├── LICENSE ├── MANIFEST.in ├── PyCLUE.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── requires.txt └── top_level.txt ├── PyCLUE ├── tasks │ ├── __init__.py │ ├── run_classifier.py │ ├── run_cmrc_drcd.py │ ├── run_ner.py │ └── run_squad.py └── utils │ ├── __init__.py │ ├── classifier_utils │ ├── __init__.py │ ├── bert_utils.py │ ├── core.py │ ├── modeling.py │ ├── optimization_finetuning.py │ └── tokenization.py │ ├── configs │ ├── __init__.py │ ├── data_configs.py │ └── model_configs.py │ └── file_utils.py ├── README.md ├── dist ├── PyCLUE-2019.12.5-py3-none-any.whl └── PyCLUE-2019.12.5.tar.gz ├── examples └── classifications │ ├── run_clue_task.py │ ├── run_clue_task_tpu.py │ ├── run_user_task.py │ └── run_user_task_tpu.py ├── requirements.txt ├── setup.py └── upload.sh /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2 | 3 |   Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 |    5 |   The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include * 2 | -------------------------------------------------------------------------------- /PyCLUE.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: PyCLUE 3 | Version: 2019.12.5 4 | Summary: Python toolkit for Chinese Language Understanding Evaluation benchmark. 5 | Home-page: https://github.com/ChineseGLUE/PyCLUE 6 | Author: Liu Shaoweihua 7 | Author-email: liushaoweihua@126.com 8 | Maintainer: CLUE 9 | Maintainer-email: chineseGLUE@163.com 10 | License: UNKNOWN 11 | Description: # PyCLUE 12 | 13 | Python toolkit for Chinese Language Understanding Evaluation benchmark. 14 | 15 | 中文语言理解测评基准的Python工具包,快速测评代表性数据集、基准(预训练)模型,并针对自己的数据选择合适的基准(预训练)模型进行快速应用。 16 | 17 | ## 安装PyCLUE 18 | 19 | 现在,可以通过pip安装PyCLUE: 20 | 21 | ```bash 22 | pip install PyCLUE 23 | ``` 24 | 25 | ## 使用PyCLUE 26 | 27 | ### 分类/句子对 任务 28 | 29 | #### 快速测评CLUE数据集 30 | 31 | 以下以在CPU/GPU上运行为例,完整例子可参见`PyCLUE/examples/classifications/run_clue_task.py`。在TPU上运行的例子参照`PyCLUE/examples/classifications/run_clue_task_tpu.py`。 32 | 33 | ```python 34 | # 指定使用的GPU,如无GPU则不指定 35 | import os 36 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 37 | 38 | # 导入分类/句子对测评任务相关组件 39 | from PyCLUE.tasks.run_classifier import clue_tasks, configs 40 | ``` 41 | 42 | 其中,`clue_tasks`函数接受测评任务的`dict`类型参数,`configs`为测评任务的`dict`类型参数,默认值和说明如下: 43 | 44 | ```json 45 | { 46 | # 测评任务名 47 | # CLUE benchmark: afqmc, cmnli, copa, csl, iflytek, tnews, wsc 48 | # chineseGLUE: bq, xnli, lcqmc, inews, thucnews 49 | "task_name": "afqmc", 50 | # 预训练语言模型 51 | # 如果该参数为None,需要指定vocab_file, bert_config_file和init_checkpoint三参数。 52 | # 或者可以直接指定以下基准(预训练)模型: 53 | # bert, bert_wwm_ext, albert_xlarge, albert_large, albert_base, albert_base_ext, 54 | # albert_small, albert_tiny, roberta, roberta_wwm_ext, roberta_wwm_ext_large 55 | "pretrained_lm_name": "bert", 56 | # 执行内容 57 | "do_train": true, 58 | "do_eval": true, 59 | "do_predict": true, 60 | # 数据路径 61 | # 如不指定,则默认为:PyCLUE/datasets 62 | # 路径中的文件后缀名可接受 "txt", "tsv", "json"等 63 | # 如果 do_train = True,数据目录中应至少包含train文件,前缀名为"train" 64 | # 如果 do_eval = True,数据目录中应至少包含dev文件,前缀名为"dev" 65 | # 如果 do_predict = True,数据目录中应至少包含test文件,前缀名为"test" 66 | "data_dir": null, 67 | # 输出结果保存路径 68 | # 如不指定,则默认为:PyCLUE/task_outputs 69 | # 包含训练的模型文件,tf_record数据以及输出的验证结果dev_results.txt/test_results.txt/test_results.tsv 70 | "output_dir": null, 71 | # 自行指定预训练语言模型三参数 72 | "vocab_file": null, 73 | "bert_config_file": null, 74 | "init_checkpoint": null, 75 | # 训练参数 76 | "do_lower_case": true, 77 | "max_seq_length": 128, 78 | "train_batch_size": 8, 79 | "eval_batch_size": 8, 80 | "predict_batch_size": 8, 81 | "learning_rate": 2e-05, 82 | "num_train_epochs": 1, 83 | "warmup_proportion": 0.1, 84 | "save_checkpoints_steps": 1000, 85 | "iterations_per_loop": 1000, 86 | # TPU选项 87 | "use_tpu": false, 88 | "tpu_name": null, 89 | "tpu_zone": null, 90 | "gcp_project": null, 91 | "master": null, 92 | "num_tpu_cores": 8, 93 | # 是否输出训练过程 94 | "verbose": 0 95 | } 96 | ``` 97 | 98 | 执行以下评测过程: 99 | 100 | ```python 101 | # task_name 102 | configs["task_name"] = "wsc" 103 | 104 | # pretrained_lm_name 105 | configs["pretrained_lm_name"] = "bert" 106 | 107 | # actions 108 | configs["do_train"] = True 109 | configs["do_eval"] = True 110 | configs["do_predict"] = True 111 | 112 | # train parameters 113 | configs["max_seq_length"] = 128 114 | configs["train_batch_size"] = 8 115 | configs["learning_rate"] = 2e-5 116 | configs["warmup_proportion"] = 0.1 117 | configs["num_train_epochs"] = 50 118 | 119 | # show training process 120 | configs["verbose"] = 0 121 | 122 | wsc_result = clue_tasks(configs) 123 | print(wsc_result) 124 | ``` 125 | 126 | 测评结果由`clue_tasks`返回,形式如下: 127 | 128 | ```json 129 | { 130 | # 验证集指标结果 131 | "dev_res":{ 132 | "eval_accuracy": "", 133 | "eval_loss": "", 134 | "global_step": "", 135 | "loss": "" 136 | }, 137 | # 测试集指标结果(部分测试集有label,具有参考意义;部分则没有label,无参考意义) 138 | "test_res":{ 139 | "eval_accuracy": "", 140 | "eval_loss": "", 141 | "global_step": "", 142 | "loss": "" 143 | }, 144 | # 测试集预测结果 145 | "test_outputs": [ 146 | { 147 | "guid": "test-0", 148 | "text_a": "", 149 | "text_b": "", 150 | "label": "" 151 | }, 152 | ... 153 | ] 154 | } 155 | ``` 156 | 157 | 测评结果同时保存在`configs`中指定的输出目录`${output_dir}/classifications/${task_name}/${pretrained_lm_name}`中,如本例的`PyCLUE/task_outputs/classifications/wsc/bert`中。其中`dev_results.txt`保存了验证集的指标结果,`test_results.txt`保存了测试集的指标结果(部分测试集有label,具有参考意义;部分则没有label,无参考意义)。`test_results.tsv`则保存了测试集的预测结果,具体形式如下: 158 | 159 | ```json 160 | {"guid": "test-0", "text_a": "_毛德和朵拉_看到火车冲过大草原,引擎上冒着滚滚黑烟。从远处就能听见它们的轰鸣声和狂野而清晰的汽笛声。当[它们]行进到近处时,马都跑开了。", "text_b": null, "label": "false"} 161 | {"guid": "test-1", "text_a": "毛德和朵拉看到_火车_冲过大草原,引擎上冒着滚滚黑烟。从远处就能听见它们的轰鸣声和狂野而清晰的汽笛声。当[它们]行进到近处时,马都跑开了。", "text_b": null, "label": "false"} 162 | {"guid": "test-2", "text_a": "毛德和朵拉看到火车冲过大草原,引擎上冒着滚滚_黑烟_。从远处就能听见它们的轰鸣声和狂野而清晰的汽笛声。当[它们]行进到近处时,马都跑开了。", "text_b": null, "label": "false"} 163 | {"guid": "test-3", "text_a": "毛德和朵拉看到火车冲过大草原,引擎上冒着滚滚黑烟。从远处就能听见它们的_轰鸣声_和狂野而清晰的汽笛声。当[它们]行进到近处时,马都跑开了。", "text_b": null, "label": "false"} 164 | {"guid": "test-4", "text_a": "毛德和朵拉看到火车冲过大草原,引擎上冒着滚滚黑烟。从远处就能听见它们的轰鸣声和狂野而清晰的_汽笛声_。当[它们]行进到近处时,马都跑开了。", "text_b": null, "label": "false"} 165 | {"guid": "test-5", "text_a": "毛德和朵拉看到火车冲过大草原,引擎上冒着滚滚黑烟。从远处就能听见它们的轰鸣声和狂野而清晰的汽笛声。当[它们]行进到近处时,_马_都跑开了。", "text_b": null, "label": "false"} 166 | ``` 167 | 168 | #### 应用于自定义数据集 169 | 170 | 以下以在CPU/GPU上运行为例,完整例子可参见`PyCLUE/examples/classifications/run_user_task.py`。在TPU上运行的例子参照`PyCLUE/examples/classifications/run_user_task_tpu.py`。 171 | 172 | ```python 173 | # 指定使用的GPU,如无GPU则不指定 174 | import os 175 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 176 | 177 | # 导入分类/句子对测评任务相关组件 178 | from PyCLUE.tasks.run_classifier import user_tasks, configs 179 | 180 | # task_name: default is "user_defined_task" 181 | configs["task_name"] = "" 182 | 183 | # pretrained_lm_name 184 | configs["pretrained_lm_name"] = "bert" 185 | 186 | # actions 187 | configs["do_train"] = True 188 | configs["do_eval"] = True 189 | configs["do_predict"] = True 190 | 191 | # data_dir 192 | configs["data_dir"] = "my_file_path" 193 | 194 | # data configs 195 | configs["labels"] = ["0", "1"] 196 | # label_, text_a_column , text_b_column & delimiter: 197 | # examples_1,txt文件,分隔符为_!_,句子对任务: 198 | # 0_!_我想要回家_!_我准备回家 199 | # 1_!_我想要回家_!_我准备吃饭 200 | # >> label_column = 0, text_a_column = 1, text_b_column = 2, delimiter = "_!_" 201 | # examples_2,tsv文件,分隔符为\t,分类任务: 202 | # 0\t我很生气 203 | # 1\t我很开心 204 | # >> label_column = 0, text_a_column = 1, text_b_column = None, delimiter = "\t" 205 | # examples_3,json文件,句子对任务: 206 | # {"label": 0, "sentence1": "我想要回家", "sentence2": "我很生气"} 207 | # >> label_column = "label", text_a_column = "sentence1", text_b_column = "sentence2", delimiter = None 208 | configs["label_column"] = "" 209 | configs["text_a_column"] = "" 210 | configs["text_b_column"] = "" 211 | configs["delimiter"] = "" 212 | # ignore_header 213 | # 是否丢弃第一行(往往是第一行为各列说明的时候设置为True) 214 | configs["ignore_header"] = False 215 | # min_seq_length 216 | # 删除最小句长小于min_seq_length的训练数据 217 | configs["min_seq_length"] = 3 218 | # file_type 219 | # 数据文件后缀名,可以为 "txt", "tsv", "json" 220 | configs["file_type"] = "" 221 | 222 | # output_dir 223 | configs["output_dir"] = "" 224 | 225 | # 预训练语言模型组件 226 | # 如果 pretrained_lm_name 不为 None, 以下部分不需要指定。 227 | configs["vocab_file"] = "vocab.txt" 228 | configs["bert_config_file"] = "XXX_config.json" 229 | configs["init_checkpoint"] = "XXX_model.ckpt" 230 | 231 | # train parameters 232 | configs["max_seq_length"] = 128 233 | configs["train_batch_size"] = 8 234 | configs["learning_rate"] = 2e-5 235 | configs["warmup_proportion"] = 0.1 236 | configs["num_train_epochs"] = 50 237 | 238 | # show training process 239 | configs["verbose"] = 0 240 | 241 | my_result = user_tasks(configs) 242 | print(my_result) 243 | ``` 244 | 245 | 结果的输出和保存形式与测评CLUE数据集时一致。 246 | 247 | ### 阅读理解任务 248 | 249 | #### 快速测评CLUE数据集 250 | 251 | 即将加入。 252 | 253 | #### 应用于自定义数据集 254 | 255 | 即将j加入。 256 | 257 | ### 命名实体识别任务 258 | 259 | #### 快速测评CLUE数据集 260 | 261 | 即将加入。 262 | 263 | #### 应用于自定义数据集 264 | 265 | 即将j加入。 266 | 267 | ## 基准(预训练)模型 268 | 269 | **现已支持以下模型:** 270 | 271 | 1. [BERT-base](https://github.com/google-research/bert) 272 | 2. [BERT-wwm-ext](https://github.com/ymcui/Chinese-BERT-wwm) 273 | 3. [albert_xlarge](https://github.com/brightmart/albert_zh) 274 | 4. [albert_large](https://github.com/brightmart/albert_zh) 275 | 5. [albert_base](https://github.com/brightmart/albert_zh) 276 | 6. [albert_base_ext](https://github.com/brightmart/albert_zh) 277 | 7. [albert_small](https://github.com/brightmart/albert_zh) 278 | 8. [albert_tiny](https://github.com/brightmart/albert_zh) 279 | 9. [roberta](https://github.com/brightmart/roberta_zh) 280 | 10. [roberta_wwm_ext](https://github.com/ymcui/Chinese-BERT-wwm) 281 | 11. [roberta_wwm_ext_large](https://github.com/ymcui/Chinese-BERT-wwm) 282 | 283 | 即将加入: 284 | 285 | 1. [XLNet_mid](https://github.com/ymcui/Chinese-PreTrained-XLNet) 286 | 2. [ERNIE_base](https://github.com/PaddlePaddle/ERNIE) 287 | 288 | ## 支持任务类型 289 | 290 | ### 分类任务 291 | 292 | **现已支持以下数据集:** 293 | 294 | #### CLUEBenchmark任务 295 | 296 | 参考:https://github.com/CLUEbenchmark/CLUE 297 | 298 | 1. **AFQMC 蚂蚁金融语义相似度** 299 | 300 | ``` 301 | 数据量:训练集(34334)验证集(4316)测试集(3861) 302 | 例子: 303 | {"sentence1": "双十一花呗提额在哪", "sentence2": "里可以提花呗额度", "label": "0"} 304 | 每一条数据有三个属性,从前往后分别是 句子1,句子2,句子相似度标签。其中label标签,1 表示sentence1和sentence2的含义类似,0表示两个句子的含义不同。 305 | ``` 306 | 307 | 2. **TNEWS' 今日头条中文新闻(短文本)分类 Short Text Classificaiton for News** 308 | 309 | ``` 310 | 数据量:训练集(266,000),验证集(57,000),测试集(57,000) 311 | 例子: 312 | {"label": "102", "label_des": "news_entertainment", "sentence": "江疏影甜甜圈自拍,迷之角度竟这么好看,美吸引一切事物"} 313 | 每一条数据有三个属性,从前往后分别是 分类ID,分类名称,新闻字符串(仅含标题)。 314 | ``` 315 | 316 | 3. **IFLYTEK' 长文本分类 Long Text classification** 317 | 318 | 该数据集共有1.7万多条关于app应用描述的长文本标注数据,包含和日常生活相关的各类应用主题,共119个类别:"打车":0,"地图导航":1,"免费WIFI":2,"租车":3,….,"女性":115,"经营":116,"收款":117,"其他":118(分别用0-118表示)。 319 | 320 | ``` 321 | 数据量:训练集(12,133),验证集(2,599),测试集(2,600) 322 | 例子: 323 | {"label": "110", "label_des": "社区超市", "sentence": "朴朴快送超市创立于2016年,专注于打造移动端30分钟即时配送一站式购物平台,商品品类包含水果、蔬菜、肉禽蛋奶、海鲜水产、粮油调味、酒水饮料、休闲食品、日用品、外卖等。朴朴公司希望能以全新的商业模式,更高效快捷的仓储配送模式,致力于成为更快、更好、更多、更省的在线零售平台,带给消费者更好的消费体验,同时推动中国食品安全进程,成为一家让社会尊敬的互联网公司。,朴朴一下,又好又快,1.配送时间提示更加清晰友好2.保障用户隐私的一些优化3.其他提高使用体验的调整4.修复了一些已知bug"} 324 | 每一条数据有三个属性,从前往后分别是 类别ID,类别名称,文本内容。 325 | ``` 326 | 327 | 4. **CMNLI 语言推理任务 Chinese Multi-Genre NLI** 328 | 329 | CMNLI数据由两部分组成:XNLI和MNLI。数据来自于fiction,telephone,travel,government,slate等,对原始MNLI数据和XNLI数据进行了中英文转化,保留原始训练集,合并XNLI中的dev和MNLI中的matched作为CMNLI的dev,合并XNLI中的test和MNLI中的mismatched作为CMNLI的test,并打乱顺序。该数据集可用于判断给定的两个句子之间属于蕴涵、中立、矛盾关系。 330 | 331 | ``` 332 | 数据量:train(391,782),matched(12,426),mismatched(13,880) 333 | 例子: 334 | {"sentence1": "新的权利已经足够好了", "sentence2": "每个人都很喜欢最新的福利", "label": "neutral"} 335 | 每一条数据有三个属性,从前往后分别是 句子1,句子2,蕴含关系标签。其中label标签有三种:neutral,entailment,contradiction。 336 | ``` 337 | 338 | 5. **COPA 因果推断-中文版 Choice of Plausible Alternatives** 339 | 340 | 自然语言推理的数据集,给定一个假设以及一个问题表明是因果还是影响,并从两个选项中选择合适的一个。遵照原数据集,我们使用了acc作为评估标准。 341 | 342 | ``` 343 | 数据量:训练集(400),验证集(100),测试集(500) 344 | 例子: 345 | {"idx": 7, "premise": "那人在杂货店买东西时打折了。", "choice0": "他向收银员打招呼。", "choice1": "他用了一张优惠券。", "question": "cause", "label": 1} 346 | 其中label的标注,0表示choice0,1 表示choice1。原先的COPA数据集是英文的,我们使用机器翻译以及人工翻译的方法,并做了些微的用法习惯上的调整,并根据中文的习惯进行了标注,得到了这份数据集。 347 | ``` 348 | 349 | 6. **WSC Winograd模式挑战中文版 The Winograd Schema Challenge,Chinese Version** 350 | 351 | 威诺格拉德模式挑战赛是图灵测试的一个变种,旨在判定AI系统的常识推理能力。参与挑战的计算机程序需要回答一种特殊但简易的常识问题:代词消歧问题,即对给定的名词和代词判断是否指代一致。 352 | 353 | ``` 354 | 数据量:训练集(532),验证集(104),测试集(143) 355 | 例子: 356 | {"target": 357 | {"span2_index": 28, 358 | "span1_index": 0, 359 | "span1_text": "马克", 360 | "span2_text": "他" 361 | }, 362 | "idx": 0, 363 | "label": "false", 364 | "text": "马克告诉皮特许多关于他自己的谎言,皮特也把这些谎言写进了他的书里。他应该多怀疑。" 365 | } 366 | 其中label标签,true表示指代一致,false表示指代不一致。 367 | ``` 368 | 369 | 7. **CSL 论文关键词识别 Keyword Recognition** 370 | 371 | 中文科技文献数据集包含中文核心论文摘要及其关键词。 用tf-idf生成伪造关键词与论文真实关键词混合,生成摘要-关键词对,关键词中包含伪造的则标签为0。 372 | 373 | ``` 374 | 数据量:训练集(20,000),验证集(3,000),测试集(3,000) 375 | 例子: 376 | {"id": 1, "abst": "为解决传统均匀FFT波束形成算法引起的3维声呐成像分辨率降低的问题,该文提出分区域FFT波束形成算法.远场条件下,以保证成像分辨率为约束条件,以划分数量最少为目标,采用遗传算法作为优化手段将成像区域划分为多个区域.在每个区域内选取一个波束方向,获得每一个接收阵元收到该方向回波时的解调输出,以此为原始数据在该区域内进行传统均匀FFT波束形成.对FFT计算过程进行优化,降低新算法的计算量,使其满足3维成像声呐实时性的要求.仿真与实验结果表明,采用分区域FFT波束形成算法的成像分辨率较传统均匀FFT波束形成算法有显著提高,且满足实时性要求.", "keyword": ["水声学", "FFT", "波束形成", "3维成像声呐"], "label": "1"} 377 | 每一条数据有四个属性,从前往后分别是 数据ID,论文摘要,关键词,真假标签。 378 | ``` 379 | 380 | #### ChineseGLUE任务 381 | 382 | 参考:https://github.com/ChineseGLUE/ChineseGLUE 383 | 384 | 1. **LCQMC口语化描述的语义相似度任务 Semantic Similarity Task** 385 | 386 | 输入是两个句子,输出是0或1。其中0代表语义不相似,1代表语义相似。 387 | 388 | ``` 389 | 数据量:训练集(238,766),验证集(8,802),测试集(12,500) 390 | 例子: 391 | 1.聊天室都有哪些好的 [分隔符] 聊天室哪个好 [分隔符] 1 392 | 2.飞行员没钱买房怎么办? [分隔符] 父母没钱买房子 [分隔符] 0 393 | ``` 394 | 395 | 2. **XNLI语言推断任务 Natural Language Inference** 396 | 397 | 跨语言理解的数据集,给定一个前提和假设,判断这个假设与前提是否具有蕴涵、对立、中性关系。 398 | 399 | ``` 400 | 数据量:训练集(392,703),验证集(2,491),测试集(5,011) 401 | 例子: 402 | 1.从 概念 上 看 , 奶油 收入 有 两 个 基本 方面 产品 和 地理 .[分隔符] 产品 和 地理 是 什么 使 奶油 抹 霜 工作 . [分隔符] neutral 403 | 2.我们 的 一个 号码 会 非常 详细 地 执行 你 的 指示 [分隔符] 我 团队 的 一个 成员 将 非常 精确 地 执行 你 的 命令 [分隔符] entailment 404 | 405 | 原始的XNLI覆盖15种语言(含低资源语言)。我们选取其中的中文,并将做格式转换,使得非常容易进入训练和测试阶段。 406 | ``` 407 | 408 | 3. **INEWS 互联网情感分析任务 Sentiment Analysis for Internet News** 409 | 410 | ``` 411 | 数据量:训练集(5,356),验证集(1,000),测试集(1,000) 412 | 例子: 413 | 1_!_00005a3efe934a19adc0b69b05faeae7_!_九江办好人民满意教育_!_近3年来,九江市紧紧围绕“人本教育、公平教育、优质教育、幸福教育”的目标,努力办好人民满意教育,促进了义务教育均衡发展,农村贫困地区办学条件改善。目前,该市特色教育学校有70所 ...... 414 | 每行为一条数据,以_!_分割的个字段,从前往后分别是情感类别,数据id,新闻标题,新闻内容 415 | ``` 416 | 417 | 5. **BQ 智能客服问句匹配 Question Matching for Customer Service** 418 | 419 | 该数据集是自动问答系统语料,共有120,000对句子对,并标注了句子对相似度值,取值为0或1(0表示不相似,1表示相似)。数据中存在错别字、语法不规范等问题,但更加贴近工业场景。 420 | 421 | ``` 422 | 数据量:训练集(100,000),验证集(10,000),测试集(10,000) 423 | 例子: 424 | 1.我存钱还不扣的 [分隔符] 借了每天都要还利息吗 [分隔符] 0 425 | 2.为什么我的还没有额度 [分隔符] 为啥没有额度!! [分隔符] 1 426 | ``` 427 | 428 | 6. **THUCNEWS 长文本分类 Long Text classification** 429 | 430 | 该数据集共有4万多条中文新闻长文本标注数据,共14个类别: "体育":0, "娱乐":1, "家居":2, "彩票":3, "房产":4, "教育":5, "时尚":6, "时政":7, "星座":8, "游戏":9, "社会":10, "科技":11, "股票":12, "财经":13。 431 | 432 | ``` 433 | 数据量:训练集(33,437),验证集(4,180),测试集(4,180) 434 | 例子: 435 | 11_!_科技_!_493337.txt_!_爱国者A-Touch MK3533高清播放器试用  爱国者MP5简介:  "爱国者"北京华旗资讯,作为国内知名数码产品制>造商。1993年创立于北京中关村,是一家致力于...... 436 | 每行为一条数据,以_!_分割的个字段,从前往后分别是 类别ID,类别名称,文本ID,文本内容。 437 | ``` 438 | 439 | ### 阅读理解任务 440 | 441 | 即将加入。 442 | 443 | ### 命名实体识别任务 444 | 445 | 即将加入。 446 | 447 | Platform: UNKNOWN 448 | Classifier: Programming Language :: Python :: 3 449 | Classifier: License :: OSI Approved :: MIT License 450 | Classifier: Operating System :: OS Independent 451 | Description-Content-Type: text/markdown 452 | -------------------------------------------------------------------------------- /PyCLUE.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | MANIFEST.in 3 | README.md 4 | requirements.txt 5 | setup.py 6 | upload.sh 7 | .git/COMMIT_EDITMSG 8 | .git/FETCH_HEAD 9 | .git/HEAD 10 | .git/ORIG_HEAD 11 | .git/config 12 | .git/description 13 | .git/index 14 | .git/hooks/applypatch-msg.sample 15 | .git/hooks/commit-msg.sample 16 | .git/hooks/post-update.sample 17 | .git/hooks/pre-applypatch.sample 18 | .git/hooks/pre-commit.sample 19 | .git/hooks/pre-push.sample 20 | .git/hooks/pre-rebase.sample 21 | .git/hooks/prepare-commit-msg.sample 22 | .git/hooks/update.sample 23 | .git/info/exclude 24 | .git/logs/HEAD 25 | .git/logs/refs/heads/master 26 | .git/logs/refs/remotes/cluebenchmark/master 27 | .git/logs/refs/remotes/origin/master 28 | .git/objects/03/6aea77dfc1811e33a2141639488053ee165e38 29 | .git/objects/04/dbe253e4f3826b4052d983a4a4952546ae5198 30 | .git/objects/05/840568aa31a8d1523a8069f0500b297a9dcb7c 31 | .git/objects/05/906f5c20ee3b129e1f3b31348380d7a6a6479f 32 | .git/objects/08/1c7948b4363d34efe67ecb8455f22438d2b064 33 | .git/objects/09/ecb728df2f729e934eb753e76c0d33e4a57358 34 | .git/objects/0a/0b96061ace7b53fc65a7c8f19ac5057aae7cb7 35 | .git/objects/0a/88b4671f30aed8016adca7f2575e72e920c23d 36 | .git/objects/0b/8161f0b4e995bb19c52d555dfb49aa878a788e 37 | .git/objects/0d/e894834e696d46608a6fe63536e3b2c428f0b6 38 | .git/objects/11/5abae00e21537a86b9015478c3348618923b15 39 | .git/objects/14/0d04d0cf44cd276b1f8b7e4214ccdc16d83812 40 | .git/objects/18/eb80ab34e5b174e64e5ba46bd553eeaab0d63f 41 | .git/objects/1a/f8a423bbbf8685f37a81b362b264e02aa3f523 42 | .git/objects/1c/da1f9074fc1e59aaf56363da86c9a1a75f61c4 43 | .git/objects/1d/4ddc701b4aaf1343ebf90d26923371a18c3af0 44 | .git/objects/1d/bd5402356d5f783f75027bda981e71a544d8a1 45 | .git/objects/1e/0efd2c2bfb2fcb2f1fa3206f049be07b0116f8 46 | .git/objects/1e/c493d2890416e6b7f30227c18e42ea6eee8289 47 | .git/objects/21/0292155ed21af7db1fbd2c01aa918fb359d3f0 48 | .git/objects/21/3c24b77cc5f0054482c532a348719e4ca8cde4 49 | .git/objects/27/7bcd26b7bd9373915dcb10d6e8c2d83825b298 50 | .git/objects/27/e6a6aa18a5712a6b787943a0fcdeb3ffc4e3d3 51 | .git/objects/2c/d9400893dc9411e24c5b4577c51a5faecfd789 52 | .git/objects/2c/ee23d875874aadfd8e541208e6762a4c9d4d92 53 | .git/objects/2d/6a710fbe54ba1da76bf069255e105eef8f0d6f 54 | .git/objects/2e/394c02faa18b041973913248d8da858597b9cc 55 | .git/objects/2f/5e70fed16214db7455c59e4fc59d8416b0b429 56 | .git/objects/37/acec4ce8561f2a33b883286bb829606198d76c 57 | .git/objects/38/1f0c2a27b774bea7d4465b885df971ae4f4dbd 58 | .git/objects/3b/03db124b7673830c9d9a5301edcdaa18404d0b 59 | .git/objects/3e/af8fd0c1b60ec826193a6fb57d55a0b4e5e223 60 | .git/objects/3e/f663dd10908ec2fdab07a87e4c4e44fc9ceaeb 61 | .git/objects/3f/9f06cd2a3de1922903c6ab5836cddfb755f7d8 62 | .git/objects/41/3dd7fb9618408dd8261b33687e725022fb12bd 63 | .git/objects/43/5cf3e636a55e101223e07957c7b0485cc4e957 64 | .git/objects/43/721af41f4b185cd953e914b261f282c18bd19f 65 | .git/objects/43/9c21da7ae4daa1d2b4508171e81d00471c8e94 66 | .git/objects/46/2d79fd69f297f4d58b60228b41b1f606fc4782 67 | .git/objects/46/53f1634232520d1a76de641772f9e4f6acf11b 68 | .git/objects/49/5121188d6b598e5c873c633b7e1e57a781489d 69 | .git/objects/4b/1c9a5e0fe4e5a0e219bc90bf748274897e640a 70 | .git/objects/4d/ec710d7ab139fb22f073a74768b04153c5b65f 71 | .git/objects/52/3c1ee2164abeac5a38a363f1e7f92cd786f1fe 72 | .git/objects/52/551d7909a94e809082b1b5082c4cf61326bd66 73 | .git/objects/53/6cc196b08285e9356174d659d28fc0e8e0de36 74 | .git/objects/53/6d7b815fbff5a74d940bad0f6f09b2ad071727 75 | .git/objects/54/236d26320b3adc2a9779a985043124f54ecb98 76 | .git/objects/55/936ecf63403a2ef118af4b3effd968a21890b9 77 | .git/objects/55/c9fcef05a704af079faf1d0410cb51258d13c4 78 | .git/objects/55/eb02f60988c67577276ae8103391bcc82207ee 79 | .git/objects/56/86ff2bd61f6d8af6aebff13ce6481d0f9acb17 80 | .git/objects/59/3849fcffd0193c4b63891f34b20ec3f7c7a4aa 81 | .git/objects/5a/b42ec565e7db65e982bdbbbf06f6f44c3ec9ad 82 | .git/objects/5c/a3d6d5dcd1a977918ce8a41079f5a6e3145622 83 | .git/objects/5d/9a96c9fb4ed8ef45f1f35f0912e05cac6c894c 84 | .git/objects/5e/365b4e66943b666cbfc78cb974a943492fb909 85 | .git/objects/5e/ac73024deb01681a77eafb3dfa4e1df53ca852 86 | .git/objects/60/b972c344ac055f7df50b6bf6c8171401e0f6fe 87 | .git/objects/61/86a1185f978cfaedac2aa691d53af54812db8c 88 | .git/objects/61/fccf676014a6d10d5ca4b411fbdf89e9be0098 89 | .git/objects/62/ee2c71b9692f608e4af4817260ce850c9ab0e6 90 | .git/objects/63/eaca2eb9e68ab967a11922795b29b4b9b78612 91 | .git/objects/67/2d24c707a4fbe9bbbdd006f2383d7d30ada8dc 92 | .git/objects/6b/bc44251a6e021fe8870cf93b0fb098f210b9b5 93 | .git/objects/6c/ada8dd45cfbff3b4121240edaeb2010ca8b2d3 94 | .git/objects/6d/ba75186ba2a69abb6ceec35657dcdceea0911e 95 | .git/objects/72/2ab20fe0ab9ad00bebb0736349f15711779380 96 | .git/objects/73/4bb94dce5ee63b1888e672b561af86b82f1e10 97 | .git/objects/75/8edbfb7ce8e190e1b07316c26708727a45164c 98 | .git/objects/77/e7a92926c58ee17db01048618d87e3b4a6f159 99 | .git/objects/79/bea02fbd31b8e2f758864c4bc95377231bd5d5 100 | .git/objects/7a/0c2946d5c1d9efeec123ae8f26bd4156b17ab2 101 | .git/objects/7a/c453ce475978f109ab934045b78ce0156a1918 102 | .git/objects/7b/636609b4ae390be9b69c4cf3952a8fb56880ae 103 | .git/objects/7c/1bade488133fa8738ba717d579d9e5fc51eae1 104 | .git/objects/7c/40a35a42fbb30867b438b85a1a6205d3756f84 105 | .git/objects/7e/91309c75cfc75aeafcdaa8e89db3fc44bb20fe 106 | .git/objects/7f/439f24db90bb23f0b657b0c7d6cc23cbaf0469 107 | .git/objects/7f/443a188e30bb09db129d17cc4b3d9d39df59a4 108 | .git/objects/80/f8b1bb9d00c394832d3ef016dc0495f80fb9f6 109 | .git/objects/89/447ee7d7e57d022d1757958d7e09443a5726dc 110 | .git/objects/8b/137891791fe96927ad78e64b0aad7bded08bdc 111 | .git/objects/96/9e824dcbdd63ae992109ac9106a75dafb585aa 112 | .git/objects/96/c46222c2d5bcc1f8bb5aa0888072dae607b6c2 113 | .git/objects/96/ed2638e8b936d56d735c924efd6589488b7729 114 | .git/objects/97/eaea8f92567f3d72537de28fa882816232be50 115 | .git/objects/98/6dd775fd3019ab6021b5cf6e8ee6b3eec4bb8b 116 | .git/objects/9f/8a8dc6612e8a703c3f0bfb6ad6da85a9186e98 117 | .git/objects/a0/cbb8930d3a3ef46c24d2a6589870edb76a2fd7 118 | .git/objects/a2/f6012a2a114ab8b61edb9621f26ef8fca4cec9 119 | .git/objects/a4/6cb5cadb5d6cce7b2d5941f6866a5526351526 120 | .git/objects/a4/a5addd3300e0a5a399fdfd81e126ec82ce6d43 121 | .git/objects/ab/2d8504c7382cca73c4e5e3f50062e5c8d9fd85 122 | .git/objects/ab/529dbbd285c7503a55b83a2f727d7928653598 123 | .git/objects/ab/95d9d175b414bae0000a7fb3726e502e0537e9 124 | .git/objects/ac/d762e226cbf937a77a78abb2da3a972246a919 125 | .git/objects/b4/ecbf2b1a685fcb31dbcc8919c5ee647db77750 126 | .git/objects/b6/047ef7e85986148f981c59ce3ab3c62e8c11dd 127 | .git/objects/b7/3069aa38909796722b2c07009d566f22e97026 128 | .git/objects/b7/d03a65edeff040bb15c3c1cb86744e8189471b 129 | .git/objects/b9/bc300670ed4dbfed944f8fd76e29d31b963116 130 | .git/objects/ba/34abd5d065f07baf8996993700e2dc8c69093b 131 | .git/objects/bf/d0ad396e0d581c7cb7d54d9f396a7a9e1577d4 132 | .git/objects/c1/01db6517bcad424d8517d653b5edb539c93acc 133 | .git/objects/c4/eda2905745e0c9a1bb32a9e21f09559bff6c16 134 | .git/objects/c7/70b501f9a4f18deb1bc30c08506c4e7fc8629c 135 | .git/objects/c7/7dd7e1ce8babc0f759c0a4202827c2257af945 136 | .git/objects/c7/9c0839716d780b0092cd7061cc59f77416e2bc 137 | .git/objects/c9/29e08f3db53fdbd402c1b5ad3a85f9fafed400 138 | .git/objects/cb/efdc63e42b9875ecfc157ca2522a9d4ca11927 139 | .git/objects/d2/839f2943e0c6f50c3abae655957fedb6453341 140 | .git/objects/d3/09e3a7952fa05368e1959a671a3a7244ebaffb 141 | .git/objects/d3/5b2c1e7ef6debf6d3bda64109938114c33f9c8 142 | .git/objects/d3/6c34360b6d92d52a4665316de982cc1c8a6676 143 | .git/objects/da/1741ebc1277f13006ac102c636507f356dd82b 144 | .git/objects/dc/ca37413598c96573cc1211da3c66b37e584561 145 | .git/objects/de/8366310052875359bc1c85888e2ab23e59c621 146 | .git/objects/e1/3fb50b326bc052ea76f11a6e1de184dfc15fe3 147 | .git/objects/e6/9de29bb2d1d6434b8b29ae775ad8c2e48c5391 148 | .git/objects/e6/cf405486bf2ed781c86f75124a2cf208b3569e 149 | .git/objects/e8/047429eb472e87b31f914376b11da8c408f724 150 | .git/objects/eb/e9950f41d0e6a530bad49c7c75ef5da77d6ebd 151 | .git/objects/f0/b7b36e62a110826ffd6b7fabf099659a499961 152 | .git/objects/f1/d2b2876b2dddbab1803934c8839b1f33eb13e2 153 | .git/objects/f2/43b5161967b5091e8c7f82fe0836b5f1113dbd 154 | .git/objects/f3/01144377fcaabf539402a8a5f28af25d8233e9 155 | .git/objects/f4/17811fe258cbfbc31e72a2a628aee5a2ac3f30 156 | .git/objects/f7/020e806b61d3b1b2fd2eb73b1d37b9c9465b3f 157 | .git/objects/f8/e314fd5c97d650f6a1f0d82c2c59eddb67f6f0 158 | .git/objects/f9/69b97fd01f370700262397eb83ee58b623219a 159 | .git/objects/fc/f0c77cd7ec6c8ed32f052ad56dbd0bcd4cef25 160 | .git/refs/heads/master 161 | .git/refs/remotes/cluebenchmark/master 162 | .git/refs/remotes/origin/master 163 | PyCLUE.egg-info/PKG-INFO 164 | PyCLUE.egg-info/SOURCES.txt 165 | PyCLUE.egg-info/dependency_links.txt 166 | PyCLUE.egg-info/requires.txt 167 | PyCLUE.egg-info/top_level.txt 168 | PyCLUE/tasks/__init__.py 169 | PyCLUE/tasks/run_classifier.py 170 | PyCLUE/tasks/run_cmrc_drcd.py 171 | PyCLUE/tasks/run_ner.py 172 | PyCLUE/tasks/run_squad.py 173 | PyCLUE/utils/__init__.py 174 | PyCLUE/utils/file_utils.py 175 | PyCLUE/utils/classifier_utils/__init__.py 176 | PyCLUE/utils/classifier_utils/bert_utils.py 177 | PyCLUE/utils/classifier_utils/core.py 178 | PyCLUE/utils/classifier_utils/modeling.py 179 | PyCLUE/utils/classifier_utils/optimization_finetuning.py 180 | PyCLUE/utils/classifier_utils/tokenization.py 181 | PyCLUE/utils/configs/__init__.py 182 | PyCLUE/utils/configs/data_configs.py 183 | PyCLUE/utils/configs/model_configs.py 184 | examples/classifications/run_clue_task.py 185 | examples/classifications/run_clue_task_tpu.py 186 | examples/classifications/run_user_task.py 187 | examples/classifications/run_user_task_tpu.py -------------------------------------------------------------------------------- /PyCLUE.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /PyCLUE.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | tensorflow 2 | requests 3 | numpy 4 | -------------------------------------------------------------------------------- /PyCLUE.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /PyCLUE/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChineseGLUE/PyCLUE/0088f97f5da5903e720cbd48c7a558a7f1d7e836/PyCLUE/tasks/__init__.py -------------------------------------------------------------------------------- /PyCLUE/tasks/run_classifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Liu Shaoweihua 3 | # @Date: 2019-12-04 4 | 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import os 11 | from ..utils.classifier_utils.core import run_classifier, TaskConfigs, UserConfigs, ClassificationProcessor, PRETRAINED_LM_DICT 12 | from ..utils.classifier_utils.core import default_configs as configs 13 | from ..utils.file_utils import wget, unzip, rm, mkdir, rmdir, mv 14 | from ..utils.configs.data_configs import DATA_URLS, DATA_PROCESSORS 15 | from ..utils.configs.model_configs import PRETRAINED_LM_URLS 16 | 17 | 18 | _CWD = os.path.dirname(__file__) 19 | DATA_DIR = os.path.abspath(os.path.join(_CWD, "../datasets")) 20 | PRETRAINED_LM_DIR = os.path.abspath(os.path.join(_CWD, "../pretrained_lm")) 21 | 22 | 23 | def clue_tasks(configs): 24 | ########################################################################################################## 25 | # download and unzip dataset and pretrained language model 26 | ########################################################################################################## 27 | configs = TaskConfigs(configs) 28 | if configs.task_name not in DATA_URLS: 29 | raise ValueError( 30 | "Not support task: %s" % configs.task_name) 31 | if configs.pretrained_lm_name not in PRETRAINED_LM_URLS: 32 | raise ValueError( 33 | "Not support pretrained language model: %s" % configs.pretrained_lm_name) 34 | processor = DATA_PROCESSORS.get(configs.task_name) 35 | 36 | if not os.path.exists(DATA_DIR): 37 | mkdir(DATA_DIR) 38 | if not os.path.exists(PRETRAINED_LM_DIR): 39 | mkdir(PRETRAINED_LM_DIR) 40 | 41 | data_dir = os.path.join(DATA_DIR, configs.task_name) 42 | pretrained_lm_dir = os.path.join(PRETRAINED_LM_DIR, configs.pretrained_lm_name) 43 | 44 | if not os.path.exists(data_dir): 45 | data_zip = wget( 46 | url=DATA_URLS.get(configs.task_name), 47 | save_path=DATA_DIR, 48 | rename=configs.task_name+".zip") 49 | unzip(file_path=data_zip) 50 | rm(data_zip) 51 | if not os.path.exists(data_dir): 52 | mkdir(data_dir) 53 | for item in os.listdir(DATA_DIR): 54 | if "train" in item: 55 | mv(os.path.join(DATA_DIR, item), os.path.join(data_dir, item)) 56 | if "test" in item: 57 | mv(os.path.join(DATA_DIR, item), os.path.join(data_dir, item)) 58 | if "dev" in item: 59 | mv(os.path.join(DATA_DIR, item), os.path.join(data_dir, item)) 60 | if "label" in item: 61 | mv(os.path.join(DATA_DIR, item), os.path.join(data_dir, item)) 62 | print("[saved] data saved at: %s" 63 | % data_dir) 64 | else: 65 | print("[exists] data already exists: %s" 66 | % data_dir) 67 | 68 | if not os.path.exists(pretrained_lm_dir): 69 | mkdir(pretrained_lm_dir) 70 | pretrained_lm_zip = wget( 71 | url=PRETRAINED_LM_URLS.get(configs.pretrained_lm_name), 72 | save_path=pretrained_lm_dir, 73 | rename=configs.pretrained_lm_name+".zip") 74 | unzip(file_path=pretrained_lm_zip) 75 | print("[saved] pretrained language model saved at: %s" 76 | % os.path.join(pretrained_lm_dir, PRETRAINED_LM_DICT.get(configs.pretrained_lm_name))) 77 | rm(pretrained_lm_zip) 78 | else: 79 | print("[exists] pretrained language model already exists: %s" 80 | % pretrained_lm_dir) 81 | 82 | ########################################################################################################## 83 | # run classifier 84 | ########################################################################################################## 85 | if not os.path.exists(configs.output_dir): 86 | os.makedirs(configs.output_dir) 87 | result_res = run_classifier(processor, configs) 88 | return result_res 89 | 90 | 91 | def user_tasks(configs): 92 | ########################################################################################################## 93 | # download and unzip dataset and pretrained language model 94 | ########################################################################################################## 95 | configs = UserConfigs(configs) 96 | processor = ClassificationProcessor(configs.labels, 97 | configs.label_column, 98 | configs.text_a_column, 99 | configs.text_b_column, 100 | configs.ignore_header, 101 | configs.min_seq_length, 102 | configs.file_type, 103 | configs.delimiter) 104 | if configs.pretrained_lm_name != "user_defined_pretrained_lm": 105 | if configs.pretrained_lm_name not in PRETRAINED_LM_URLS: 106 | raise ValueError( 107 | "Not support pretrained language model: %s" % configs.pretrained_lm_name) 108 | if not os.path.exists(PRETRAINED_LM_DIR): 109 | mkdir(PRETRAINED_LM_DIR) 110 | pretrained_lm_dir = os.path.join(PRETRAINED_LM_DIR, configs.pretrained_lm_name) 111 | if not os.path.exists(pretrained_lm_dir): 112 | mkdir(pretrained_lm_dir) 113 | pretrained_lm_zip = wget( 114 | url=PRETRAINED_LM_URLS.get(configs.pretrained_lm_name), 115 | save_path=pretrained_lm_dir, 116 | rename=configs.pretrained_lm_name+".zip") 117 | unzip(file_path=pretrained_lm_zip) 118 | print("[saved] pretrained language model saved at: %s" 119 | % os.path.exists(os.path.join(pretrained_lm_dir, PRETRAINED_LM_DICT.get(configs.pretrained_lm_name)))) 120 | rm(pretrained_lm_zip) 121 | else: 122 | print("[exists] pretrained language model already exists: %s" 123 | % pretrained_lm_dir) 124 | else: 125 | # TODO: should consider some other cases 126 | if "albert" in configs.init_checkpoint.lower(): 127 | configs.pretrained_lm_name = "albert" 128 | else: 129 | configs.pretrained_lm_name = "bert" 130 | if not os.path.exists(configs.output_dir): 131 | os.makedirs(configs.output_dir) 132 | result_res = run_classifier(processor, configs) 133 | return result_res -------------------------------------------------------------------------------- /PyCLUE/tasks/run_cmrc_drcd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChineseGLUE/PyCLUE/0088f97f5da5903e720cbd48c7a558a7f1d7e836/PyCLUE/tasks/run_cmrc_drcd.py -------------------------------------------------------------------------------- /PyCLUE/tasks/run_ner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChineseGLUE/PyCLUE/0088f97f5da5903e720cbd48c7a558a7f1d7e836/PyCLUE/tasks/run_ner.py -------------------------------------------------------------------------------- /PyCLUE/tasks/run_squad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChineseGLUE/PyCLUE/0088f97f5da5903e720cbd48c7a558a7f1d7e836/PyCLUE/tasks/run_squad.py -------------------------------------------------------------------------------- /PyCLUE/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChineseGLUE/PyCLUE/0088f97f5da5903e720cbd48c7a558a7f1d7e836/PyCLUE/utils/__init__.py -------------------------------------------------------------------------------- /PyCLUE/utils/classifier_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChineseGLUE/PyCLUE/0088f97f5da5903e720cbd48c7a558a7f1d7e836/PyCLUE/utils/classifier_utils/__init__.py -------------------------------------------------------------------------------- /PyCLUE/utils/classifier_utils/bert_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Liu Shaoweihua 3 | # @Date: 2019-11-18 4 | 5 | # Copyright 2018 The Google AI Language Team Authors. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | 25 | import re 26 | import six 27 | import copy 28 | import json 29 | import math 30 | import collections 31 | import tensorflow as tf 32 | 33 | 34 | def get_shape_list(tensor, expected_rank=None, name=None): 35 | """Returns a list of the shape of tensor, preferring static dimensions. 36 | 37 | Args: 38 | tensor: A tf.Tensor object to find the shape of. 39 | expected_rank: (optional) int. The expected rank of `tensor`. If this is 40 | specified and the `tensor` has a different rank, and exception will be 41 | thrown. 42 | name: Optional name of the tensor for the error message. 43 | 44 | Returns: 45 | A list of dimensions of the shape of tensor. All static dimensions will 46 | be returned as python integers, and dynamic dimensions will be returned 47 | as tf.Tensor scalars. 48 | """ 49 | if name is None: 50 | name = tensor.name 51 | 52 | if expected_rank is not None: 53 | assert_rank(tensor, expected_rank, name) 54 | 55 | shape = tensor.shape.as_list() 56 | 57 | non_static_indexes = [] 58 | for (index, dim) in enumerate(shape): 59 | if dim is None: 60 | non_static_indexes.append(index) 61 | 62 | if not non_static_indexes: 63 | return shape 64 | 65 | dyn_shape = tf.shape(tensor) 66 | for index in non_static_indexes: 67 | shape[index] = dyn_shape[index] 68 | return shape 69 | 70 | def reshape_to_matrix(input_tensor): 71 | """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" 72 | ndims = input_tensor.shape.ndims 73 | if ndims < 2: 74 | raise ValueError("Input tensor must have at least rank 2. Shape = %s" % 75 | (input_tensor.shape)) 76 | if ndims == 2: 77 | return input_tensor 78 | 79 | width = input_tensor.shape[-1] 80 | output_tensor = tf.reshape(input_tensor, [-1, width]) 81 | return output_tensor 82 | 83 | def reshape_from_matrix(output_tensor, orig_shape_list): 84 | """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" 85 | if len(orig_shape_list) == 2: 86 | return output_tensor 87 | 88 | output_shape = get_shape_list(output_tensor) 89 | 90 | orig_dims = orig_shape_list[0:-1] 91 | width = output_shape[-1] 92 | 93 | return tf.reshape(output_tensor, orig_dims + [width]) 94 | 95 | def assert_rank(tensor, expected_rank, name=None): 96 | """Raises an exception if the tensor rank is not of the expected rank. 97 | 98 | Args: 99 | tensor: A tf.Tensor to check the rank of. 100 | expected_rank: Python integer or list of integers, expected rank. 101 | name: Optional name of the tensor for the error message. 102 | 103 | Raises: 104 | ValueError: If the expected shape doesn't match the actual shape. 105 | """ 106 | if name is None: 107 | name = tensor.name 108 | 109 | expected_rank_dict = {} 110 | if isinstance(expected_rank, six.integer_types): 111 | expected_rank_dict[expected_rank] = True 112 | else: 113 | for x in expected_rank: 114 | expected_rank_dict[x] = True 115 | 116 | actual_rank = tensor.shape.ndims 117 | if actual_rank not in expected_rank_dict: 118 | scope_name = tf.get_variable_scope().name 119 | raise ValueError( 120 | "For the tensor `%s` in scope `%s`, the actual rank " 121 | "`%d` (shape = %s) is not equal to the expected rank `%s`" % 122 | (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) 123 | 124 | def gather_indexes(sequence_tensor, positions): 125 | """Gathers the vectors at the specific positions over a minibatch.""" 126 | sequence_shape = get_shape_list(sequence_tensor, expected_rank=3) 127 | batch_size = sequence_shape[0] 128 | seq_length = sequence_shape[1] 129 | width = sequence_shape[2] 130 | 131 | flat_offsets = tf.reshape( 132 | tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) 133 | flat_positions = tf.reshape(positions + flat_offsets, [-1]) 134 | flat_sequence_tensor = tf.reshape(sequence_tensor, 135 | [batch_size * seq_length, width]) 136 | output_tensor = tf.gather(flat_sequence_tensor, flat_positions) 137 | return output_tensor 138 | 139 | # add sequence mask for: 140 | # 1. random shuffle lm modeling---xlnet with random shuffled input 141 | # 2. left2right and right2left language modeling 142 | # 3. conditional generation 143 | def generate_seq2seq_mask(attention_mask, mask_sequence, seq_type, **kargs): 144 | if seq_type == 'seq2seq': 145 | if mask_sequence is not None: 146 | seq_shape = get_shape_list(mask_sequence, expected_rank=2) 147 | seq_len = seq_shape[1] 148 | ones = tf.ones((1, seq_len, seq_len)) 149 | a_mask = tf.matrix_band_part(ones, -1, 0) 150 | s_ex12 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 2) 151 | s_ex13 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 3) 152 | a_mask = (1 - s_ex13) * (1 - s_ex12) + s_ex13 * a_mask 153 | # generate mask of batch x seq_len x seq_len 154 | a_mask = tf.reshape(a_mask, (-1, seq_len, seq_len)) 155 | out_mask = attention_mask * a_mask 156 | else: 157 | ones = tf.ones_like(attention_mask[:1]) 158 | mask = (tf.matrix_band_part(ones, -1, 0)) 159 | out_mask = attention_mask * mask 160 | else: 161 | out_mask = attention_mask 162 | 163 | return out_mask -------------------------------------------------------------------------------- /PyCLUE/utils/classifier_utils/core.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Liu Shaoweihua 3 | # @Date: 2019-12-04 4 | 5 | # Copyright 2018 The Google AI Language Team Authors. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | """BERT finetuning runner.""" 19 | 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | 26 | import os 27 | import json 28 | import collections 29 | import numpy as np 30 | import tensorflow as tf 31 | from . import tokenization, modeling 32 | from . import optimization_finetuning as optimization 33 | from ..configs.model_configs import PRETRAINED_LM_DICT, PRETRAINED_LM_CONFIG, PRETRAINED_LM_CKPT 34 | 35 | 36 | _CWD = os.path.dirname(__file__) 37 | DATA_DIR = os.path.abspath(os.path.join(_CWD, "../../datasets")) 38 | OUTPUT_DIR = os.path.abspath(os.path.join(_CWD, "../../task_outputs/classifications")) 39 | PRETRAINED_LM_DIR = os.path.abspath(os.path.join(_CWD, "../../pretrained_lm")) 40 | 41 | 42 | __all__ = [ 43 | "TaskConfigs", "UserConfigs", "InputExample", "PaddingInputExample", "InputFeatures", 44 | "DataProcessor", "ClassificationProcessor","convert_single_example", 45 | "file_based_input_fn_builder", "create_model", "model_fn_builder", "run_classifier" 46 | ] 47 | 48 | 49 | default_configs = { 50 | "task_name": None, 51 | "pretrained_lm_name": None, 52 | "do_train": False, 53 | "do_eval": False, 54 | "do_predict": False, 55 | "data_dir": None, 56 | "output_dir": None, 57 | "vocab_file": None, 58 | "bert_config_file": None, 59 | "init_checkpoint": None, 60 | "do_lower_case": True, 61 | "max_seq_length": 128, 62 | "train_batch_size": 32, 63 | "eval_batch_size": 8, 64 | "predict_batch_size": 8, 65 | "learning_rate": 5e-5, 66 | "num_train_epochs": 3.0, 67 | "warmup_proportion": 0.1, 68 | "save_checkpoints_steps": 1000, 69 | "iterations_per_loop": 1000, 70 | "use_tpu": False, 71 | "tpu_name": None, 72 | "tpu_zone": None, 73 | "gcp_project": None, 74 | "master": None, 75 | "num_tpu_cores": 8, 76 | "verbose": 0 77 | } 78 | 79 | 80 | class TaskConfigs(object): 81 | 82 | def __init__(self, configs): 83 | self.task_name = configs.get("task_name").lower() or "user_defined_task" 84 | self.pretrained_lm_name = configs.get("pretrained_lm_name").lower() or "user_defined_pretrained_lm" 85 | self.do_train = configs.get("do_train") 86 | self.do_eval = configs.get("do_eval") 87 | self.do_predict = configs.get("do_predict") 88 | self.data_dir = configs.get("data_dir") or os.path.join(DATA_DIR, self.task_name) 89 | self.output_dir = configs.get("output_dir") or os.path.join(OUTPUT_DIR, self.task_name, self.pretrained_lm_name) 90 | self.vocab_file = configs.get("vocab_file") or os.path.join(PRETRAINED_LM_DIR, self.pretrained_lm_name, PRETRAINED_LM_DICT.get(self.pretrained_lm_name), "vocab.txt") 91 | self.bert_config_file = configs.get("bert_config_file") or os.path.join(PRETRAINED_LM_DIR, self.pretrained_lm_name, PRETRAINED_LM_DICT.get(self.pretrained_lm_name), PRETRAINED_LM_CONFIG.get(self.pretrained_lm_name)) 92 | self.init_checkpoint = configs.get("init_checkpoint") or os.path.join(PRETRAINED_LM_DIR, self.pretrained_lm_name, PRETRAINED_LM_DICT.get(self.pretrained_lm_name), PRETRAINED_LM_CKPT.get(self.pretrained_lm_name)) 93 | self.do_lower_case = configs.get("do_lower_case") 94 | self.max_seq_length = configs.get("max_seq_length") 95 | self.train_batch_size = configs.get("train_batch_size") 96 | self.eval_batch_size = configs.get("eval_batch_size") 97 | self.predict_batch_size = configs.get("predict_batch_size") 98 | self.learning_rate = configs.get("learning_rate") 99 | self.num_train_epochs = configs.get("num_train_epochs") 100 | self.warmup_proportion = configs.get("warmup_proportion") 101 | self.save_checkpoints_steps = configs.get("save_checkpoints_steps") 102 | self.iterations_per_loop = configs.get("iterations_per_loop") 103 | self.use_tpu = configs.get("use_tpu") 104 | self.tpu_name = configs.get("tpu_name") 105 | self.tpu_zone = configs.get("tpu_zone") 106 | self.gcp_project = configs.get("gcp_project") 107 | self.master = configs.get("master") 108 | self.num_tpu_cores = configs.get("num_tpu_cores") 109 | self.verbose = configs.get("verbose") 110 | 111 | 112 | class UserConfigs(TaskConfigs): 113 | 114 | def __init__(self, configs): 115 | self.label_column = configs.get("label_column") 116 | self.text_a_column = configs.get("text_a_column") 117 | self.text_b_column = configs.get("text_b_column") 118 | self.delimiter = configs.get("delimiter") 119 | self.ignore_header = configs.get("ignore_header") 120 | self.min_seq_length = configs.get("min_seq_length") 121 | self.file_type = configs.get("file_type") 122 | super().__init__(configs) 123 | 124 | 125 | 126 | class InputExample(object): 127 | """A single training/test example for simple sequence classification.""" 128 | 129 | def __init__(self, guid, text_a, text_b=None, label=None): 130 | """Constructs a InputExample. 131 | Args: 132 | guid: Unique id for the example. 133 | text_a: string. The untokenized text of the first sequence. For single 134 | sequence tasks, only this sequence must be specified. 135 | text_b: (Optional) string. The untokenized text of the second sequence. 136 | Only must be specified for sequence pair tasks. 137 | label: (Optional) string. The label of the example. This should be 138 | specified for train and dev examples, but not for test examples. 139 | """ 140 | self.guid = guid 141 | self.text_a = text_a 142 | self.text_b = text_b 143 | self.label = label 144 | 145 | 146 | class PaddingInputExample(object): 147 | """Fake example so the num input examples is a multiple of the batch size. 148 | When running eval/predict on the TPU, we need to pad the number of examples 149 | to be a multiple of the batch size, because the TPU requires a fixed batch 150 | size. The alternative is to drop the last batch, which is bad because it means 151 | the entire output data won't be generated. 152 | We use this class instead of `None` because treating `None` as padding 153 | battches could cause silent errors. 154 | """ 155 | 156 | 157 | class InputFeatures(object): 158 | """A single set of features of data""" 159 | def __init__(self, 160 | input_ids, 161 | input_mask, 162 | segment_ids, 163 | label_id, 164 | is_real_example=True): 165 | self.input_ids = input_ids 166 | self.input_mask = input_mask 167 | self.segment_ids = segment_ids 168 | self.label_id = label_id 169 | self.is_real_example = is_real_example 170 | 171 | 172 | class DataProcessor(object): 173 | """Base class for data converters for sequence classification data sets.""" 174 | 175 | def get_train_examples(self, data_dir): 176 | """Gets a collection of `InputExample`s for the train set.""" 177 | raise NotImplementedError() 178 | 179 | def get_dev_examples(self, data_dir): 180 | """Gets a collection of `InputExample`s for the dev set.""" 181 | raise NotImplementedError() 182 | 183 | def get_test_examples(self, data_dir): 184 | """Gets a collection of `InputExample`s for prediction.""" 185 | raise NotImplementedError() 186 | 187 | def get_labels(self): 188 | """Gets the list of labels for this data set.""" 189 | raise NotImplementedError() 190 | 191 | @classmethod 192 | def _read_file(cls, input_file, file_type, delimiter): 193 | """Reads files.""" 194 | with tf.gfile.Open(input_file, "r") as f: 195 | reader = f.readlines() 196 | lines = [] 197 | for line in reader: 198 | lines.append(line.strip()) 199 | if file_type == "json": 200 | lines = [json.loads(item) for item in lines] 201 | else: 202 | lines = [item.split(delimiter) for item in lines] 203 | return lines 204 | 205 | 206 | class ClassificationProcessor(DataProcessor): 207 | 208 | def __init__(self, labels, label_column, text_a_column, text_b_column=None, ignore_header=False, min_seq_length=None, file_type="json", delimiter=None): 209 | self.language = "zh" 210 | self.labels = labels 211 | self.label_column = label_column 212 | self.text_a_column = text_a_column 213 | self.text_b_column = text_b_column 214 | self.ignore_header = ignore_header 215 | self.min_seq_length = min_seq_length 216 | self.file_type = file_type 217 | self.delimiter = delimiter 218 | 219 | def get_train_examples(self, data_dir): 220 | """See base class.""" 221 | return self._create_examples( 222 | self._read_file(os.path.join(data_dir, "train."+self.file_type), self.file_type, delimiter=self.delimiter), "train" 223 | ) 224 | 225 | def get_dev_examples(self, data_dir): 226 | """See base class.""" 227 | return self._create_examples( 228 | self._read_file(os.path.join(data_dir, "dev."+self.file_type), self.file_type, delimiter=self.delimiter), "dev" 229 | ) 230 | 231 | def get_test_examples(self, data_dir): 232 | """See base class.""" 233 | return self._create_examples( 234 | self._read_file(os.path.join(data_dir, "test."+self.file_type), self.file_type, delimiter=self.delimiter), "test" 235 | ) 236 | 237 | def get_labels(self): 238 | """See base class.""" 239 | return self.labels 240 | 241 | def _create_examples(self, lines, set_type): 242 | """Creates examples for the training and dev sets.""" 243 | examples = [] 244 | if self.ignore_header: 245 | lines = lines[1:] 246 | if self.min_seq_length: 247 | lines = [line for line in lines if len(line) >= self.min_seq_length] 248 | for i, line in enumerate(lines): 249 | guid = "%s-%s" %(set_type, i) 250 | try: 251 | label = tokenization.convert_to_unicode(line[self.label_column]) if set_type != "test" else self.labels[0] 252 | text_a = tokenization.convert_to_unicode(line[self.text_a_column]) 253 | text_b = None if not self.text_b_column else tokenization.convert_to_unicode(line[self.text_b_column]) 254 | examples.append( 255 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) 256 | ) 257 | except Exception: 258 | print("### Error {}: {}".format(i, line)) 259 | return examples 260 | 261 | 262 | def convert_single_example( 263 | ex_index, example, label_list, max_seq_length, tokenizer): 264 | """Converts a single `InputExample` into a single `InputFeatures`.""" 265 | 266 | if isinstance(example, PaddingInputExample): 267 | return [InputFeatures( 268 | input_ids=[0]*max_seq_length, 269 | input_mask=[0]*max_seq_length, 270 | segment_ids=[0]*max_seq_length, 271 | label_id=0, 272 | is_real_example=False 273 | )] 274 | 275 | label_map = {} 276 | for i, label in enumerate(label_list): 277 | label_map[label] = i 278 | 279 | tokens_a = tokenizer.tokenize(example.text_a) 280 | tokens_b = None 281 | if example.text_b: 282 | tokens_b = tokenizer.tokenize(example.text_b) 283 | 284 | if tokens_b: 285 | # Modifies `tokens_a` and `tokens_b` in place so that the total 286 | # length is less than the specified length. 287 | # Account for [CLS], [SEP], [SEP] with "- 3" 288 | _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) 289 | else: 290 | # Account for [CLS] and [SEP] with "- 2" 291 | if len(tokens_a) > max_seq_length - 2: 292 | tokens_a = tokens_a[:max_seq_length - 2] 293 | 294 | # The convention in BERT is: 295 | # (a) For sequence pairs: 296 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 297 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 298 | # (b) For single sequences: 299 | # tokens: [CLS] the dog is hairy . [SEP] 300 | # type_ids: 0 0 0 0 0 0 0 301 | # 302 | # Where "type_ids" are used to indicate whether this is the first 303 | # sequence or the second sequence. The embedding vectors for `type=0` and 304 | # `type=1` were learned during pre-training and are added to the wordpiece 305 | # embedding vector (and position vector). This is not *strictly* necessary 306 | # since the [SEP] token unambiguously separates the sequences, but it makes 307 | # it easier for the model to learn the concept of sequences. 308 | # 309 | # For classification tasks, the first vector (corresponding to [CLS]) is 310 | # used as the "sentence vector". Note that this only makes sense because 311 | # the entire model is fine-tuned. 312 | tokens = [] 313 | segment_ids = [] 314 | tokens.append("[CLS]") 315 | segment_ids.append(0) 316 | for token in tokens_a: 317 | tokens.append(token) 318 | segment_ids.append(0) 319 | tokens.append("[SEP]") 320 | segment_ids.append(0) 321 | 322 | if tokens_b: 323 | for token in tokens_b: 324 | tokens.append(token) 325 | segment_ids.append(1) 326 | tokens.append("[SEP]") 327 | segment_ids.append(1) 328 | 329 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 330 | 331 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 332 | # tokens are attended to. 333 | input_mask = [1] * len(input_ids) 334 | 335 | # Zero-pad up to the sequence length. 336 | while len(input_ids) < max_seq_length: 337 | input_ids.append(0) 338 | input_mask.append(0) 339 | segment_ids.append(0) 340 | 341 | assert len(input_ids) == max_seq_length 342 | assert len(input_mask) == max_seq_length 343 | assert len(segment_ids) == max_seq_length 344 | 345 | label_id = label_map[example.label] 346 | if ex_index < 5: 347 | tf.logging.info("*** Example ***") 348 | tf.logging.info("guid: %s" %(example.guid)) 349 | tf.logging.info("tokens: %s" % " ".join( 350 | [tokenization.printable_text(x) for x in tokens])) 351 | tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 352 | tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 353 | tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 354 | tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) 355 | 356 | feature = InputFeatures( 357 | input_ids=input_ids, 358 | input_mask=input_mask, 359 | segment_ids=segment_ids, 360 | label_id=label_id, 361 | is_real_example=True 362 | ) 363 | return feature 364 | 365 | 366 | def file_based_convert_examples_to_features( 367 | examples, label_list, max_seq_length, tokenizer, output_file): 368 | """Convert a set of `InputExample`s to a TFRecord file.""" 369 | 370 | writer = tf.python_io.TFRecordWriter(output_file) 371 | 372 | for (ex_index, example) in enumerate(examples): 373 | if ex_index % 10000 == 0: 374 | tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) 375 | 376 | feature = convert_single_example(ex_index, example, label_list, 377 | max_seq_length, tokenizer) 378 | 379 | def create_int_feature(values): 380 | f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) 381 | return f 382 | 383 | features = collections.OrderedDict() 384 | features["input_ids"] = create_int_feature(feature.input_ids) 385 | features["input_mask"] = create_int_feature(feature.input_mask) 386 | features["segment_ids"] = create_int_feature(feature.segment_ids) 387 | features["label_ids"] = create_int_feature([feature.label_id]) 388 | features["is_real_example"] = create_int_feature( 389 | [int(feature.is_real_example)]) 390 | 391 | tf_features = tf.train.Features(feature=features) 392 | tf_example = tf.train.Example(features=tf_features) 393 | writer.write(tf_example.SerializeToString()) 394 | 395 | writer.close() 396 | 397 | 398 | def file_based_input_fn_builder( 399 | input_file, seq_length, is_training, drop_remainder 400 | ): 401 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 402 | 403 | name_to_features = { 404 | "input_ids": tf.FixedLenFeature([seq_length], tf.int64), 405 | "input_mask": tf.FixedLenFeature([seq_length], tf.int64), 406 | "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), 407 | "label_ids": tf.FixedLenFeature([], tf.int64), 408 | "is_real_example": tf.FixedLenFeature([], tf.int64) 409 | } 410 | 411 | def _decode_record(record, name_to_features): 412 | """Decodes a record to a tensorflow example.""" 413 | examples = tf.parse_single_example(record, name_to_features) 414 | 415 | # tf.train.Example only supports tf.int64, but the TPU only supports tf.int32 416 | # so cast all tf.int64 to tf.int32 417 | for name in list(examples.keys()): 418 | t = examples[name] 419 | if t.dtype == tf.int64: 420 | t = tf.to_int32(t) 421 | examples[name] = t 422 | return examples 423 | 424 | def input_fn(params): 425 | """The actual input function.""" 426 | batch_size = params["batch_size"] 427 | 428 | # For training, we want a lot of parallel reading and shuffling. 429 | # For evaluation, we want no shuffling and parallel reading doesn't matter. 430 | d = tf.data.TFRecordDataset(input_file) 431 | if is_training: 432 | d = d.repeat() 433 | d = d.shuffle(buffer_size=100) 434 | 435 | d = d.apply( 436 | tf.contrib.data.map_and_batch( 437 | lambda record: _decode_record(record, name_to_features), 438 | batch_size=batch_size, 439 | drop_remainder=drop_remainder 440 | ) 441 | ) 442 | 443 | return d 444 | 445 | return input_fn 446 | 447 | 448 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 449 | """Truncates a sequence pair in place to the maximum length.""" 450 | 451 | # This is a simple heuristic which will always truncate the longer sequence 452 | # one token at a time. This makes more sense than truncating an equal percent 453 | # of tokens from each, since if one sequence is very short then each token 454 | # that's truncated likely contains more information than a longer sequence. 455 | while True: 456 | total_length = len(tokens_a) + len(tokens_b) 457 | if total_length <= max_length: 458 | break 459 | if len(tokens_a) > len(tokens_b): 460 | tokens_a.pop() 461 | else: 462 | tokens_b.pop() 463 | 464 | 465 | def create_model( 466 | model_type, bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): 467 | """Create a classification model.""" 468 | if model_type.startswith("bert") or model_type.startswith("roberta"): 469 | model = modeling.BertModel( 470 | config=bert_config, 471 | is_training=is_training, 472 | input_ids=input_ids, 473 | input_mask=input_mask, 474 | token_type_ids=segment_ids, 475 | use_one_hot_embeddings=use_one_hot_embeddings 476 | ) 477 | elif model_type.startswith("albert"): 478 | model = modeling.AlBertModel( 479 | config=bert_config, 480 | is_training=is_training, 481 | input_ids=input_ids, 482 | input_mask=input_mask, 483 | token_type_ids=segment_ids, 484 | use_one_hot_embeddings=use_one_hot_embeddings 485 | ) 486 | 487 | # In this demo, we are doing a simple classification task on the entire segment. 488 | # 489 | # If you want to use the token-level output, use model.get_sequence_output() instead. 490 | output_layer = model.get_pooled_output() 491 | 492 | hidden_size = output_layer.shape[-1].value 493 | 494 | output_weights = tf.get_variable( 495 | "output_weights", [num_labels, hidden_size], 496 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 497 | 498 | output_bias = tf.get_variable( 499 | "output_bias", [num_labels], 500 | initializer=tf.zeros_initializer()) 501 | 502 | with tf.variable_scope("loss"): 503 | if model_type.startswith("albert"): 504 | try: 505 | ln_type = bert_config.ln_type 506 | except: 507 | ln_type = None 508 | if ln_type == "preln": 509 | # Add by brightmart, 10-06. If it is preln, we need to add an additional 510 | # layer: layer normalization as suggested in paper "ON LAYER NORMALIZATION 511 | # IN THE TRANSFORMER ARCHITECTURE" 512 | print("ln_type is preln. add LN layer.") 513 | output = layer_norm(output_layer) 514 | else: 515 | print("ln_type is postln or other, do nothing.") 516 | 517 | if is_training: 518 | # I.E., 0.1 dropout 519 | output_layer = tf.nn.dropout(output_layer, rate=0.1) 520 | 521 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 522 | logits = tf.nn.bias_add(logits, output_bias) 523 | probabilities = tf.nn.softmax(logits, axis=-1) 524 | log_probs = tf.nn.log_softmax(logits, axis=-1) 525 | 526 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 527 | 528 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, 529 | axis=-1) # todo 08-29 try temp-loss 530 | ###############bi_tempered_logistic_loss############################################################################ 531 | # print("##cross entropy loss is used...."); tf.logging.info("##cross entropy loss is used....") 532 | # t1=0.9 #t1=0.90 533 | # t2=1.05 #t2=1.05 534 | # per_example_loss=bi_tempered_logistic_loss(log_probs,one_hot_labels,t1,t2,label_smoothing=0.1,num_iters=5) # TODO label_smoothing=0.0 535 | # tf.logging.info("per_example_loss:"+str(per_example_loss.shape)) 536 | ##############bi_tempered_logistic_loss############################################################################# 537 | 538 | loss = tf.reduce_mean(per_example_loss) 539 | 540 | return loss, per_example_loss, logits, probabilities 541 | 542 | 543 | def layer_norm(input_tensor, name=None): 544 | """Run layer normalization on the last dimension of the tensor.""" 545 | return tf.contrib.layers.layer_norm( 546 | inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) 547 | 548 | 549 | def model_fn_builder(model_type, bert_config, num_labels, init_checkpoint, learning_rate, 550 | num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings): 551 | """Returns `model_fn` closure for TPUEstimator.""" 552 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 553 | """The `model_fn` for TPUEstimator.""" 554 | 555 | tf.logging.info("*** Features ***") 556 | for name in sorted(features.keys()): 557 | tf.logging.info(" name = %s, shape = %s" %(name, features[name].shape)) 558 | 559 | input_ids = features["input_ids"] 560 | input_mask = features["input_mask"] 561 | segment_ids = features["segment_ids"] 562 | label_ids = features["label_ids"] 563 | is_real_example = None 564 | if "is_real_example" in features: 565 | is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) 566 | else: 567 | is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) 568 | 569 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 570 | 571 | total_loss, per_example_loss, logits, probabilities = create_model( 572 | model_type, bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) 573 | 574 | tvars = tf.trainable_variables() 575 | initialized_variable_names = {} 576 | scaffold_fn = None 577 | if init_checkpoint: 578 | assignment_map, initialized_variable_names = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) 579 | if use_tpu: 580 | def tpu_scaffold(): 581 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 582 | return tf.train.Scaffold() 583 | scaffold_fn = tpu_scaffold 584 | else: 585 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 586 | tf.logging.info("**** Trainable Variables ****") 587 | for var in tvars: 588 | init_string = "" 589 | if var.name in initialized_variable_names: 590 | init_string = ", *INIT_FROM_CKPT*" 591 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) 592 | 593 | output_spec = None 594 | if mode == tf.estimator.ModeKeys.TRAIN: 595 | train_op = optimization.create_optimizer( 596 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) 597 | 598 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 599 | mode=mode, 600 | loss=total_loss, 601 | train_op=train_op, 602 | scaffold_fn=scaffold_fn) 603 | elif mode == tf.estimator.ModeKeys.EVAL: 604 | def metric_fn(per_example_loss, label_ids, logits, is_real_example): 605 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) 606 | accuracy = tf.metrics.accuracy( 607 | labels=label_ids, predictions=predictions, weights=is_real_example) 608 | loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) 609 | return { 610 | "eval_accuracy": accuracy, 611 | "eval_loss": loss, 612 | } 613 | eval_metrics = (metric_fn, [per_example_loss, label_ids, logits, is_real_example]) 614 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 615 | mode=mode, 616 | loss=total_loss, 617 | eval_metrics=eval_metrics, 618 | scaffold_fn=scaffold_fn) 619 | else: 620 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 621 | mode=mode, 622 | predictions={"probabilities": probabilities}, 623 | scaffold_fn=scaffold_fn) 624 | return output_spec 625 | 626 | return model_fn 627 | 628 | 629 | def run_classifier(processor, configs): 630 | if configs.verbose == 0: 631 | tf.logging.set_verbosity(tf.logging.ERROR) 632 | else: 633 | tf.logging.set_verbosity(tf.logging.INFO) 634 | 635 | tokenization.validate_case_matches_checkpoint(configs.do_lower_case, configs.init_checkpoint) 636 | 637 | if not configs.do_train and not configs.do_eval and not configs.do_predict: 638 | raise ValueError( 639 | "At least one of `do_train`, `do_eval` or `do_predict' must be True.") 640 | 641 | bert_config = modeling.BertConfig.from_json_file(configs.bert_config_file) 642 | 643 | if configs.max_seq_length > bert_config.max_position_embeddings: 644 | raise ValueError( 645 | "Cannot use sequence length %d because the BERT model " 646 | "was only trained up to sequence length %d" % 647 | (configs.max_seq_length, bert_config.max_position_embeddings)) 648 | 649 | tf.gfile.MakeDirs(configs.output_dir) 650 | 651 | task_name = configs.task_name.lower() 652 | 653 | label_list = processor.get_labels() 654 | 655 | tokenizer = tokenization.FullTokenizer( 656 | vocab_file=configs.vocab_file, do_lower_case=configs.do_lower_case) 657 | 658 | tpu_cluster_resolver = None 659 | if configs.use_tpu and configs.tpu_name: 660 | tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( 661 | configs.tpu_name, zone=configs.tpu_zone, project=configs.gcp_project) 662 | 663 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 664 | # Cloud TPU: Invalid TPU configuration, ensure ClusterResolver is passed to tpu. 665 | print("[tpu] tpu cluster resolver:", tpu_cluster_resolver) 666 | run_config = tf.contrib.tpu.RunConfig( 667 | cluster=tpu_cluster_resolver, 668 | master=configs.master, 669 | model_dir=configs.output_dir, 670 | save_checkpoints_steps=configs.save_checkpoints_steps, 671 | tpu_config=tf.contrib.tpu.TPUConfig( 672 | iterations_per_loop=configs.iterations_per_loop, 673 | num_shards=configs.num_tpu_cores, 674 | per_host_input_for_training=is_per_host)) 675 | 676 | train_examples = None 677 | num_train_steps = None 678 | num_warmup_steps = None 679 | if configs.do_train: 680 | train_examples = processor.get_train_examples(configs.data_dir) 681 | print("[train] length of total train_examples:", len(train_examples)) 682 | num_train_steps = int(len(train_examples) / configs.train_batch_size * configs.num_train_epochs) 683 | num_warmup_steps = int(num_train_steps * configs.warmup_proportion) 684 | 685 | model_fn = model_fn_builder( 686 | model_type=configs.pretrained_lm_name, 687 | bert_config=bert_config, 688 | num_labels=len(label_list), 689 | init_checkpoint=configs.init_checkpoint, 690 | learning_rate=configs.learning_rate, 691 | num_train_steps=num_train_steps, 692 | num_warmup_steps=num_warmup_steps, 693 | use_tpu=configs.use_tpu, 694 | use_one_hot_embeddings=configs.use_tpu) 695 | 696 | # If TPU is not available, this will fall back to normal Estimator on CPU 697 | # or GPU. 698 | estimator = tf.contrib.tpu.TPUEstimator( 699 | use_tpu=configs.use_tpu, 700 | model_fn=model_fn, 701 | config=run_config, 702 | train_batch_size=configs.train_batch_size, 703 | eval_batch_size=configs.eval_batch_size, 704 | predict_batch_size=configs.predict_batch_size) 705 | 706 | if configs.do_train: 707 | train_file = os.path.join(configs.output_dir, "train.tf_record") 708 | train_file_exists = os.path.exists(train_file) 709 | print("[train] train file exists:", train_file_exists) 710 | print("[train] train file path:", train_file) 711 | if not train_file_exists: # if tf_record file not exist, convert from raw text file. # TODO 712 | file_based_convert_examples_to_features( 713 | train_examples, label_list, configs.max_seq_length, tokenizer, train_file) 714 | tf.logging.info("***** Running training *****") 715 | tf.logging.info(" Num examples = %d", len(train_examples)) 716 | tf.logging.info(" Batch size = %d", configs.train_batch_size) 717 | tf.logging.info(" Num steps = %d", num_train_steps) 718 | train_input_fn = file_based_input_fn_builder( 719 | input_file=train_file, 720 | seq_length=configs.max_seq_length, 721 | is_training=True, 722 | drop_remainder=True) 723 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) 724 | 725 | if configs.do_eval: 726 | # dev dataset 727 | dev_examples = processor.get_dev_examples(configs.data_dir) 728 | num_actual_dev_examples = len(dev_examples) 729 | if configs.use_tpu: 730 | # TPU requires a fixed batch size for all batches, therefore the number 731 | # of examples must be a multiple of the batch size, or else examples 732 | # will get dropped. So we pad with fake examples which are ignored 733 | # later on. These do NOT count towards the metric (all tf.metrics 734 | # support a per-instance weight, and these get a weight of 0.0). 735 | while len(dev_examples) % configs.eval_batch_size != 0: 736 | dev_examples.append(PaddingInputExample()) 737 | 738 | eval_file = os.path.join(configs.output_dir, "dev.tf_record") 739 | file_based_convert_examples_to_features( 740 | dev_examples, label_list, configs.max_seq_length, tokenizer, eval_file) 741 | 742 | tf.logging.info("***** Running evaluation *****") 743 | tf.logging.info(" Num examples = %d (%d actual, %d padding)", 744 | len(dev_examples), num_actual_dev_examples, 745 | len(dev_examples) - num_actual_dev_examples) 746 | tf.logging.info(" Batch size = %d", configs.eval_batch_size) 747 | 748 | # This tells the estimator to run through the entire set. 749 | eval_steps = None 750 | # However, if running eval on the TPU, you will need to specify the 751 | # number of steps. 752 | if configs.use_tpu: 753 | assert len(dev_examples) % configs.eval_batch_size == 0 754 | eval_steps = int(len(dev_examples) // configs.eval_batch_size) 755 | 756 | eval_drop_remainder = True if configs.use_tpu else False 757 | eval_input_fn = file_based_input_fn_builder( 758 | input_file=eval_file, 759 | seq_length=configs.max_seq_length, 760 | is_training=False, 761 | drop_remainder=eval_drop_remainder) 762 | 763 | ####################################################################################################################### 764 | # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy 765 | steps_and_files = [] 766 | filenames = tf.gfile.ListDirectory(configs.output_dir) 767 | for filename in filenames: 768 | if filename.endswith(".index"): 769 | ckpt_name = filename[:-6] 770 | cur_filename = os.path.join(configs.output_dir, ckpt_name) 771 | global_step = int(cur_filename.split("-")[-1]) 772 | tf.logging.info("Add {} to eval list.".format(cur_filename)) 773 | steps_and_files.append([global_step, cur_filename]) 774 | steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) 775 | 776 | output_eval_file = os.path.join(configs.output_dir, "dev_results.txt") 777 | print("[eval] dev result saved at:", output_eval_file) 778 | tf.logging.info("dev_eval_file:" + output_eval_file) 779 | with tf.gfile.GFile(output_eval_file, "w") as writer: 780 | for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): 781 | result = estimator.evaluate(input_fn=eval_input_fn, 782 | steps=eval_steps, checkpoint_path=filename) 783 | 784 | tf.logging.info("***** Eval results %s *****" % (filename)) 785 | writer.write("***** Eval results %s *****\n" % (filename)) 786 | for key in sorted(result.keys()): 787 | tf.logging.info(" %s = %s", key, str(result[key])) 788 | writer.write("%s = %s\n" % (key, str(result[key]))) 789 | ####################################################################################################################### 790 | 791 | # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) 792 | # 793 | # output_eval_file = os.path.join(configs.output_dir, "dev_results.txt") 794 | # with tf.gfile.GFile(output_eval_file, "w") as writer: 795 | # tf.logging.info("***** Eval results *****") 796 | # for key in sorted(result.keys()): 797 | # tf.logging.info(" %s = %s", key, str(result[key])) 798 | # writer.write("%s = %s\n" % (key, str(result[key]))) 799 | 800 | # test dataset 801 | test_examples = processor.get_test_examples(configs.data_dir) 802 | num_actual_test_examples = len(test_examples) 803 | if configs.use_tpu: 804 | # TPU requires a fixed batch size for all batches, therefore the number 805 | # of examples must be a multiple of the batch size, or else examples 806 | # will get dropped. So we pad with fake examples which are ignored 807 | # later on. These do NOT count towards the metric (all tf.metrics 808 | # support a per-instance weight, and these get a weight of 0.0). 809 | while len(test_examples) % configs.eval_batch_size != 0: 810 | test_examples.append(PaddingInputExample()) 811 | 812 | eval_file = os.path.join(configs.output_dir, "test.tf_record") 813 | file_based_convert_examples_to_features( 814 | test_examples, label_list, configs.max_seq_length, tokenizer, eval_file) 815 | 816 | tf.logging.info("***** Running evaluation *****") 817 | tf.logging.info(" Num examples = %d (%d actual, %d padding)", 818 | len(test_examples), num_actual_test_examples, 819 | len(test_examples) - num_actual_test_examples) 820 | tf.logging.info(" Batch size = %d", configs.eval_batch_size) 821 | 822 | # This tells the estimator to run through the entire set. 823 | eval_steps = None 824 | # However, if running eval on the TPU, you will need to specify the 825 | # number of steps. 826 | if configs.use_tpu: 827 | assert len(test_examples) % configs.eval_batch_size == 0 828 | eval_steps = int(len(test_examples) // configs.eval_batch_size) 829 | 830 | eval_drop_remainder = True if configs.use_tpu else False 831 | eval_input_fn = file_based_input_fn_builder( 832 | input_file=eval_file, 833 | seq_length=configs.max_seq_length, 834 | is_training=False, 835 | drop_remainder=eval_drop_remainder) 836 | 837 | ####################################################################################################################### 838 | # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy 839 | steps_and_files = [] 840 | filenames = tf.gfile.ListDirectory(configs.output_dir) 841 | for filename in filenames: 842 | if filename.endswith(".index"): 843 | ckpt_name = filename[:-6] 844 | cur_filename = os.path.join(configs.output_dir, ckpt_name) 845 | global_step = int(cur_filename.split("-")[-1]) 846 | tf.logging.info("Add {} to eval list.".format(cur_filename)) 847 | steps_and_files.append([global_step, cur_filename]) 848 | steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) 849 | 850 | output_eval_file = os.path.join(configs.output_dir, "test_results.txt") 851 | print("[test] test result saved at:", output_eval_file) 852 | tf.logging.info("test_eval_file:" + output_eval_file) 853 | with tf.gfile.GFile(output_eval_file, "w") as writer: 854 | for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): 855 | result = estimator.evaluate(input_fn=eval_input_fn, 856 | steps=eval_steps, checkpoint_path=filename) 857 | 858 | tf.logging.info("***** Eval results %s *****" % (filename)) 859 | writer.write("***** Eval results %s *****\n" % (filename)) 860 | for key in sorted(result.keys()): 861 | tf.logging.info(" %s = %s", key, str(result[key])) 862 | writer.write("%s = %s\n" % (key, str(result[key]))) 863 | ####################################################################################################################### 864 | # 865 | # #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) 866 | # # 867 | # #output_eval_file = os.path.join(configs.output_dir, "test_results.txt") 868 | # # with tf.gfile.GFile(output_eval_file, "w") as writer: 869 | # # tf.logging.info("***** Eval results *****") 870 | # # for key in sorted(result.keys()): 871 | # # tf.logging.info(" %s = %s", key, str(result[key])) 872 | # # writer.write("%s = %s\n" % (key, str(result[key]))) 873 | 874 | if configs.do_predict: 875 | predict_examples = processor.get_test_examples(configs.data_dir) 876 | num_actual_predict_examples = len(predict_examples) 877 | if configs.use_tpu: 878 | # TPU requires a fixed batch size for all batches, therefore the number 879 | # of examples must be a multiple of the batch size, or else examples 880 | # will get dropped. So we pad with fake examples which are ignored 881 | # later on. 882 | while len(predict_examples) % configs.predict_batch_size != 0: 883 | predict_examples.append(PaddingInputExample()) 884 | 885 | predict_file = os.path.join(configs.output_dir, "predict.tf_record") 886 | file_based_convert_examples_to_features( 887 | predict_examples, label_list, configs.max_seq_length, tokenizer, predict_file) 888 | 889 | tf.logging.info("***** Running prediction*****") 890 | tf.logging.info(" Num examples = %d (%d actual, %d padding)", 891 | len(predict_examples), num_actual_predict_examples, 892 | len(predict_examples) - num_actual_predict_examples) 893 | tf.logging.info(" Batch size = %d", configs.predict_batch_size) 894 | 895 | predict_drop_remainder = True if configs.use_tpu else False 896 | predict_input_fn = file_based_input_fn_builder( 897 | input_file=predict_file, 898 | seq_length=configs.max_seq_length, 899 | is_training=False, 900 | drop_remainder=predict_drop_remainder) 901 | 902 | result = estimator.predict(input_fn=predict_input_fn) 903 | 904 | output_predict_file = os.path.join(configs.output_dir, "test_results.tsv") 905 | print("[pred] predict result saved at:", output_predict_file) 906 | with tf.gfile.GFile(output_predict_file, "w") as writer: 907 | num_written_lines = 0 908 | tf.logging.info("***** Predict results *****") 909 | pred_labels = [] 910 | for (i, prediction) in enumerate(result): 911 | probabilities = prediction["probabilities"] 912 | if i >= num_actual_predict_examples: 913 | break 914 | pred_label = np.argmax([item for item in probabilities]) 915 | pred_labels.append(pred_label) 916 | output_lines = [] 917 | for pred_data, pred_label in zip(test_examples, pred_labels): 918 | output_lines.append({"guid": pred_data.guid, "text_a": pred_data.text_a, "text_b": pred_data.text_b, "label": processor.labels[pred_label]}) 919 | for item in output_lines: 920 | writer.write(json.dumps(item, ensure_ascii=False)+"\n") 921 | num_written_lines += 1 922 | assert num_written_lines == num_actual_predict_examples 923 | 924 | dev_res, test_res = "", "" 925 | test_outputs = [] 926 | 927 | dev_res_file = os.path.join(configs.output_dir, "dev_results.txt") 928 | test_res_file = os.path.join(configs.output_dir, "test_results.txt") 929 | test_output_file = os.path.join(configs.output_dir, "test_results.tsv") 930 | 931 | if configs.do_eval: 932 | with open(dev_res_file, "r") as f: 933 | dev_res = [item.strip().split(" = ") for item in f.readlines()[-4:]] 934 | with open(test_res_file, "r") as f: 935 | test_res = [item.strip().split(" = ") for item in f.readlines()[-4:]] 936 | 937 | if configs.do_predict: 938 | test_outputs = output_lines 939 | 940 | result_dict = { 941 | "dev_res": {item[0]:item[1] for item in dev_res}, 942 | "test_res": {item[0]:item[1] for item in test_res}, 943 | "test_outputs": test_outputs 944 | } 945 | 946 | return result_dict -------------------------------------------------------------------------------- /PyCLUE/utils/classifier_utils/modeling.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """The main BERT model and related functions.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import copy 23 | import json 24 | import math 25 | import re 26 | import numpy as np 27 | import six 28 | import tensorflow as tf 29 | from . import bert_utils 30 | 31 | class BertConfig(object): 32 | """Configuration for `BertModel`.""" 33 | 34 | def __init__(self, 35 | vocab_size, 36 | hidden_size=768, 37 | num_hidden_layers=12, 38 | num_attention_heads=12, 39 | intermediate_size=3072, 40 | hidden_act="gelu", 41 | hidden_dropout_prob=0.1, 42 | attention_probs_dropout_prob=0.1, 43 | max_position_embeddings=512, 44 | type_vocab_size=16, 45 | initializer_range=0.02): 46 | """Constructs BertConfig. 47 | 48 | Args: 49 | vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. 50 | hidden_size: Size of the encoder layers and the pooler layer. 51 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 52 | num_attention_heads: Number of attention heads for each attention layer in 53 | the Transformer encoder. 54 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 55 | layer in the Transformer encoder. 56 | hidden_act: The non-linear activation function (function or string) in the 57 | encoder and pooler. 58 | hidden_dropout_prob: The dropout probability for all fully connected 59 | layers in the embeddings, encoder, and pooler. 60 | attention_probs_dropout_prob: The dropout ratio for the attention 61 | probabilities. 62 | max_position_embeddings: The maximum sequence length that this model might 63 | ever be used with. Typically set this to something large just in case 64 | (e.g., 512 or 1024 or 2048). 65 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 66 | `BertModel`. 67 | initializer_range: The stdev of the truncated_normal_initializer for 68 | initializing all weight matrices. 69 | """ 70 | self.vocab_size = vocab_size 71 | self.hidden_size = hidden_size 72 | self.num_hidden_layers = num_hidden_layers 73 | self.num_attention_heads = num_attention_heads 74 | self.hidden_act = hidden_act 75 | self.intermediate_size = intermediate_size 76 | self.hidden_dropout_prob = hidden_dropout_prob 77 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 78 | self.max_position_embeddings = max_position_embeddings 79 | self.type_vocab_size = type_vocab_size 80 | self.initializer_range = initializer_range 81 | 82 | @classmethod 83 | def from_dict(cls, json_object): 84 | """Constructs a `BertConfig` from a Python dictionary of parameters.""" 85 | config = BertConfig(vocab_size=None) 86 | for (key, value) in six.iteritems(json_object): 87 | config.__dict__[key] = value 88 | return config 89 | 90 | @classmethod 91 | def from_json_file(cls, json_file): 92 | """Constructs a `BertConfig` from a json file of parameters.""" 93 | with tf.gfile.GFile(json_file, "r") as reader: 94 | text = reader.read() 95 | return cls.from_dict(json.loads(text)) 96 | 97 | def to_dict(self): 98 | """Serializes this instance to a Python dictionary.""" 99 | output = copy.deepcopy(self.__dict__) 100 | return output 101 | 102 | def to_json_string(self): 103 | """Serializes this instance to a JSON string.""" 104 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 105 | 106 | 107 | class BertModel(object): 108 | """BERT model ("Bidirectional Encoder Representations from Transformers"). 109 | 110 | Example usage: 111 | 112 | ```python 113 | # Already been converted into WordPiece token ids 114 | input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) 115 | input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) 116 | token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) 117 | 118 | config = modeling.BertConfig(vocab_size=32000, hidden_size=512, 119 | num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) 120 | 121 | model = modeling.BertModel(config=config, is_training=True, 122 | input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) 123 | 124 | label_embeddings = tf.get_variable(...) 125 | pooled_output = model.get_pooled_output() 126 | logits = tf.matmul(pooled_output, label_embeddings) 127 | ... 128 | ``` 129 | """ 130 | 131 | def __init__(self, 132 | config, 133 | is_training, 134 | input_ids, 135 | input_mask=None, 136 | token_type_ids=None, 137 | use_one_hot_embeddings=False, 138 | scope=None): 139 | """Constructor for BertModel. 140 | 141 | Args: 142 | config: `BertConfig` instance. 143 | is_training: bool. true for training model, false for eval model. Controls 144 | whether dropout will be applied. 145 | input_ids: int32 Tensor of shape [batch_size, seq_length]. 146 | input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. 147 | token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. 148 | use_one_hot_embeddings: (optional) bool. Whether to use one-hot word 149 | embeddings or tf.embedding_lookup() for the word embeddings. 150 | scope: (optional) variable scope. Defaults to "bert". 151 | 152 | Raises: 153 | ValueError: The config is invalid or one of the input tensor shapes 154 | is invalid. 155 | """ 156 | config = copy.deepcopy(config) 157 | if not is_training: 158 | config.hidden_dropout_prob = 0.0 159 | config.attention_probs_dropout_prob = 0.0 160 | 161 | input_shape = get_shape_list(input_ids, expected_rank=2) 162 | batch_size = input_shape[0] 163 | seq_length = input_shape[1] 164 | 165 | if input_mask is None: 166 | input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) 167 | 168 | if token_type_ids is None: 169 | token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) 170 | 171 | with tf.variable_scope(scope, default_name="bert"): 172 | with tf.variable_scope("embeddings"): 173 | # Perform embedding lookup on the word ids. 174 | (self.embedding_output, self.embedding_table) = embedding_lookup( 175 | input_ids=input_ids, 176 | vocab_size=config.vocab_size, 177 | embedding_size=config.hidden_size, 178 | initializer_range=config.initializer_range, 179 | word_embedding_name="word_embeddings", 180 | use_one_hot_embeddings=use_one_hot_embeddings) 181 | 182 | # Add positional embeddings and token type embeddings, then layer 183 | # normalize and perform dropout. 184 | self.embedding_output = embedding_postprocessor( 185 | input_tensor=self.embedding_output, 186 | use_token_type=True, 187 | token_type_ids=token_type_ids, 188 | token_type_vocab_size=config.type_vocab_size, 189 | token_type_embedding_name="token_type_embeddings", 190 | use_position_embeddings=True, 191 | position_embedding_name="position_embeddings", 192 | initializer_range=config.initializer_range, 193 | max_position_embeddings=config.max_position_embeddings, 194 | dropout_prob=config.hidden_dropout_prob) 195 | 196 | with tf.variable_scope("encoder"): 197 | # This converts a 2D mask of shape [batch_size, seq_length] to a 3D 198 | # mask of shape [batch_size, seq_length, seq_length] which is used 199 | # for the attention scores. 200 | attention_mask = create_attention_mask_from_input_mask( 201 | input_ids, input_mask) 202 | 203 | # Run the stacked transformer. 204 | # `sequence_output` shape = [batch_size, seq_length, hidden_size]. 205 | self.all_encoder_layers = transformer_model( 206 | input_tensor=self.embedding_output, 207 | attention_mask=attention_mask, 208 | hidden_size=config.hidden_size, 209 | num_hidden_layers=config.num_hidden_layers, 210 | num_attention_heads=config.num_attention_heads, 211 | intermediate_size=config.intermediate_size, 212 | intermediate_act_fn=get_activation(config.hidden_act), 213 | hidden_dropout_prob=config.hidden_dropout_prob, 214 | attention_probs_dropout_prob=config.attention_probs_dropout_prob, 215 | initializer_range=config.initializer_range, 216 | do_return_all_layers=True) 217 | 218 | self.sequence_output = self.all_encoder_layers[-1] 219 | # The "pooler" converts the encoded sequence tensor of shape 220 | # [batch_size, seq_length, hidden_size] to a tensor of shape 221 | # [batch_size, hidden_size]. This is necessary for segment-level 222 | # (or segment-pair-level) classification tasks where we need a fixed 223 | # dimensional representation of the segment. 224 | with tf.variable_scope("pooler"): 225 | # We "pool" the model by simply taking the hidden state corresponding 226 | # to the first token. We assume that this has been pre-trained 227 | first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) 228 | self.pooled_output = tf.layers.dense( 229 | first_token_tensor, 230 | config.hidden_size, 231 | activation=tf.tanh, 232 | kernel_initializer=create_initializer(config.initializer_range)) 233 | 234 | def get_pooled_output(self): 235 | return self.pooled_output 236 | 237 | def get_sequence_output(self): 238 | """Gets final hidden layer of encoder. 239 | 240 | Returns: 241 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 242 | to the final hidden of the transformer encoder. 243 | """ 244 | return self.sequence_output 245 | 246 | def get_all_encoder_layers(self): 247 | return self.all_encoder_layers 248 | 249 | def get_embedding_output(self): 250 | """Gets output of the embedding lookup (i.e., input to the transformer). 251 | 252 | Returns: 253 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 254 | to the output of the embedding layer, after summing the word 255 | embeddings with the positional embeddings and the token type embeddings, 256 | then performing layer normalization. This is the input to the transformer. 257 | """ 258 | return self.embedding_output 259 | 260 | def get_embedding_table(self): 261 | return self.embedding_table 262 | 263 | 264 | 265 | class AlBertModel(object): 266 | """BERT model ("Bidirectional Encoder Representations from Transformers"). 267 | 268 | Example usage: 269 | 270 | ```python 271 | # Already been converted into WordPiece token ids 272 | input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) 273 | input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) 274 | token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) 275 | 276 | config = modeling.BertConfig(vocab_size=32000, hidden_size=512, 277 | num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) 278 | 279 | model = modeling.BertModel(config=config, is_training=True, 280 | input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) 281 | 282 | label_embeddings = tf.get_variable(...) 283 | pooled_output = model.get_pooled_output() 284 | logits = tf.matmul(pooled_output, label_embeddings) 285 | ... 286 | ``` 287 | """ 288 | 289 | def __init__(self, 290 | config, 291 | is_training, 292 | input_ids, 293 | input_mask=None, 294 | token_type_ids=None, 295 | use_one_hot_embeddings=False, 296 | scope=None): 297 | """Constructor for BertModel. 298 | 299 | Args: 300 | config: `BertConfig` instance. 301 | is_training: bool. true for training model, false for eval model. Controls 302 | whether dropout will be applied. 303 | input_ids: int32 Tensor of shape [batch_size, seq_length]. 304 | input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. 305 | token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. 306 | use_one_hot_embeddings: (optional) bool. Whether to use one-hot word 307 | embeddings or tf.embedding_lookup() for the word embeddings. 308 | scope: (optional) variable scope. Defaults to "bert". 309 | 310 | Raises: 311 | ValueError: The config is invalid or one of the input tensor shapes 312 | is invalid. 313 | """ 314 | config = copy.deepcopy(config) 315 | if not is_training: 316 | config.hidden_dropout_prob = 0.0 317 | config.attention_probs_dropout_prob = 0.0 318 | 319 | input_shape = get_shape_list(input_ids, expected_rank=2) 320 | batch_size = input_shape[0] 321 | seq_length = input_shape[1] 322 | 323 | if input_mask is None: 324 | input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) 325 | 326 | if token_type_ids is None: 327 | token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) 328 | 329 | with tf.variable_scope(scope, default_name="albert"): 330 | with tf.variable_scope("embeddings"): 331 | # Perform embedding lookup on the word ids, but use stype of factorized embedding parameterization from albert. add by brightmart, 2019-09-28 332 | (self.embedding_output, self.embedding_table,self.embedding_table_2) = embedding_lookup_factorized( 333 | input_ids=input_ids, 334 | vocab_size=config.vocab_size, 335 | hidden_size=config.hidden_size, 336 | embedding_size=config.embedding_size, 337 | initializer_range=config.initializer_range, 338 | word_embedding_name="word_embeddings", 339 | use_one_hot_embeddings=use_one_hot_embeddings) 340 | 341 | # Add positional embeddings and token type embeddings, then layer 342 | # normalize and perform dropout. 343 | self.embedding_output = embedding_postprocessor( 344 | input_tensor=self.embedding_output, 345 | use_token_type=True, 346 | token_type_ids=token_type_ids, 347 | token_type_vocab_size=config.type_vocab_size, 348 | token_type_embedding_name="token_type_embeddings", 349 | use_position_embeddings=True, 350 | position_embedding_name="position_embeddings", 351 | initializer_range=config.initializer_range, 352 | max_position_embeddings=config.max_position_embeddings, 353 | dropout_prob=config.hidden_dropout_prob) 354 | 355 | with tf.variable_scope("encoder"): 356 | # This converts a 2D mask of shape [batch_size, seq_length] to a 3D 357 | # mask of shape [batch_size, seq_length, seq_length] which is used 358 | # for the attention scores. 359 | attention_mask = create_attention_mask_from_input_mask( 360 | input_ids, input_mask) 361 | 362 | # Run the stacked transformer. 363 | # `sequence_output` shape = [batch_size, seq_length, hidden_size]. 364 | try: 365 | ln_type = config.ln_type 366 | except: 367 | ln_type = None 368 | print("ln_type:",ln_type) 369 | if ln_type is None or ln_type=='postln': # currently, base or large of albert used post-LN structure 370 | print("old structure of transformer.use: transformer_model,which use post-LN") 371 | self.all_encoder_layers = transformer_model( 372 | input_tensor=self.embedding_output, 373 | attention_mask=attention_mask, 374 | hidden_size=config.hidden_size, 375 | num_hidden_layers=config.num_hidden_layers, 376 | num_attention_heads=config.num_attention_heads, 377 | intermediate_size=config.intermediate_size, 378 | intermediate_act_fn=get_activation(config.hidden_act), 379 | hidden_dropout_prob=config.hidden_dropout_prob, 380 | attention_probs_dropout_prob=config.attention_probs_dropout_prob, 381 | initializer_range=config.initializer_range, 382 | do_return_all_layers=True) 383 | else: # xlarge or xxlarge of albert, used pre-LN structure 384 | print("new structure of transformer.use: prelln_transformer_model,which use pre-LN") 385 | self.all_encoder_layers = prelln_transformer_model( # change by brightmart, 4th, oct, 2019. pre-Layer Normalization can converge fast and better. check paper: ON LAYER NORMALIZATION IN THE TRANSFORMER ARCHITECTURE 386 | input_tensor=self.embedding_output, 387 | attention_mask=attention_mask, 388 | hidden_size=config.hidden_size, 389 | num_hidden_layers=config.num_hidden_layers, 390 | num_attention_heads=config.num_attention_heads, 391 | intermediate_size=config.intermediate_size, 392 | intermediate_act_fn=get_activation(config.hidden_act), 393 | hidden_dropout_prob=config.hidden_dropout_prob, 394 | attention_probs_dropout_prob=config.attention_probs_dropout_prob, 395 | initializer_range=config.initializer_range, 396 | do_return_all_layers=True, 397 | shared_type='all') # do_return_all_layers=True 398 | 399 | self.sequence_output = self.all_encoder_layers[-1] # [batch_size, seq_length, hidden_size] 400 | # The "pooler" converts the encoded sequence tensor of shape 401 | # [batch_size, seq_length, hidden_size] to a tensor of shape 402 | # [batch_size, hidden_size]. This is necessary for segment-level 403 | # (or segment-pair-level) classification tasks where we need a fixed 404 | # dimensional representation of the segment. 405 | with tf.variable_scope("pooler"): 406 | # We "pool" the model by simply taking the hidden state corresponding 407 | # to the first token. We assume that this has been pre-trained 408 | first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) 409 | self.pooled_output = tf.layers.dense( 410 | first_token_tensor, 411 | config.hidden_size, 412 | activation=tf.tanh, 413 | kernel_initializer=create_initializer(config.initializer_range)) 414 | 415 | def get_pooled_output(self): 416 | return self.pooled_output 417 | 418 | def get_sequence_output(self): 419 | """Gets final hidden layer of encoder. 420 | 421 | Returns: 422 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 423 | to the final hidden of the transformer encoder. 424 | """ 425 | return self.sequence_output 426 | 427 | def get_all_encoder_layers(self): 428 | return self.all_encoder_layers 429 | 430 | def get_embedding_output(self): 431 | """Gets output of the embedding lookup (i.e., input to the transformer). 432 | 433 | Returns: 434 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 435 | to the output of the embedding layer, after summing the word 436 | embeddings with the positional embeddings and the token type embeddings, 437 | then performing layer normalization. This is the input to the transformer. 438 | """ 439 | return self.embedding_output 440 | 441 | def get_embedding_table(self): 442 | return self.embedding_table 443 | 444 | def get_embedding_table_2(self): 445 | return self.embedding_table_2 446 | 447 | def gelu(x): 448 | """Gaussian Error Linear Unit. 449 | 450 | This is a smoother version of the RELU. 451 | Original paper: https://arxiv.org/abs/1606.08415 452 | Args: 453 | x: float Tensor to perform activation. 454 | 455 | Returns: 456 | `x` with the GELU activation applied. 457 | """ 458 | cdf = 0.5 * (1.0 + tf.tanh( 459 | (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) 460 | return x * cdf 461 | 462 | 463 | def get_activation(activation_string): 464 | """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. 465 | 466 | Args: 467 | activation_string: String name of the activation function. 468 | 469 | Returns: 470 | A Python function corresponding to the activation function. If 471 | `activation_string` is None, empty, or "linear", this will return None. 472 | If `activation_string` is not a string, it will return `activation_string`. 473 | 474 | Raises: 475 | ValueError: The `activation_string` does not correspond to a known 476 | activation. 477 | """ 478 | 479 | # We assume that anything that"s not a string is already an activation 480 | # function, so we just return it. 481 | if not isinstance(activation_string, six.string_types): 482 | return activation_string 483 | 484 | if not activation_string: 485 | return None 486 | 487 | act = activation_string.lower() 488 | if act == "linear": 489 | return None 490 | elif act == "relu": 491 | return tf.nn.relu 492 | elif act == "gelu": 493 | return gelu 494 | elif act == "tanh": 495 | return tf.tanh 496 | else: 497 | raise ValueError("Unsupported activation: %s" % act) 498 | 499 | 500 | def get_assignment_map_from_checkpoint(tvars, init_checkpoint): 501 | """Compute the union of the current variables and checkpoint variables.""" 502 | assignment_map = {} 503 | initialized_variable_names = {} 504 | 505 | name_to_variable = collections.OrderedDict() 506 | for var in tvars: 507 | name = var.name 508 | m = re.match("^(.*):\\d+$", name) 509 | if m is not None: 510 | name = m.group(1) 511 | name_to_variable[name] = var 512 | 513 | init_vars = tf.train.list_variables(init_checkpoint) 514 | 515 | assignment_map = collections.OrderedDict() 516 | for x in init_vars: 517 | (name, var) = (x[0], x[1]) 518 | if name not in name_to_variable: 519 | continue 520 | assignment_map[name] = name 521 | initialized_variable_names[name] = 1 522 | initialized_variable_names[name + ":0"] = 1 523 | 524 | return (assignment_map, initialized_variable_names) 525 | 526 | 527 | def dropout(input_tensor, dropout_prob): 528 | """Perform dropout. 529 | 530 | Args: 531 | input_tensor: float Tensor. 532 | dropout_prob: Python float. The probability of dropping out a value (NOT of 533 | *keeping* a dimension as in `tf.nn.dropout`). 534 | 535 | Returns: 536 | A version of `input_tensor` with dropout applied. 537 | """ 538 | if dropout_prob is None or dropout_prob == 0.0: 539 | return input_tensor 540 | 541 | output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) 542 | return output 543 | 544 | 545 | def layer_norm(input_tensor, name=None): 546 | """Run layer normalization on the last dimension of the tensor.""" 547 | return tf.contrib.layers.layer_norm( 548 | inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) 549 | 550 | 551 | def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): 552 | """Runs layer normalization followed by dropout.""" 553 | output_tensor = layer_norm(input_tensor, name) 554 | output_tensor = dropout(output_tensor, dropout_prob) 555 | return output_tensor 556 | 557 | 558 | def create_initializer(initializer_range=0.02): 559 | """Creates a `truncated_normal_initializer` with the given range.""" 560 | return tf.truncated_normal_initializer(stddev=initializer_range) 561 | 562 | 563 | def embedding_lookup(input_ids, 564 | vocab_size, 565 | embedding_size=128, 566 | initializer_range=0.02, 567 | word_embedding_name="word_embeddings", 568 | use_one_hot_embeddings=False): 569 | """Looks up words embeddings for id tensor. 570 | 571 | Args: 572 | input_ids: int32 Tensor of shape [batch_size, seq_length] containing word 573 | ids. 574 | vocab_size: int. Size of the embedding vocabulary. 575 | embedding_size: int. Width of the word embeddings. 576 | initializer_range: float. Embedding initialization range. 577 | word_embedding_name: string. Name of the embedding table. 578 | use_one_hot_embeddings: bool. If True, use one-hot method for word 579 | embeddings. If False, use `tf.gather()`. 580 | 581 | Returns: 582 | float Tensor of shape [batch_size, seq_length, embedding_size]. 583 | """ 584 | # This function assumes that the input is of shape [batch_size, seq_length, 585 | # num_inputs]. 586 | # 587 | # If the input is a 2D tensor of shape [batch_size, seq_length], we 588 | # reshape to [batch_size, seq_length, 1]. 589 | if input_ids.shape.ndims == 2: 590 | input_ids = tf.expand_dims(input_ids, axis=[-1]) # shape of input_ids is:[ batch_size, seq_length, 1] 591 | 592 | embedding_table = tf.get_variable( # [vocab_size, embedding_size] 593 | name=word_embedding_name, 594 | shape=[vocab_size, embedding_size], 595 | initializer=create_initializer(initializer_range)) 596 | 597 | flat_input_ids = tf.reshape(input_ids, [-1]) # one rank. shape as (batch_size * sequence_length,) 598 | if use_one_hot_embeddings: 599 | one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) # one_hot_input_ids=[batch_size * sequence_length,vocab_size] 600 | output = tf.matmul(one_hot_input_ids, embedding_table) # output=[batch_size * sequence_length,embedding_size] 601 | else: 602 | output = tf.gather(embedding_table, flat_input_ids) # [vocab_size, embedding_size]*[batch_size * sequence_length,]--->[batch_size * sequence_length,embedding_size] 603 | 604 | input_shape = get_shape_list(input_ids) # input_shape=[ batch_size, seq_length, 1] 605 | 606 | output = tf.reshape(output,input_shape[0:-1] + [input_shape[-1] * embedding_size]) # output=[batch_size,sequence_length,embedding_size] 607 | return (output, embedding_table) 608 | 609 | def embedding_lookup_factorized(input_ids, # Factorized embedding parameterization provide by albert 610 | vocab_size, 611 | hidden_size, 612 | embedding_size=128, 613 | initializer_range=0.02, 614 | word_embedding_name="word_embeddings", 615 | use_one_hot_embeddings=False): 616 | """Looks up words embeddings for id tensor, but in a factorized style followed by albert. it is used to reduce much percentage of parameters previous exists. 617 | Check "Factorized embedding parameterization" session in the paper. 618 | 619 | Args: 620 | input_ids: int32 Tensor of shape [batch_size, seq_length] containing word 621 | ids. 622 | vocab_size: int. Size of the embedding vocabulary. 623 | embedding_size: int. Width of the word embeddings. 624 | initializer_range: float. Embedding initialization range. 625 | word_embedding_name: string. Name of the embedding table. 626 | use_one_hot_embeddings: bool. If True, use one-hot method for word 627 | embeddings. If False, use `tf.gather()`. 628 | 629 | Returns: 630 | float Tensor of shape [batch_size, seq_length, embedding_size]. 631 | """ 632 | # This function assumes that the input is of shape [batch_size, seq_length, 633 | # num_inputs]. 634 | # 635 | # If the input is a 2D tensor of shape [batch_size, seq_length], we 636 | # reshape to [batch_size, seq_length, 1]. 637 | 638 | # 1.first project one-hot vectors into a lower dimensional embedding space of size E 639 | print("embedding_lookup_factorized. factorized embedding parameterization is used.") 640 | if input_ids.shape.ndims == 2: 641 | input_ids = tf.expand_dims(input_ids, axis=[-1]) # shape of input_ids is:[ batch_size, seq_length, 1] 642 | 643 | embedding_table = tf.get_variable( # [vocab_size, embedding_size] 644 | name=word_embedding_name, 645 | shape=[vocab_size, embedding_size], 646 | initializer=create_initializer(initializer_range)) 647 | 648 | flat_input_ids = tf.reshape(input_ids, [-1]) # one rank. shape as (batch_size * sequence_length,) 649 | if use_one_hot_embeddings: 650 | one_hot_input_ids = tf.one_hot(flat_input_ids,depth=vocab_size) # one_hot_input_ids=[batch_size * sequence_length,vocab_size] 651 | output_middle = tf.matmul(one_hot_input_ids, embedding_table) # output=[batch_size * sequence_length,embedding_size] 652 | else: 653 | output_middle = tf.gather(embedding_table,flat_input_ids) # [vocab_size, embedding_size]*[batch_size * sequence_length,]--->[batch_size * sequence_length,embedding_size] 654 | 655 | # 2. project vector(output_middle) to the hidden space 656 | project_variable = tf.get_variable( # [embedding_size, hidden_size] 657 | name=word_embedding_name+"_2", 658 | shape=[embedding_size, hidden_size], 659 | initializer=create_initializer(initializer_range)) 660 | output = tf.matmul(output_middle, project_variable) # ([batch_size * sequence_length, embedding_size] * [embedding_size, hidden_size])--->[batch_size * sequence_length, hidden_size] 661 | # reshape back to 3 rank 662 | input_shape = get_shape_list(input_ids) # input_shape=[ batch_size, seq_length, 1] 663 | batch_size, sequene_length, _=input_shape 664 | output = tf.reshape(output, (batch_size,sequene_length,hidden_size)) # output=[batch_size, sequence_length, hidden_size] 665 | return (output, embedding_table, project_variable) 666 | 667 | 668 | def embedding_postprocessor(input_tensor, 669 | use_token_type=False, 670 | token_type_ids=None, 671 | token_type_vocab_size=16, 672 | token_type_embedding_name="token_type_embeddings", 673 | use_position_embeddings=True, 674 | position_embedding_name="position_embeddings", 675 | initializer_range=0.02, 676 | max_position_embeddings=512, 677 | dropout_prob=0.1): 678 | """Performs various post-processing on a word embedding tensor. 679 | 680 | Args: 681 | input_tensor: float Tensor of shape [batch_size, seq_length, 682 | embedding_size]. 683 | use_token_type: bool. Whether to add embeddings for `token_type_ids`. 684 | token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. 685 | Must be specified if `use_token_type` is True. 686 | token_type_vocab_size: int. The vocabulary size of `token_type_ids`. 687 | token_type_embedding_name: string. The name of the embedding table variable 688 | for token type ids. 689 | use_position_embeddings: bool. Whether to add position embeddings for the 690 | position of each token in the sequence. 691 | position_embedding_name: string. The name of the embedding table variable 692 | for positional embeddings. 693 | initializer_range: float. Range of the weight initialization. 694 | max_position_embeddings: int. Maximum sequence length that might ever be 695 | used with this model. This can be longer than the sequence length of 696 | input_tensor, but cannot be shorter. 697 | dropout_prob: float. Dropout probability applied to the final output tensor. 698 | 699 | Returns: 700 | float tensor with same shape as `input_tensor`. 701 | 702 | Raises: 703 | ValueError: One of the tensor shapes or input values is invalid. 704 | """ 705 | input_shape = get_shape_list(input_tensor, expected_rank=3) 706 | batch_size = input_shape[0] 707 | seq_length = input_shape[1] 708 | width = input_shape[2] 709 | 710 | output = input_tensor 711 | 712 | if use_token_type: 713 | if token_type_ids is None: 714 | raise ValueError("`token_type_ids` must be specified if" 715 | "`use_token_type` is True.") 716 | token_type_table = tf.get_variable( 717 | name=token_type_embedding_name, 718 | shape=[token_type_vocab_size, width], 719 | initializer=create_initializer(initializer_range)) 720 | # This vocab will be small so we always do one-hot here, since it is always 721 | # faster for a small vocabulary. 722 | flat_token_type_ids = tf.reshape(token_type_ids, [-1]) 723 | one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) 724 | token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) 725 | token_type_embeddings = tf.reshape(token_type_embeddings, 726 | [batch_size, seq_length, width]) 727 | output += token_type_embeddings 728 | 729 | if use_position_embeddings: 730 | assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) 731 | with tf.control_dependencies([assert_op]): 732 | full_position_embeddings = tf.get_variable( 733 | name=position_embedding_name, 734 | shape=[max_position_embeddings, width], 735 | initializer=create_initializer(initializer_range)) 736 | # Since the position embedding table is a learned variable, we create it 737 | # using a (long) sequence length `max_position_embeddings`. The actual 738 | # sequence length might be shorter than this, for faster training of 739 | # tasks that do not have long sequences. 740 | # 741 | # So `full_position_embeddings` is effectively an embedding table 742 | # for position [0, 1, 2, ..., max_position_embeddings-1], and the current 743 | # sequence has positions [0, 1, 2, ... seq_length-1], so we can just 744 | # perform a slice. 745 | position_embeddings = tf.slice(full_position_embeddings, [0, 0], 746 | [seq_length, -1]) 747 | num_dims = len(output.shape.as_list()) 748 | 749 | # Only the last two dimensions are relevant (`seq_length` and `width`), so 750 | # we broadcast among the first dimensions, which is typically just 751 | # the batch size. 752 | position_broadcast_shape = [] 753 | for _ in range(num_dims - 2): 754 | position_broadcast_shape.append(1) 755 | position_broadcast_shape.extend([seq_length, width]) 756 | position_embeddings = tf.reshape(position_embeddings, 757 | position_broadcast_shape) 758 | output += position_embeddings 759 | 760 | output = layer_norm_and_dropout(output, dropout_prob) 761 | return output 762 | 763 | 764 | def create_attention_mask_from_input_mask(from_tensor, to_mask): 765 | """Create 3D attention mask from a 2D tensor mask. 766 | 767 | Args: 768 | from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. 769 | to_mask: int32 Tensor of shape [batch_size, to_seq_length]. 770 | 771 | Returns: 772 | float Tensor of shape [batch_size, from_seq_length, to_seq_length]. 773 | """ 774 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 775 | batch_size = from_shape[0] 776 | from_seq_length = from_shape[1] 777 | 778 | to_shape = get_shape_list(to_mask, expected_rank=2) 779 | to_seq_length = to_shape[1] 780 | 781 | to_mask = tf.cast( 782 | tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) 783 | 784 | # We don't assume that `from_tensor` is a mask (although it could be). We 785 | # don't actually care if we attend *from* padding tokens (only *to* padding) 786 | # tokens so we create a tensor of all ones. 787 | # 788 | # `broadcast_ones` = [batch_size, from_seq_length, 1] 789 | broadcast_ones = tf.ones( 790 | shape=[batch_size, from_seq_length, 1], dtype=tf.float32) 791 | 792 | # Here we broadcast along two dimensions to create the mask. 793 | mask = broadcast_ones * to_mask 794 | 795 | return mask 796 | 797 | 798 | def attention_layer(from_tensor, 799 | to_tensor, 800 | attention_mask=None, 801 | num_attention_heads=1, 802 | size_per_head=512, 803 | query_act=None, 804 | key_act=None, 805 | value_act=None, 806 | attention_probs_dropout_prob=0.0, 807 | initializer_range=0.02, 808 | do_return_2d_tensor=False, 809 | batch_size=None, 810 | from_seq_length=None, 811 | to_seq_length=None): 812 | """Performs multi-headed attention from `from_tensor` to `to_tensor`. 813 | 814 | This is an implementation of multi-headed attention based on "Attention 815 | is all you Need". If `from_tensor` and `to_tensor` are the same, then 816 | this is self-attention. Each timestep in `from_tensor` attends to the 817 | corresponding sequence in `to_tensor`, and returns a fixed-with vector. 818 | 819 | This function first projects `from_tensor` into a "query" tensor and 820 | `to_tensor` into "key" and "value" tensors. These are (effectively) a list 821 | of tensors of length `num_attention_heads`, where each tensor is of shape 822 | [batch_size, seq_length, size_per_head]. 823 | 824 | Then, the query and key tensors are dot-producted and scaled. These are 825 | softmaxed to obtain attention probabilities. The value tensors are then 826 | interpolated by these probabilities, then concatenated back to a single 827 | tensor and returned. 828 | 829 | In practice, the multi-headed attention are done with transposes and 830 | reshapes rather than actual separate tensors. 831 | 832 | Args: 833 | from_tensor: float Tensor of shape [batch_size, from_seq_length, 834 | from_width]. 835 | to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. 836 | attention_mask: (optional) int32 Tensor of shape [batch_size, 837 | from_seq_length, to_seq_length]. The values should be 1 or 0. The 838 | attention scores will effectively be set to -infinity for any positions in 839 | the mask that are 0, and will be unchanged for positions that are 1. 840 | num_attention_heads: int. Number of attention heads. 841 | size_per_head: int. Size of each attention head. 842 | query_act: (optional) Activation function for the query transform. 843 | key_act: (optional) Activation function for the key transform. 844 | value_act: (optional) Activation function for the value transform. 845 | attention_probs_dropout_prob: (optional) float. Dropout probability of the 846 | attention probabilities. 847 | initializer_range: float. Range of the weight initializer. 848 | do_return_2d_tensor: bool. If True, the output will be of shape [batch_size 849 | * from_seq_length, num_attention_heads * size_per_head]. If False, the 850 | output will be of shape [batch_size, from_seq_length, num_attention_heads 851 | * size_per_head]. 852 | batch_size: (Optional) int. If the input is 2D, this might be the batch size 853 | of the 3D version of the `from_tensor` and `to_tensor`. 854 | from_seq_length: (Optional) If the input is 2D, this might be the seq length 855 | of the 3D version of the `from_tensor`. 856 | to_seq_length: (Optional) If the input is 2D, this might be the seq length 857 | of the 3D version of the `to_tensor`. 858 | 859 | Returns: 860 | float Tensor of shape [batch_size, from_seq_length, 861 | num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is 862 | true, this will be of shape [batch_size * from_seq_length, 863 | num_attention_heads * size_per_head]). 864 | 865 | Raises: 866 | ValueError: Any of the arguments or tensor shapes are invalid. 867 | """ 868 | 869 | def transpose_for_scores(input_tensor, batch_size, num_attention_heads, 870 | seq_length, width): 871 | output_tensor = tf.reshape( 872 | input_tensor, [batch_size, seq_length, num_attention_heads, width]) 873 | 874 | output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) 875 | return output_tensor 876 | 877 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 878 | to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) 879 | 880 | if len(from_shape) != len(to_shape): 881 | raise ValueError( 882 | "The rank of `from_tensor` must match the rank of `to_tensor`.") 883 | 884 | if len(from_shape) == 3: 885 | batch_size = from_shape[0] 886 | from_seq_length = from_shape[1] 887 | to_seq_length = to_shape[1] 888 | elif len(from_shape) == 2: 889 | if (batch_size is None or from_seq_length is None or to_seq_length is None): 890 | raise ValueError( 891 | "When passing in rank 2 tensors to attention_layer, the values " 892 | "for `batch_size`, `from_seq_length`, and `to_seq_length` " 893 | "must all be specified.") 894 | 895 | # Scalar dimensions referenced here: 896 | # B = batch size (number of sequences) 897 | # F = `from_tensor` sequence length 898 | # T = `to_tensor` sequence length 899 | # N = `num_attention_heads` 900 | # H = `size_per_head` 901 | 902 | from_tensor_2d = reshape_to_matrix(from_tensor) 903 | to_tensor_2d = reshape_to_matrix(to_tensor) 904 | 905 | # `query_layer` = [B*F, N*H] 906 | query_layer = tf.layers.dense( 907 | from_tensor_2d, 908 | num_attention_heads * size_per_head, 909 | activation=query_act, 910 | name="query", 911 | kernel_initializer=create_initializer(initializer_range)) 912 | 913 | # `key_layer` = [B*T, N*H] 914 | key_layer = tf.layers.dense( 915 | to_tensor_2d, 916 | num_attention_heads * size_per_head, 917 | activation=key_act, 918 | name="key", 919 | kernel_initializer=create_initializer(initializer_range)) 920 | 921 | # `value_layer` = [B*T, N*H] 922 | value_layer = tf.layers.dense( 923 | to_tensor_2d, 924 | num_attention_heads * size_per_head, 925 | activation=value_act, 926 | name="value", 927 | kernel_initializer=create_initializer(initializer_range)) 928 | 929 | # `query_layer` = [B, N, F, H] 930 | query_layer = transpose_for_scores(query_layer, batch_size, 931 | num_attention_heads, from_seq_length, 932 | size_per_head) 933 | 934 | # `key_layer` = [B, N, T, H] 935 | key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, 936 | to_seq_length, size_per_head) 937 | 938 | # Take the dot product between "query" and "key" to get the raw 939 | # attention scores. 940 | # `attention_scores` = [B, N, F, T] 941 | attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) 942 | attention_scores = tf.multiply(attention_scores, 943 | 1.0 / math.sqrt(float(size_per_head))) 944 | 945 | if attention_mask is not None: 946 | # `attention_mask` = [B, 1, F, T] 947 | attention_mask = tf.expand_dims(attention_mask, axis=[1]) 948 | 949 | # Since attention_mask is 1.0 for positions we want to attend and 0.0 for 950 | # masked positions, this operation will create a tensor which is 0.0 for 951 | # positions we want to attend and -10000.0 for masked positions. 952 | adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 953 | 954 | # Since we are adding it to the raw scores before the softmax, this is 955 | # effectively the same as removing these entirely. 956 | attention_scores += adder 957 | 958 | # Normalize the attention scores to probabilities. 959 | # `attention_probs` = [B, N, F, T] 960 | attention_probs = tf.nn.softmax(attention_scores) 961 | 962 | # This is actually dropping out entire tokens to attend to, which might 963 | # seem a bit unusual, but is taken from the original Transformer paper. 964 | attention_probs = dropout(attention_probs, attention_probs_dropout_prob) 965 | 966 | # `value_layer` = [B, T, N, H] 967 | value_layer = tf.reshape( 968 | value_layer, 969 | [batch_size, to_seq_length, num_attention_heads, size_per_head]) 970 | 971 | # `value_layer` = [B, N, T, H] 972 | value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) 973 | 974 | # `context_layer` = [B, N, F, H] 975 | context_layer = tf.matmul(attention_probs, value_layer) 976 | 977 | # `context_layer` = [B, F, N, H] 978 | context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) 979 | 980 | if do_return_2d_tensor: 981 | # `context_layer` = [B*F, N*H] 982 | context_layer = tf.reshape( 983 | context_layer, 984 | [batch_size * from_seq_length, num_attention_heads * size_per_head]) 985 | else: 986 | # `context_layer` = [B, F, N*H] 987 | context_layer = tf.reshape( 988 | context_layer, 989 | [batch_size, from_seq_length, num_attention_heads * size_per_head]) 990 | 991 | return context_layer 992 | 993 | 994 | def transformer_model(input_tensor, 995 | attention_mask=None, 996 | hidden_size=768, 997 | num_hidden_layers=12, 998 | num_attention_heads=12, 999 | intermediate_size=3072, 1000 | intermediate_act_fn=gelu, 1001 | hidden_dropout_prob=0.1, 1002 | attention_probs_dropout_prob=0.1, 1003 | initializer_range=0.02, 1004 | do_return_all_layers=False, 1005 | share_parameter_across_layers=True): 1006 | """Multi-headed, multi-layer Transformer from "Attention is All You Need". 1007 | 1008 | This is almost an exact implementation of the original Transformer encoder. 1009 | 1010 | See the original paper: 1011 | https://arxiv.org/abs/1706.03762 1012 | 1013 | Also see: 1014 | https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py 1015 | 1016 | Args: 1017 | input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. 1018 | attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, 1019 | seq_length], with 1 for positions that can be attended to and 0 in 1020 | positions that should not be. 1021 | hidden_size: int. Hidden size of the Transformer. 1022 | num_hidden_layers: int. Number of layers (blocks) in the Transformer. 1023 | num_attention_heads: int. Number of attention heads in the Transformer. 1024 | intermediate_size: int. The size of the "intermediate" (a.k.a., feed 1025 | forward) layer. 1026 | intermediate_act_fn: function. The non-linear activation function to apply 1027 | to the output of the intermediate/feed-forward layer. 1028 | hidden_dropout_prob: float. Dropout probability for the hidden layers. 1029 | attention_probs_dropout_prob: float. Dropout probability of the attention 1030 | probabilities. 1031 | initializer_range: float. Range of the initializer (stddev of truncated 1032 | normal). 1033 | do_return_all_layers: Whether to also return all layers or just the final 1034 | layer. 1035 | 1036 | Returns: 1037 | float Tensor of shape [batch_size, seq_length, hidden_size], the final 1038 | hidden layer of the Transformer. 1039 | 1040 | Raises: 1041 | ValueError: A Tensor shape or parameter is invalid. 1042 | """ 1043 | if hidden_size % num_attention_heads != 0: 1044 | raise ValueError( 1045 | "The hidden size (%d) is not a multiple of the number of attention " 1046 | "heads (%d)" % (hidden_size, num_attention_heads)) 1047 | 1048 | attention_head_size = int(hidden_size / num_attention_heads) 1049 | input_shape = get_shape_list(input_tensor, expected_rank=3) 1050 | batch_size = input_shape[0] 1051 | seq_length = input_shape[1] 1052 | input_width = input_shape[2] 1053 | 1054 | # The Transformer performs sum residuals on all layers so the input needs 1055 | # to be the same as the hidden size. 1056 | if input_width != hidden_size: 1057 | raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % 1058 | (input_width, hidden_size)) 1059 | 1060 | # We keep the representation as a 2D tensor to avoid re-shaping it back and 1061 | # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on 1062 | # the GPU/CPU but may not be free on the TPU, so we want to minimize them to 1063 | # help the optimizer. 1064 | prev_output = reshape_to_matrix(input_tensor) 1065 | 1066 | all_layer_outputs = [] 1067 | for layer_idx in range(num_hidden_layers): 1068 | if share_parameter_across_layers: 1069 | name_variable_scope="layer_shared" 1070 | else: 1071 | name_variable_scope="layer_%d" % layer_idx 1072 | # share all parameters across layers. add by brightmart, 2019-09-28. previous it is like this: "layer_%d" % layer_idx 1073 | with tf.variable_scope(name_variable_scope, reuse=True if (share_parameter_across_layers and layer_idx>0) else False): 1074 | 1075 | layer_input = prev_output 1076 | 1077 | with tf.variable_scope("attention"): 1078 | attention_heads = [] 1079 | with tf.variable_scope("self"): 1080 | attention_head = attention_layer( 1081 | from_tensor=layer_input, 1082 | to_tensor=layer_input, 1083 | attention_mask=attention_mask, 1084 | num_attention_heads=num_attention_heads, 1085 | size_per_head=attention_head_size, 1086 | attention_probs_dropout_prob=attention_probs_dropout_prob, 1087 | initializer_range=initializer_range, 1088 | do_return_2d_tensor=True, 1089 | batch_size=batch_size, 1090 | from_seq_length=seq_length, 1091 | to_seq_length=seq_length) 1092 | attention_heads.append(attention_head) 1093 | 1094 | attention_output = None 1095 | if len(attention_heads) == 1: 1096 | attention_output = attention_heads[0] 1097 | else: 1098 | # In the case where we have other sequences, we just concatenate 1099 | # them to the self-attention head before the projection. 1100 | attention_output = tf.concat(attention_heads, axis=-1) 1101 | 1102 | # Run a linear projection of `hidden_size` then add a residual 1103 | # with `layer_input`. 1104 | with tf.variable_scope("output"): 1105 | attention_output = tf.layers.dense( 1106 | attention_output, 1107 | hidden_size, 1108 | kernel_initializer=create_initializer(initializer_range)) 1109 | attention_output = dropout(attention_output, hidden_dropout_prob) 1110 | attention_output = layer_norm(attention_output + layer_input) 1111 | 1112 | # The activation is only applied to the "intermediate" hidden layer. 1113 | with tf.variable_scope("intermediate"): 1114 | intermediate_output = tf.layers.dense( 1115 | attention_output, 1116 | intermediate_size, 1117 | activation=intermediate_act_fn, 1118 | kernel_initializer=create_initializer(initializer_range)) 1119 | 1120 | # Down-project back to `hidden_size` then add the residual. 1121 | with tf.variable_scope("output"): 1122 | layer_output = tf.layers.dense( 1123 | intermediate_output, 1124 | hidden_size, 1125 | kernel_initializer=create_initializer(initializer_range)) 1126 | layer_output = dropout(layer_output, hidden_dropout_prob) 1127 | layer_output = layer_norm(layer_output + attention_output) 1128 | prev_output = layer_output 1129 | all_layer_outputs.append(layer_output) 1130 | 1131 | if do_return_all_layers: 1132 | final_outputs = [] 1133 | for layer_output in all_layer_outputs: 1134 | final_output = reshape_from_matrix(layer_output, input_shape) 1135 | final_outputs.append(final_output) 1136 | return final_outputs 1137 | else: 1138 | final_output = reshape_from_matrix(prev_output, input_shape) 1139 | return final_output 1140 | 1141 | 1142 | def get_shape_list(tensor, expected_rank=None, name=None): 1143 | """Returns a list of the shape of tensor, preferring static dimensions. 1144 | 1145 | Args: 1146 | tensor: A tf.Tensor object to find the shape of. 1147 | expected_rank: (optional) int. The expected rank of `tensor`. If this is 1148 | specified and the `tensor` has a different rank, and exception will be 1149 | thrown. 1150 | name: Optional name of the tensor for the error message. 1151 | 1152 | Returns: 1153 | A list of dimensions of the shape of tensor. All static dimensions will 1154 | be returned as python integers, and dynamic dimensions will be returned 1155 | as tf.Tensor scalars. 1156 | """ 1157 | if name is None: 1158 | name = tensor.name 1159 | 1160 | if expected_rank is not None: 1161 | assert_rank(tensor, expected_rank, name) 1162 | 1163 | shape = tensor.shape.as_list() 1164 | 1165 | non_static_indexes = [] 1166 | for (index, dim) in enumerate(shape): 1167 | if dim is None: 1168 | non_static_indexes.append(index) 1169 | 1170 | if not non_static_indexes: 1171 | return shape 1172 | 1173 | dyn_shape = tf.shape(tensor) 1174 | for index in non_static_indexes: 1175 | shape[index] = dyn_shape[index] 1176 | return shape 1177 | 1178 | 1179 | def reshape_to_matrix(input_tensor): 1180 | """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" 1181 | ndims = input_tensor.shape.ndims 1182 | if ndims < 2: 1183 | raise ValueError("Input tensor must have at least rank 2. Shape = %s" % 1184 | (input_tensor.shape)) 1185 | if ndims == 2: 1186 | return input_tensor 1187 | 1188 | width = input_tensor.shape[-1] 1189 | output_tensor = tf.reshape(input_tensor, [-1, width]) 1190 | return output_tensor 1191 | 1192 | 1193 | def reshape_from_matrix(output_tensor, orig_shape_list): 1194 | """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" 1195 | if len(orig_shape_list) == 2: 1196 | return output_tensor 1197 | 1198 | output_shape = get_shape_list(output_tensor) 1199 | 1200 | orig_dims = orig_shape_list[0:-1] 1201 | width = output_shape[-1] 1202 | 1203 | return tf.reshape(output_tensor, orig_dims + [width]) 1204 | 1205 | 1206 | def assert_rank(tensor, expected_rank, name=None): 1207 | """Raises an exception if the tensor rank is not of the expected rank. 1208 | 1209 | Args: 1210 | tensor: A tf.Tensor to check the rank of. 1211 | expected_rank: Python integer or list of integers, expected rank. 1212 | name: Optional name of the tensor for the error message. 1213 | 1214 | Raises: 1215 | ValueError: If the expected shape doesn't match the actual shape. 1216 | """ 1217 | if name is None: 1218 | name = tensor.name 1219 | 1220 | expected_rank_dict = {} 1221 | if isinstance(expected_rank, six.integer_types): 1222 | expected_rank_dict[expected_rank] = True 1223 | else: 1224 | for x in expected_rank: 1225 | expected_rank_dict[x] = True 1226 | 1227 | actual_rank = tensor.shape.ndims 1228 | if actual_rank not in expected_rank_dict: 1229 | scope_name = tf.get_variable_scope().name 1230 | raise ValueError( 1231 | "For the tensor `%s` in scope `%s`, the actual rank " 1232 | "`%d` (shape = %s) is not equal to the expected rank `%s`" % 1233 | (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) 1234 | 1235 | def prelln_transformer_model(input_tensor, 1236 | attention_mask=None, 1237 | hidden_size=768, 1238 | num_hidden_layers=12, 1239 | num_attention_heads=12, 1240 | intermediate_size=3072, 1241 | intermediate_act_fn=gelu, 1242 | hidden_dropout_prob=0.1, 1243 | attention_probs_dropout_prob=0.1, 1244 | initializer_range=0.02, 1245 | do_return_all_layers=False, 1246 | shared_type='all', # None, 1247 | adapter_fn=None): 1248 | """Multi-headed, multi-layer Transformer from "Attention is All You Need". 1249 | 1250 | This is almost an exact implementation of the original Transformer encoder. 1251 | 1252 | See the original paper: 1253 | https://arxiv.org/abs/1706.03762 1254 | 1255 | Also see: 1256 | https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py 1257 | 1258 | Args: 1259 | input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. 1260 | attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, 1261 | seq_length], with 1 for positions that can be attended to and 0 in 1262 | positions that should not be. 1263 | hidden_size: int. Hidden size of the Transformer. 1264 | num_hidden_layers: int. Number of layers (blocks) in the Transformer. 1265 | num_attention_heads: int. Number of attention heads in the Transformer. 1266 | intermediate_size: int. The size of the "intermediate" (a.k.a., feed 1267 | forward) layer. 1268 | intermediate_act_fn: function. The non-linear activation function to apply 1269 | to the output of the intermediate/feed-forward layer. 1270 | hidden_dropout_prob: float. Dropout probability for the hidden layers. 1271 | attention_probs_dropout_prob: float. Dropout probability of the attention 1272 | probabilities. 1273 | initializer_range: float. Range of the initializer (stddev of truncated 1274 | normal). 1275 | do_return_all_layers: Whether to also return all layers or just the final 1276 | layer. 1277 | 1278 | Returns: 1279 | float Tensor of shape [batch_size, seq_length, hidden_size], the final 1280 | hidden layer of the Transformer. 1281 | 1282 | Raises: 1283 | ValueError: A Tensor shape or parameter is invalid. 1284 | """ 1285 | if hidden_size % num_attention_heads != 0: 1286 | raise ValueError( 1287 | "The hidden size (%d) is not a multiple of the number of attention " 1288 | "heads (%d)" % (hidden_size, num_attention_heads)) 1289 | 1290 | attention_head_size = int(hidden_size / num_attention_heads) 1291 | 1292 | input_shape = bert_utils.get_shape_list(input_tensor, expected_rank=3) 1293 | batch_size = input_shape[0] 1294 | seq_length = input_shape[1] 1295 | input_width = input_shape[2] 1296 | 1297 | # The Transformer performs sum residuals on all layers so the input needs 1298 | # to be the same as the hidden size. 1299 | if input_width != hidden_size: 1300 | raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % 1301 | (input_width, hidden_size)) 1302 | 1303 | # We keep the representation as a 2D tensor to avoid re-shaping it back and 1304 | # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on 1305 | # the GPU/CPU but may not be free on the TPU, so we want to minimize them to 1306 | # help the optimizer. 1307 | prev_output = bert_utils.reshape_to_matrix(input_tensor) 1308 | 1309 | all_layer_outputs = [] 1310 | 1311 | def layer_scope(idx, shared_type): 1312 | if shared_type == 'all': 1313 | tmp = { 1314 | "layer":"layer_shared", 1315 | 'attention':'attention', 1316 | 'intermediate':'intermediate', 1317 | 'output':'output' 1318 | } 1319 | elif shared_type == 'attention': 1320 | tmp = { 1321 | "layer":"layer_shared", 1322 | 'attention':'attention', 1323 | 'intermediate':'intermediate_{}'.format(idx), 1324 | 'output':'output_{}'.format(idx) 1325 | } 1326 | elif shared_type == 'ffn': 1327 | tmp = { 1328 | "layer":"layer_shared", 1329 | 'attention':'attention_{}'.format(idx), 1330 | 'intermediate':'intermediate', 1331 | 'output':'output' 1332 | } 1333 | else: 1334 | tmp = { 1335 | "layer":"layer_{}".format(idx), 1336 | 'attention':'attention', 1337 | 'intermediate':'intermediate', 1338 | 'output':'output' 1339 | } 1340 | 1341 | return tmp 1342 | 1343 | all_layer_outputs = [] 1344 | 1345 | for layer_idx in range(num_hidden_layers): 1346 | 1347 | idx_scope = layer_scope(layer_idx, shared_type) 1348 | 1349 | with tf.variable_scope(idx_scope['layer'], reuse=tf.AUTO_REUSE): 1350 | layer_input = prev_output 1351 | 1352 | with tf.variable_scope(idx_scope['attention'], reuse=tf.AUTO_REUSE): 1353 | attention_heads = [] 1354 | 1355 | with tf.variable_scope("output", reuse=tf.AUTO_REUSE): 1356 | layer_input_pre = layer_norm(layer_input) 1357 | 1358 | with tf.variable_scope("self"): 1359 | attention_head = attention_layer( 1360 | from_tensor=layer_input_pre, 1361 | to_tensor=layer_input_pre, 1362 | attention_mask=attention_mask, 1363 | num_attention_heads=num_attention_heads, 1364 | size_per_head=attention_head_size, 1365 | attention_probs_dropout_prob=attention_probs_dropout_prob, 1366 | initializer_range=initializer_range, 1367 | do_return_2d_tensor=True, 1368 | batch_size=batch_size, 1369 | from_seq_length=seq_length, 1370 | to_seq_length=seq_length) 1371 | attention_heads.append(attention_head) 1372 | 1373 | attention_output = None 1374 | if len(attention_heads) == 1: 1375 | attention_output = attention_heads[0] 1376 | else: 1377 | # In the case where we have other sequences, we just concatenate 1378 | # them to the self-attention head before the projection. 1379 | attention_output = tf.concat(attention_heads, axis=-1) 1380 | 1381 | # Run a linear projection of `hidden_size` then add a residual 1382 | # with `layer_input`. 1383 | with tf.variable_scope("output", reuse=tf.AUTO_REUSE): 1384 | attention_output = tf.layers.dense( 1385 | attention_output, 1386 | hidden_size, 1387 | kernel_initializer=create_initializer(initializer_range)) 1388 | attention_output = dropout(attention_output, hidden_dropout_prob) 1389 | 1390 | # attention_output = layer_norm(attention_output + layer_input) 1391 | attention_output = attention_output + layer_input 1392 | 1393 | with tf.variable_scope(idx_scope['output'], reuse=tf.AUTO_REUSE): 1394 | attention_output_pre = layer_norm(attention_output) 1395 | 1396 | # The activation is only applied to the "intermediate" hidden layer. 1397 | with tf.variable_scope(idx_scope['intermediate'], reuse=tf.AUTO_REUSE): 1398 | intermediate_output = tf.layers.dense( 1399 | attention_output_pre, 1400 | intermediate_size, 1401 | activation=intermediate_act_fn, 1402 | kernel_initializer=create_initializer(initializer_range)) 1403 | 1404 | # Down-project back to `hidden_size` then add the residual. 1405 | with tf.variable_scope(idx_scope['output'], reuse=tf.AUTO_REUSE): 1406 | layer_output = tf.layers.dense( 1407 | intermediate_output, 1408 | hidden_size, 1409 | kernel_initializer=create_initializer(initializer_range)) 1410 | layer_output = dropout(layer_output, hidden_dropout_prob) 1411 | 1412 | # layer_output = layer_norm(layer_output + attention_output) 1413 | layer_output = layer_output + attention_output 1414 | prev_output = layer_output 1415 | all_layer_outputs.append(layer_output) 1416 | 1417 | if do_return_all_layers: 1418 | final_outputs = [] 1419 | for layer_output in all_layer_outputs: 1420 | final_output = bert_utils.reshape_from_matrix(layer_output, input_shape) 1421 | final_outputs.append(final_output) 1422 | return final_outputs 1423 | else: 1424 | final_output = bert_utils.reshape_from_matrix(prev_output, input_shape) 1425 | return final_output 1426 | -------------------------------------------------------------------------------- /PyCLUE/utils/classifier_utils/optimization_finetuning.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = AdamWeightDecayOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | beta_1=0.9, 63 | beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999, 64 | epsilon=1e-6, 65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 66 | 67 | if use_tpu: 68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 69 | 70 | tvars = tf.trainable_variables() 71 | grads = tf.gradients(loss, tvars) 72 | 73 | # This is how the model was pre-trained. 74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 75 | 76 | train_op = optimizer.apply_gradients( 77 | zip(grads, tvars), global_step=global_step) 78 | 79 | # Normally the global step update is done inside of `apply_gradients`. 80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use 81 | # a different optimizer, you should probably take this line out. 82 | new_global_step = global_step + 1 83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 84 | return train_op 85 | 86 | 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 88 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 89 | 90 | def __init__(self, 91 | learning_rate, 92 | weight_decay_rate=0.0, 93 | beta_1=0.9, 94 | beta_2=0.999, 95 | epsilon=1e-6, 96 | exclude_from_weight_decay=None, 97 | name="AdamWeightDecayOptimizer"): 98 | """Constructs a AdamWeightDecayOptimizer.""" 99 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 100 | 101 | self.learning_rate = learning_rate 102 | self.weight_decay_rate = weight_decay_rate 103 | self.beta_1 = beta_1 104 | self.beta_2 = beta_2 105 | self.epsilon = epsilon 106 | self.exclude_from_weight_decay = exclude_from_weight_decay 107 | 108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 109 | """See base class.""" 110 | assignments = [] 111 | for (grad, param) in grads_and_vars: 112 | if grad is None or param is None: 113 | continue 114 | 115 | param_name = self._get_variable_name(param.name) 116 | 117 | m = tf.get_variable( 118 | name=param_name + "/adam_m", 119 | shape=param.shape.as_list(), 120 | dtype=tf.float32, 121 | trainable=False, 122 | initializer=tf.zeros_initializer()) 123 | v = tf.get_variable( 124 | name=param_name + "/adam_v", 125 | shape=param.shape.as_list(), 126 | dtype=tf.float32, 127 | trainable=False, 128 | initializer=tf.zeros_initializer()) 129 | 130 | # Standard Adam update. 131 | next_m = ( 132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 133 | next_v = ( 134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 135 | tf.square(grad))) 136 | 137 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 138 | 139 | # Just adding the square of the weights to the loss function is *not* 140 | # the correct way of using L2 regularization/weight decay with Adam, 141 | # since that will interact with the m and v parameters in strange ways. 142 | # 143 | # Instead we want ot decay the weights in a manner that doesn't interact 144 | # with the m/v parameters. This is equivalent to adding the square 145 | # of the weights to the loss with plain (non-momentum) SGD. 146 | if self._do_use_weight_decay(param_name): 147 | update += self.weight_decay_rate * param 148 | 149 | update_with_lr = self.learning_rate * update 150 | 151 | next_param = param - update_with_lr 152 | 153 | assignments.extend( 154 | [param.assign(next_param), 155 | m.assign(next_m), 156 | v.assign(next_v)]) 157 | return tf.group(*assignments, name=name) 158 | 159 | def _do_use_weight_decay(self, param_name): 160 | """Whether to use L2 weight decay for `param_name`.""" 161 | if not self.weight_decay_rate: 162 | return False 163 | if self.exclude_from_weight_decay: 164 | for r in self.exclude_from_weight_decay: 165 | if re.search(r, param_name) is not None: 166 | return False 167 | return True 168 | 169 | def _get_variable_name(self, param_name): 170 | """Get the variable name from the tensor name.""" 171 | m = re.match("^(.*):\\d+$", param_name) 172 | if m is not None: 173 | param_name = m.group(1) 174 | return param_name 175 | -------------------------------------------------------------------------------- /PyCLUE/utils/classifier_utils/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import re 23 | import unicodedata 24 | import six 25 | import tensorflow as tf 26 | 27 | 28 | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): 29 | """Checks whether the casing config is consistent with the checkpoint name.""" 30 | 31 | # The casing has to be passed in by the user and there is no explicit check 32 | # as to whether it matches the checkpoint. The casing information probably 33 | # should have been stored in the bert_config.json file, but it's not, so 34 | # we have to heuristically detect it to validate. 35 | 36 | if not init_checkpoint: 37 | return 38 | 39 | m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) 40 | if m is None: 41 | return 42 | 43 | model_name = m.group(1) 44 | 45 | lower_models = [ 46 | "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", 47 | "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" 48 | ] 49 | 50 | cased_models = [ 51 | "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", 52 | "multi_cased_L-12_H-768_A-12" 53 | ] 54 | 55 | is_bad_config = False 56 | if model_name in lower_models and not do_lower_case: 57 | is_bad_config = True 58 | actual_flag = "False" 59 | case_name = "lowercased" 60 | opposite_flag = "True" 61 | 62 | if model_name in cased_models and do_lower_case: 63 | is_bad_config = True 64 | actual_flag = "True" 65 | case_name = "cased" 66 | opposite_flag = "False" 67 | 68 | if is_bad_config: 69 | raise ValueError( 70 | "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " 71 | "However, `%s` seems to be a %s model, so you " 72 | "should pass in `--do_lower_case=%s` so that the fine-tuning matches " 73 | "how the model was pre-training. If this error is wrong, please " 74 | "just comment out this check." % (actual_flag, init_checkpoint, 75 | model_name, case_name, opposite_flag)) 76 | 77 | 78 | def convert_to_unicode(text): 79 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 80 | if six.PY3: 81 | if isinstance(text, str): 82 | return text 83 | elif isinstance(text, bytes): 84 | return text.decode("utf-8", "ignore") 85 | else: 86 | raise ValueError("Unsupported string type: %s" % (type(text))) 87 | elif six.PY2: 88 | if isinstance(text, str): 89 | return text.decode("utf-8", "ignore") 90 | elif isinstance(text, unicode): 91 | return text 92 | else: 93 | raise ValueError("Unsupported string type: %s" % (type(text))) 94 | else: 95 | raise ValueError("Not running on Python2 or Python 3?") 96 | 97 | 98 | def printable_text(text): 99 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 100 | 101 | # These functions want `str` for both Python2 and Python3, but in one case 102 | # it's a Unicode string and in the other it's a byte string. 103 | if six.PY3: 104 | if isinstance(text, str): 105 | return text 106 | elif isinstance(text, bytes): 107 | return text.decode("utf-8", "ignore") 108 | else: 109 | raise ValueError("Unsupported string type: %s" % (type(text))) 110 | elif six.PY2: 111 | if isinstance(text, str): 112 | return text 113 | elif isinstance(text, unicode): 114 | return text.encode("utf-8") 115 | else: 116 | raise ValueError("Unsupported string type: %s" % (type(text))) 117 | else: 118 | raise ValueError("Not running on Python2 or Python 3?") 119 | 120 | 121 | def load_vocab(vocab_file): 122 | """Loads a vocabulary file into a dictionary.""" 123 | vocab = collections.OrderedDict() 124 | index = 0 125 | with tf.gfile.GFile(vocab_file, "r") as reader: 126 | while True: 127 | token = convert_to_unicode(reader.readline()) 128 | if not token: 129 | break 130 | token = token.strip() 131 | vocab[token] = index 132 | index += 1 133 | return vocab 134 | 135 | 136 | def convert_by_vocab(vocab, items): 137 | """Converts a sequence of [tokens|ids] using the vocab.""" 138 | output = [] 139 | #print("items:",items) #['[CLS]', '日', '##期', ',', '但', '被', '##告', '金', '##东', '##福', '载', '##明', '[MASK]', 'U', '##N', '##K', ']', '保', '##证', '本', '##月', '1', '##4', '[MASK]', '到', '##位', ',', '2', '##0', '##1', '##5', '年', '6', '[MASK]', '1', '##1', '日', '[', 'U', '##N', '##K', ']', ',', '原', '##告', '[MASK]', '认', '##可', '于', '2', '##0', '##1', '##5', '[MASK]', '6', '月', '[MASK]', '[MASK]', '日', '##向', '被', '##告', '主', '##张', '权', '##利', '。', '而', '[MASK]', '[MASK]', '自', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '年', '6', '月', '1', '##1', '日', '[SEP]', '原', '##告', '于', '2', '##0', '##1', '##6', '[MASK]', '6', '[MASK]', '2', '##4', '日', '起', '##诉', ',', '主', '##张', '保', '##证', '责', '##任', ',', '已', '超', '##过', '保', '##证', '期', '##限', '[MASK]', '保', '##证', '人', '依', '##法', '不', '##再', '承', '##担', '保', '##证', '[MASK]', '[MASK]', '[MASK]', '[SEP]'] 140 | for i,item in enumerate(items): 141 | #print(i,"item:",item) # ##期 142 | output.append(vocab[item]) 143 | return output 144 | 145 | 146 | def convert_tokens_to_ids(vocab, tokens): 147 | return convert_by_vocab(vocab, tokens) 148 | 149 | 150 | def convert_ids_to_tokens(inv_vocab, ids): 151 | return convert_by_vocab(inv_vocab, ids) 152 | 153 | 154 | def whitespace_tokenize(text): 155 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 156 | text = text.strip() 157 | if not text: 158 | return [] 159 | tokens = text.split() 160 | return tokens 161 | 162 | 163 | class FullTokenizer(object): 164 | """Runs end-to-end tokenziation.""" 165 | 166 | def __init__(self, vocab_file, do_lower_case=True): 167 | self.vocab = load_vocab(vocab_file) 168 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 169 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 170 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 171 | 172 | def tokenize(self, text): 173 | split_tokens = [] 174 | for token in self.basic_tokenizer.tokenize(text): 175 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 176 | split_tokens.append(sub_token) 177 | 178 | return split_tokens 179 | 180 | def convert_tokens_to_ids(self, tokens): 181 | return convert_by_vocab(self.vocab, tokens) 182 | 183 | def convert_ids_to_tokens(self, ids): 184 | return convert_by_vocab(self.inv_vocab, ids) 185 | 186 | 187 | class BasicTokenizer(object): 188 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 189 | 190 | def __init__(self, do_lower_case=True): 191 | """Constructs a BasicTokenizer. 192 | 193 | Args: 194 | do_lower_case: Whether to lower case the input. 195 | """ 196 | self.do_lower_case = do_lower_case 197 | 198 | def tokenize(self, text): 199 | """Tokenizes a piece of text.""" 200 | text = convert_to_unicode(text) 201 | text = self._clean_text(text) 202 | 203 | # This was added on November 1st, 2018 for the multilingual and Chinese 204 | # models. This is also applied to the English models now, but it doesn't 205 | # matter since the English models were not trained on any Chinese data 206 | # and generally don't have any Chinese data in them (there are Chinese 207 | # characters in the vocabulary because Wikipedia does have some Chinese 208 | # words in the English Wikipedia.). 209 | text = self._tokenize_chinese_chars(text) 210 | 211 | orig_tokens = whitespace_tokenize(text) 212 | split_tokens = [] 213 | for token in orig_tokens: 214 | if self.do_lower_case: 215 | token = token.lower() 216 | token = self._run_strip_accents(token) 217 | split_tokens.extend(self._run_split_on_punc(token)) 218 | 219 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 220 | return output_tokens 221 | 222 | def _run_strip_accents(self, text): 223 | """Strips accents from a piece of text.""" 224 | text = unicodedata.normalize("NFD", text) 225 | output = [] 226 | for char in text: 227 | cat = unicodedata.category(char) 228 | if cat == "Mn": 229 | continue 230 | output.append(char) 231 | return "".join(output) 232 | 233 | def _run_split_on_punc(self, text): 234 | """Splits punctuation on a piece of text.""" 235 | chars = list(text) 236 | i = 0 237 | start_new_word = True 238 | output = [] 239 | while i < len(chars): 240 | char = chars[i] 241 | if _is_punctuation(char): 242 | output.append([char]) 243 | start_new_word = True 244 | else: 245 | if start_new_word: 246 | output.append([]) 247 | start_new_word = False 248 | output[-1].append(char) 249 | i += 1 250 | 251 | return ["".join(x) for x in output] 252 | 253 | def _tokenize_chinese_chars(self, text): 254 | """Adds whitespace around any CJK character.""" 255 | output = [] 256 | for char in text: 257 | cp = ord(char) 258 | if self._is_chinese_char(cp): 259 | output.append(" ") 260 | output.append(char) 261 | output.append(" ") 262 | else: 263 | output.append(char) 264 | return "".join(output) 265 | 266 | def _is_chinese_char(self, cp): 267 | """Checks whether CP is the codepoint of a CJK character.""" 268 | # This defines a "chinese character" as anything in the CJK Unicode block: 269 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 270 | # 271 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 272 | # despite its name. The modern Korean Hangul alphabet is a different block, 273 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 274 | # space-separated words, so they are not treated specially and handled 275 | # like the all of the other languages. 276 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 277 | (cp >= 0x3400 and cp <= 0x4DBF) or # 278 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 279 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 280 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 281 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 282 | (cp >= 0xF900 and cp <= 0xFAFF) or # 283 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 284 | return True 285 | 286 | return False 287 | 288 | def _clean_text(self, text): 289 | """Performs invalid character removal and whitespace cleanup on text.""" 290 | output = [] 291 | for char in text: 292 | cp = ord(char) 293 | if cp == 0 or cp == 0xfffd or _is_control(char): 294 | continue 295 | if _is_whitespace(char): 296 | output.append(" ") 297 | else: 298 | output.append(char) 299 | return "".join(output) 300 | 301 | 302 | class WordpieceTokenizer(object): 303 | """Runs WordPiece tokenziation.""" 304 | 305 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): 306 | self.vocab = vocab 307 | self.unk_token = unk_token 308 | self.max_input_chars_per_word = max_input_chars_per_word 309 | 310 | def tokenize(self, text): 311 | """Tokenizes a piece of text into its word pieces. 312 | 313 | This uses a greedy longest-match-first algorithm to perform tokenization 314 | using the given vocabulary. 315 | 316 | For example: 317 | input = "unaffable" 318 | output = ["un", "##aff", "##able"] 319 | 320 | Args: 321 | text: A single token or whitespace separated tokens. This should have 322 | already been passed through `BasicTokenizer. 323 | 324 | Returns: 325 | A list of wordpiece tokens. 326 | """ 327 | 328 | text = convert_to_unicode(text) 329 | 330 | output_tokens = [] 331 | for token in whitespace_tokenize(text): 332 | chars = list(token) 333 | if len(chars) > self.max_input_chars_per_word: 334 | output_tokens.append(self.unk_token) 335 | continue 336 | 337 | is_bad = False 338 | start = 0 339 | sub_tokens = [] 340 | while start < len(chars): 341 | end = len(chars) 342 | cur_substr = None 343 | while start < end: 344 | substr = "".join(chars[start:end]) 345 | if start > 0: 346 | substr = "##" + substr 347 | if substr in self.vocab: 348 | cur_substr = substr 349 | break 350 | end -= 1 351 | if cur_substr is None: 352 | is_bad = True 353 | break 354 | sub_tokens.append(cur_substr) 355 | start = end 356 | 357 | if is_bad: 358 | output_tokens.append(self.unk_token) 359 | else: 360 | output_tokens.extend(sub_tokens) 361 | return output_tokens 362 | 363 | 364 | def _is_whitespace(char): 365 | """Checks whether `chars` is a whitespace character.""" 366 | # \t, \n, and \r are technically contorl characters but we treat them 367 | # as whitespace since they are generally considered as such. 368 | if char == " " or char == "\t" or char == "\n" or char == "\r": 369 | return True 370 | cat = unicodedata.category(char) 371 | if cat == "Zs": 372 | return True 373 | return False 374 | 375 | 376 | def _is_control(char): 377 | """Checks whether `chars` is a control character.""" 378 | # These are technically control characters but we count them as whitespace 379 | # characters. 380 | if char == "\t" or char == "\n" or char == "\r": 381 | return False 382 | cat = unicodedata.category(char) 383 | if cat in ("Cc", "Cf"): 384 | return True 385 | return False 386 | 387 | 388 | def _is_punctuation(char): 389 | """Checks whether `chars` is a punctuation character.""" 390 | cp = ord(char) 391 | # We treat all non-letter/number ASCII as punctuation. 392 | # Characters such as "^", "$", and "`" are not in the Unicode 393 | # Punctuation class but we treat them as punctuation anyways, for 394 | # consistency. 395 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 396 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 397 | return True 398 | cat = unicodedata.category(char) 399 | if cat.startswith("P"): 400 | return True 401 | return False 402 | -------------------------------------------------------------------------------- /PyCLUE/utils/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChineseGLUE/PyCLUE/0088f97f5da5903e720cbd48c7a558a7f1d7e836/PyCLUE/utils/configs/__init__.py -------------------------------------------------------------------------------- /PyCLUE/utils/configs/data_configs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Liu Shaoweihua 3 | # @Date: 2019-12-04 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | from ..classifier_utils.core import ClassificationProcessor, InputExample 10 | from ..classifier_utils import tokenization 11 | 12 | 13 | class CmnliProcessor(ClassificationProcessor): 14 | 15 | def _create_examples(self, lines, set_type): 16 | """Creates examples for the training and dev sets.""" 17 | examples = [] 18 | if self.ignore_header: 19 | lines = lines[1:] 20 | if self.min_seq_length: 21 | lines = [line for line in lines if len(line) >= self.min_seq_length] 22 | for i, line in enumerate(lines): 23 | guid = "%s-%s" %(set_type, i) 24 | try: 25 | if set_type == "train": 26 | label = tokenization.convert_to_unicode(line["gold_"+self.label_column]) 27 | elif set_type == "dev": 28 | label = tokenization.convert_to_unicode(line[self.label_column]) 29 | elif set_type == "test": 30 | label = self.labels[0] 31 | text_a = tokenization.convert_to_unicode(line[self.text_a_column]) 32 | text_b = None if not self.text_b_column else tokenization.convert_to_unicode(line[self.text_b_column]) 33 | examples.append( 34 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) 35 | ) 36 | except Exception: 37 | print("### Error {}: {}".format(i, line)) 38 | return examples 39 | 40 | class CslProcessor(ClassificationProcessor): 41 | 42 | def _create_examples(self, lines, set_type): 43 | """Creates examples for the training and dev sets.""" 44 | examples = [] 45 | if self.ignore_header: 46 | lines = lines[1:] 47 | if self.min_seq_length: 48 | lines = [line for line in lines if len(line) >= self.min_seq_length] 49 | for i, line in enumerate(lines): 50 | guid = "%s-%s" %(set_type, i) 51 | try: 52 | label = tokenization.convert_to_unicode(line[self.label_column]) if set_type != "test" else self.labels[0] 53 | text_a = tokenization.convert_to_unicode(" ".join(line[self.text_a_column])) 54 | text_b = None if not self.text_b_column else tokenization.convert_to_unicode(line[self.text_b_column]) 55 | examples.append( 56 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) 57 | ) 58 | except Exception: 59 | print("### Error {}: {}".format(i, line)) 60 | return examples 61 | 62 | 63 | class WscProcessor(ClassificationProcessor): 64 | 65 | def _create_examples(self, lines, set_type): 66 | examples = [] 67 | if self.ignore_header: 68 | lines = lines[1:] 69 | if self.min_seq_length: 70 | lines = [line for line in lines if len(line) >= self.min_seq_length] 71 | for i, line in enumerate(lines): 72 | guid = "%s-%s" %(set_type, i) 73 | try: 74 | label = tokenization.convert_to_unicode(line[self.label_column]) if set_type != "test" else self.labels[0] 75 | text_a = tokenization.convert_to_unicode(line[self.text_a_column]) 76 | text_b = None if not self.text_b_column else tokenization.convert_to_unicode(line[self.text_b_column]) 77 | text_a_list = list(text_a) 78 | target = line["target"] 79 | query = target["span1_text"] 80 | query_idx = target["span1_index"] 81 | pronoun = target["span2_text"] 82 | pronoun_idx = target["span2_index"] 83 | 84 | assert text_a[pronoun_idx:(pronoun_idx + len(pronoun))] == pronoun, "pronoun: {}".format(pronoun) 85 | assert text_a[query_idx: (query_idx + len(query))] == query, "query: {}".format(query) 86 | 87 | if pronoun_idx > query_idx: 88 | text_a_list.insert(query_idx, "_") 89 | text_a_list.insert(query_idx + len(query) + 1, "_") 90 | text_a_list.insert(pronoun_idx + 2, "[") 91 | text_a_list.insert(pronoun_idx + len(pronoun) + 2 + 1, "]") 92 | else: 93 | text_a_list.insert(pronoun_idx, "[") 94 | text_a_list.insert(pronoun_idx + len(pronoun) + 1, "]") 95 | text_a_list.insert(query_idx + 2, "_") 96 | text_a_list.insert(query_idx + len(query) + 2 + 1, "_") 97 | 98 | text_a = "".join(text_a_list) 99 | examples.append( 100 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) 101 | ) 102 | except Exception as e: 103 | print("### Error {}: {}".format(i, line)) 104 | return examples 105 | 106 | 107 | class CopaProcessor(ClassificationProcessor): 108 | 109 | def __init__(self, labels, label_column, ignore_header=False, min_seq_length=None, file_type="json", delimiter=None): 110 | self.language = "zh" 111 | self.labels = labels 112 | self.label_column = label_column 113 | self.ignore_header = ignore_header 114 | self.min_seq_length = min_seq_length 115 | self.file_type = file_type 116 | self.delimiter = delimiter 117 | 118 | def _create_examples(self, lines, set_type): 119 | """Creates examples for the training and dev sets.""" 120 | examples = [] 121 | if self.ignore_header: 122 | lines = lines[1:] 123 | if self.min_seq_length: 124 | lines = [line for line in lines if len(line) >= self.min_seq_length] 125 | for i, line in enumerate(lines): 126 | i = 2*i 127 | guid0 = "%s-%s" %(set_type, i) 128 | guid1 = "%s-%s" %(set_type, i+1) 129 | try: 130 | premise = tokenization.convert_to_unicode(line["premise"]) 131 | choice0 = tokenization.convert_to_unicode(line["choice0"]) 132 | label0 = tokenization.convert_to_unicode(str(1 if line[self.label_column] == 0 else 0)) if set_type != "test" else self.labels[0] 133 | choice1 = tokenization.convert_to_unicode(line["choice1"]) 134 | label1 = tokenization.convert_to_unicode(str(0 if line[self.label_column] == 0 else 1)) if set_type != "test" else self.labels[0] 135 | if line["question"] == "effect": 136 | text_a0 = premise 137 | text_b0 = choice0 138 | text_a1 = premise 139 | text_b1 = choice1 140 | elif line["question"] == "cause": 141 | text_a0 = choice0 142 | text_b0 = premise 143 | text_a1 = choice1 144 | text_b1 = premise 145 | else: 146 | raise Exception 147 | examples.append( 148 | InputExample(guid=guid0, text_a=text_a0, text_b=text_b0, label=label0) 149 | ) 150 | examples.append( 151 | InputExample(guid=guid1, text_a=text_a1, text_b=text_b1, label=label1) 152 | ) 153 | except Exception as e: 154 | print("### Error {}: {}".format(i, line)) 155 | return examples 156 | 157 | 158 | DATA_URLS = { 159 | # chineseGLUE txt Version 160 | "bq": "https://storage.googleapis.com/chineseglue/toolkitTasks/bq.zip", # 161 | "xnli": "https://storage.googleapis.com/chineseglue/toolkitTasks/xnli.zip", 162 | "lcqmc": "https://storage.googleapis.com/chineseglue/toolkitTasks/lcqmc.zip", 163 | "inews": "https://storage.googleapis.com/chineseglue/toolkitTasks/inews.zip", 164 | "thucnews": "https://storage.googleapis.com/chineseglue/toolkitTasks/thucnews.zip", 165 | # CLUE json Version 166 | "afqmc": "https://storage.googleapis.com/cluebenchmark/tasks/afqmc_public.zip", 167 | "cmnli": "https://storage.googleapis.com/cluebenchmark/tasks/cmnli_public.zip", 168 | "copa": "https://storage.googleapis.com/cluebenchmark/tasks/copa_public.zip", 169 | "csl": "https://storage.googleapis.com/cluebenchmark/tasks/csl_public.zip", 170 | "iflytek": "https://storage.googleapis.com/cluebenchmark/tasks/iflytek_public.zip", 171 | "tnews": "https://storage.googleapis.com/cluebenchmark/tasks/tnews_public.zip", 172 | "wsc": "https://storage.googleapis.com/cluebenchmark/tasks/wsc_public.zip" 173 | } 174 | 175 | 176 | DATA_PROCESSORS = { 177 | # chineseGLUE txt Version 178 | "bq": ClassificationProcessor( 179 | labels = ["0", "1"], 180 | label_column = 2, 181 | text_a_column = 0, 182 | text_b_column = 1, 183 | file_type = "txt", 184 | delimiter = "_!_" 185 | ), 186 | "xnli": ClassificationProcessor( 187 | labels = ["0", "1", "2"], 188 | label_column = 2, 189 | text_a_column = 0, 190 | text_b_column = 1, 191 | file_type = "txt", 192 | delimiter = "_!_" 193 | ), 194 | "lcqmc": ClassificationProcessor( 195 | labels = ["0", "1"], 196 | label_column = 2, 197 | text_a_column = 0, 198 | text_b_column = 1, 199 | file_type = "txt", 200 | delimiter = "_!_" 201 | ), 202 | "inews": ClassificationProcessor( 203 | labels = ["0", "1", "2"], 204 | label_column = 0, 205 | text_a_column = 2, 206 | text_b_column = 3, 207 | file_type = "txt", 208 | delimiter = "_!_" 209 | ), 210 | "thucnews": ClassificationProcessor( 211 | labels = [str(i) for i in range(14)], 212 | label_column = 0, 213 | text_a_column = 3, 214 | text_b_column = None, 215 | file_type = "txt", 216 | delimiter = "_!_", 217 | min_seq_length = 3 218 | ), 219 | # CLUE json Version 220 | "afqmc": ClassificationProcessor( 221 | labels = ["0", "1"], 222 | label_column = "label", 223 | text_a_column = "sentence1", 224 | text_b_column = "sentence2", 225 | file_type = "json", 226 | delimiter = None 227 | ), 228 | "iflytek": ClassificationProcessor( 229 | labels = [str(i) for i in range(119)], 230 | label_column = "label", 231 | text_a_column = "sentence", 232 | text_b_column = None, 233 | file_type = "json", 234 | delimiter = None 235 | ), 236 | "tnews": ClassificationProcessor( 237 | labels = [str(100 + i) for i in range(17) if i != 5 and i != 11], 238 | label_column = "label", 239 | text_a_column = "sentence", 240 | text_b_column = None, 241 | file_type = "json", 242 | delimiter = None 243 | ), 244 | "wsc": WscProcessor( 245 | labels = ["true", "false"], 246 | label_column = "label", 247 | text_a_column = "text", 248 | text_b_column = None, 249 | file_type = "json", 250 | delimiter = None 251 | ), 252 | "copa": CopaProcessor( 253 | labels = ["0", "1"], 254 | label_column = "label", 255 | file_type = "json", 256 | delimiter = None 257 | ), 258 | "csl": CslProcessor( 259 | labels = ["0", "1"], 260 | label_column = "label", 261 | text_a_column = "keyword", 262 | text_b_column = "abst", 263 | file_type = "json", 264 | delimiter = None 265 | ), 266 | "cmnli": CmnliProcessor( 267 | labels = ["contradiction", "entailment", "neutral"], 268 | label_column = "label", 269 | text_a_column = "sentence1", 270 | text_b_column = "sentence2", 271 | file_type = "json", 272 | delimiter = None 273 | ) 274 | } -------------------------------------------------------------------------------- /PyCLUE/utils/configs/model_configs.py: -------------------------------------------------------------------------------- 1 | PRETRAINED_LM_DICT = { 2 | "bert": "chinese_L-12_H-768_A-12", 3 | "bert_wwm_ext": "", 4 | "albert_xlarge": "", 5 | "albert_large": "", 6 | "albert_base": "", 7 | "albert_base_ext": "", 8 | "albert_small": "", 9 | "albert_tiny": "", 10 | "roberta": "", 11 | "roberta_wwm_ext": "", 12 | "roberta_wwm_ext_large": "" 13 | } 14 | 15 | PRETRAINED_LM_CONFIG = { 16 | "bert": "bert_config.json", 17 | "bert_wwm_ext": "bert_config.json", 18 | "albert_xlarge": "albert_config_xlarge.json", 19 | "albert_large": "albert_config_large.json", 20 | "albert_base": "albert_config_base.json", 21 | "albert_base_ext": "albert_config_base.json", 22 | "albert_small": "albert_config_small_google.json", 23 | "albert_tiny": "albert_config_tiny_g.json", 24 | "roberta": "bert_config_large.json", 25 | "roberta_wwm_ext": "bert_config.json", 26 | "roberta_wwm_ext_large": "bert_config.json" 27 | 28 | } 29 | 30 | PRETRAINED_LM_CKPT = { 31 | "bert": "bert_model.ckpt", 32 | "bert_wwm_ext": "bert_model.ckpt", 33 | "albert_xlarge": "albert_model.ckpt", 34 | "albert_large": "albert_model.ckpt", 35 | "albert_base": "albert_model.ckpt", 36 | "albert_base_ext": "albert_model.ckpt", 37 | "albert_small": "albert_model.ckpt", 38 | "albert_tiny": "albert_model.ckpt", 39 | "roberta": "roberta_zh_large_model.ckpt", 40 | "roberta_wwm_ext": "bert_model.ckpt", 41 | "roberta_wwm_ext_large": "bert_model.ckpt" 42 | } 43 | 44 | PRETRAINED_LM_URLS = { 45 | "bert": "https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip", 46 | "bert_wwm_ext": "https://storage.googleapis.com/chineseglue/pretrain_models/chinese_wwm_ext_L-12_H-768_A-12.zip", 47 | "albert_xlarge": "https://storage.googleapis.com/albert_zh/albert_xlarge_zh_177k.zip", 48 | "albert_large": "https://storage.googleapis.com/albert_zh/albert_large_zh.zip", 49 | "albert_base": "https://storage.googleapis.com/albert_zh/albert_base_zh.zip", 50 | "albert_base_ext": "https://storage.googleapis.com/albert_zh/albert_base_zh_additional_36k_steps.zip", 51 | "albert_small": "https://storage.googleapis.com/albert_zh/albert_small_zh_google.zip", 52 | "albert_tiny": "https://storage.googleapis.com/albert_zh/albert_tiny_zh_google.zip", 53 | "roberta": "https://storage.googleapis.com/chineseglue/pretrain_models/roeberta_zh_L-24_H-1024_A-16.zip", 54 | "roberta_wwm_ext": "https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_ext_L-12_H-768_A-12.zip", 55 | "roberta_wwm_ext_large": "https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip" 56 | } -------------------------------------------------------------------------------- /PyCLUE/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Liu Shaoweihua 3 | # @Date: 2019-11-15 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import os 10 | import time 11 | import codecs 12 | import shutil 13 | import zipfile 14 | import requests 15 | 16 | 17 | __all__ = [ 18 | "wget", "unzip", "rm", "mkdir", "rmdir", "mv" 19 | ] 20 | 21 | 22 | _CURRENT_FILE = os.path.dirname(__file__) 23 | 24 | 25 | def wget(url, save_path=None, rename=None): 26 | current_path = os.getcwd() 27 | file_name = url[url.rfind("/")+1:] 28 | if not save_path: 29 | save_path = current_path 30 | if not rename: 31 | rename = file_name 32 | save_path = os.path.abspath(os.path.join(save_path, rename)) 33 | print("[wget] downloading from {}".format(url)) 34 | start = time.time() 35 | size = 0 36 | response = requests.get(url, stream=True) 37 | chunk_size = 10240 38 | content_size = int(response.headers["content-length"]) 39 | if response.status_code == 200: 40 | print("[wget] file size: %.2f MB" %(content_size / 1024 / 1024)) 41 | with codecs.open(save_path, "wb") as f: 42 | for data in response.iter_content(chunk_size=chunk_size): 43 | f.write(data) 44 | size += len(data) 45 | print("\r"+"[wget] %s%.2f%%" 46 | %(">"*int(size*50/content_size), float(size/content_size*100)), end="") 47 | end = time.time() 48 | print("\n"+"[wget] complete! cost: %.2fs."%(end-start)) 49 | print("[wget] save at: %s" %save_path) 50 | return save_path 51 | 52 | 53 | def unzip(file_path, save_path=None): 54 | if not save_path: 55 | save_path = os.path.abspath("/".join(os.path.abspath(file_path).split("/")[:-1])) 56 | with zipfile.ZipFile(file_path) as zf: 57 | zf.extractall(save_path) 58 | print("[unzip] file path: {}, save at {}".format(file_path, save_path)) 59 | return save_path 60 | 61 | 62 | def rm(file_path): 63 | file_path = os.path.abspath(file_path) 64 | os.remove(file_path) 65 | print("[remove] file path {}".format(file_path)) 66 | return 67 | 68 | 69 | def mkdir(file_path): 70 | file_path = os.path.abspath(file_path) 71 | os.makedirs(file_path) 72 | print("[mkdir] create directory {}".format(file_path)) 73 | return file_path 74 | 75 | 76 | def rmdir(file_path): 77 | file_path = os.path.abspath(file_path) 78 | shutil.rmtree(file_path) 79 | print("[rmdir] remove directory {}".format(file_path)) 80 | return 81 | 82 | 83 | def mv(from_file_path, to_file_path): 84 | from_file_path = os.path.abspath(from_file_path) 85 | to_file_path = os.path.abspath(to_file_path) 86 | os.rename(from_file_path, to_file_path) 87 | print("[move] move file from {} to {}".format(from_file_path, to_file_path)) 88 | return -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyCLUE 2 | 3 | Python toolkit for Chinese Language Understanding Evaluation benchmark. 4 | 5 | 中文语言理解测评基准的Python工具包,快速测评代表性数据集、基准(预训练)模型,并针对自己的数据选择合适的基准(预训练)模型进行快速应用。 6 | 7 | ## 安装PyCLUE 8 | 9 | 现在,可以通过pip安装PyCLUE: 10 | 11 | ```bash 12 | pip install PyCLUE 13 | ``` 14 | 15 | ## 使用PyCLUE 16 | 17 | ### 分类/句子对 任务 18 | 19 | #### 快速测评CLUE数据集 20 | 21 | 以下以在CPU/GPU上运行为例,完整例子可参见`PyCLUE/examples/classifications/run_clue_task.py`。 22 | 23 | 在TPU上运行的例子参照`PyCLUE/examples/classifications/run_clue_task_tpu.py`。 24 | 25 | ```python 26 | # 指定使用的GPU,如无GPU则不指定 27 | import os 28 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 29 | 30 | # 导入分类/句子对测评任务相关组件 31 | from PyCLUE.tasks.run_classifier import clue_tasks, configs 32 | 33 | configs["task_name"] = "wsc" 34 | configs["pretrained_lm_name"] = "bert" 35 | 36 | result = clue_tasks(configs) 37 | print(result) 38 | ``` 39 | 40 | 其中,`clue_tasks`函数接受测评任务的`dict`类型参数,`configs`为测评任务的`dict`类型参数,说明如下: 41 | 42 | 1. **task_name** 43 | 44 | CLUE benchmark: afqmc, cmnli, copa, csl, iflytek, tnews, wsc 45 | 46 | chineseGLUE: bq, xnli, lcqmc, inews, thucnews 47 | 48 | 2. **pretrained_lm_name** 49 | 50 | bert, bert_wwm_ext, albert_xlarge, albert_large, albert_base, albert_base_ext, albert_small, albert_tiny, roberta, roberta_wwm_ext, roberta_wwm_ext_large 51 | 52 | 如指定该参数,则不需要指定vocab_file/bert_config_file/init_checkpoint 53 | 54 | 3. 其余参数见`PyCLUE/utils/classifier_utils/core.py`中的`default_configs`、`TaskConfigs`及`UserConfigs`。 55 | 56 | 测评结果`result`形式如下: 57 | 58 | ```python 59 | { 60 | # 验证集指标结果 61 | "dev_res":{ 62 | "eval_accuracy": "", 63 | "eval_loss": "", 64 | "global_step": "", 65 | "loss": "" 66 | }, 67 | # 测试集指标结果(部分测试集有label,具有参考意义;部分则没有label,无参考意义) 68 | "test_res":{ 69 | "eval_accuracy": "", 70 | "eval_loss": "", 71 | "global_step": "", 72 | "loss": "" 73 | }, 74 | # 测试集预测结果 75 | "test_outputs": [ 76 | { 77 | "guid": "test-0", 78 | "text_a": "_毛德和朵拉_看到火车冲过大草原,引擎上冒着滚滚黑烟。从远处就能听见它们的轰鸣声和狂野而清晰的汽笛声。当[它们]行进到近处时,马都跑开了。", 79 | "text_b": None, 80 | "label": "false" 81 | }, 82 | ... 83 | ] 84 | } 85 | ``` 86 | 87 | 测评结果同时保存在`configs`中指定的输出目录`${output_dir}/classifications/${task_name}/${pretrained_lm_name}`中,如本例的`PyCLUE/task_outputs/classifications/wsc/bert`中。 88 | 89 | 其中`dev_results.txt`保存了验证集的指标结果,`test_results.txt`保存了测试集的指标结果(部分测试集有label,具有参考意义;部分则没有label,无参考意义),`test_results.tsv`则保存了测试集的预测结果,具体形式如下: 90 | 91 | ```python 92 | {"guid": "test-0", "text_a": "_毛德和朵拉_看到火车冲过大草原,引擎上冒着滚滚黑烟。从远处就能听见它们的轰鸣声和狂野而清晰的汽笛声。当[它们]行进到近处时,马都跑开了。", "text_b": null, "label": "false"} 93 | {"guid": "test-1", "text_a": "毛德和朵拉看到_火车_冲过大草原,引擎上冒着滚滚黑烟。从远处就能听见它们的轰鸣声和狂野而清晰的汽笛声。当[它们]行进到近处时,马都跑开了。", "text_b": null, "label": "false"} 94 | ``` 95 | 96 | #### 应用于自定义数据集 97 | 98 | 以下以在CPU/GPU上运行为例,完整例子可参见`PyCLUE/examples/classifications/run_user_task.py`。 99 | 100 | 在TPU上运行的例子参照`PyCLUE/examples/classifications/run_user_task_tpu.py`。 101 | 102 | ```python 103 | # 指定使用的GPU,如无GPU则不指定 104 | import os 105 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 106 | 107 | # 导入分类/句子对测评任务相关组件 108 | from PyCLUE.tasks.run_classifier import user_tasks, configs 109 | 110 | configs["task_name"] = "my_task" 111 | configs["pretrained_lm_name"] = "albert" 112 | configs["do_train"] = True 113 | configs["do_eval"] = True 114 | configs["do_predict"] = True 115 | configs["data_dir"] = "/home/liushaoweihua/my_pyclue_datasets" 116 | configs["labels"] = ["0", "1"] 117 | 118 | # label_, text_a_column , text_b_column & delimiter: 119 | # examples_1,txt文件,分隔符为_!_,句子对任务: 120 | # 0_!_我想要回家_!_我准备回家 121 | # 1_!_我想要回家_!_我准备吃饭 122 | # >> label_column = 0, text_a_column = 1, text_b_column = 2, delimiter = "_!_" 123 | # examples_2,tsv文件,分隔符为\t,分类任务: 124 | # 0\t我很生气 125 | # 1\t我很开心 126 | # >> label_column = 0, text_a_column = 1, text_b_column = None, delimiter = "\t" 127 | # examples_3,json文件,句子对任务: 128 | # {"label": 0, "sentence1": "我想要回家", "sentence2": "我很生气"} 129 | # >> label_column = "label", text_a_column = "sentence1", text_b_column = "sentence2", delimiter = None 130 | configs["label_column"] = "label" 131 | configs["text_a_column"] = "sentence1" 132 | configs["text_b_column"] = "sentence2" 133 | configs["delimiter"] = None 134 | configs["ignore_header"] = False 135 | configs["min_seq_length"] = 3 136 | configs["file_type"] = "json" 137 | configs["output_dir"] = "/home/liushaoweihua/my_pyclue_outputs" 138 | 139 | # 预训练语言模型组件 140 | # 如果 pretrained_lm_name 不为 None, 以下部分不需要指定。 141 | configs["vocab_file"] = "vocab.txt" 142 | configs["bert_config_file"] = "XXX_config.json" 143 | configs["init_checkpoint"] = "XXX_model.ckpt" 144 | 145 | # train parameters 146 | configs["max_seq_length"] = 128 147 | configs["train_batch_size"] = 8 148 | configs["learning_rate"] = 2e-5 149 | configs["warmup_proportion"] = 0.1 150 | configs["num_train_epochs"] = 50 151 | 152 | # show training process 153 | configs["verbose"] = 0 154 | 155 | result = user_tasks(configs) 156 | print(result) 157 | ``` 158 | 159 | 结果的输出和保存形式与测评CLUE数据集时一致。 160 | 161 | ### 阅读理解任务 162 | 163 | #### 快速测评CLUE数据集 164 | 165 | 即将加入。 166 | 167 | #### 应用于自定义数据集 168 | 169 | 即将j加入。 170 | 171 | ### 命名实体识别任务 172 | 173 | #### 快速测评CLUE数据集 174 | 175 | 即将加入。 176 | 177 | #### 应用于自定义数据集 178 | 179 | 即将j加入。 180 | 181 | ## 基准(预训练)模型 182 | 183 | **现已支持以下模型:** 184 | 185 | 1. [BERT-base](https://github.com/google-research/bert) 186 | 2. [BERT-wwm-ext](https://github.com/ymcui/Chinese-BERT-wwm) 187 | 3. [albert_xlarge](https://github.com/brightmart/albert_zh) 188 | 4. [albert_large](https://github.com/brightmart/albert_zh) 189 | 5. [albert_base](https://github.com/brightmart/albert_zh) 190 | 6. [albert_base_ext](https://github.com/brightmart/albert_zh) 191 | 7. [albert_small](https://github.com/brightmart/albert_zh) 192 | 8. [albert_tiny](https://github.com/brightmart/albert_zh) 193 | 9. [roberta](https://github.com/brightmart/roberta_zh) 194 | 10. [roberta_wwm_ext](https://github.com/ymcui/Chinese-BERT-wwm) 195 | 11. [roberta_wwm_ext_large](https://github.com/ymcui/Chinese-BERT-wwm) 196 | 197 | 即将加入: 198 | 199 | 1. [XLNet_mid](https://github.com/ymcui/Chinese-PreTrained-XLNet) 200 | 2. [ERNIE_base](https://github.com/PaddlePaddle/ERNIE) 201 | 202 | ## 支持任务类型 203 | 204 | ### 分类任务 205 | 206 | **现已支持以下数据集:** 207 | 208 | #### CLUEBenchmark任务 209 | 210 | 参考:https://github.com/CLUEbenchmark/CLUE 211 | 212 | 1. **AFQMC 蚂蚁金融语义相似度** 213 | 214 | ``` 215 | 数据量:训练集(34334)验证集(4316)测试集(3861) 216 | 例子: 217 | {"sentence1": "双十一花呗提额在哪", "sentence2": "里可以提花呗额度", "label": "0"} 218 | 每一条数据有三个属性,从前往后分别是 句子1,句子2,句子相似度标签。其中label标签,1 表示sentence1和sentence2的含义类似,0表示两个句子的含义不同。 219 | ``` 220 | 221 | 2. **TNEWS' 今日头条中文新闻(短文本)分类 Short Text Classificaiton for News** 222 | 223 | ``` 224 | 数据量:训练集(266,000),验证集(57,000),测试集(57,000) 225 | 例子: 226 | {"label": "102", "label_des": "news_entertainment", "sentence": "江疏影甜甜圈自拍,迷之角度竟这么好看,美吸引一切事物"} 227 | 每一条数据有三个属性,从前往后分别是 分类ID,分类名称,新闻字符串(仅含标题)。 228 | ``` 229 | 230 | 3. **IFLYTEK' 长文本分类 Long Text classification** 231 | 232 | 该数据集共有1.7万多条关于app应用描述的长文本标注数据,包含和日常生活相关的各类应用主题,共119个类别:"打车":0,"地图导航":1,"免费WIFI":2,"租车":3,….,"女性":115,"经营":116,"收款":117,"其他":118(分别用0-118表示)。 233 | 234 | ``` 235 | 数据量:训练集(12,133),验证集(2,599),测试集(2,600) 236 | 例子: 237 | {"label": "110", "label_des": "社区超市", "sentence": "朴朴快送超市创立于2016年,专注于打造移动端30分钟即时配送一站式购物平台,商品品类包含水果、蔬菜、肉禽蛋奶、海鲜水产、粮油调味、酒水饮料、休闲食品、日用品、外卖等。朴朴公司希望能以全新的商业模式,更高效快捷的仓储配送模式,致力于成为更快、更好、更多、更省的在线零售平台,带给消费者更好的消费体验,同时推动中国食品安全进程,成为一家让社会尊敬的互联网公司。,朴朴一下,又好又快,1.配送时间提示更加清晰友好2.保障用户隐私的一些优化3.其他提高使用体验的调整4.修复了一些已知bug"} 238 | 每一条数据有三个属性,从前往后分别是 类别ID,类别名称,文本内容。 239 | ``` 240 | 241 | 4. **CMNLI 语言推理任务 Chinese Multi-Genre NLI** 242 | 243 | CMNLI数据由两部分组成:XNLI和MNLI。数据来自于fiction,telephone,travel,government,slate等,对原始MNLI数据和XNLI数据进行了中英文转化,保留原始训练集,合并XNLI中的dev和MNLI中的matched作为CMNLI的dev,合并XNLI中的test和MNLI中的mismatched作为CMNLI的test,并打乱顺序。该数据集可用于判断给定的两个句子之间属于蕴涵、中立、矛盾关系。 244 | 245 | ``` 246 | 数据量:train(391,782),matched(12,426),mismatched(13,880) 247 | 例子: 248 | {"sentence1": "新的权利已经足够好了", "sentence2": "每个人都很喜欢最新的福利", "label": "neutral"} 249 | 每一条数据有三个属性,从前往后分别是 句子1,句子2,蕴含关系标签。其中label标签有三种:neutral,entailment,contradiction。 250 | ``` 251 | 252 | 5. **COPA 因果推断-中文版 Choice of Plausible Alternatives** 253 | 254 | 自然语言推理的数据集,给定一个假设以及一个问题表明是因果还是影响,并从两个选项中选择合适的一个。遵照原数据集,我们使用了acc作为评估标准。 255 | 256 | ``` 257 | 数据量:训练集(400),验证集(100),测试集(500) 258 | 例子: 259 | {"idx": 7, "premise": "那人在杂货店买东西时打折了。", "choice0": "他向收银员打招呼。", "choice1": "他用了一张优惠券。", "question": "cause", "label": 1} 260 | 其中label的标注,0表示choice0,1 表示choice1。原先的COPA数据集是英文的,我们使用机器翻译以及人工翻译的方法,并做了些微的用法习惯上的调整,并根据中文的习惯进行了标注,得到了这份数据集。 261 | ``` 262 | 263 | 6. **WSC Winograd模式挑战中文版 The Winograd Schema Challenge,Chinese Version** 264 | 265 | 威诺格拉德模式挑战赛是图灵测试的一个变种,旨在判定AI系统的常识推理能力。参与挑战的计算机程序需要回答一种特殊但简易的常识问题:代词消歧问题,即对给定的名词和代词判断是否指代一致。 266 | 267 | ``` 268 | 数据量:训练集(532),验证集(104),测试集(143) 269 | 例子: 270 | {"target": 271 | {"span2_index": 28, 272 | "span1_index": 0, 273 | "span1_text": "马克", 274 | "span2_text": "他" 275 | }, 276 | "idx": 0, 277 | "label": "false", 278 | "text": "马克告诉皮特许多关于他自己的谎言,皮特也把这些谎言写进了他的书里。他应该多怀疑。" 279 | } 280 | 其中label标签,true表示指代一致,false表示指代不一致。 281 | ``` 282 | 283 | 7. **CSL 论文关键词识别 Keyword Recognition** 284 | 285 | 中文科技文献数据集包含中文核心论文摘要及其关键词。 用tf-idf生成伪造关键词与论文真实关键词混合,生成摘要-关键词对,关键词中包含伪造的则标签为0。 286 | 287 | ``` 288 | 数据量:训练集(20,000),验证集(3,000),测试集(3,000) 289 | 例子: 290 | {"id": 1, "abst": "为解决传统均匀FFT波束形成算法引起的3维声呐成像分辨率降低的问题,该文提出分区域FFT波束形成算法.远场条件下,以保证成像分辨率为约束条件,以划分数量最少为目标,采用遗传算法作为优化手段将成像区域划分为多个区域.在每个区域内选取一个波束方向,获得每一个接收阵元收到该方向回波时的解调输出,以此为原始数据在该区域内进行传统均匀FFT波束形成.对FFT计算过程进行优化,降低新算法的计算量,使其满足3维成像声呐实时性的要求.仿真与实验结果表明,采用分区域FFT波束形成算法的成像分辨率较传统均匀FFT波束形成算法有显著提高,且满足实时性要求.", "keyword": ["水声学", "FFT", "波束形成", "3维成像声呐"], "label": "1"} 291 | 每一条数据有四个属性,从前往后分别是 数据ID,论文摘要,关键词,真假标签。 292 | ``` 293 | 294 | #### ChineseGLUE任务 295 | 296 | 参考:https://github.com/ChineseGLUE/ChineseGLUE 297 | 298 | 1. **LCQMC口语化描述的语义相似度任务 Semantic Similarity Task** 299 | 300 | 输入是两个句子,输出是0或1。其中0代表语义不相似,1代表语义相似。 301 | 302 | ``` 303 | 数据量:训练集(238,766),验证集(8,802),测试集(12,500) 304 | 例子: 305 | 1.聊天室都有哪些好的 [分隔符] 聊天室哪个好 [分隔符] 1 306 | 2.飞行员没钱买房怎么办? [分隔符] 父母没钱买房子 [分隔符] 0 307 | ``` 308 | 309 | 2. **XNLI语言推断任务 Natural Language Inference** 310 | 311 | 跨语言理解的数据集,给定一个前提和假设,判断这个假设与前提是否具有蕴涵、对立、中性关系。 312 | 313 | ``` 314 | 数据量:训练集(392,703),验证集(2,491),测试集(5,011) 315 | 例子: 316 | 1.从 概念 上 看 , 奶油 收入 有 两 个 基本 方面 产品 和 地理 .[分隔符] 产品 和 地理 是 什么 使 奶油 抹 霜 工作 . [分隔符] neutral 317 | 2.我们 的 一个 号码 会 非常 详细 地 执行 你 的 指示 [分隔符] 我 团队 的 一个 成员 将 非常 精确 地 执行 你 的 命令 [分隔符] entailment 318 | 319 | 原始的XNLI覆盖15种语言(含低资源语言)。我们选取其中的中文,并将做格式转换,使得非常容易进入训练和测试阶段。 320 | ``` 321 | 322 | 3. **INEWS 互联网情感分析任务 Sentiment Analysis for Internet News** 323 | 324 | ``` 325 | 数据量:训练集(5,356),验证集(1,000),测试集(1,000) 326 | 例子: 327 | 1_!_00005a3efe934a19adc0b69b05faeae7_!_九江办好人民满意教育_!_近3年来,九江市紧紧围绕“人本教育、公平教育、优质教育、幸福教育”的目标,努力办好人民满意教育,促进了义务教育均衡发展,农村贫困地区办学条件改善。目前,该市特色教育学校有70所 ...... 328 | 每行为一条数据,以_!_分割的个字段,从前往后分别是情感类别,数据id,新闻标题,新闻内容 329 | ``` 330 | 331 | 5. **BQ 智能客服问句匹配 Question Matching for Customer Service** 332 | 333 | 该数据集是自动问答系统语料,共有120,000对句子对,并标注了句子对相似度值,取值为0或1(0表示不相似,1表示相似)。数据中存在错别字、语法不规范等问题,但更加贴近工业场景。 334 | 335 | ``` 336 | 数据量:训练集(100,000),验证集(10,000),测试集(10,000) 337 | 例子: 338 | 1.我存钱还不扣的 [分隔符] 借了每天都要还利息吗 [分隔符] 0 339 | 2.为什么我的还没有额度 [分隔符] 为啥没有额度!! [分隔符] 1 340 | ``` 341 | 342 | 6. **THUCNEWS 长文本分类 Long Text classification** 343 | 344 | 该数据集共有4万多条中文新闻长文本标注数据,共14个类别: "体育":0, "娱乐":1, "家居":2, "彩票":3, "房产":4, "教育":5, "时尚":6, "时政":7, "星座":8, "游戏":9, "社会":10, "科技":11, "股票":12, "财经":13。 345 | 346 | ``` 347 | 数据量:训练集(33,437),验证集(4,180),测试集(4,180) 348 | 例子: 349 | 11_!_科技_!_493337.txt_!_爱国者A-Touch MK3533高清播放器试用  爱国者MP5简介:  "爱国者"北京华旗资讯,作为国内知名数码产品制>造商。1993年创立于北京中关村,是一家致力于...... 350 | 每行为一条数据,以_!_分割的个字段,从前往后分别是 类别ID,类别名称,文本ID,文本内容。 351 | ``` 352 | 353 | ### 阅读理解任务 354 | 355 | 即将加入。 356 | 357 | ### 命名实体识别任务 358 | 359 | 即将加入。 360 | -------------------------------------------------------------------------------- /dist/PyCLUE-2019.12.5-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChineseGLUE/PyCLUE/0088f97f5da5903e720cbd48c7a558a7f1d7e836/dist/PyCLUE-2019.12.5-py3-none-any.whl -------------------------------------------------------------------------------- /dist/PyCLUE-2019.12.5.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChineseGLUE/PyCLUE/0088f97f5da5903e720cbd48c7a558a7f1d7e836/dist/PyCLUE-2019.12.5.tar.gz -------------------------------------------------------------------------------- /examples/classifications/run_clue_task.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append("../..") 4 | from PyCLUE.tasks.run_classifier import clue_tasks, configs 5 | 6 | # assign GPU devices or CPU devices 7 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 8 | 9 | # default configs: see PyCLUE.utils.classifier_utils.core 10 | # below are some necessary paramters required in running this task 11 | 12 | # task_name: 13 | # Support: 14 | # chineseGLUE: bq, xnli, lcqmc, inews, thucnews, 15 | # CLUE: afqmc, cmnli, copa, csl, iflytek, tnews, wsc 16 | configs["task_name"] = "afqmc" 17 | 18 | # pretrained_lm_name: 19 | # If None, should assign `vocab_file`, `bert_config_file`, `init_checkpoint`. 20 | # Or you can choose the following models: 21 | # bert, bert_wwm_ext, albert_xlarge, albert_large, albert_base, albert_base_ext, 22 | # albert_small, albert_tiny, roberta, roberta_wwm_ext, roberta_wwm_ext_large 23 | configs["pretrained_lm_name"] = "bert" 24 | 25 | # actions 26 | configs["do_train"] = True 27 | configs["do_eval"] = True 28 | configs["do_predict"] = True 29 | 30 | # train parameters 31 | configs["max_seq_length"] = 128 32 | configs["train_batch_size"] = 32 33 | configs["learning_rate"] = 2e-5 34 | configs["warmup_proportion"] = 0.1 35 | configs["num_train_epochs"] = 3.0 36 | 37 | 38 | if __name__ == "__main__": 39 | clue_tasks(configs) 40 | -------------------------------------------------------------------------------- /examples/classifications/run_clue_task_tpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PyCLUE.tasks.run_classifier import clue_tasks, configs 3 | 4 | # default configs: see PyCLUE.utils.classifier_utils.core 5 | # below are some necessary paramters required in running this task 6 | 7 | # task_name: 8 | # Support: 9 | # chineseGLUE: bq, xnli, lcqmc, inews, thucnews, 10 | # CLUE: afqmc, cmnli, copa, csl, iflytek, tnews, wsc 11 | configs["task_name"] = "bq" 12 | 13 | # pretrained_lm_name: 14 | # If None, should assign `vocab_file`, `bert_config_file`, `init_checkpoint`. 15 | # Or you can choose the following models: 16 | # bert, bert_wwm_ext, albert_xlarge, albert_large, albert_base, albert_base_ext, 17 | # albert_small, albert_tiny, roberta, roberta_wwm_ext, roberta_wwm_ext_large 18 | configs["pretrained_lm_name"] = "bert" 19 | 20 | # actions 21 | configs["do_train"] = True 22 | configs["do_eval"] = True 23 | configs["do_predict"] = True 24 | 25 | # train parameters 26 | configs["max_seq_length"] = 128 27 | configs["train_batch_size"] = 32 28 | configs["learning_rate"] = 2e-5 29 | configs["warmup_proportion"] = 0.1 30 | configs["num_train_epochs"] = 3.0 31 | 32 | # tpu configs 33 | configs["use_tpu"] = True 34 | configs["tpu_name"] = "grpc://10.1.101.2:8470" 35 | configs["num_tpu_cores"] = 8 36 | 37 | 38 | if __name__ == "__main__": 39 | clue_tasks(configs) 40 | -------------------------------------------------------------------------------- /examples/classifications/run_user_task.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PyCLUE.tasks.run_classifier import user_tasks, configs 3 | 4 | # assign GPU devices or CPU devices 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 6 | 7 | # default configs: see PyCLUE.utils.classifier_utils.core 8 | # below are some necessary paramters required in running this task 9 | 10 | # task_name: default is "user_defined_task" 11 | configs["task_name"] = "" 12 | 13 | # pretrained_lm_name: 14 | # If None, should assign `vocab_file`, `bert_config_file`, `init_checkpoint`. 15 | # Or you can choose the following models: 16 | # bert, bert_wwm_ext, albert_xlarge, albert_large, albert_base, albert_base_ext, 17 | # albert_small, albert_tiny, roberta, roberta_wwm_ext, roberta_wwm_ext_large 18 | configs["pretrained_lm_name"] = None 19 | 20 | # actions 21 | configs["do_train"] = True 22 | configs["do_eval"] = True 23 | configs["do_predict"] = True 24 | 25 | # data_dir: your own data path 26 | # If `do_train` = True, should contain at least train.txt 27 | # If `do_eval` = True, should contain at least dev.txt 28 | # If `do_predict` = True, should contain at least test.txt 29 | configs["data_dir"] = "" 30 | 31 | # data configs: 32 | # below are some examples 33 | configs["labels"] = ["0", "1"] 34 | # label_position, text_a_position , text_b_position & delimiter: 35 | # examples_1: 36 | # 0_!_我想要回家_!_我准备回家 37 | # 1_!_我想要回家_!_我准备吃饭 38 | # >> label_position = 0, text_a_position = 1, text_b_position = 2, delimiter = "_!_" 39 | # examples_2: 40 | # 0_!_我很生气 41 | # 1_!_我很开心 42 | # >> label_position = 0, text_a_position = 1, text_b_position = None, delimiter = "_!_" 43 | configs["label_position"] = 0 44 | configs["text_a_position"] = 1 45 | configs["text_b_position"] = 2 46 | configs["delimiter"] = "_!_" 47 | # ignore_header: 48 | # If to drop the first line of each file. 49 | configs["ignore_header"] = True 50 | # min_seq_length: 51 | # If to drop sequence that has length less than `min_seq_length` 52 | configs["min_seq_length"] = 3 53 | # file_type: 54 | # train, dev, test file type, can be "txt" or "tsv" 55 | configs["file_type"] = "txt" 56 | 57 | # output_dir: save trained model, evaluation results and tf_records data 58 | configs["output_dir"] = "" 59 | 60 | # your pretrained language model components 61 | # If `pretrained_lm_name` is not None, these components will auto installed. 62 | configs["vocab_file"] = "vocab.txt" 63 | configs["bert_config_file"] = "XXX_config.json" 64 | configs["init_checkpoint"] = "XXX_model.ckpt" 65 | 66 | configs["max_seq_length"] = 128 67 | configs["train_batch_size"] = 32 68 | configs["learning_rate"] = 2e-5 69 | configs["warmup_proportion"] = 0.1 70 | configs["num_train_epochs"] = 3.0 71 | 72 | 73 | if __name__ == "__main__": 74 | user_tasks(configs) 75 | -------------------------------------------------------------------------------- /examples/classifications/run_user_task_tpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PyCLUE.tasks.run_classifier import user_tasks, configs 3 | 4 | 5 | # default configs: see PyCLUE.utils.classifier_utils.core 6 | # below are some necessary paramters required in running this task 7 | 8 | # task_name: default is "user_defined_task" 9 | configs["task_name"] = "" 10 | 11 | # pretrained_lm_name: 12 | # If None, should assign `vocab_file`, `bert_config_file`, `init_checkpoint`. 13 | # Or you can choose the following models: 14 | # bert, bert_wwm_ext, albert_xlarge, albert_large, albert_base, albert_base_ext, 15 | # albert_small, albert_tiny, roberta, roberta_wwm_ext, roberta_wwm_ext_large 16 | configs["pretrained_lm_name"] = None 17 | 18 | # actions 19 | configs["do_train"] = True 20 | configs["do_eval"] = True 21 | configs["do_predict"] = True 22 | 23 | # data_dir: your own data path 24 | # If `do_train` = True, should contain at least train.txt 25 | # If `do_eval` = True, should contain at least dev.txt 26 | # If `do_predict` = True, should contain at least test.txt 27 | configs["data_dir"] = "" 28 | 29 | # data configs: 30 | # below are some examples 31 | configs["labels"] = ["0", "1"] 32 | # label_position, text_a_position , text_b_position & delimiter: 33 | # examples_1: 34 | # 0_!_我想要回家_!_我准备回家 35 | # 1_!_我想要回家_!_我准备吃饭 36 | # >> label_position = 0, text_a_position = 1, text_b_position = 2, delimiter = "_!_" 37 | # examples_2: 38 | # 0_!_我很生气 39 | # 1_!_我很开心 40 | # >> label_position = 0, text_a_position = 1, text_b_position = None, delimiter = "_!_" 41 | configs["label_position"] = 0 42 | configs["text_a_position"] = 1 43 | configs["text_b_position"] = 2 44 | configs["delimiter"] = "_!_" 45 | # ignore_header: 46 | # If to drop the first line of each file. 47 | configs["ignore_header"] = True 48 | # min_seq_length: 49 | # If to drop sequence that has length less than `min_seq_length` 50 | configs["min_seq_length"] = 3 51 | # file_type: 52 | # train, dev, test file type, can be "txt" or "tsv" 53 | configs["file_type"] = "txt" 54 | 55 | # output_dir: save trained model, evaluation results and tf_records data 56 | configs["output_dir"] = "" 57 | 58 | # your pretrained language model components 59 | # If `pretrained_lm_name` is not None, these components will auto installed. 60 | configs["vocab_file"] = "vocab.txt" 61 | configs["bert_config_file"] = "XXX_config.json" 62 | configs["init_checkpoint"] = "XXX_model.ckpt" 63 | 64 | configs["max_seq_length"] = 128 65 | configs["train_batch_size"] = 32 66 | configs["learning_rate"] = 2e-5 67 | configs["warmup_proportion"] = 0.1 68 | configs["num_train_epochs"] = 3.0 69 | 70 | # tpu configs 71 | configs["use_tpu"] = True 72 | configs["tpu_name"] = "grpc://10.1.101.2:8470" 73 | configs["num_tpu_cores"] = 8 74 | 75 | 76 | if __name__ == "__main__": 77 | user_tasks(configs) 78 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow <= 1.15 2 | requests == 2.21.0 3 | numpy == 1.16.4 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | from setuptools import setup 4 | from setuptools.command.develop import develop 5 | from setuptools.command.install import install 6 | from subprocess import call 7 | 8 | with open("README.md","r") as f: 9 | long_description = f.read() 10 | 11 | class Installation(install): 12 | def run(self): 13 | call(["pip install -r requirements.txt --no-clean"], shell=True) 14 | install.run(self) 15 | 16 | setuptools.setup( 17 | name="PyCLUE", 18 | version="2019.12.05", 19 | author="Liu Shaoweihua", 20 | author_email="liushaoweihua@126.com", 21 | maintainer="CLUE", 22 | maintainer_email="chineseGLUE@163.com", 23 | description="Python toolkit for Chinese Language Understanding Evaluation benchmark.", 24 | long_description=long_description, 25 | long_description_content_type="text/markdown", 26 | url="https://github.com/ChineseGLUE/PyCLUE", 27 | include_package_data=True, 28 | packages=setuptools.find_packages(), 29 | classifiers=[ 30 | "Programming Language :: Python :: 3", 31 | "License :: OSI Approved :: MIT License", 32 | "Operating System :: OS Independent", 33 | ], 34 | setup_requires=["tensorflow","requests","numpy"], 35 | install_requires=["tensorflow","requests","numpy"], 36 | cmdclass={'install':Installation}, 37 | ) 38 | -------------------------------------------------------------------------------- /upload.sh: -------------------------------------------------------------------------------- 1 | python3 setup.py sdist bdist_wheel && twine upload dist/* 2 | --------------------------------------------------------------------------------