├── .gitignore ├── README.md ├── README_en.md ├── __init__.py ├── baselines ├── CONTRIBUTING.md ├── __init__.py ├── baseline_lxj │ ├── .gitignore │ ├── baseline_data │ │ ├── all.json │ │ ├── dev_1.json │ │ ├── dev_2.json │ │ ├── dev_3.json │ │ ├── dev_4.json │ │ ├── dev_5.json │ │ ├── dev_6.json │ │ ├── train_1.json │ │ ├── train_2.json │ │ ├── train_3.json │ │ ├── train_4.json │ │ ├── train_5.json │ │ └── train_6.json │ ├── dataclue.py │ ├── dataclue_change_label.py │ ├── requirements.txt │ ├── run.sh │ ├── run_multi_classify.py │ └── run_multi_classify_bert_multi_seed.sh ├── models_pytorch │ └── classifier_pytorch │ │ ├── README.md │ │ ├── compute_f1.py │ │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ │ ├── metrics │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── clue_compute_metrics.cpython-36.pyc │ │ │ └── clue_compute_metrics.cpython-37.pyc │ │ └── clue_compute_metrics.py │ │ ├── notebook │ │ └── rbt3_iflytek_gpu.ipynb │ │ ├── outputs │ │ ├── afqmc_output │ │ │ └── .gitignore │ │ ├── cmnli_output │ │ │ └── .gitignore │ │ ├── copa_output │ │ │ └── .gitignore │ │ ├── csl_output │ │ │ └── .gitignore │ │ ├── iflytek_output │ │ │ └── .gitignore │ │ ├── tnews_output │ │ │ └── .gitignore │ │ └── wsc_output │ │ │ └── .gitignore │ │ ├── processors │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── clue.cpython-36.pyc │ │ │ ├── clue.cpython-37.pyc │ │ │ ├── utils.cpython-36.pyc │ │ │ └── utils.cpython-37.pyc │ │ ├── clue.py │ │ └── utils.py │ │ ├── run_classifier.py │ │ ├── run_classifier_afqmc.sh │ │ ├── run_classifier_cic.sh │ │ ├── run_classifier_cic_torch12_py36.sh │ │ ├── run_classifier_iflytek.sh │ │ ├── run_classifier_iflytek_original.sh │ │ ├── run_classifier_qbqtc.sh │ │ ├── run_classifier_tnews.sh │ │ ├── run_classifier_triclue.sh │ │ ├── tools │ │ ├── __pycache__ │ │ │ ├── common.cpython-36.pyc │ │ │ ├── common.cpython-37.pyc │ │ │ ├── progressbar.cpython-36.pyc │ │ │ └── progressbar.cpython-37.pyc │ │ ├── common.py │ │ └── progressbar.py │ │ └── transformers │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── configuration_auto.cpython-36.pyc │ │ ├── configuration_auto.cpython-37.pyc │ │ ├── configuration_bert.cpython-36.pyc │ │ ├── configuration_bert.cpython-37.pyc │ │ ├── configuration_ctrl.cpython-36.pyc │ │ ├── configuration_ctrl.cpython-37.pyc │ │ ├── configuration_distilbert.cpython-36.pyc │ │ ├── configuration_distilbert.cpython-37.pyc │ │ ├── configuration_gpt2.cpython-36.pyc │ │ ├── configuration_gpt2.cpython-37.pyc │ │ ├── configuration_openai.cpython-36.pyc │ │ ├── configuration_openai.cpython-37.pyc │ │ ├── configuration_roberta.cpython-36.pyc │ │ ├── configuration_roberta.cpython-37.pyc │ │ ├── configuration_transfo_xl.cpython-36.pyc │ │ ├── configuration_transfo_xl.cpython-37.pyc │ │ ├── configuration_utils.cpython-36.pyc │ │ ├── configuration_utils.cpython-37.pyc │ │ ├── configuration_xlm.cpython-36.pyc │ │ ├── configuration_xlm.cpython-37.pyc │ │ ├── configuration_xlnet.cpython-36.pyc │ │ ├── configuration_xlnet.cpython-37.pyc │ │ ├── file_utils.cpython-36.pyc │ │ ├── file_utils.cpython-37.pyc │ │ ├── modeling_albert.cpython-36.pyc │ │ ├── modeling_albert.cpython-37.pyc │ │ ├── modeling_auto.cpython-36.pyc │ │ ├── modeling_auto.cpython-37.pyc │ │ ├── modeling_bert.cpython-36.pyc │ │ ├── modeling_bert.cpython-37.pyc │ │ ├── modeling_ctrl.cpython-36.pyc │ │ ├── modeling_ctrl.cpython-37.pyc │ │ ├── modeling_distilbert.cpython-36.pyc │ │ ├── modeling_distilbert.cpython-37.pyc │ │ ├── modeling_gpt2.cpython-36.pyc │ │ ├── modeling_gpt2.cpython-37.pyc │ │ ├── modeling_openai.cpython-36.pyc │ │ ├── modeling_openai.cpython-37.pyc │ │ ├── modeling_roberta.cpython-36.pyc │ │ ├── modeling_roberta.cpython-37.pyc │ │ ├── modeling_transfo_xl.cpython-36.pyc │ │ ├── modeling_transfo_xl.cpython-37.pyc │ │ ├── modeling_transfo_xl_utilities.cpython-36.pyc │ │ ├── modeling_transfo_xl_utilities.cpython-37.pyc │ │ ├── modeling_utils.cpython-36.pyc │ │ ├── modeling_utils.cpython-37.pyc │ │ ├── modeling_xlm.cpython-36.pyc │ │ ├── modeling_xlm.cpython-37.pyc │ │ ├── modeling_xlnet.cpython-36.pyc │ │ ├── modeling_xlnet.cpython-37.pyc │ │ ├── optimization.cpython-36.pyc │ │ ├── optimization.cpython-37.pyc │ │ ├── tokenization_auto.cpython-36.pyc │ │ ├── tokenization_auto.cpython-37.pyc │ │ ├── tokenization_bert.cpython-36.pyc │ │ ├── tokenization_bert.cpython-37.pyc │ │ ├── tokenization_ctrl.cpython-36.pyc │ │ ├── tokenization_ctrl.cpython-37.pyc │ │ ├── tokenization_distilbert.cpython-36.pyc │ │ ├── tokenization_distilbert.cpython-37.pyc │ │ ├── tokenization_gpt2.cpython-36.pyc │ │ ├── tokenization_gpt2.cpython-37.pyc │ │ ├── tokenization_openai.cpython-36.pyc │ │ ├── tokenization_openai.cpython-37.pyc │ │ ├── tokenization_roberta.cpython-36.pyc │ │ ├── tokenization_roberta.cpython-37.pyc │ │ ├── tokenization_transfo_xl.cpython-36.pyc │ │ ├── tokenization_transfo_xl.cpython-37.pyc │ │ ├── tokenization_utils.cpython-36.pyc │ │ ├── tokenization_utils.cpython-37.pyc │ │ ├── tokenization_xlm.cpython-36.pyc │ │ ├── tokenization_xlm.cpython-37.pyc │ │ ├── tokenization_xlnet.cpython-36.pyc │ │ └── tokenization_xlnet.cpython-37.pyc │ │ ├── configuration_auto.py │ │ ├── configuration_bert.py │ │ ├── configuration_ctrl.py │ │ ├── configuration_distilbert.py │ │ ├── configuration_gpt2.py │ │ ├── configuration_openai.py │ │ ├── configuration_roberta.py │ │ ├── configuration_transfo_xl.py │ │ ├── configuration_utils.py │ │ ├── configuration_xlm.py │ │ ├── configuration_xlnet.py │ │ ├── file_utils.py │ │ ├── modeling_albert.py │ │ ├── modeling_auto.py │ │ ├── modeling_bert.py │ │ ├── modeling_ctrl.py │ │ ├── modeling_distilbert.py │ │ ├── modeling_gpt2.py │ │ ├── modeling_openai.py │ │ ├── modeling_roberta.py │ │ ├── modeling_transfo_xl.py │ │ ├── modeling_transfo_xl_utilities.py │ │ ├── modeling_utils.py │ │ ├── modeling_xlm.py │ │ ├── modeling_xlnet.py │ │ ├── optimization.py │ │ ├── tokenization_auto.py │ │ ├── tokenization_bert.py │ │ ├── tokenization_ctrl.py │ │ ├── tokenization_distilbert.py │ │ ├── tokenization_gpt2.py │ │ ├── tokenization_openai.py │ │ ├── tokenization_roberta.py │ │ ├── tokenization_transfo_xl.py │ │ ├── tokenization_utils.py │ │ ├── tokenization_xlm.py │ │ └── tokenization_xlnet.py ├── multi │ ├── README.md │ └── simple_baseline │ │ ├── README.md │ │ └── simple_baseline.py └── single │ ├── README.md │ ├── __init__.py │ ├── data_aug │ ├── README.md │ ├── __init__.py │ ├── data_aug.py │ └── parallel_textda.py │ ├── data_mixup │ ├── README.md │ ├── __init__.py │ └── data_mixup.py │ ├── def_aug │ ├── README.md │ ├── __init__.py │ └── def_aug.py │ ├── delete_noise │ ├── README.md │ ├── __init__.py │ ├── classifier.py │ └── delete_noise.py │ └── template │ ├── README.md │ └── template.py ├── datasets ├── afqmc │ └── test_public.json ├── cic │ ├── README.txt │ └── test_public.json ├── iflytek │ └── test_public.json ├── qbqtc │ └── test_public.json ├── raw_afqmc │ ├── dev.json │ ├── readme.md │ ├── test_public.json │ └── train.json ├── raw_cic │ ├── README.txt │ ├── dev.json │ ├── labels.json │ ├── labels.txt │ ├── test_public.json │ └── train.json ├── raw_iflytek │ ├── dev.json │ ├── labels.json │ ├── test_public.json │ └── train.json ├── raw_qbqtc │ ├── dev.json │ ├── readme.md │ ├── test_public.json │ └── train.json ├── raw_tnews │ ├── dev.json │ ├── labels.json │ ├── test_public.json │ └── train.json ├── raw_triclue │ ├── dev.json │ ├── test_public.json │ └── train.json ├── tnews │ └── test_public.json └── triclue │ └── test_public.json ├── dckit ├── README.md ├── __init__.py ├── evaluate.py └── utils.py ├── resources ├── dataclue_submit_examples │ └── dataclue_submit_examples.zip ├── dataclue_submit_examples_old_nouse_iflytek │ └── dataclue_submit_examples.zip └── img │ ├── bxu.jpg │ ├── improve.jpeg │ ├── lifec.jpeg │ ├── takeaway2.jpeg │ └── teamgroup.jpeg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # IPython 77 | profile_default/ 78 | ipython_config.py 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # Pyre type checker 109 | .pyre/ 110 | data/ 111 | 112 | 113 | .idea/ 114 | .vscode/ 115 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/__init__.py -------------------------------------------------------------------------------- /baselines/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # 如何给DataCLUE做贡献 2 | 3 | ## 如果你发现了问题 4 | 5 | 请直接提一个 issue 6 | 7 | ## 如果你采用了一个新的方法 8 | 9 | 请首先确认类似方法没有被提出。 10 | 11 | 如果没有,那么恭喜你发现了一个新方法,请以文件夹的方式添加你的方法添加到`baselines/single` 12 | 我们推荐文件夹包含如下两个文件 13 | 14 | - README.md 描述算法的主要思路和测试的结果,也可以涵盖参数设置、参考文献等内容 15 | - xxxx.py 程序入口文件,最好和方法名一致 16 | 17 | 另外如果有用到特殊的库的话,请添加 18 | 19 | - requirements.txt 20 | 21 | 同时请添加你的方法在test_public上的结果和相应的链接到`baselines/single/README.md` 22 | 23 | ## 如果你尝试了一个方法组合 24 | 25 | 我们推荐先将单独的算法放到single文件夹中,然后请在`baselines/multi`中添加你组合实验的代码 26 | 27 | - README.md 描述算法的主要思路和测试的结果,也可以涵盖参数设置等内容 28 | - xxxx.py 程序入口文件,最好和方法名一致 29 | 30 | ## 编码规范 31 | 32 | 我们提供了一个简单接口用于读取中的数据和进行验证。为了使得大家的代码可以在方便进行整合,请大家采用统一的接口。 33 | 34 | 代码尽量遵循PEP8等规范。 35 | 36 | 37 | -------------------------------------------------------------------------------- /baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/__init__.py -------------------------------------------------------------------------------- /baselines/baseline_lxj/.gitignore: -------------------------------------------------------------------------------- 1 | *output* 2 | *lock 3 | __pycache__ 4 | -------------------------------------------------------------------------------- /baselines/baseline_lxj/dataclue.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """TODO: Add a description here.""" 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | import csv 21 | import json 22 | import os 23 | import pdb 24 | import json 25 | from tqdm import tqdm 26 | from ali_data_util import weibo_data_process 27 | 28 | import datasets 29 | 30 | label_file=open("../../datasets/cic/labels.txt") 31 | label_list=[line.strip() for line in label_file] 32 | label_file.close() 33 | 34 | # TODO: Add BibTeX citation 35 | _CITATION = """\ 36 | @InProceedings{huggingface:dataset, 37 | title = {A great new dataset}, 38 | authors={huggingface, Inc. 39 | }, 40 | year={2020} 41 | } 42 | """ 43 | 44 | # TODO: Add description of the dataset here 45 | _DESCRIPTION = """\ 46 | This new dataset is designed to solve this great NLP task and is crafted with a lot of care. 47 | """ 48 | 49 | # _URL = "https://huggingface.co/great-new-dataset.zip" 50 | 51 | 52 | # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case 53 | # Using a specific configuration class is optional, you can also use the base class if you don't need 54 | # to add specific attributes. 55 | # here we give an example for three sub-set of the dataset with difference sizes. 56 | class AliDatasetConfig(datasets.BuilderConfig): 57 | """ BuilderConfig for AliDataset""" 58 | 59 | def __init__(self, data_size, **kwargs): 60 | """ 61 | 62 | Args: 63 | data_size: the size of the training set we want to us (xs, s, m, l, xl) 64 | **kwargs: keyword arguments forwarded to super. 65 | """ 66 | self.data_size = data_size 67 | 68 | 69 | class AliDataset(datasets.GeneratorBasedBuilder): 70 | """TODO: Short description of my dataset.""" 71 | 72 | VERSION = datasets.Version("0.0.1") 73 | 74 | # This is an example of a dataset with multiple configurations. 75 | # If you don't want/need to define several sub-sets in your dataset, 76 | # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes. 77 | # BUILDER_CONFIG_CLASS = AliDatasetConfig 78 | # BUILDER_CONFIGS = [ 79 | # AliDatasetConfig(name="my_dataset_" + size, description="A small dataset", data_size=size) for size in ["small", "medium", "large"] 80 | # ] 81 | 82 | def _info(self): 83 | # TODO: Specifies the datasets.DatasetInfo object 84 | return datasets.DatasetInfo( 85 | # This is the description that will appear on the datasets page. 86 | description=_DESCRIPTION, 87 | # This defines the different columns of the dataset and their types 88 | features=datasets.Features( 89 | { 90 | "sentence": datasets.Value("string"), 91 | "label": datasets.features.ClassLabel(names=label_list) 92 | # These are the features of your dataset like images, labels ... 93 | } 94 | ), 95 | # If there's a common (input, target) tuple from the features, 96 | # specify them here. They'll be used if as_supervised=True in 97 | # builder.as_dataset. 98 | supervised_keys=None, 99 | # Homepage of the dataset for documentation 100 | homepage="xiaoling@30.54.209.130:/media2/xiaoling/multi_classifier_model/yewu_classify", 101 | citation=_CITATION, 102 | ) 103 | 104 | def _split_generators(self, dl_manager): 105 | """Returns SplitGenerators.""" 106 | # TODO: Downloads the data and defines the splits 107 | # dl_manager is a datasets.download.DownloadManager that can be used to 108 | # download and extract URLs 109 | # dl_dir = dl_manager.download_and_extract(_URL) 110 | # data_dir = os.path.join(dl_dir, "great-new-dataset") 111 | if not self.config.data_files: 112 | raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") 113 | data_files = dl_manager.download_and_extract(self.config.data_files) 114 | if isinstance(data_files,(list,tuple)): 115 | raise ValueError("not right input") 116 | if isinstance(data_files, str): 117 | return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": data_files})] 118 | if isinstance(data_files,dict): 119 | splits = [] 120 | for split_name, files in data_files.items(): 121 | splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"filepath": files})) 122 | return splits 123 | 124 | def preprocess_text(self, text, MAX_LENGTH=256): 125 | if len(text) > 2 * MAX_LENGTH: 126 | text = text[:MAX_LENGTH] + text[-MAX_LENGTH:] 127 | # obj前预处理是为了obj处理太耗时引起的,后处理的目的是防止源码截断的时候只取前max_length个token,而这边是前后各取half_max_length个token 128 | text = weibo_data_process(text) 129 | 130 | if len(text) > MAX_LENGTH: 131 | half_max_length = int(MAX_LENGTH/2) 132 | return text[:half_max_length] + text[-half_max_length:] 133 | else: 134 | return text 135 | 136 | def _generate_examples(self, filepath): 137 | """ Yields examples. """ 138 | # TODO: Yields (key, example) tuples from the dataset 139 | with open(filepath) as f: 140 | for id_, row in tqdm(enumerate(f)): 141 | # data = json.loads(row) 142 | # data=row.strip().split(',') 143 | data=json.loads(row.strip()) 144 | label=int(data["label"]) if "label" in data else 0 145 | sentence=data["sentence"] 146 | yield id_, { 147 | "sentence": sentence, 148 | # "sentence": self.preprocess_text(data[0]), 149 | # "sentence": data[0], 150 | "label": label, 151 | } 152 | -------------------------------------------------------------------------------- /baselines/baseline_lxj/dataclue_change_label.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import os 3 | import sys 4 | import pdb 5 | import json 6 | import collections 7 | 8 | label_file=open("../../datasets/cic/labels.txt") 9 | label_lines=label_file.readlines() 10 | id_label_map={index:str(label_lines[index].strip()) for index in range(len(label_lines))} 11 | label_id_map={str(label_lines[index].strip()):index for index in range(len(label_lines))} 12 | 13 | all_sentences=[] 14 | all_labels=[] 15 | all_ids=[] 16 | all_label_des=[] 17 | 18 | all_source_lines={} 19 | for split_file_index in range(1,7): 20 | 21 | dev_file=open("./baseline_data/dev_{}.json".format(split_file_index),'r',encoding="utf-8") 22 | dev_lines=[json.loads(line.strip()) for line in dev_file] 23 | 24 | sentences=[line["sentence"] for line in dev_lines] 25 | labels=[line["label"] for line in dev_lines] 26 | ids=[line["id"] for line in dev_lines] 27 | label_des=[line["label_des"] for line in dev_lines] 28 | 29 | all_source_lines.update({ids[index]:dev_lines[index] for index in range(len(dev_lines))}) 30 | 31 | all_sentences.extend(sentences) 32 | all_labels.extend(labels) 33 | all_ids.extend(ids) 34 | all_label_des.extend(label_des) 35 | 36 | dev_result_map={} 37 | for seed in [8,9,10]: 38 | 39 | dev_result_map[seed]=[] 40 | 41 | for split_file_index in range(1,7): 42 | dev_result_file=open("./output_dir/dataclue_{}_{}/eval_preds_{}.txt".format(split_file_index,seed,seed),'r',encoding="utf-8") 43 | dev_results=[str(line.strip()) for line in dev_result_file] 44 | dev_result_map[seed].extend(dev_results) 45 | 46 | assert len(all_sentences)==len(all_labels)==len(all_ids)==len(dev_result_map[seed]) 47 | 48 | dev_result_map_prob={} 49 | for seed in [8,9,10]: 50 | 51 | dev_result_map_prob[seed]=[] 52 | 53 | for split_file_index in range(1,7): 54 | dev_result_file=open("./output_dir/dataclue_{}_{}/eval_probility_{}.txt".format(split_file_index,seed,seed),'r',encoding="utf-8") 55 | dev_results=[str(line.strip()) for line in dev_result_file] 56 | dev_result_map_prob[seed].extend(dev_results) 57 | 58 | assert len(all_sentences)==len(all_labels)==len(all_ids)==len(dev_result_map_prob[seed]) 59 | 60 | result_map={} 61 | average_score_list=[] 62 | for index in range(len(all_sentences)): 63 | average_score=str((float(dev_result_map_prob[8][index])+float(dev_result_map_prob[9][index])+float(dev_result_map_prob[10][index]))/3) 64 | average_score_list.append(average_score) 65 | result_map[average_score]=all_sentences[index]+"\t"+all_label_des[index]+"\t"+all_labels[index]+"\t"+str(all_ids[index])+"\t"+dev_result_map[8][index]+"\t"+dev_result_map[9][index]+"\t"+dev_result_map[10][index]+"\t"+dev_result_map_prob[8][index]+"\t"+dev_result_map_prob[9][index]+"\t"+dev_result_map_prob[10][index] 66 | 67 | need_change_sentence_index=[] 68 | 69 | count=0 70 | for index in range(len(all_sentences)): 71 | if float(average_score_list[index])>0.6 and dev_result_map[8][index]==dev_result_map[9][index]==dev_result_map[10][index] and dev_result_map[8][index]!=all_label_des[index]: 72 | need_change_id=all_ids[index] 73 | all_source_lines[need_change_id]["label_des"]=dev_result_map[8][index] 74 | all_source_lines[need_change_id]["label"]=str(label_id_map[dev_result_map[8][index]]) 75 | count+=1 76 | 77 | for id_,line in all_source_lines.items(): 78 | print(json.dumps(line, ensure_ascii=False)) 79 | -------------------------------------------------------------------------------- /baselines/baseline_lxj/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.12.0.dev0 2 | datasets>=1.10.2 3 | -------------------------------------------------------------------------------- /baselines/baseline_lxj/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | bash -x ./run_multi_classify_bert_multi_seed.sh 3 | python ./dataclue_change_label.py > result.json 4 | -------------------------------------------------------------------------------- /baselines/baseline_lxj/run_multi_classify_bert_multi_seed.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | model_type=bert 3 | epoch=6 4 | ttime=`date +"%Y-%m-%d-%H-%M"` 5 | echo $ttime 6 | 7 | for seed in $(seq 8 10);do 8 | for i in $(seq 1 6);do 9 | CUDA_VISIBLE_DEVICES=0 python ./run_multi_classify.py \ 10 | --model_name_or_path=bert-base-chinese \ 11 | --output_dir=./output_dir/dataclue_$i\_$seed \ 12 | --model_type=$model_type \ 13 | --train_file=./baseline_data/train_$i.json \ 14 | --validation_file=./baseline_data/dev_$i.json \ 15 | --test_file=../../datasets/cic/test_public.json \ 16 | --task_name=dataclue \ 17 | --per_device_train_batch_size=16 \ 18 | --num_train_epochs=$epoch \ 19 | --max_seq_length=64 \ 20 | --learning_rate=2e-5 \ 21 | --seed=$seed \ 22 | --overwrite_output_dir \ 23 | --overwrite_cache \ 24 | --do_train \ 25 | --do_eval \ 26 | --do_predict \ 27 | --evaluation_strategy=epoch \ 28 | --save_strategy=epoch 29 | done 30 | done 31 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/README.md: -------------------------------------------------------------------------------- 1 | # CLUE_pytorch 2 | 3 | 中文语言理解测评基准(Language Understanding Evaluation benchmark for Chinese) 4 | 5 | **备注**:此版本为个人开发版(目前支持所有的分类型任务),正式版见https://github.com/CLUEbenchmark/CLUE 6 | 7 | ## 代码目录说明 8 | 9 | ```text 10 | ├── CLUEdatasets # 存放数据 11 | | └── tnews    12 | | └── wsc  13 | | └── ... 14 | ├── metrics         # metric计算 15 | | └── clue_compute_metrics.py    16 | ├── outputs # 模型输出保存 17 | | └── tnews_output 18 | | └── wsc_output  19 | | └── ... 20 | ├── prev_trained_model # 预训练模型 21 | | └── albert_base 22 | | └── bert-wwm 23 | | └── ... 24 | ├── processors     # 数据处理 25 | | └── clue.py 26 | | └── ... 27 | ├── tools        # 通用脚本 28 | | └── progressbar.py 29 | | └── ... 30 | ├── transformers   # 主模型 31 | | └── modeling_albert.py 32 | | └── modeling_bert.py 33 | | └── ... 34 | ├── convert_albert_original_tf_checkpoint_to_pytorch.py # 模型文件转换 35 | ├── run_classifier.py # 主程序 36 | ├── run_classifier_tnews.sh # 任务运行脚本 37 | ├── download_clue_data.py # 数据集下载 38 | ``` 39 | ### 依赖模块 40 | 41 | - pytorch=1.1.0 42 | - boto3=1.9 43 | - regex 44 | - sacremoses 45 | - sentencepiece 46 | - python3.7+ 47 | 48 | ### 运行方式 49 | 50 | **1. 下载CLUE数据集,运行以下命令:** 51 | ```python 52 | python download_clue_data.py --data_dir=./CLUEdatasets --tasks=all 53 | ``` 54 | 上述命令默认下载全CLUE数据集,你也可以指定`--tasks`进行下载对应任务数据集,默认存在在`./CLUEdatasets/{对应task}`目录下。 55 | 56 | **2. 若下载对应tf模型权重(若下载为pytorch权重,则跳过该步),运行转换脚本,比如转换`albert_base_tf`:** 57 | 58 | ```python 59 | python convert_albert_original_tf_checkpoint_to_pytorch.py \ 60 | --tf_checkpoint_path=./prev_trained_model/albert_base_tf \ 61 | --bert_config_file=./prev_trained_model/albert_base_tf/albert_config_base.json \ 62 | --pytorch_dump_path=./prev_trained_model/albert_base/pytorch_model.bin 63 | ``` 64 | **注意**: 当转换完模型(包括下载的pytorch模型权重)之后,需要在对应的文件夹内存放`config.json`和`vocab.txt`文件,比如: 65 | 66 | ```text 67 | ├── prev_trained_model # 预训练模型 68 | | └── bert-base 69 | | | └── vocab.txt 70 | | | └── config.json 71 | | | └── pytorch_model.bin 72 | 73 | ``` 74 | **3. 直接运行对应任务sh脚本,如:** 75 | 76 | ```shell 77 | sh run_classifier_tnews.sh 78 | ``` 79 | **4. 评估** 80 | 81 | 当前默认使用最后一个checkpoint模型作为评估模型,你也可以指定`--predict_checkpoints`参数进行对应的checkpoint进行评估,比如: 82 | ```python 83 | CURRENT_DIR=`pwd` 84 | export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/bert-base 85 | export GLUE_DIR=$CURRENT_DIR/CLUEdatasets 86 | export OUTPUR_DIR=$CURRENT_DIR/outputs 87 | TASK_NAME="copa" 88 | 89 | python run_classifier.py \ 90 | --model_type=bert \ 91 | --model_name_or_path=$BERT_BASE_DIR \ 92 | --task_name=$TASK_NAME \ 93 | --do_predict \ 94 | --predict_checkpoints=100 \ 95 | --do_lower_case \ 96 | --data_dir=$GLUE_DIR/${TASK_NAME}/ \ 97 | --max_seq_length=128 \ 98 | --per_gpu_train_batch_size=16 \ 99 | --per_gpu_eval_batch_size=16 \ 100 | --learning_rate=1e-5 \ 101 | --num_train_epochs=2.0 \ 102 | --logging_steps=50 \ 103 | --save_steps=50 \ 104 | --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ 105 | --overwrite_output_dir \ 106 | --seed=42 107 | ``` 108 | 109 | ### 模型列表 110 | 111 | ``` 112 | MODEL_CLASSES = { 113 | ## bert ernie bert_wwm bert_wwwm_ext 114 | 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), 115 | # xlnet_base xlnet_mid xlnet_large 116 | 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), 117 | # roberta_base roberta_wwm roberta_wwm_ext roberta_wwm_large_ext 118 | 'roberta': (BertConfig, BertForSequenceClassification, BertTokenizer), 119 | # albert_tiny albert_base albert_large albert_xlarge 120 | 'albert': (BertConfig, AlbertForSequenceClassification, BertTokenizer) 121 | } 122 | ``` 123 | **注意**: bert ernie bert_wwm bert_wwwm_ext等模型只是权重不一样,而模型本身主体一样,因此参数`model_type=bert`其余同理。 124 | 125 | ### 结果 126 | 127 | 当前按照https://github.com/CLUEbenchmark/CLUE 提供的参数,除了**COPA**任务无法复现,其余任务基本保持一致。 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/compute_f1.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | from sklearn.metrics import f1_score 5 | 6 | def compute_f1_score_by_list(y_true_list,y_pred_list): 7 | #y_true = [1, 1, 1, 1, 2, 2, 2, 3, 3] 8 | # y_pred = [1, 1, 2, 3, 2, 2, 3, 2, 3] 9 | f1_micro = f1_score(y_true_list, y_pred_list, average='micro') 10 | f1_macro = f1_score(y_true_list, y_pred_list, average='macro') 11 | print('f1_micro: {0}'.format(f1_micro)) 12 | print('f1_macro: {0}'.format(f1_macro)) 13 | 14 | def compute_score_fn(target_file, predict_file): 15 | predict_object=open(predict_file,'r') 16 | predict_lines=predict_object.readlines() 17 | 18 | target_object=open(target_file,'r') 19 | target_lines=target_object.readlines() 20 | countt=0 21 | total_ignore=0 22 | y_pred_list=[] 23 | y_true_list=[] 24 | for i, source_line in enumerate(predict_lines): 25 | source_line_json=json.loads(source_line) 26 | predict_label=source_line_json['label'] 27 | y_pred_list.append(predict_label) 28 | target_line_json=json.loads(target_lines[i]) 29 | target_label=target_line_json['label'] 30 | y_true_list.append(target_label) 31 | if str(target_label)=='-1': 32 | total_ignore=total_ignore+1 33 | continue 34 | if predict_label==target_label: 35 | countt=countt+1 36 | 37 | compute_f1_score_by_list(y_true_list, y_pred_list) 38 | avg=float(countt)/float(len(target_lines)-total_ignore) 39 | print("avg:",avg,";total_ignore:",total_ignore,";target_lines:",len(target_lines)) 40 | 41 | 42 | target_file='test_public.json' 43 | predict_file='test_public_preidct.json' 44 | compute_score_fn(target_file, predict_file) 45 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/convert_bert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | """Convert BERT checkpoint.""" 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import argparse 8 | import torch 9 | 10 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 11 | 12 | import logging 13 | logging.basicConfig(level=logging.INFO) 14 | 15 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 16 | # Initialise PyTorch model 17 | config = BertConfig.from_json_file(bert_config_file) 18 | print("Building PyTorch model from configuration: {}".format(str(config))) 19 | model = BertForPreTraining(config) 20 | 21 | # Load weights from tf checkpoint 22 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 23 | 24 | # Save pytorch-model 25 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 26 | torch.save(model.state_dict(), pytorch_dump_path) 27 | 28 | 29 | if __name__ == "__main__": 30 | parser = argparse.ArgumentParser() 31 | ## Required parameters 32 | parser.add_argument("--tf_checkpoint_path", 33 | default = None, 34 | type = str, 35 | required = True, 36 | help = "Path to the TensorFlow checkpoint path.") 37 | parser.add_argument("--bert_config_file", 38 | default = None, 39 | type = str, 40 | required = True, 41 | help = "The config json file corresponding to the pre-trained BERT model. \n" 42 | "This specifies the model architecture.") 43 | parser.add_argument("--pytorch_dump_path", 44 | default = None, 45 | type = str, 46 | required = True, 47 | help = "Path to the output PyTorch model.") 48 | args = parser.parse_args() 49 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, 50 | args.bert_config_file, 51 | args.pytorch_dump_path) 52 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/metrics/__init__.py -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/clue_compute_metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/clue_compute_metrics.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/clue_compute_metrics.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/clue_compute_metrics.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/metrics/clue_compute_metrics.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | try: 8 | from scipy.stats import pearsonr, spearmanr 9 | from sklearn.metrics import matthews_corrcoef, f1_score 10 | _has_sklearn = True 11 | except (AttributeError, ImportError) as e: 12 | logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html") 13 | _has_sklearn = False 14 | 15 | def simple_accuracy(preds, labels): 16 | return (preds == labels).mean() 17 | 18 | def acc_and_f1(preds, labels): 19 | acc = simple_accuracy(preds, labels) 20 | f1 = f1_score(y_true=labels, y_pred=preds,average="macro") 21 | return { 22 | "acc": acc, 23 | "f1": f1, 24 | "acc_and_f1": (acc + f1) / 2, 25 | } 26 | 27 | 28 | def pearson_and_spearman(preds, labels): 29 | pearson_corr = pearsonr(preds, labels)[0] 30 | spearman_corr = spearmanr(preds, labels)[0] 31 | return { 32 | "pearson": pearson_corr, 33 | "spearmanr": spearman_corr, 34 | "corr": (pearson_corr + spearman_corr) / 2, 35 | } 36 | 37 | def compute_metrics(task_name, preds, labels): 38 | assert len(preds) == len(labels) 39 | if task_name == "cls": 40 | return {"acc": simple_accuracy(preds, labels)} 41 | elif task_name == "cmnli": 42 | return {"acc": simple_accuracy(preds, labels)} 43 | elif task_name == "ocnli": 44 | return {"acc": simple_accuracy(preds, labels)} 45 | elif task_name == "iflytek": 46 | return {"acc": simple_accuracy(preds, labels)} 47 | elif task_name == "wsc": 48 | return {"acc": simple_accuracy(preds, labels)} 49 | elif task_name == "tnews": 50 | return {"acc": simple_accuracy(preds, labels)} 51 | elif task_name == "afqmc": 52 | return {"acc": simple_accuracy(preds, labels)} 53 | elif task_name == "copa": 54 | return {"acc": simple_accuracy(preds, labels)} 55 | elif task_name == "cic": 56 | return {"acc": acc_and_f1(preds, labels)} 57 | else: 58 | raise KeyError(task_name) 59 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/outputs/afqmc_output/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/outputs/cmnli_output/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/outputs/copa_output/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/outputs/csl_output/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/outputs/iflytek_output/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/outputs/tnews_output/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/outputs/wsc_output/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/processors/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import InputExample, InputFeatures, DataProcessor 2 | from .clue import (clue_output_modes, clue_processors, clue_tasks_num_labels, 3 | clue_convert_examples_to_features, collate_fn, xlnet_collate_fn) 4 | 5 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/processors/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/processors/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/processors/__pycache__/clue.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/clue.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/processors/__pycache__/clue.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/clue.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/processors/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/processors/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/processors/utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | import copy 4 | import json 5 | 6 | class InputExample(object): 7 | """ 8 | A single training/test example for simple sequence classification. 9 | 10 | Args: 11 | guid: Unique id for the example. 12 | text_a: string. The untokenized text of the first sequence. For single 13 | sequence tasks, only this sequence must be specified. 14 | text_b: (Optional) string. The untokenized text of the second sequence. 15 | Only must be specified for sequence pair tasks. 16 | label: (Optional) string. The label of the example. This should be 17 | specified for train and dev examples, but not for test examples. 18 | """ 19 | def __init__(self, guid, text_a, text_b=None, label=None): 20 | self.guid = guid 21 | self.text_a = text_a 22 | self.text_b = text_b 23 | self.label = label 24 | 25 | def __repr__(self): 26 | return str(self.to_json_string()) 27 | 28 | def to_dict(self): 29 | """Serializes this instance to a Python dictionary.""" 30 | output = copy.deepcopy(self.__dict__) 31 | return output 32 | 33 | def to_json_string(self): 34 | """Serializes this instance to a JSON string.""" 35 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 36 | 37 | 38 | class InputFeatures(object): 39 | """ 40 | A single set of features of data. 41 | 42 | Args: 43 | input_ids: Indices of input sequence tokens in the vocabulary. 44 | attention_mask: Mask to avoid performing attention on padding token indices. 45 | Mask values selected in ``[0, 1]``: 46 | Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. 47 | token_type_ids: Segment token indices to indicate first and second portions of the inputs. 48 | label: Label corresponding to the input 49 | """ 50 | 51 | def __init__(self, input_ids, attention_mask, token_type_ids, label,input_len): 52 | self.input_ids = input_ids 53 | self.attention_mask = attention_mask 54 | self.token_type_ids = token_type_ids 55 | self.input_len = input_len 56 | self.label = label 57 | 58 | def __repr__(self): 59 | return str(self.to_json_string()) 60 | 61 | def to_dict(self): 62 | """Serializes this instance to a Python dictionary.""" 63 | output = copy.deepcopy(self.__dict__) 64 | return output 65 | 66 | def to_json_string(self): 67 | """Serializes this instance to a JSON string.""" 68 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 69 | 70 | 71 | class DataProcessor(object): 72 | """Base class for data converters for sequence classification data sets.""" 73 | 74 | def get_train_examples(self, data_dir): 75 | """Gets a collection of `InputExample`s for the train set.""" 76 | raise NotImplementedError() 77 | 78 | def get_dev_examples(self, data_dir): 79 | """Gets a collection of `InputExample`s for the dev set.""" 80 | raise NotImplementedError() 81 | 82 | def get_labels(self): 83 | """Gets the list of labels for this data set.""" 84 | raise NotImplementedError() 85 | 86 | @classmethod 87 | def _read_tsv(cls, input_file, quotechar=None): 88 | """Reads a tab separated value file.""" 89 | with open(input_file, "r", encoding="utf-8-sig") as f: 90 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 91 | lines = [] 92 | for line in reader: 93 | lines.append(line) 94 | return lines 95 | 96 | @classmethod 97 | def _read_json(cls, input_file): 98 | """Reads a json list file.""" 99 | with open(input_file, "r") as f: 100 | reader = f.readlines() 101 | lines = [] 102 | for line in reader: 103 | lines.append(json.loads(line.strip())) 104 | return lines 105 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/run_classifier_afqmc.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env bash 3 | # @Author: bo.shi 4 | # @Date: 2019-11-04 09:56:36 5 | # @Last Modified by: bo.shi 6 | # @Last Modified time: 2019-12-05 11:23:45 7 | 8 | TASK_NAME="afqmc" 9 | MODEL_NAME="chinese_rbtl3_pytorch" 10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 11 | echo "CURRENT_DIR:"+$CURRENT_DIR 12 | export CUDA_VISIBLE_DEVICES="0" 13 | export CLUE_DATA_DIR=../../../datasets # that is under project path 14 | export OUTPUT_DIR=../../../output_dir/ # # that is under project path 15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model 16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME 17 | 18 | # download base model if not exists 19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then 20 | mkdir -p $ROBERTA_WWM_SMALL_DIR 21 | echo "makedir $ROBERTA_WWM_SMALL_DIR" 22 | fi 23 | cd $ROBERTA_WWM_SMALL_DIR 24 | 25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then 26 | echo "Model not exists, will downloda it now..." 27 | # rm * 28 | # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm 29 | wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip 30 | unzip chinese_rbtl3_pytorch.zip 31 | rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip 32 | else 33 | echo "Model exists, will reuse it." 34 | fi 35 | 36 | # run task 37 | cd $CURRENT_DIR 38 | echo "Start running..." 39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR 40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR 41 | 42 | if [ $# == 0 ]; then 43 | echo "Start training..." 44 | python run_classifier.py \ 45 | --model_type=bert \ 46 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 47 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 48 | --task_name=$TASK_NAME \ 49 | --do_train \ 50 | --do_eval \ 51 | --do_lower_case \ 52 | --max_seq_length=32 \ 53 | --per_gpu_train_batch_size=64 \ 54 | --per_gpu_eval_batch_size=32 \ 55 | --learning_rate=2e-5 \ 56 | --num_train_epochs=15 \ 57 | --logging_steps=300 \ 58 | --save_steps=300 \ 59 | --output_dir=$OUTPUT_DIR \ 60 | --overwrite_output_dir \ 61 | --seed=42 62 | 63 | # run below lines to generate predicted file on test.json 64 | elif [ $1 == "predict" ]; then 65 | echo "Start predict..." 66 | python run_classifier.py \ 67 | --model_type=bert \ 68 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 69 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 70 | --task_name=$TASK_NAME \ 71 | --do_predict \ 72 | --do_lower_case \ 73 | --max_seq_length=32 \ 74 | --per_gpu_train_batch_size=64 \ 75 | --per_gpu_eval_batch_size=32 \ 76 | --learning_rate=2e-5 \ 77 | --num_train_epochs=15 \ 78 | --logging_steps=300 \ 79 | --save_steps=300 \ 80 | --output_dir=$OUTPUT_DIR \ 81 | --overwrite_output_dir \ 82 | --seed=42 83 | fi 84 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/run_classifier_cic.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env bash 3 | # @Author: bo.shi 4 | # @Date: 2019-11-04 09:56:36 5 | # @Last Modified by: bo.shi 6 | # @Last Modified time: 2019-12-05 11:23:45 7 | 8 | TASK_NAME="cic" 9 | MODEL_NAME="chinese_rbtl3_pytorch" 10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 11 | echo "CURRENT_DIR:"+$CURRENT_DIR 12 | export CUDA_VISIBLE_DEVICES="0" 13 | export CLUE_DATA_DIR=../../../datasets # that is under project path 14 | export OUTPUT_DIR=../../../output_dir/ # # that is under project path 15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model 16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME 17 | 18 | # download base model if not exists 19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then 20 | mkdir -p $ROBERTA_WWM_SMALL_DIR 21 | echo "makedir $ROBERTA_WWM_SMALL_DIR" 22 | fi 23 | cd $ROBERTA_WWM_SMALL_DIR 24 | 25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then 26 | echo "Model not exists, will downloda it now..." 27 | # rm * 28 | # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm 29 | wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip 30 | unzip chinese_rbtl3_pytorch.zip 31 | rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip 32 | else 33 | echo "Model exists, will reuse it." 34 | fi 35 | 36 | # run task 37 | cd $CURRENT_DIR 38 | echo "Start running..." 39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR 40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR 41 | 42 | if [ $# == 0 ]; then 43 | echo "Start training..." 44 | python run_classifier.py \ 45 | --model_type=bert \ 46 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 47 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 48 | --task_name=$TASK_NAME \ 49 | --do_train \ 50 | --do_eval \ 51 | --do_lower_case \ 52 | --max_seq_length=32 \ 53 | --per_gpu_train_batch_size=64 \ 54 | --per_gpu_eval_batch_size=32 \ 55 | --learning_rate=2e-5 \ 56 | --num_train_epochs=15 \ 57 | --logging_steps=300 \ 58 | --save_steps=300 \ 59 | --output_dir=$OUTPUT_DIR \ 60 | --overwrite_output_dir \ 61 | --seed=42 62 | 63 | # run below lines to generate predicted file on test.json 64 | elif [ $1 == "predict" ]; then 65 | echo "Start predict..." 66 | python run_classifier.py \ 67 | --model_type=bert \ 68 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 69 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 70 | --task_name=$TASK_NAME \ 71 | --do_predict \ 72 | --do_lower_case \ 73 | --max_seq_length=32 \ 74 | --per_gpu_train_batch_size=64 \ 75 | --per_gpu_eval_batch_size=32 \ 76 | --learning_rate=2e-5 \ 77 | --num_train_epochs=15 \ 78 | --logging_steps=300 \ 79 | --save_steps=300 \ 80 | --output_dir=$OUTPUT_DIR \ 81 | --overwrite_output_dir \ 82 | --seed=42 83 | fi 84 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/run_classifier_cic_torch12_py36.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env bash 3 | # @Author: bo.shi 4 | # @Date: 2019-11-04 09:56:36 5 | # @Last Modified by: bo.shi 6 | # @Last Modified time: 2019-12-05 11:23:45 7 | source /root/anaconda/bin/activate /root/anaconda/envs/torch_1.2_cu10.0_py36 8 | conda init torch_1.2_cu10.0_py36 9 | 10 | export LC_ALL="en_US.utf8" 11 | TASK_NAME="cic" 12 | MODEL_NAME="chinese_rbtl3_pytorch" 13 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 14 | echo "CURRENT_DIR:"+$CURRENT_DIR 15 | export CUDA_VISIBLE_DEVICES="0" 16 | export CLUE_DATA_DIR=../../../datasets # that is under project path 17 | export OUTPUT_DIR=../../../output_dir/ # # that is under project path 18 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model 19 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME 20 | 21 | # download base model if not exists 22 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then 23 | mkdir -p $ROBERTA_WWM_SMALL_DIR 24 | echo "makedir $ROBERTA_WWM_SMALL_DIR" 25 | fi 26 | cd $ROBERTA_WWM_SMALL_DIR 27 | 28 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then 29 | echo "Model not exists, will downloda it now..." 30 | # rm * 31 | # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm 32 | wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip 33 | unzip chinese_rbtl3_pytorch.zip 34 | rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip 35 | else 36 | echo "Model exists, will reuse it." 37 | fi 38 | 39 | # run task 40 | cd $CURRENT_DIR 41 | echo "Start running..." 42 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR 43 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR 44 | 45 | if [ $# == 0 ]; then 46 | echo "Start training..." 47 | python run_classifier.py \ 48 | --model_type=bert \ 49 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 50 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 51 | --task_name=$TASK_NAME \ 52 | --do_train \ 53 | --do_eval \ 54 | --do_lower_case \ 55 | --max_seq_length=32 \ 56 | --per_gpu_train_batch_size=32 \ 57 | --per_gpu_eval_batch_size=32 \ 58 | --learning_rate=2e-5 \ 59 | --num_train_epochs=2 \ 60 | --logging_steps=300 \ 61 | --save_steps=300 \ 62 | --output_dir=$OUTPUT_DIR \ 63 | --overwrite_output_dir \ 64 | --seed=42 65 | 66 | # run below lines to generate predicted file on test.json 67 | elif [ $1 == "predict" ]; then 68 | echo "Start predict..." 69 | python run_classifier.py \ 70 | --model_type=bert \ 71 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 72 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 73 | --task_name=$TASK_NAME \ 74 | --do_predict \ 75 | --do_lower_case \ 76 | --max_seq_length=32 \ 77 | --per_gpu_train_batch_size=32 \ 78 | --per_gpu_eval_batch_size=32 \ 79 | --learning_rate=2e-5 \ 80 | --num_train_epochs=15 \ 81 | --logging_steps=300 \ 82 | --save_steps=300 \ 83 | --output_dir=$OUTPUT_DIR \ 84 | --overwrite_output_dir \ 85 | --seed=42 86 | fi 87 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/run_classifier_iflytek.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env bash 3 | # @Author: bo.shi 4 | # @Date: 2019-11-04 09:56:36 5 | # @Last Modified by: bo.shi 6 | # @Last Modified time: 2019-12-05 11:23:45 7 | 8 | TASK_NAME="iflytek" 9 | MODEL_NAME="chinese_rbtl3_pytorch" 10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 11 | echo "CURRENT_DIR:"+$CURRENT_DIR 12 | export CUDA_VISIBLE_DEVICES="0" 13 | export CLUE_DATA_DIR=../../../datasets # that is under project path 14 | export OUTPUT_DIR=../../../output_dir/ # # that is under project path 15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model 16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME 17 | 18 | # download base model if not exists 19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then 20 | mkdir -p $ROBERTA_WWM_SMALL_DIR 21 | echo "makedir $ROBERTA_WWM_SMALL_DIR" 22 | fi 23 | cd $ROBERTA_WWM_SMALL_DIR 24 | 25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then 26 | echo "Model not exists, will downloda it now..." 27 | # rm * 28 | # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm 29 | wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip 30 | unzip chinese_rbtl3_pytorch.zip 31 | rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip 32 | else 33 | echo "Model exists, will reuse it." 34 | fi 35 | 36 | # run task 37 | cd $CURRENT_DIR 38 | echo "Start running..." 39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR 40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR 41 | 42 | if [ $# == 0 ]; then 43 | echo "Start training..." 44 | python run_classifier.py \ 45 | --model_type=bert \ 46 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 47 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 48 | --task_name=$TASK_NAME \ 49 | --do_train \ 50 | --do_eval \ 51 | --do_lower_case \ 52 | --max_seq_length=256 \ 53 | --per_gpu_train_batch_size=64 \ 54 | --per_gpu_eval_batch_size=32 \ 55 | --learning_rate=2e-5 \ 56 | --num_train_epochs=6 \ 57 | --logging_steps=300 \ 58 | --save_steps=300 \ 59 | --output_dir=$OUTPUT_DIR \ 60 | --overwrite_output_dir \ 61 | --seed=42 62 | 63 | # run below lines to generate predicted file on test.json 64 | elif [ $1 == "predict" ]; then 65 | echo "Start predict..." 66 | python run_classifier.py \ 67 | --model_type=bert \ 68 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 69 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 70 | --task_name=$TASK_NAME \ 71 | --do_predict \ 72 | --do_lower_case \ 73 | --max_seq_length=256 \ 74 | --per_gpu_train_batch_size=32 \ 75 | --per_gpu_eval_batch_size=32 \ 76 | --learning_rate=2e-5 \ 77 | --num_train_epochs=6 \ 78 | --logging_steps=300 \ 79 | --save_steps=300 \ 80 | --output_dir=$OUTPUT_DIR \ 81 | --overwrite_output_dir \ 82 | --seed=42 83 | fi 84 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/run_classifier_iflytek_original.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # @Author: bo.shi 3 | # @Date: 2019-11-04 09:56:36 4 | # @Last Modified by: bo.shi 5 | # @Last Modified time: 2020-01-01 11:43:42 6 | 7 | TASK_NAME="iflytek" 8 | MODEL_NAME="../../../../local_models/chinese_rbtl3_pytorch" 9 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 10 | export CUDA_VISIBLE_DEVICES="0" 11 | export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model 12 | export BERT_WWM_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME 13 | export CLUE_DATA_DIR=$CURRENT_DIR/CLUEdatasets 14 | experot pretrained_model_url=https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip 15 | 16 | # download and unzip dataset 17 | if [ ! -d $CLUE_DATA_DIR ]; then 18 | mkdir -p $CLUE_DATA_DIR 19 | echo "makedir $CLUE_DATA_DIR" 20 | fi 21 | cd $CLUE_DATA_DIR 22 | if [ ! -d $TASK_NAME ]; then 23 | mkdir $TASK_NAME 24 | echo "makedir $CLUE_DATA_DIR/$TASK_NAME" 25 | fi 26 | cd $TASK_NAME 27 | if [ ! -f "train.json" ] || [ ! -f "dev.json" ] || [ ! -f "test.json" ]; then 28 | rm * 29 | wget https://storage.googleapis.com/cluebenchmark/tasks/iflytek_public.zip 30 | unzip iflytek_public.zip 31 | rm iflytek_public.zip 32 | else 33 | echo "data exists" 34 | fi 35 | echo "Finish download dataset." 36 | 37 | # make output dir 38 | if [ ! -d $CURRENT_DIR/${TASK_NAME}_output ]; then 39 | mkdir -p $CURRENT_DIR/${TASK_NAME}_output 40 | echo "makedir $CURRENT_DIR/${TASK_NAME}_output" 41 | fi 42 | 43 | # run task 44 | cd $CURRENT_DIR 45 | echo "Start running..." 46 | if [ $# == 0 ]; then 47 | python run_classifier.py \ 48 | --model_type=bert \ 49 | --model_name_or_path=$MODEL_NAME \ 50 | --task_name=$TASK_NAME \ 51 | --do_train \ 52 | --do_eval \ 53 | --do_lower_case \ 54 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 55 | --max_seq_length=128 \ 56 | --per_gpu_train_batch_size=16 \ 57 | --per_gpu_eval_batch_size=16 \ 58 | --learning_rate=2e-5 \ 59 | --num_train_epochs=3.0 \ 60 | --logging_steps=759 \ 61 | --save_steps=759 \ 62 | --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ 63 | --overwrite_output_dir \ 64 | --seed=42 65 | elif [ $1 == "predict" ]; then 66 | echo "Start predict..." 67 | python run_classifier.py \ 68 | --model_type=bert \ 69 | --model_name_or_path=$MODEL_NAME \ 70 | --task_name=$TASK_NAME \ 71 | --do_predict \ 72 | --do_lower_case \ 73 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 74 | --max_seq_length=128 \ 75 | --per_gpu_train_batch_size=16 \ 76 | --per_gpu_eval_batch_size=16 \ 77 | --learning_rate=2e-5 \ 78 | --num_train_epochs=3.0 \ 79 | --logging_steps=759 \ 80 | --save_steps=759 \ 81 | --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ 82 | --overwrite_output_dir \ 83 | --seed=42 84 | fi 85 | 86 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/run_classifier_qbqtc.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env bash 3 | # @Author: bo.shi 4 | # @Date: 2019-11-04 09:56:36 5 | # @Last Modified by: bo.shi 6 | # @Last Modified time: 2019-12-05 11:23:45 7 | 8 | TASK_NAME="qbqtc" 9 | MODEL_NAME="chinese_rbtl3_pytorch" 10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 11 | echo "CURRENT_DIR:"+$CURRENT_DIR 12 | export CUDA_VISIBLE_DEVICES="0" 13 | export CLUE_DATA_DIR=../../../datasets # that is under project path 14 | export OUTPUT_DIR=../../../output_dir/ # # that is under project path 15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model 16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME 17 | 18 | # download base model if not exists 19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then 20 | mkdir -p $ROBERTA_WWM_SMALL_DIR 21 | echo "makedir $ROBERTA_WWM_SMALL_DIR" 22 | fi 23 | cd $ROBERTA_WWM_SMALL_DIR 24 | 25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then 26 | echo "Model not exists, will downloda it now..." 27 | # rm * 28 | # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm 29 | wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip 30 | unzip chinese_rbtl3_pytorch.zip 31 | rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip 32 | else 33 | echo "Model exists, will reuse it." 34 | fi 35 | 36 | # run task 37 | cd $CURRENT_DIR 38 | echo "Start running..." 39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR 40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR 41 | 42 | if [ $# == 0 ]; then 43 | echo "Start training..." 44 | python run_classifier.py \ 45 | --model_type=bert \ 46 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 47 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 48 | --task_name=$TASK_NAME \ 49 | --do_train \ 50 | --do_eval \ 51 | --do_lower_case \ 52 | --max_seq_length=32 \ 53 | --per_gpu_train_batch_size=64 \ 54 | --per_gpu_eval_batch_size=32 \ 55 | --learning_rate=2e-5 \ 56 | --num_train_epochs=15 \ 57 | --logging_steps=300 \ 58 | --save_steps=300 \ 59 | --output_dir=$OUTPUT_DIR \ 60 | --overwrite_output_dir \ 61 | --seed=42 62 | 63 | # run below lines to generate predicted file on test.json 64 | elif [ $1 == "predict" ]; then 65 | echo "Start predict..." 66 | python run_classifier.py \ 67 | --model_type=bert \ 68 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 69 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 70 | --task_name=$TASK_NAME \ 71 | --do_predict \ 72 | --do_lower_case \ 73 | --max_seq_length=32 \ 74 | --per_gpu_train_batch_size=64 \ 75 | --per_gpu_eval_batch_size=32 \ 76 | --learning_rate=2e-5 \ 77 | --num_train_epochs=15 \ 78 | --logging_steps=300 \ 79 | --save_steps=300 \ 80 | --output_dir=$OUTPUT_DIR \ 81 | --overwrite_output_dir \ 82 | --seed=42 83 | fi 84 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/run_classifier_tnews.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env bash 3 | # @Author: bo.shi 4 | # @Date: 2019-11-04 09:56:36 5 | # @Last Modified by: bo.shi 6 | # @Last Modified time: 2019-12-05 11:23:45 7 | 8 | TASK_NAME="tnews" 9 | MODEL_NAME="chinese_rbtl3_pytorch" 10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 11 | echo "CURRENT_DIR:"+$CURRENT_DIR 12 | export CUDA_VISIBLE_DEVICES="0" 13 | export CLUE_DATA_DIR=../../../datasets # that is under project path 14 | export OUTPUT_DIR=../../../output_dir/ # # that is under project path 15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model 16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME 17 | 18 | # download base model if not exists 19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then 20 | mkdir -p $ROBERTA_WWM_SMALL_DIR 21 | echo "makedir $ROBERTA_WWM_SMALL_DIR" 22 | fi 23 | cd $ROBERTA_WWM_SMALL_DIR 24 | 25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then 26 | echo "Model not exists, will downloda it now..." 27 | # rm * 28 | # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm 29 | wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip 30 | unzip chinese_rbtl3_pytorch.zip 31 | rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip 32 | else 33 | echo "Model exists, will reuse it." 34 | fi 35 | 36 | # run task 37 | cd $CURRENT_DIR 38 | echo "Start running..." 39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR 40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR 41 | 42 | if [ $# == 0 ]; then 43 | echo "Start training..." 44 | python run_classifier.py \ 45 | --model_type=bert \ 46 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 47 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 48 | --task_name=$TASK_NAME \ 49 | --do_train \ 50 | --do_eval \ 51 | --do_lower_case \ 52 | --max_seq_length=32 \ 53 | --per_gpu_train_batch_size=64 \ 54 | --per_gpu_eval_batch_size=32 \ 55 | --learning_rate=2e-5 \ 56 | --num_train_epochs=15 \ 57 | --logging_steps=300 \ 58 | --save_steps=300 \ 59 | --output_dir=$OUTPUT_DIR \ 60 | --overwrite_output_dir \ 61 | --seed=42 62 | 63 | # run below lines to generate predicted file on test.json 64 | elif [ $1 == "predict" ]; then 65 | echo "Start predict..." 66 | python run_classifier.py \ 67 | --model_type=bert \ 68 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 69 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 70 | --task_name=$TASK_NAME \ 71 | --do_predict \ 72 | --do_lower_case \ 73 | --max_seq_length=32 \ 74 | --per_gpu_train_batch_size=64 \ 75 | --per_gpu_eval_batch_size=32 \ 76 | --learning_rate=2e-5 \ 77 | --num_train_epochs=15 \ 78 | --logging_steps=300 \ 79 | --save_steps=300 \ 80 | --output_dir=$OUTPUT_DIR \ 81 | --overwrite_output_dir \ 82 | --seed=42 83 | fi 84 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/run_classifier_triclue.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env bash 3 | # @Author: bo.shi 4 | # @Date: 2019-11-04 09:56:36 5 | # @Last Modified by: bo.shi 6 | # @Last Modified time: 2019-12-05 11:23:45 7 | 8 | TASK_NAME="triclue" 9 | MODEL_NAME="chinese_rbtl3_pytorch" 10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 11 | echo "CURRENT_DIR:"+$CURRENT_DIR 12 | export CUDA_VISIBLE_DEVICES="0" 13 | export CLUE_DATA_DIR=../../../datasets # that is under project path 14 | export OUTPUT_DIR=../../../output_dir/ # # that is under project path 15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model 16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME 17 | 18 | # download base model if not exists 19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then 20 | mkdir -p $ROBERTA_WWM_SMALL_DIR 21 | echo "makedir $ROBERTA_WWM_SMALL_DIR" 22 | fi 23 | cd $ROBERTA_WWM_SMALL_DIR 24 | 25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then 26 | echo "Model not exists, will downloda it now..." 27 | # rm * 28 | # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm 29 | wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip 30 | unzip chinese_rbtl3_pytorch.zip 31 | rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip 32 | else 33 | echo "Model exists, will reuse it." 34 | fi 35 | 36 | # run task 37 | cd $CURRENT_DIR 38 | echo "Start running..." 39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR 40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR 41 | 42 | if [ $# == 0 ]; then 43 | echo "Start training..." 44 | python run_classifier.py \ 45 | --model_type=bert \ 46 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 47 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 48 | --task_name=$TASK_NAME \ 49 | --do_train \ 50 | --do_eval \ 51 | --do_lower_case \ 52 | --max_seq_length=32 \ 53 | --per_gpu_train_batch_size=64 \ 54 | --per_gpu_eval_batch_size=32 \ 55 | --learning_rate=2e-5 \ 56 | --num_train_epochs=15 \ 57 | --logging_steps=300 \ 58 | --save_steps=300 \ 59 | --output_dir=$OUTPUT_DIR \ 60 | --overwrite_output_dir \ 61 | --seed=42 62 | 63 | # run below lines to generate predicted file on test.json 64 | elif [ $1 == "predict" ]; then 65 | echo "Start predict..." 66 | python run_classifier.py \ 67 | --model_type=bert \ 68 | --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \ 69 | --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \ 70 | --task_name=$TASK_NAME \ 71 | --do_predict \ 72 | --do_lower_case \ 73 | --max_seq_length=32 \ 74 | --per_gpu_train_batch_size=64 \ 75 | --per_gpu_eval_batch_size=32 \ 76 | --learning_rate=2e-5 \ 77 | --num_train_epochs=15 \ 78 | --logging_steps=300 \ 79 | --save_steps=300 \ 80 | --output_dir=$OUTPUT_DIR \ 81 | --overwrite_output_dir \ 82 | --seed=42 83 | fi 84 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/tools/__pycache__/common.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/common.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/tools/__pycache__/common.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/common.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/tools/__pycache__/progressbar.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/progressbar.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/tools/__pycache__/progressbar.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/progressbar.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/tools/progressbar.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | class ProgressBar(object): 4 | ''' 5 | custom progress bar 6 | Example: 7 | >>> pbar = ProgressBar(n_total=30,desc='Training') 8 | >>> step = 2 9 | >>> pbar(step=step) 10 | ''' 11 | def __init__(self, n_total,width=30,desc = 'Training'): 12 | self.width = width 13 | self.n_total = n_total 14 | self.start_time = time.time() 15 | self.desc = desc 16 | 17 | def __call__(self, step, info={}): 18 | now = time.time() 19 | current = step + 1 20 | recv_per = current / self.n_total 21 | bar = f'[{self.desc}] {current}/{self.n_total} [' 22 | if recv_per >= 1: 23 | recv_per = 1 24 | prog_width = int(self.width * recv_per) 25 | if prog_width > 0: 26 | bar += '=' * (prog_width - 1) 27 | if current< self.n_total: 28 | bar += ">" 29 | else: 30 | bar += '=' 31 | bar += '.' * (self.width - prog_width) 32 | bar += ']' 33 | show_bar = f"\r{bar}" 34 | time_per_unit = (now - self.start_time) / current 35 | if current < self.n_total: 36 | eta = time_per_unit * (self.n_total - current) 37 | if eta > 3600: 38 | eta_format = ('%d:%02d:%02d' % 39 | (eta // 3600, (eta % 3600) // 60, eta % 60)) 40 | elif eta > 60: 41 | eta_format = '%d:%02d' % (eta // 60, eta % 60) 42 | else: 43 | eta_format = '%ds' % eta 44 | time_info = f' - ETA: {eta_format}' 45 | else: 46 | if time_per_unit >= 1: 47 | time_info = f' {time_per_unit:.1f}s/step' 48 | elif time_per_unit >= 1e-3: 49 | time_info = f' {time_per_unit * 1e3:.1f}ms/step' 50 | else: 51 | time_info = f' {time_per_unit * 1e6:.1f}us/step' 52 | 53 | show_bar += time_info 54 | if len(info) != 0: 55 | show_info = f'{show_bar} ' + \ 56 | "-".join([f' {key}: {value:.4f} ' for key, value in info.items()]) 57 | print(show_info, end='') 58 | else: 59 | print(show_bar, end='') 60 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "2.1.1" 2 | 3 | # Work around to update TensorFlow's absl.logging threshold which alters the 4 | # default Python logging output behavior when present. 5 | # see: https://github.com/abseil/abseil-py/issues/99 6 | # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493 7 | try: 8 | import absl.logging 9 | absl.logging.set_verbosity('info') 10 | absl.logging.set_stderrthreshold('info') 11 | absl.logging._warn_preinit_stderr = False 12 | except: 13 | pass 14 | 15 | import logging 16 | 17 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 18 | 19 | # Files and general utilities 20 | from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, 21 | cached_path, add_start_docstrings, add_end_docstrings, 22 | WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, 23 | is_tf_available, is_torch_available) 24 | 25 | # Tokenizers 26 | from .tokenization_utils import (PreTrainedTokenizer) 27 | from .tokenization_auto import AutoTokenizer 28 | from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer 29 | from .tokenization_openai import OpenAIGPTTokenizer 30 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) 31 | from .tokenization_gpt2 import GPT2Tokenizer 32 | from .tokenization_ctrl import CTRLTokenizer 33 | from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE 34 | from .tokenization_xlm import XLMTokenizer 35 | from .tokenization_roberta import RobertaTokenizer 36 | from .tokenization_distilbert import DistilBertTokenizer 37 | 38 | # Configurations 39 | from .configuration_utils import PretrainedConfig 40 | from .configuration_auto import AutoConfig 41 | from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP 42 | from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP 43 | from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP 44 | from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP 45 | from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP 46 | from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP 47 | from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP 48 | from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP 49 | from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 50 | from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 51 | 52 | # Modeling 53 | if is_torch_available(): 54 | from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D) 55 | from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, 56 | AutoModelWithLMHead) 57 | 58 | from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining, 59 | BertForMaskedLM, BertForNextSentencePrediction, 60 | BertForSequenceClassification, BertForMultipleChoice, 61 | BertForTokenClassification, BertForQuestionAnswering, 62 | load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP) 63 | from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel, 64 | OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, 65 | load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) 66 | from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, 67 | load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) 68 | from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model, 69 | GPT2LMHeadModel, GPT2DoubleHeadsModel, 70 | load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) 71 | from .modeling_ctrl import (CTRLPreTrainedModel, CTRLModel, 72 | CTRLLMHeadModel, 73 | CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) 74 | from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel, 75 | XLNetForSequenceClassification, XLNetForMultipleChoice, 76 | XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering, 77 | load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) 78 | from .modeling_xlm import (XLMPreTrainedModel , XLMModel, 79 | XLMWithLMHeadModel, XLMForSequenceClassification, 80 | XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, 81 | XLM_PRETRAINED_MODEL_ARCHIVE_MAP) 82 | from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, 83 | RobertaForSequenceClassification, RobertaForMultipleChoice, 84 | ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) 85 | from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel, 86 | DistilBertForSequenceClassification, DistilBertForQuestionAnswering, 87 | DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) 88 | from .modeling_albert import AlbertForSequenceClassification 89 | 90 | # Optimization 91 | from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule, 92 | WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule) 93 | if not is_tf_available() and not is_torch_available(): 94 | logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found." 95 | "Models won't be available and only tokenizers, configuration" 96 | "and file/data utilities can be used.") 97 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__main__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | def main(): 3 | import sys 4 | if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]: 5 | print( 6 | "This command line utility let you convert original (author released) model checkpoint to pytorch.\n" 7 | "It should be used as one of: \n" 8 | ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n" 9 | ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n" 10 | ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n" 11 | ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n" 12 | ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n" 13 | ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT") 14 | else: 15 | if sys.argv[1] == "bert": 16 | try: 17 | from convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch 18 | except ImportError: 19 | print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 20 | "In that case, it requires TensorFlow to be installed. Please see " 21 | "https://www.tensorflow.org/install/ for installation instructions.") 22 | raise 23 | 24 | if len(sys.argv) != 5: 25 | # pylint: disable=line-too-long 26 | print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") 27 | else: 28 | PYTORCH_DUMP_OUTPUT = sys.argv.pop() 29 | TF_CONFIG = sys.argv.pop() 30 | TF_CHECKPOINT = sys.argv.pop() 31 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 32 | elif sys.argv[1] == "gpt": 33 | from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch 34 | if len(sys.argv) < 4 or len(sys.argv) > 5: 35 | # pylint: disable=line-too-long 36 | print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`") 37 | else: 38 | OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] 39 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 40 | if len(sys.argv) == 5: 41 | OPENAI_GPT_CONFIG = sys.argv[4] 42 | else: 43 | OPENAI_GPT_CONFIG = "" 44 | convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, 45 | OPENAI_GPT_CONFIG, 46 | PYTORCH_DUMP_OUTPUT) 47 | elif sys.argv[1] == "transfo_xl": 48 | try: 49 | from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch 50 | except ImportError: 51 | print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 52 | "In that case, it requires TensorFlow to be installed. Please see " 53 | "https://www.tensorflow.org/install/ for installation instructions.") 54 | raise 55 | if len(sys.argv) < 4 or len(sys.argv) > 5: 56 | # pylint: disable=line-too-long 57 | print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") 58 | else: 59 | if 'ckpt' in sys.argv[2].lower(): 60 | TF_CHECKPOINT = sys.argv[2] 61 | TF_DATASET_FILE = "" 62 | else: 63 | TF_DATASET_FILE = sys.argv[2] 64 | TF_CHECKPOINT = "" 65 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 66 | if len(sys.argv) == 5: 67 | TF_CONFIG = sys.argv[4] 68 | else: 69 | TF_CONFIG = "" 70 | convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE) 71 | elif sys.argv[1] == "gpt2": 72 | try: 73 | from convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch 74 | except ImportError: 75 | print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 76 | "In that case, it requires TensorFlow to be installed. Please see " 77 | "https://www.tensorflow.org/install/ for installation instructions.") 78 | raise 79 | 80 | if len(sys.argv) < 4 or len(sys.argv) > 5: 81 | # pylint: disable=line-too-long 82 | print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") 83 | else: 84 | TF_CHECKPOINT = sys.argv[2] 85 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 86 | if len(sys.argv) == 5: 87 | TF_CONFIG = sys.argv[4] 88 | else: 89 | TF_CONFIG = "" 90 | convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 91 | elif sys.argv[1] == "xlnet": 92 | try: 93 | from convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch 94 | except ImportError: 95 | print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 96 | "In that case, it requires TensorFlow to be installed. Please see " 97 | "https://www.tensorflow.org/install/ for installation instructions.") 98 | raise 99 | 100 | if len(sys.argv) < 5 or len(sys.argv) > 6: 101 | # pylint: disable=line-too-long 102 | print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`") 103 | else: 104 | TF_CHECKPOINT = sys.argv[2] 105 | TF_CONFIG = sys.argv[3] 106 | PYTORCH_DUMP_OUTPUT = sys.argv[4] 107 | if len(sys.argv) == 6: 108 | FINETUNING_TASK = sys.argv[5] 109 | else: 110 | FINETUNING_TASK = None 111 | 112 | convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT, 113 | TF_CONFIG, 114 | PYTORCH_DUMP_OUTPUT, 115 | FINETUNING_TASK) 116 | elif sys.argv[1] == "xlm": 117 | from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch 118 | 119 | if len(sys.argv) != 4: 120 | # pylint: disable=line-too-long 121 | print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`") 122 | else: 123 | XLM_CHECKPOINT_PATH = sys.argv[2] 124 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 125 | 126 | convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT) 127 | 128 | if __name__ == '__main__': 129 | main() 130 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_auto.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_auto.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_auto.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_auto.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_bert.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_bert.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_bert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_bert.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_ctrl.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_ctrl.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_ctrl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_ctrl.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_distilbert.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_distilbert.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_distilbert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_distilbert.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_gpt2.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_gpt2.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_gpt2.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_gpt2.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_openai.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_openai.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_openai.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_openai.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_roberta.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_roberta.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_roberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_roberta.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_transfo_xl.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_transfo_xl.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_transfo_xl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_transfo_xl.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_utils.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_utils.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlm.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlm.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlnet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlnet.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlnet.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/file_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/file_utils.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/file_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/file_utils.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_albert.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_albert.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_albert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_albert.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_auto.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_auto.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_auto.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_auto.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_bert.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_bert.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_bert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_bert.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_ctrl.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_ctrl.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_ctrl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_ctrl.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_distilbert.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_distilbert.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_distilbert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_distilbert.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_gpt2.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_gpt2.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_gpt2.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_gpt2.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_openai.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_openai.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_openai.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_openai.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_roberta.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_roberta.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_roberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_roberta.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_utils.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_utils.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlm.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlm.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlnet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlnet.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlnet.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/optimization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/optimization.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/optimization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/optimization.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_auto.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_auto.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_auto.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_auto.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_bert.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_bert.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_bert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_bert.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_ctrl.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_ctrl.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_ctrl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_ctrl.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_distilbert.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_distilbert.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_distilbert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_distilbert.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_gpt2.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_gpt2.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_gpt2.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_gpt2.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_openai.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_openai.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_openai.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_openai.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_roberta.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_roberta.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_roberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_roberta.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_transfo_xl.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_transfo_xl.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_transfo_xl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_transfo_xl.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_utils.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_utils.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlm.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlm.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlnet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlnet.cpython-36.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlnet.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/configuration_bert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ BERT model configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 30 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", 31 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", 32 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", 33 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", 34 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", 35 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", 36 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", 37 | 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", 38 | 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", 39 | 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", 40 | 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", 41 | 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", 42 | 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", 43 | 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", 44 | 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", 45 | } 46 | 47 | 48 | class BertConfig(PretrainedConfig): 49 | r""" 50 | :class:`~transformers.BertConfig` is the configuration class to store the configuration of a 51 | `BertModel`. 52 | 53 | 54 | Arguments: 55 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. 56 | hidden_size: Size of the encoder layers and the pooler layer. 57 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 58 | num_attention_heads: Number of attention heads for each attention layer in 59 | the Transformer encoder. 60 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 61 | layer in the Transformer encoder. 62 | hidden_act: The non-linear activation function (function or string) in the 63 | encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. 64 | hidden_dropout_prob: The dropout probabilitiy for all fully connected 65 | layers in the embeddings, encoder, and pooler. 66 | attention_probs_dropout_prob: The dropout ratio for the attention 67 | probabilities. 68 | max_position_embeddings: The maximum sequence length that this model might 69 | ever be used with. Typically set this to something large just in case 70 | (e.g., 512 or 1024 or 2048). 71 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 72 | `BertModel`. 73 | initializer_range: The sttdev of the truncated_normal_initializer for 74 | initializing all weight matrices. 75 | layer_norm_eps: The epsilon used by LayerNorm. 76 | """ 77 | pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP 78 | 79 | def __init__(self, 80 | vocab_size_or_config_json_file=30522, 81 | hidden_size=768, 82 | num_hidden_layers=12, 83 | num_attention_heads=12, 84 | intermediate_size=3072, 85 | hidden_act="gelu", 86 | hidden_dropout_prob=0.1, 87 | attention_probs_dropout_prob=0.1, 88 | max_position_embeddings=512, 89 | type_vocab_size=2, 90 | initializer_range=0.02, 91 | layer_norm_eps=1e-12, 92 | **kwargs): 93 | super(BertConfig, self).__init__(**kwargs) 94 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 95 | and isinstance(vocab_size_or_config_json_file, unicode)): 96 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 97 | json_config = json.loads(reader.read()) 98 | for key, value in json_config.items(): 99 | self.__dict__[key] = value 100 | elif isinstance(vocab_size_or_config_json_file, int): 101 | self.vocab_size = vocab_size_or_config_json_file 102 | self.hidden_size = hidden_size 103 | self.num_hidden_layers = num_hidden_layers 104 | self.num_attention_heads = num_attention_heads 105 | self.hidden_act = hidden_act 106 | self.intermediate_size = intermediate_size 107 | self.hidden_dropout_prob = hidden_dropout_prob 108 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 109 | self.max_position_embeddings = max_position_embeddings 110 | self.type_vocab_size = type_vocab_size 111 | self.initializer_range = initializer_range 112 | self.layer_norm_eps = layer_norm_eps 113 | else: 114 | raise ValueError("First argument must be either a vocabulary size (int)" 115 | " or the path to a pretrained model config file (str)") 116 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/configuration_ctrl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Salesforce and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Salesforce CTRL configuration """ 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import json 20 | import logging 21 | import sys 22 | from io import open 23 | 24 | from .configuration_utils import PretrainedConfig 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"} 29 | 30 | class CTRLConfig(PretrainedConfig): 31 | """Configuration class to store the configuration of a `CTRLModel`. 32 | 33 | Args: 34 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. 35 | n_positions: Number of positional embeddings. 36 | n_ctx: Size of the causal mask (usually same as n_positions). 37 | dff: Size of the inner dimension of the FFN. 38 | n_embd: Dimensionality of the embeddings and hidden states. 39 | n_layer: Number of hidden layers in the Transformer encoder. 40 | n_head: Number of attention heads for each attention layer in 41 | the Transformer encoder. 42 | layer_norm_epsilon: epsilon to use in the layer norm layers 43 | resid_pdrop: The dropout probabilitiy for all fully connected 44 | layers in the embeddings, encoder, and pooler. 45 | attn_pdrop: The dropout ratio for the attention 46 | probabilities. 47 | embd_pdrop: The dropout ratio for the embeddings. 48 | initializer_range: The sttdev of the truncated_normal_initializer for 49 | initializing all weight matrices. 50 | """ 51 | pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP 52 | 53 | def __init__( 54 | self, 55 | vocab_size_or_config_json_file=246534, 56 | n_positions=256, 57 | n_ctx=256, 58 | n_embd=1280, 59 | dff=8192, 60 | n_layer=48, 61 | n_head=16, 62 | resid_pdrop=0.1, 63 | embd_pdrop=0.1, 64 | attn_pdrop=0.1, 65 | layer_norm_epsilon=1e-6, 66 | initializer_range=0.02, 67 | 68 | num_labels=1, 69 | summary_type='cls_index', 70 | summary_use_proj=True, 71 | summary_activation=None, 72 | summary_proj_to_labels=True, 73 | summary_first_dropout=0.1, 74 | **kwargs 75 | ): 76 | """Constructs CTRLConfig. 77 | 78 | Args: 79 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. 80 | n_positions: Number of positional embeddings. 81 | n_ctx: Size of the causal mask (usually same as n_positions). 82 | dff: Size of the inner dimension of the FFN. 83 | n_embd: Dimensionality of the embeddings and hidden states. 84 | n_layer: Number of hidden layers in the Transformer encoder. 85 | n_head: Number of attention heads for each attention layer in 86 | the Transformer encoder. 87 | layer_norm_epsilon: epsilon to use in the layer norm layers 88 | resid_pdrop: The dropout probabilitiy for all fully connected 89 | layers in the embeddings, encoder, and pooler. 90 | attn_pdrop: The dropout ratio for the attention 91 | probabilities. 92 | embd_pdrop: The dropout ratio for the embeddings. 93 | initializer_range: The sttdev of the truncated_normal_initializer for 94 | initializing all weight matrices. 95 | """ 96 | super(CTRLConfig, self).__init__(**kwargs) 97 | 98 | self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 99 | self.n_ctx = n_ctx 100 | self.n_positions = n_positions 101 | self.n_embd = n_embd 102 | self.n_layer = n_layer 103 | self.n_head = n_head 104 | self.dff = dff 105 | self.resid_pdrop = resid_pdrop 106 | self.embd_pdrop = embd_pdrop 107 | self.attn_pdrop = attn_pdrop 108 | self.layer_norm_epsilon = layer_norm_epsilon 109 | self.initializer_range = initializer_range 110 | 111 | self.num_labels = num_labels 112 | self.summary_type = summary_type 113 | self.summary_use_proj = summary_use_proj 114 | self.summary_activation = summary_activation 115 | self.summary_first_dropout = summary_first_dropout 116 | self.summary_proj_to_labels = summary_proj_to_labels 117 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 118 | and isinstance(vocab_size_or_config_json_file, unicode)): 119 | with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: 120 | json_config = json.loads(reader.read()) 121 | for key, value in json_config.items(): 122 | self.__dict__[key] = value 123 | elif not isinstance(vocab_size_or_config_json_file, int): 124 | raise ValueError( 125 | "First argument must be either a vocabulary size (int)" 126 | "or the path to a pretrained model config file (str)" 127 | ) 128 | 129 | @property 130 | def max_position_embeddings(self): 131 | return self.n_positions 132 | 133 | @property 134 | def hidden_size(self): 135 | return self.n_embd 136 | 137 | @property 138 | def num_attention_heads(self): 139 | return self.n_head 140 | 141 | @property 142 | def num_hidden_layers(self): 143 | return self.n_layer 144 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/configuration_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ DistilBERT model configuration """ 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import sys 20 | import json 21 | import logging 22 | from io import open 23 | 24 | from .configuration_utils import PretrainedConfig 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 29 | 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", 30 | 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json" 31 | } 32 | 33 | 34 | class DistilBertConfig(PretrainedConfig): 35 | pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 36 | 37 | def __init__(self, 38 | vocab_size_or_config_json_file=30522, 39 | max_position_embeddings=512, 40 | sinusoidal_pos_embds=False, 41 | n_layers=6, 42 | n_heads=12, 43 | dim=768, 44 | hidden_dim=4*768, 45 | dropout=0.1, 46 | attention_dropout=0.1, 47 | activation='gelu', 48 | initializer_range=0.02, 49 | tie_weights_=True, 50 | qa_dropout=0.1, 51 | seq_classif_dropout=0.2, 52 | **kwargs): 53 | super(DistilBertConfig, self).__init__(**kwargs) 54 | 55 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 56 | and isinstance(vocab_size_or_config_json_file, unicode)): 57 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 58 | json_config = json.loads(reader.read()) 59 | for key, value in json_config.items(): 60 | self.__dict__[key] = value 61 | elif isinstance(vocab_size_or_config_json_file, int): 62 | self.vocab_size = vocab_size_or_config_json_file 63 | self.max_position_embeddings = max_position_embeddings 64 | self.sinusoidal_pos_embds = sinusoidal_pos_embds 65 | self.n_layers = n_layers 66 | self.n_heads = n_heads 67 | self.dim = dim 68 | self.hidden_dim = hidden_dim 69 | self.dropout = dropout 70 | self.attention_dropout = attention_dropout 71 | self.activation = activation 72 | self.initializer_range = initializer_range 73 | self.tie_weights_ = tie_weights_ 74 | self.qa_dropout = qa_dropout 75 | self.seq_classif_dropout = seq_classif_dropout 76 | else: 77 | raise ValueError("First argument must be either a vocabulary size (int)" 78 | " or the path to a pretrained model config file (str)") 79 | @property 80 | def hidden_size(self): 81 | return self.dim 82 | 83 | @property 84 | def num_attention_heads(self): 85 | return self.n_heads 86 | 87 | @property 88 | def num_hidden_layers(self): 89 | return self.n_layers 90 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/configuration_gpt2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ OpenAI GPT-2 configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", 30 | "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json", 31 | "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json", 32 | "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",} 33 | 34 | class GPT2Config(PretrainedConfig): 35 | """Configuration class to store the configuration of a `GPT2Model`. 36 | 37 | Args: 38 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. 39 | n_positions: Number of positional embeddings. 40 | n_ctx: Size of the causal mask (usually same as n_positions). 41 | n_embd: Dimensionality of the embeddings and hidden states. 42 | n_layer: Number of hidden layers in the Transformer encoder. 43 | n_head: Number of attention heads for each attention layer in 44 | the Transformer encoder. 45 | layer_norm_epsilon: epsilon to use in the layer norm layers 46 | resid_pdrop: The dropout probabilitiy for all fully connected 47 | layers in the embeddings, encoder, and pooler. 48 | attn_pdrop: The dropout ratio for the attention 49 | probabilities. 50 | embd_pdrop: The dropout ratio for the embeddings. 51 | initializer_range: The sttdev of the truncated_normal_initializer for 52 | initializing all weight matrices. 53 | """ 54 | pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP 55 | 56 | def __init__( 57 | self, 58 | vocab_size_or_config_json_file=50257, 59 | n_positions=1024, 60 | n_ctx=1024, 61 | n_embd=768, 62 | n_layer=12, 63 | n_head=12, 64 | resid_pdrop=0.1, 65 | embd_pdrop=0.1, 66 | attn_pdrop=0.1, 67 | layer_norm_epsilon=1e-5, 68 | initializer_range=0.02, 69 | 70 | num_labels=1, 71 | summary_type='cls_index', 72 | summary_use_proj=True, 73 | summary_activation=None, 74 | summary_proj_to_labels=True, 75 | summary_first_dropout=0.1, 76 | **kwargs 77 | ): 78 | """Constructs GPT2Config. 79 | 80 | Args: 81 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. 82 | n_positions: Number of positional embeddings. 83 | n_ctx: Size of the causal mask (usually same as n_positions). 84 | n_embd: Dimensionality of the embeddings and hidden states. 85 | n_layer: Number of hidden layers in the Transformer encoder. 86 | n_head: Number of attention heads for each attention layer in 87 | the Transformer encoder. 88 | layer_norm_epsilon: epsilon to use in the layer norm layers 89 | resid_pdrop: The dropout probabilitiy for all fully connected 90 | layers in the embeddings, encoder, and pooler. 91 | attn_pdrop: The dropout ratio for the attention 92 | probabilities. 93 | embd_pdrop: The dropout ratio for the embeddings. 94 | initializer_range: The sttdev of the truncated_normal_initializer for 95 | initializing all weight matrices. 96 | """ 97 | super(GPT2Config, self).__init__(**kwargs) 98 | 99 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 100 | and isinstance(vocab_size_or_config_json_file, unicode)): 101 | with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: 102 | json_config = json.loads(reader.read()) 103 | for key, value in json_config.items(): 104 | self.__dict__[key] = value 105 | elif isinstance(vocab_size_or_config_json_file, int): 106 | self.vocab_size = vocab_size_or_config_json_file 107 | self.n_ctx = n_ctx 108 | self.n_positions = n_positions 109 | self.n_embd = n_embd 110 | self.n_layer = n_layer 111 | self.n_head = n_head 112 | self.resid_pdrop = resid_pdrop 113 | self.embd_pdrop = embd_pdrop 114 | self.attn_pdrop = attn_pdrop 115 | self.layer_norm_epsilon = layer_norm_epsilon 116 | self.initializer_range = initializer_range 117 | 118 | self.num_labels = num_labels 119 | self.summary_type = summary_type 120 | self.summary_use_proj = summary_use_proj 121 | self.summary_activation = summary_activation 122 | self.summary_first_dropout = summary_first_dropout 123 | self.summary_proj_to_labels = summary_proj_to_labels 124 | else: 125 | raise ValueError( 126 | "First argument must be either a vocabulary size (int)" 127 | "or the path to a pretrained model config file (str)" 128 | ) 129 | 130 | @property 131 | def max_position_embeddings(self): 132 | return self.n_positions 133 | 134 | @property 135 | def hidden_size(self): 136 | return self.n_embd 137 | 138 | @property 139 | def num_attention_heads(self): 140 | return self.n_head 141 | 142 | @property 143 | def num_hidden_layers(self): 144 | return self.n_layer 145 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/configuration_openai.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ OpenAI GPT configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 30 | "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json" 31 | } 32 | 33 | class OpenAIGPTConfig(PretrainedConfig): 34 | """ 35 | Configuration class to store the configuration of a `OpenAIGPTModel`. 36 | 37 | Args: 38 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. 39 | n_positions: Number of positional embeddings. 40 | n_ctx: Size of the causal mask (usually same as n_positions). 41 | n_embd: Dimensionality of the embeddings and hidden states. 42 | n_layer: Number of hidden layers in the Transformer encoder. 43 | n_head: Number of attention heads for each attention layer in 44 | the Transformer encoder. 45 | afn: The non-linear activation function (function or string) in the 46 | encoder and pooler. If string, "gelu", "relu" and "swish" are supported. 47 | resid_pdrop: The dropout probabilitiy for all fully connected 48 | layers in the embeddings, encoder, and pooler. 49 | attn_pdrop: The dropout ratio for the attention 50 | probabilities. 51 | embd_pdrop: The dropout ratio for the embeddings. 52 | layer_norm_epsilon: epsilon to use in the layer norm layers 53 | initializer_range: The sttdev of the truncated_normal_initializer for 54 | initializing all weight matrices. 55 | predict_special_tokens: should we predict special tokens (when the model has a LM head) 56 | """ 57 | pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP 58 | 59 | def __init__( 60 | self, 61 | vocab_size_or_config_json_file=40478, 62 | n_positions=512, 63 | n_ctx=512, 64 | n_embd=768, 65 | n_layer=12, 66 | n_head=12, 67 | afn="gelu", 68 | resid_pdrop=0.1, 69 | embd_pdrop=0.1, 70 | attn_pdrop=0.1, 71 | layer_norm_epsilon=1e-5, 72 | initializer_range=0.02, 73 | predict_special_tokens=True, 74 | 75 | num_labels=1, 76 | summary_type='cls_index', 77 | summary_use_proj=True, 78 | summary_activation=None, 79 | summary_proj_to_labels=True, 80 | summary_first_dropout=0.1, 81 | **kwargs 82 | ): 83 | """Constructs OpenAIGPTConfig. 84 | """ 85 | super(OpenAIGPTConfig, self).__init__(**kwargs) 86 | 87 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 88 | and isinstance(vocab_size_or_config_json_file, unicode)): 89 | with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: 90 | json_config = json.loads(reader.read()) 91 | for key, value in json_config.items(): 92 | self.__dict__[key] = value 93 | elif isinstance(vocab_size_or_config_json_file, int): 94 | self.vocab_size = vocab_size_or_config_json_file 95 | self.n_ctx = n_ctx 96 | self.n_positions = n_positions 97 | self.n_embd = n_embd 98 | self.n_layer = n_layer 99 | self.n_head = n_head 100 | self.afn = afn 101 | self.resid_pdrop = resid_pdrop 102 | self.embd_pdrop = embd_pdrop 103 | self.attn_pdrop = attn_pdrop 104 | self.layer_norm_epsilon = layer_norm_epsilon 105 | self.initializer_range = initializer_range 106 | self.predict_special_tokens = predict_special_tokens 107 | 108 | self.num_labels = num_labels 109 | self.summary_type = summary_type 110 | self.summary_use_proj = summary_use_proj 111 | self.summary_activation = summary_activation 112 | self.summary_first_dropout = summary_first_dropout 113 | self.summary_proj_to_labels = summary_proj_to_labels 114 | else: 115 | raise ValueError( 116 | "First argument must be either a vocabulary size (int)" 117 | "or the path to a pretrained model config file (str)" 118 | ) 119 | 120 | @property 121 | def max_position_embeddings(self): 122 | return self.n_positions 123 | 124 | @property 125 | def hidden_size(self): 126 | return self.n_embd 127 | 128 | @property 129 | def num_attention_heads(self): 130 | return self.n_head 131 | 132 | @property 133 | def num_hidden_layers(self): 134 | return self.n_layer 135 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | from __future__ import (absolute_import, division, print_function, 19 | unicode_literals) 20 | 21 | import logging 22 | 23 | from .configuration_bert import BertConfig 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 28 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", 29 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", 30 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", 31 | } 32 | 33 | 34 | class RobertaConfig(BertConfig): 35 | pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 36 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/configuration_transfo_xl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Transformer XL configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { 30 | 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", 31 | } 32 | 33 | class TransfoXLConfig(PretrainedConfig): 34 | """Configuration class to store the configuration of a `TransfoXLModel`. 35 | 36 | Args: 37 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. 38 | cutoffs: cutoffs for the adaptive softmax 39 | d_model: Dimensionality of the model's hidden states. 40 | d_embed: Dimensionality of the embeddings 41 | d_head: Dimensionality of the model's heads. 42 | div_val: divident value for adapative input and softmax 43 | pre_lnorm: apply LayerNorm to the input instead of the output 44 | d_inner: Inner dimension in FF 45 | n_layer: Number of hidden layers in the Transformer encoder. 46 | n_head: Number of attention heads for each attention layer in 47 | the Transformer encoder. 48 | tgt_len: number of tokens to predict 49 | ext_len: length of the extended context 50 | mem_len: length of the retained previous heads 51 | same_length: use the same attn length for all tokens 52 | proj_share_all_but_first: True to share all but first projs, False not to share. 53 | attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. 54 | clamp_len: use the same pos embeddings after clamp_len 55 | sample_softmax: number of samples in sampled softmax 56 | adaptive: use adaptive softmax 57 | tie_weight: tie the word embedding and softmax weights 58 | dropout: The dropout probabilitiy for all fully connected 59 | layers in the embeddings, encoder, and pooler. 60 | dropatt: The dropout ratio for the attention probabilities. 61 | untie_r: untie relative position biases 62 | embd_pdrop: The dropout ratio for the embeddings. 63 | init: parameter initializer to use 64 | init_range: parameters initialized by U(-init_range, init_range). 65 | proj_init_std: parameters initialized by N(0, init_std) 66 | init_std: parameters initialized by N(0, init_std) 67 | """ 68 | pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP 69 | 70 | def __init__(self, 71 | vocab_size_or_config_json_file=267735, 72 | cutoffs=[20000, 40000, 200000], 73 | d_model=1024, 74 | d_embed=1024, 75 | n_head=16, 76 | d_head=64, 77 | d_inner=4096, 78 | div_val=4, 79 | pre_lnorm=False, 80 | n_layer=18, 81 | tgt_len=128, 82 | ext_len=0, 83 | mem_len=1600, 84 | clamp_len=1000, 85 | same_length=True, 86 | proj_share_all_but_first=True, 87 | attn_type=0, 88 | sample_softmax=-1, 89 | adaptive=True, 90 | tie_weight=True, 91 | dropout=0.1, 92 | dropatt=0.0, 93 | untie_r=True, 94 | init="normal", 95 | init_range=0.01, 96 | proj_init_std=0.01, 97 | init_std=0.02, 98 | layer_norm_epsilon=1e-5, 99 | **kwargs): 100 | """Constructs TransfoXLConfig. 101 | """ 102 | super(TransfoXLConfig, self).__init__(**kwargs) 103 | self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 104 | self.cutoffs = [] 105 | self.cutoffs.extend(cutoffs) 106 | self.tie_weight = tie_weight 107 | if proj_share_all_but_first: 108 | self.tie_projs = [False] + [True] * len(self.cutoffs) 109 | else: 110 | self.tie_projs = [False] + [False] * len(self.cutoffs) 111 | self.d_model = d_model 112 | self.d_embed = d_embed 113 | self.d_head = d_head 114 | self.d_inner = d_inner 115 | self.div_val = div_val 116 | self.pre_lnorm = pre_lnorm 117 | self.n_layer = n_layer 118 | self.n_head = n_head 119 | self.tgt_len = tgt_len 120 | self.ext_len = ext_len 121 | self.mem_len = mem_len 122 | self.same_length = same_length 123 | self.attn_type = attn_type 124 | self.clamp_len = clamp_len 125 | self.sample_softmax = sample_softmax 126 | self.adaptive = adaptive 127 | self.dropout = dropout 128 | self.dropatt = dropatt 129 | self.untie_r = untie_r 130 | self.init = init 131 | self.init_range = init_range 132 | self.proj_init_std = proj_init_std 133 | self.init_std = init_std 134 | self.layer_norm_epsilon = layer_norm_epsilon 135 | 136 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 137 | and isinstance(vocab_size_or_config_json_file, unicode)): 138 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 139 | json_config = json.loads(reader.read()) 140 | for key, value in json_config.items(): 141 | self.__dict__[key] = value 142 | elif not isinstance(vocab_size_or_config_json_file, int): 143 | raise ValueError("First argument must be either a vocabulary size (int)" 144 | " or the path to a pretrained model config file (str)") 145 | 146 | @property 147 | def max_position_embeddings(self): 148 | return self.tgt_len + self.ext_len + self.mem_len 149 | 150 | @property 151 | def vocab_size(self): 152 | return self.n_token 153 | 154 | @vocab_size.setter 155 | def vocab_size(self, value): 156 | self.n_token = value 157 | 158 | @property 159 | def hidden_size(self): 160 | return self.d_model 161 | 162 | @property 163 | def num_attention_heads(self): 164 | return self.n_head 165 | 166 | @property 167 | def num_hidden_layers(self): 168 | return self.n_layer 169 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/configuration_xlnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XLNet configuration """ 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import json 20 | import logging 21 | import sys 22 | from io import open 23 | 24 | from .configuration_utils import PretrainedConfig 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { 29 | 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json", 30 | 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json", 31 | } 32 | 33 | 34 | class XLNetConfig(PretrainedConfig): 35 | """Configuration class to store the configuration of a ``XLNetModel``. 36 | 37 | Args: 38 | vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. 39 | d_model: Size of the encoder layers and the pooler layer. 40 | n_layer: Number of hidden layers in the Transformer encoder. 41 | n_head: Number of attention heads for each attention layer in 42 | the Transformer encoder. 43 | d_inner: The size of the "intermediate" (i.e., feed-forward) 44 | layer in the Transformer encoder. 45 | ff_activation: The non-linear activation function (function or string) in the 46 | encoder and pooler. If string, "gelu", "relu" and "swish" are supported. 47 | untie_r: untie relative position biases 48 | attn_type: 'bi' for XLNet, 'uni' for Transformer-XL 49 | 50 | dropout: The dropout probabilitiy for all fully connected 51 | layers in the embeddings, encoder, and pooler. 52 | initializer_range: The sttdev of the truncated_normal_initializer for 53 | initializing all weight matrices. 54 | layer_norm_eps: The epsilon used by LayerNorm. 55 | 56 | dropout: float, dropout rate. 57 | init: str, the initialization scheme, either "normal" or "uniform". 58 | init_range: float, initialize the parameters with a uniform distribution 59 | in [-init_range, init_range]. Only effective when init="uniform". 60 | init_std: float, initialize the parameters with a normal distribution 61 | with mean 0 and stddev init_std. Only effective when init="normal". 62 | mem_len: int, the number of tokens to cache. 63 | reuse_len: int, the number of tokens in the currect batch to be cached 64 | and reused in the future. 65 | bi_data: bool, whether to use bidirectional input pipeline. 66 | Usually set to True during pretraining and False during finetuning. 67 | clamp_len: int, clamp all relative distances larger than clamp_len. 68 | -1 means no clamping. 69 | same_length: bool, whether to use the same attention length for each token. 70 | finetuning_task: name of the glue task on which the model was fine-tuned if any 71 | """ 72 | pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP 73 | 74 | def __init__(self, 75 | vocab_size_or_config_json_file=32000, 76 | d_model=1024, 77 | n_layer=24, 78 | n_head=16, 79 | d_inner=4096, 80 | max_position_embeddings=512, 81 | ff_activation="gelu", 82 | untie_r=True, 83 | attn_type="bi", 84 | 85 | initializer_range=0.02, 86 | layer_norm_eps=1e-12, 87 | 88 | dropout=0.1, 89 | mem_len=None, 90 | reuse_len=None, 91 | bi_data=False, 92 | clamp_len=-1, 93 | same_length=False, 94 | 95 | finetuning_task=None, 96 | num_labels=2, 97 | summary_type='last', 98 | summary_use_proj=True, 99 | summary_activation='tanh', 100 | summary_last_dropout=0.1, 101 | start_n_top=5, 102 | end_n_top=5, 103 | **kwargs): 104 | """Constructs XLNetConfig. 105 | """ 106 | super(XLNetConfig, self).__init__(**kwargs) 107 | 108 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 109 | and isinstance(vocab_size_or_config_json_file, unicode)): 110 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 111 | json_config = json.loads(reader.read()) 112 | for key, value in json_config.items(): 113 | setattr(config, key, value) 114 | elif isinstance(vocab_size_or_config_json_file, int): 115 | self.n_token = vocab_size_or_config_json_file 116 | self.d_model = d_model 117 | self.n_layer = n_layer 118 | self.n_head = n_head 119 | assert d_model % n_head == 0 120 | self.d_head = d_model // n_head 121 | self.ff_activation = ff_activation 122 | self.d_inner = d_inner 123 | self.untie_r = untie_r 124 | self.attn_type = attn_type 125 | 126 | self.initializer_range = initializer_range 127 | self.layer_norm_eps = layer_norm_eps 128 | 129 | self.dropout = dropout 130 | self.mem_len = mem_len 131 | self.reuse_len = reuse_len 132 | self.bi_data = bi_data 133 | self.clamp_len = clamp_len 134 | self.same_length = same_length 135 | 136 | self.finetuning_task = finetuning_task 137 | self.num_labels = num_labels 138 | self.summary_type = summary_type 139 | self.summary_use_proj = summary_use_proj 140 | self.summary_activation = summary_activation 141 | self.summary_last_dropout = summary_last_dropout 142 | self.start_n_top = start_n_top 143 | self.end_n_top = end_n_top 144 | else: 145 | raise ValueError("First argument must be either a vocabulary size (int)" 146 | " or the path to a pretrained model config file (str)") 147 | 148 | @property 149 | def max_position_embeddings(self): 150 | return -1 151 | 152 | @property 153 | def vocab_size(self): 154 | return self.n_token 155 | 156 | @vocab_size.setter 157 | def vocab_size(self, value): 158 | self.n_token = value 159 | 160 | @property 161 | def hidden_size(self): 162 | return self.d_model 163 | 164 | @property 165 | def num_attention_heads(self): 166 | return self.n_head 167 | 168 | @property 169 | def num_hidden_layers(self): 170 | return self.n_layer 171 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for DistilBERT.""" 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import collections 20 | import logging 21 | import os 22 | import unicodedata 23 | from io import open 24 | 25 | from .tokenization_bert import BertTokenizer 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} 30 | 31 | PRETRAINED_VOCAB_FILES_MAP = { 32 | 'vocab_file': 33 | { 34 | 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 35 | 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 36 | } 37 | } 38 | 39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 40 | 'distilbert-base-uncased': 512, 41 | 'distilbert-base-uncased-distilled-squad': 512, 42 | } 43 | 44 | 45 | class DistilBertTokenizer(BertTokenizer): 46 | r""" 47 | Constructs a DistilBertTokenizer. 48 | :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece 49 | 50 | Args: 51 | vocab_file: Path to a one-wordpiece-per-line vocabulary file 52 | do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False 53 | do_basic_tokenize: Whether to do basic tokenization before wordpiece. 54 | max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the 55 | minimum of this value (if specified) and the underlying BERT model's sequence length. 56 | never_split: List of tokens which will never be split during tokenization. Only has an effect when 57 | do_wordpiece_only=False 58 | """ 59 | 60 | vocab_files_names = VOCAB_FILES_NAMES 61 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 62 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 63 | -------------------------------------------------------------------------------- /baselines/models_pytorch/classifier_pytorch/transformers/tokenization_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for RoBERTa.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import sys 20 | import json 21 | import logging 22 | import os 23 | import regex as re 24 | from io import open 25 | 26 | from .tokenization_gpt2 import GPT2Tokenizer 27 | 28 | try: 29 | from functools import lru_cache 30 | except ImportError: 31 | # Just a dummy decorator to get the checks to run on python2 32 | # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. 33 | def lru_cache(): 34 | return lambda func: func 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | VOCAB_FILES_NAMES = { 39 | 'vocab_file': 'vocab.json', 40 | 'merges_file': 'merges.txt', 41 | } 42 | 43 | PRETRAINED_VOCAB_FILES_MAP = { 44 | 'vocab_file': 45 | { 46 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", 47 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", 48 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json", 49 | }, 50 | 'merges_file': 51 | { 52 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", 53 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", 54 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt", 55 | }, 56 | } 57 | 58 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 59 | 'roberta-base': 512, 60 | 'roberta-large': 512, 61 | 'roberta-large-mnli': 512, 62 | } 63 | 64 | 65 | class RobertaTokenizer(GPT2Tokenizer): 66 | """ 67 | RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: 68 | - Byte-level Byte-Pair-Encoding 69 | - Requires a space to start the input string => the encoding methods should be called with the 70 | ``add_prefix_space`` flag set to ``True``. 71 | Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve 72 | the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` 73 | """ 74 | vocab_files_names = VOCAB_FILES_NAMES 75 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 76 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 77 | 78 | def __init__(self, vocab_file, merges_file, errors='replace', bos_token="", eos_token="", sep_token="", 79 | cls_token="", unk_token="", pad_token='', mask_token='', **kwargs): 80 | super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors, 81 | bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, 82 | sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, 83 | mask_token=mask_token, **kwargs) 84 | self.max_len_single_sentence = self.max_len - 2 # take into account special tokens 85 | self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens 86 | 87 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): 88 | """ 89 | Build model inputs from a sequence or a pair of sequence for sequence classification tasks 90 | by concatenating and adding special tokens. 91 | A RoBERTa sequence has the following format: 92 | single sequence: X 93 | pair of sequences: A B 94 | """ 95 | if token_ids_1 is None: 96 | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] 97 | cls = [self.cls_token_id] 98 | sep = [self.sep_token_id] 99 | return cls + token_ids_0 + sep + sep + token_ids_1 + sep 100 | 101 | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): 102 | """ 103 | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding 104 | special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. 105 | 106 | Args: 107 | token_ids_0: list of ids (must not contain special tokens) 108 | token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids 109 | for sequence pairs 110 | already_has_special_tokens: (default False) Set to True if the token list is already formated with 111 | special tokens for the model 112 | 113 | Returns: 114 | A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. 115 | """ 116 | if already_has_special_tokens: 117 | if token_ids_1 is not None: 118 | raise ValueError("You should not supply a second sequence if the provided sequence of " 119 | "ids is already formated with special tokens for the model.") 120 | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) 121 | 122 | if token_ids_1 is None: 123 | return [1] + ([0] * len(token_ids_0)) + [1] 124 | return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] 125 | 126 | def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): 127 | """ 128 | Creates a mask from the two sequences passed to be used in a sequence-pair classification task. 129 | A RoBERTa sequence pair mask has the following format: 130 | 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 131 | | first sequence | second sequence 132 | 133 | if token_ids_1 is None, only returns the first portion of the mask (0's). 134 | """ 135 | sep = [self.sep_token_id] 136 | cls = [self.cls_token_id] 137 | 138 | if token_ids_1 is None: 139 | return len(cls + token_ids_0 + sep) * [0] 140 | return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] 141 | -------------------------------------------------------------------------------- /baselines/multi/README.md: -------------------------------------------------------------------------------- 1 | # 多策略方法汇总 2 | 3 | |组合编号| 策略名称(包含策略id) | 亦可用于英文? | 亦可用于CV? | 借助人力? | CIC (F1 marco) | 4 | |---|---|---|---|---|---| 5 | |0| 基线(2,3,4) | yes | | | 0.7849 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /baselines/multi/simple_baseline/README.md: -------------------------------------------------------------------------------- 1 | ### 算法描述 2 | 3 | 这里我们组合了如下策略: 4 | 5 | 1. 通过训练一个分类模型根据预测的熵找出数据中最有可能标签错误的样本,并丢弃; 6 | 2. 使用数据增强提升数据量,即对输入文本的增强; 7 | 3. 将标签定义增强后添加到训练集中增加数据量。比如标签定义买家抱怨商品了;标签定义增强->买家抱怨商品涨价了。 8 | 9 | -------------------------------------------------------------------------------- /baselines/multi/simple_baseline/simple_baseline.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from textda.data_expansion import data_expansion 3 | import os 4 | import sys 5 | path = os.path.split(os.getcwd()) 6 | sys.path.append(path) 7 | 8 | from dckit import read_datasets, random_split_data, evaluate 9 | from baselines.single.data_aug.data_aug import data_aug 10 | from baselines.single.def_aug.def_aug import def_aug 11 | from baselines.single.delete_noise.delete_noise import delete_noise 12 | 13 | 14 | def simple_baseline(data, use_delete=False, use_aug=False, use_def=False): 15 | if use_delete: 16 | data = delete_noise(data) 17 | if use_aug: 18 | data = data_aug(data) 19 | if use_def: 20 | data = def_aug(data) 21 | return data 22 | 23 | 24 | def main(): 25 | data = read_datasets() 26 | data = simple_baseline(data) 27 | random_split_data(data) 28 | f1 = evaluate() 29 | print('Macro-F1=', f1) 30 | return f1 31 | 32 | 33 | if __name__ == '__main__': 34 | main() -------------------------------------------------------------------------------- /baselines/single/README.md: -------------------------------------------------------------------------------- 1 | # 单策略方法汇总 2 | 3 | |策略编号| 策略名称 | 亦可用于英文? | 亦可用于CV? | 借助人力? | CIC Marco-F1 | TNEWS Marco-F1 | IFLYTEK Marco-F1 |AFQMC Marco-F1| 4 | |---|---|---|---|---|---|---|---|---| 5 | |0| 基线 | yes | yes | | 0.7278 |0.4683|0.3097|0.5904| 6 | |1| 人类表现 (Accuracy) | | | | 0.8740 |0.71|0.66|0.81| 7 | |2| [数据增强](data_aug) | yes | yes | | 0.7462 |0.4805|0.4015|| 8 | |3| [噪声标签删除](delete_noise) | yes | yes | | 0.7332 |0.4934|0.2941|| 9 | |4| [定义增强](definition_aug) | yes | | | 0.7822 |0.4570|0.3371|| 10 | -------------------------------------------------------------------------------- /baselines/single/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/__init__.py -------------------------------------------------------------------------------- /baselines/single/data_aug/README.md: -------------------------------------------------------------------------------- 1 | # 数据增强 2 | 3 | 对输入数据进行增强 4 | 5 | # 参数说明 6 | 7 | 这里只用了增强次数作为参数 8 | 9 | # 参数选择实验 10 | 11 | |增强次数 | 0 | 1 | 3 | 5 |10| 12 | |---|---|---|---|---|---| 13 | | Marco-F1| .7278 | .7388 | .7462 | .7363 | .6694 | 14 | 15 | 16 | # 可能问题 17 | 这里的数据增强部分用了[synonyms](https://github.com/chatopera/Synonyms),其中资源下载可能存在问题。如果存在问题请按照如下设置: 18 | ```bash 19 | export SYNONYMS_WORD2VEC_BIN_URL_ZH_CN=https://gitee.com/chatopera/cskefu/attach_files/610602/download/words.vector.gz 20 | pip install -U synonyms 21 | python -c "import synonyms" # download word vectors file 22 | ``` 23 | 24 | -------------------------------------------------------------------------------- /baselines/single/data_aug/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/data_aug/__init__.py -------------------------------------------------------------------------------- /baselines/single/data_aug/data_aug.py: -------------------------------------------------------------------------------- 1 | import imp 2 | from tqdm import tqdm 3 | from textda.data_expansion import data_expansion 4 | import sys 5 | sys.path.append('../../../') 6 | sys.path.append('../../../dckit') 7 | from dckit import read_datasets, random_split_data, evaluate 8 | import swifter 9 | import pandas as pd 10 | import numpy as np 11 | 12 | 13 | def aug_function(sentence, alpha_ri=0.1, alpha_rs=0, num_aug=3): 14 | aug_list = data_expansion(sentence, alpha_ri, alpha_rs, p_rd=0.2, num_aug=num_aug) 15 | if len(aug_list) != num_aug: 16 | l = len(aug_list) 17 | if l < num_aug: 18 | for i in range(num_aug-l): 19 | aug_list.append(None) 20 | else: 21 | aug_list = aug_list[:num_aug] 22 | return aug_list 23 | 24 | 25 | def data_aug(data, num_aug=3): 26 | json_data = data['json'] 27 | df = pd.DataFrame.from_records(json_data) 28 | df.columns = json_data[0].keys() 29 | aug_lists = df['sentence'].swifter.apply(aug_function) 30 | aug_lens = [len(aug_list) for aug_list in aug_lists] 31 | flatten_list = [j for sub in aug_lists for j in sub] 32 | newdf = pd.DataFrame(np.repeat(df.values, num_aug, axis=0), columns=df.columns) 33 | newdf['sentence'] = flatten_list 34 | # remove none 35 | newdf.dropna(inplace=True) 36 | data["json"] = newdf.to_dict(orient='records') 37 | return data 38 | 39 | 40 | 41 | def main(): 42 | data = read_datasets() 43 | data = data_aug(data) 44 | random_split_data(data) 45 | f1 = evaluate() 46 | print('Macro-F1=', f1) 47 | return f1 48 | 49 | 50 | if __name__ == '__main__': 51 | main() 52 | -------------------------------------------------------------------------------- /baselines/single/data_aug/parallel_textda.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor, as_completed 2 | from textda.data_expansion import data_expansion 3 | 4 | 5 | def expand(sentence, label, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.2, num_aug=9): 6 | res = data_expansion(sentence, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, 7 | p_rd=p_rd, num_aug=num_aug) 8 | return res, [label] * len(res) 9 | 10 | 11 | def parallel_expansion(sentences, labels, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.2, num_aug=9, workers=16): 12 | ''' 13 | if you set alpha_ri and alpha_rs is 0 that means use linear classifier for it, and insensitive to word location 14 | :param sentences: input sentence list 15 | :param labels: input label list 16 | :param alpha_sr: Replace synonym control param. bigger means more words are Replace 17 | :param alpha_ri: Random insert. bigger means more words are Insert 18 | :param alpha_rs: Random swap. bigger means more words are swap 19 | :param p_rd: Random delete. bigger means more words are deleted 20 | :param num_aug: How many times do you repeat each method 21 | :param workers: Number of process 22 | :return: 23 | ''' 24 | assert len(sentences) == len(labels) 25 | res_sentences = [] 26 | res_labels = [] 27 | with ProcessPoolExecutor(max_workers=workers) as t: 28 | obj_list = [] 29 | for idx, sentence in enumerate(sentences): 30 | obj = t.submit(expand, sentence, labels[idx], alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, 31 | p_rd=p_rd, num_aug=num_aug) 32 | obj_list.append(obj) 33 | 34 | for future in as_completed(obj_list): 35 | res = future.result() 36 | res_sentences.extend(res[0]) 37 | res_labels.extend(res[1]) 38 | return res_sentences, res_labels 39 | -------------------------------------------------------------------------------- /baselines/single/data_mixup/README.md: -------------------------------------------------------------------------------- 1 | # 数据组合 2 | 3 | 对输入数据进行组合,生成复合数据,减少其中噪声的影响。 4 | 比如对于句子:“我希望换一个地址”和“你们这怎么换地址”可以合并成“我希望换一个地址。你们这怎么换地址” 5 | 6 | # 参数说明 7 | 8 | 这里只用了复合次数作为参数 9 | 10 | # 参数选择实验 11 | 12 | 分析显示 13 | |复合生成次数 | 0 | 2 | 14 | |---|---|---| 15 | | Marco-F1| .7278 | .7376 | 16 | -------------------------------------------------------------------------------- /baselines/single/data_mixup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/data_mixup/__init__.py -------------------------------------------------------------------------------- /baselines/single/data_mixup/data_mixup.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from dckit import read_datasets, random_split_data, evaluate 3 | import random 4 | import numpy as np 5 | 6 | 7 | def data_mix(data, num_mix=3): 8 | json_data = data['json'] 9 | new_json = json_data 10 | # 按类聚合数据 11 | sentence_by_class = {} 12 | label_desc_map = {} 13 | for idx, tmp in enumerate(tqdm(json_data)): 14 | if tmp['label'] not in label_desc_map: 15 | label_desc_map[tmp['label']] = tmp['label_des'] 16 | if tmp['label'] not in sentence_by_class: 17 | sentence_by_class[tmp['label']] = [] 18 | sentence_by_class[tmp['label']].append(tmp['sentence']) 19 | idx = 0 20 | for classes, sentences in tqdm(sentence_by_class.items()): 21 | for _ in range(len(json_data)//len(data['info'])): 22 | random.shuffle(sentences) 23 | sentence = '。'.join(sentences[:num_mix]) 24 | dic = {'id': idx, 'sentence': sentence, 'label': classes, 'label_des': label_desc_map[classes]} 25 | idx += 1 26 | new_json.append(dic) 27 | data['json'] = new_json 28 | return data 29 | 30 | 31 | def main(): 32 | res = [] 33 | for i in range(5): 34 | data = read_datasets() 35 | data = data_mix(data) 36 | random_split_data(data, seed=i) 37 | f1 = evaluate() 38 | res.append(f1) 39 | print('Macro-F1=', np.mean(res), np.std(res)) 40 | return f1 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /baselines/single/def_aug/README.md: -------------------------------------------------------------------------------- 1 | # 标签定义增强 2 | 3 | 将标签定义加入进行增强 4 | 5 | # 参数说明 6 | 7 | 这里只用了增强次数作为参数 8 | 9 | # 参数选择实验 10 | 11 | |增强次数 | 0 | 1 | 3 | 5 |10| 12 | |---|---|---|---|---|---| 13 | | Marco-F1| .7278 | .7388 | .7462 | .7363 | .6694 | 14 | 15 | # 可能问题 16 | 这里的数据增强部分用了[synonyms](https://github.com/chatopera/Synonyms),其中资源下载可能存在问题。如果存在问题请按照如下设置: 17 | ```bash 18 | export SYNONYMS_WORD2VEC_BIN_URL_ZH_CN=https://gitee.com/chatopera/cskefu/attach_files/610602/download/words.vector.gz 19 | pip install -U synonyms 20 | python -c "import synonyms" # download word vectors file 21 | ``` 22 | 23 | -------------------------------------------------------------------------------- /baselines/single/def_aug/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/def_aug/__init__.py -------------------------------------------------------------------------------- /baselines/single/def_aug/def_aug.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from tqdm import tqdm 4 | from textda.data_expansion import data_expansion 5 | from dckit import read_datasets, random_split_data, evaluate 6 | 7 | path = os.path.split(os.path.split(os.path.realpath(__file__))[0])[0] 8 | 9 | 10 | def def_aug(data, num_aug=50): 11 | json_data = data['json'] 12 | label_info = data['info'] 13 | for idx, line in label_info.items(): 14 | if num_aug > 0: 15 | sen_list = data_expansion(line, alpha_ri=0.2, alpha_rs=0, num_aug=num_aug) 16 | if num_aug == 0: 17 | sen_list = [line] 18 | for sen in sen_list: 19 | tmp = {} 20 | tmp['id'] = -1 21 | tmp['sentence'] = sen 22 | tmp['label_des'] = line 23 | tmp['label'] = idx 24 | json_data.append(tmp) 25 | 26 | data['json'] = json_data 27 | return data 28 | 29 | 30 | def main(): 31 | data = read_datasets() 32 | data = def_aug(data) 33 | random_split_data(data) 34 | f1 = evaluate() 35 | print('Macro-F1=', f1) 36 | return f1 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /baselines/single/delete_noise/README.md: -------------------------------------------------------------------------------- 1 | # 噪声删除 2 | 3 | 交叉验证训练多个模型,根据熵判断最有可能错误的 4 | 5 | # 参数说明 6 | 7 | 这里只用了删除次数作为参数 8 | 9 | # 参数选择实验 10 | 11 | |增强次数 | 0 | 1 | 3 | 5 |10| 12 | |---|---|---|---|---|---| 13 | | Marco-F1| .7278 | .7388 | .7462 | .7363 | .6694 | 14 | -------------------------------------------------------------------------------- /baselines/single/delete_noise/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/delete_noise/__init__.py -------------------------------------------------------------------------------- /baselines/single/delete_noise/classifier.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["CUDA_VISIBLE_DEVICE"] = '1' 3 | from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer 4 | import torch 5 | import numpy as np 6 | from datasets import load_metric 7 | from sklearn.model_selection import StratifiedKFold # StratifiedKFold划分数据集的原理:划分后的训练集和验证集中类别分布尽量和原数据集一样 8 | 9 | path = os.path.split(os.path.split(os.path.realpath(__file__))[0])[0] 10 | from baselines.single.data_aug.parallel_textda import parallel_expansion 11 | os.environ["TOKENIZERS_PARALLELISM"] = 'false' 12 | PRETRAIN = 'hfl/rbtl3' # 加载的预训练模型的名称 13 | metric = load_metric("f1") # 使用f1 score作为指标 14 | 15 | 16 | # 计算标签与预测值在给定的指标上的效果 17 | def compute_metrics(eval_pred): 18 | predictions, labels = eval_pred 19 | predictions = np.argmax(predictions, axis=1) 20 | return metric.compute(predictions=predictions, references=labels, average='macro') 21 | 22 | 23 | class MyDataset(torch.utils.data.Dataset): 24 | def __init__(self, encodings, labels): 25 | self.encodings = encodings 26 | self.labels = labels 27 | 28 | def __getitem__(self, idx): 29 | item = {key: torch.tensor(val[idx]) 30 | for key, val in self.encodings.items()} 31 | item['labels'] = torch.tensor(self.labels[idx]) 32 | return item 33 | 34 | def __len__(self): 35 | return len(self.labels) 36 | 37 | 38 | def get_prediction(data): 39 | """ 40 | 训练一个模型,得到数据点上的标签预测: 41 | 1) 加载数据; 42 | 2)使用K折交叉验证训练,并在验证集上做预测; 43 | 3) 合并交叉验证的结果,并得到整个数据集上模型的预测的概率分布 44 | train a model to get estimation of each data point 45 | """ 46 | # 1、加载所有数据、标签到列表 all_text, all_label,all_id 47 | all_text, all_label, all_id = [], [], [] 48 | for idx, line in enumerate(data['json']): 49 | all_text.append(line['sentence']) 50 | all_label.append(int(line['label'])) 51 | all_id.append(idx) 52 | # 加载标签定义增强后的数据 53 | # label_data.json--->{"id": -1, "sentence": "买家抱怨商品了", "label_des": "买家抱怨商品涨价了\n", "label": 0} 54 | label_text, label_label = [], [] 55 | # for line in open('../../datasets/cic/label_data.json', 'r', encoding='utf-8'): 56 | # label_text.append(json.loads(line)['sentence']) 57 | # label_label.append(int(json.loads(line)['label'])) 58 | 59 | # 2、使用K折交叉验证训练,并在验证集上做预测:遍历每一折得到训练集和验证子集、数据增强、设置训练参数和数据进行训练、在验证集上进行预测 60 | dev_out = {} # 带索引(index)的验证子集的列表 61 | dev_index = {} # 带索引(index)的验证集的列表 62 | kf = StratifiedKFold(n_splits=6, shuffle=True) # StratifiedKFold划分数据集的原理:划分后的训练集和验证集中类别分布尽量和原数据集一样 63 | # kf.get_n_splits(all_text, all_label) 64 | for kf_id, (train_index, test_index) in enumerate(kf.split(all_text, all_label)): 65 | # 2.1 得到训练和验证子集 66 | # kf_id:第几折;train_index, test_index这一折的训练、验证集。 67 | train_text = [all_text[i] for i in train_index][:] + label_text # 训练集的文本 68 | train_label = [all_label[i] for i in train_index][:] + label_label # 训练集的标签 69 | dev_text = [all_text[i] for i in test_index] 70 | dev_label = [all_label[i] for i in test_index] 71 | dev_index[kf_id] = test_index 72 | 73 | # 2.2 对训练数据进行数据扩增 74 | # new_train_text = [] 75 | # new_train_label = [] 76 | # for idx, tmp_text in enumerate(train_text): 77 | # sen_list = data_expansion(tmp_text, alpha_ri=0.1, alpha_rs=0, num_aug=5) 78 | # new_train_text.extend(sen_list) 79 | # new_train_label.extend([train_label[idx]] * len(sen_list)) 80 | # 81 | # train_text = new_train_text 82 | # train_label = new_train_label 83 | 84 | sen_list, label_list = parallel_expansion(train_text, train_label, alpha_ri=0.1, alpha_rs=0, num_aug=5) 85 | train_text = sen_list 86 | train_label = label_list 87 | assert len(train_text) == len(train_label) 88 | # 2.3 设置使用的预训练模型,并设置tokenizer、数据集对象 89 | tokenizer = AutoTokenizer.from_pretrained(PRETRAIN, do_lower_case=True) 90 | train_encodings = tokenizer(train_text, truncation=True, padding=True, max_length=32) 91 | val_encodings = tokenizer(dev_text, truncation=True, padding=True, max_length=32) 92 | 93 | train_dataset = MyDataset(train_encodings, train_label) 94 | val_dataset = MyDataset(val_encodings, dev_label) 95 | 96 | # 2.4 实例化训练参数 97 | training_args = TrainingArguments( 98 | # output directory 99 | output_dir='../../tmpresults/tmpresult{}'.format(kf_id), 100 | num_train_epochs=50, # total number of training epochs 101 | per_device_train_batch_size=256, # batch size per device during training 102 | per_device_eval_batch_size=32, # batch size for evaluation 103 | warmup_steps=500, # number of warmup steps for learning rate scheduler 104 | learning_rate=3e-4 if 'electra' in PRETRAIN else 2e-5, 105 | weight_decay=0.01, # strength of weight decay 106 | save_total_limit=1, 107 | # logging_dir='../../tmplogs', # directory for storing logs 108 | # logging_steps=10, 109 | # evaluation_strategy="epoch", 110 | ) 111 | model = AutoModelForSequenceClassification.from_pretrained(PRETRAIN, num_labels=len(data['info'])) 112 | 113 | # 2.5 利用实例化的训练对象进行训练(模型、训练参数、训练集、验证集、评价指标) 114 | trainer = Trainer( 115 | # the instantiated 🤗 Transformers model to be trained 116 | model=model, 117 | args=training_args, # training arguments, defined above 118 | train_dataset=train_dataset, # training dataset 119 | eval_dataset=val_dataset, # evaluation dataset 120 | compute_metrics=compute_metrics, 121 | ) 122 | trainer.train() # 训练模型 123 | 124 | # 2.6 利用训练好的模型在验证集上进行预测 125 | dev_outputs = trainer.predict(val_dataset).predictions 126 | dev_out[kf_id] = dev_outputs # 将预测结果保存在列表中 127 | 128 | # 3、合并交叉验证的结果,并得到整个数据集上模型的预测的概率分布 129 | alls = [0] * len(all_label) 130 | for kfid in range(6): 131 | for idx, item in enumerate(dev_index[kfid]): 132 | # dev_index[0]:第0折的验证数据的索引的列表 133 | alls[item - 1] = dev_out[kfid][idx] 134 | outputs = np.array(alls) 135 | return outputs 136 | 137 | 138 | if __name__ == '__main__': 139 | get_prediction() 140 | -------------------------------------------------------------------------------- /baselines/single/delete_noise/delete_noise.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from dckit import read_datasets, random_split_data, evaluate 3 | from scipy.stats import entropy 4 | import numpy as np 5 | import os 6 | import sys 7 | path = os.path.split(os.path.split(os.path.realpath(__file__))[0])[0] 8 | sys.path.append(path) 9 | from baselines.single.delete_noise.classifier import get_prediction 10 | 11 | 12 | def find_max_entropy(predicted_probabilities): 13 | entros = entropy(predicted_probabilities, axis=1) 14 | return np.argsort(entros)[::-1] 15 | 16 | 17 | def delete_noise(data, delete_num=100): 18 | numpy_array_of_predicted_probabilities = get_prediction(data) 19 | ordered_label_errors = find_max_entropy(numpy_array_of_predicted_probabilities) 20 | 21 | json_data = data['json'] 22 | new_json = [] 23 | for idx, tmp in enumerate(tqdm(json_data)): 24 | # 每一句都给他扩展 25 | if idx in ordered_label_errors[:delete_num]: # and idx not in correct_id: 26 | # print(tmp['sentence'], tmp['label_des']) 27 | continue 28 | new_json.append(tmp) 29 | data['json'] = new_json 30 | return data 31 | 32 | 33 | def main(): 34 | data = read_datasets() 35 | data = delete_noise(data) 36 | random_split_data(data) 37 | f1 = evaluate() 38 | print('Macro-F1=', f1) 39 | return f1 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /baselines/single/template/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/template/README.md -------------------------------------------------------------------------------- /baselines/single/template/template.py: -------------------------------------------------------------------------------- 1 | from dckit import read_datasets 2 | from dckit.evaluate import evaluate 3 | 4 | 5 | def template(data): 6 | """ 7 | 输入读取的字典,输出还是这个字典,但是修改其内容,如果修改了标签请注意同时修改label_des和label字段 8 | """ 9 | # TODO add your code here 10 | return data 11 | 12 | 13 | def main(): 14 | data = read_datasets() 15 | template(data) 16 | f1 = evaluate() 17 | print('Macro-F1=', f1) 18 | return f1 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /datasets/cic/README.txt: -------------------------------------------------------------------------------- 1 | 这里存储你的结果 2 | -------------------------------------------------------------------------------- /datasets/raw_afqmc/readme.md: -------------------------------------------------------------------------------- 1 | afqmc用于DataCLUE的版本 | afqmc for dataclue 2 | 3 | 标签含义: 4 | 0: 语义相似 5 | 1: 语义不同 6 | 7 | 数据量: 8 | train.json(32,334)/dev.json(4,316)/test_public.json(2,000) 9 | 10 | 更多信息: 11 | for more detail, check: https://github.com/CLUEbenchmark/DataCLUE -------------------------------------------------------------------------------- /datasets/raw_cic/README.txt: -------------------------------------------------------------------------------- 1 | DataCLUE: 以数据为中心的AI测评 2 | 项目地址:https://github.com/CLUEbenchmark/DataCLUE 3 | 4 | 意图识别任务,类别数:118 5 | train.json:1万,包含部分有问题的数据 6 | dev.json:2000条,包含部分有问题的数据 7 | test_public.json:2000条,仅用于学术和实验,和作为训练完后的效果评估。不能用于训练;高质量数据(标注准确率95%或以上) 8 | 9 | train.json/dev.json: 含有噪声的数据,都含有一定比例有标注错误的标签; 10 | 11 | 12 | 1) 请做实验,报告你的方法下改进后的数据集(train.json/dev.json),在test_public.json上做最终测试(2个数值,f1_macro & f1_micro);或 13 | 2) 你也可以提交到CLUE平台:www.CLUEbenchmarks.com, 或发送邮件包含训练集和验证集的压缩包到邮箱。联系邮箱:CLUEbenchmark@163.com 14 | 15 | www.CLUEbenchmarks.com 16 | -------------------------------------------------------------------------------- /datasets/raw_cic/labels.json: -------------------------------------------------------------------------------- 1 | {"label_des": "买家抱怨商品涨价了", "label": 0} 2 | {"label_des": "买家表达不满/生气", "label": 1} 3 | {"label_des": "买家表达赞美/满意", "label": 2} 4 | {"label_des": "买家表示不想要了", "label": 3} 5 | {"label_des": "买家表示地址正确", "label": 4} 6 | {"label_des": "买家表示好的", "label": 5} 7 | {"label_des": "买家表示具体时间寄回去", "label": 6} 8 | {"label_des": "买家表示麻烦卖家,表达感谢", "label": 7} 9 | {"label_des": "买家表示马上下单付款", "label": 8} 10 | {"label_des": "买家表示拍错了", "label": 9} 11 | {"label_des": "买家表示伤心难过", "label": 10} 12 | {"label_des": "买家表示商家发错地址", "label": 11} 13 | {"label_des": "买家表示商家发错货了", "label": 12} 14 | {"label_des": "买家表示商品已经寄回去了", "label": 13} 15 | {"label_des": "买家表示稍等", "label": 14} 16 | {"label_des": "买家表示是老顾客", "label": 15} 17 | {"label_des": "买家表示收件信息不需要修改了", "label": 16} 18 | {"label_des": "买家表示送人的", "label": 17} 19 | {"label_des": "买家表示无法申请退款", "label": 18} 20 | {"label_des": "买家表示无法下单", "label": 19} 21 | {"label_des": "买家表示物流太慢了", "label": 20} 22 | {"label_des": "买家表示先买来试试", "label": 21} 23 | {"label_des": "买家表示需要退货退款", "label": 22} 24 | {"label_des": "买家表示要重拍", "label": 23} 25 | {"label_des": "买家表示已经收到货了", "label": 24} 26 | {"label_des": "买家表示已下单付款", "label": 25} 27 | {"label_des": "买家表示优惠券无法使用", "label": 26} 28 | {"label_des": "买家表示有疑问/不理解", "label": 27} 29 | {"label_des": "买家表示再看看", "label": 28} 30 | {"label_des": "买家催促发货", "label": 29} 31 | {"label_des": "买家催促回复", "label": 30} 32 | {"label_des": "买家打招呼", "label": 31} 33 | {"label_des": "买家对特定款式/颜色表达喜好", "label": 32} 34 | {"label_des": "买家发送结束语", "label": 33} 35 | {"label_des": "买家发送开票信息", "label": 34} 36 | {"label_des": "买家发送支付宝账号", "label": 35} 37 | {"label_des": "买家反馈收到的商品有问题", "label": 36} 38 | {"label_des": "买家确认地址是否修改成功", "label": 37} 39 | {"label_des": "买家讨价还价", "label": 38} 40 | {"label_des": "买家向商家表达不好意思", "label": 39} 41 | {"label_des": "买家需要商品推荐", "label": 40} 42 | {"label_des": "买家要求补偿", "label": 41} 43 | {"label_des": "买家要求发货检查", "label": 42} 44 | {"label_des": "买家要求改商品信息", "label": 43} 45 | {"label_des": "买家要求改运费", "label": 44} 46 | {"label_des": "买家要求核对订单信息", "label": 45} 47 | {"label_des": "买家要求取消退换货退款", "label": 46} 48 | {"label_des": "买家要求添加商家微信", "label": 47} 49 | {"label_des": "买家要求退运费", "label": 48} 50 | {"label_des": "买家要求修改收件信息", "label": 49} 51 | {"label_des": "买家要求延迟发货", "label": 50} 52 | {"label_des": "买家要求延长收货时间", "label": 51} 53 | {"label_des": "买家咨询补货/上货时间", "label": 52} 54 | {"label_des": "买家咨询到货时间", "label": 53} 55 | {"label_des": "买家咨询发货时间", "label": 54} 56 | {"label_des": "买家咨询发什么快递", "label": 55} 57 | {"label_des": "买家咨询返现方式", "label": 56} 58 | {"label_des": "买家咨询返现金额", "label": 57} 59 | {"label_des": "买家咨询返现时间", "label": 58} 60 | {"label_des": "买家咨询换货发货时间", "label": 59} 61 | {"label_des": "买家咨询活动规则", "label": 60} 62 | {"label_des": "买家咨询快递单号", "label": 61} 63 | {"label_des": "买家咨询买多件能否优惠", "label": 62} 64 | {"label_des": "买家咨询哪款更好", "label": 63} 65 | {"label_des": "买家咨询能否到付", "label": 64} 66 | {"label_des": "买家咨询能否定制", "label": 65} 67 | {"label_des": "买家咨询能否分开/一起发货", "label": 66} 68 | {"label_des": "买家咨询能否开发票及发票类型和寄送时间", "label": 67} 69 | {"label_des": "买家咨询能否提前享受优惠", "label": 68} 70 | {"label_des": "买家咨询偏远地区是否发货", "label": 69} 71 | {"label_des": "买家咨询如何加入会员", "label": 70} 72 | {"label_des": "买家咨询如何领取优惠券", "label": 71} 73 | {"label_des": "买家咨询如何下单", "label": 72} 74 | {"label_des": "买家咨询商家发货地", "label": 73} 75 | {"label_des": "买家咨询商家是否收到寄回的商品", "label": 74} 76 | {"label_des": "买家咨询商品包装", "label": 75} 77 | {"label_des": "买家咨询商品产地", "label": 76} 78 | {"label_des": "买家咨询商品的材质/面料", "label": 77} 79 | {"label_des": "买家咨询商品的色差问题", "label": 78} 80 | {"label_des": "买家咨询商品规格数量", "label": 79} 81 | {"label_des": "买家咨询商品价格", "label": 80} 82 | {"label_des": "买家咨询商品具体尺码尺寸", "label": 81} 83 | {"label_des": "买家咨询商品区别", "label": 82} 84 | {"label_des": "买家咨询商品上新时间", "label": 83} 85 | {"label_des": "买家咨询商品是不是预售款", "label": 84} 86 | {"label_des": "买家咨询商品是否可以单买/单卖", "label": 85} 87 | {"label_des": "买家咨询商品是否有赠品", "label": 86} 88 | {"label_des": "买家咨询商品是否有质保,质保多久", "label": 87} 89 | {"label_des": "买家咨询商品是什么颜色", "label": 88} 90 | {"label_des": "买家咨询商品质量是否有保障", "label": 89} 91 | {"label_des": "买家咨询商品重量", "label": 90} 92 | {"label_des": "买家咨询什么颜色好看", "label": 91} 93 | {"label_des": "买家咨询是否可以打折", "label": 92} 94 | {"label_des": "买家咨询是否可以发顺丰", "label": 93} 95 | {"label_des": "买家咨询是否可以改价", "label": 94} 96 | {"label_des": "买家咨询是否可以拒收", "label": 95} 97 | {"label_des": "买家咨询是否可以微信支付", "label": 96} 98 | {"label_des": "买家咨询是否可以指定快递", "label": 97} 99 | {"label_des": "买家咨询是否有好评返现", "label": 98} 100 | {"label_des": "买家咨询是否有活动", "label": 99} 101 | {"label_des": "买家咨询是否有买家秀", "label": 100} 102 | {"label_des": "买家咨询是否有实体店", "label": 101} 103 | {"label_des": "买家咨询是否有味道", "label": 102} 104 | {"label_des": "买家咨询退换货地址", "label": 103} 105 | {"label_des": "买家咨询退换货规则", "label": 104} 106 | {"label_des": "买家咨询退换货运费由谁承担", "label": 105} 107 | {"label_des": "买家咨询退换货支持快递", "label": 106} 108 | {"label_des": "买家咨询退货退款原因选什么", "label": 107} 109 | {"label_des": "买家咨询退款金额", "label": 108} 110 | {"label_des": "买家咨询退款去向", "label": 109} 111 | {"label_des": "买家咨询退款时间", "label": 110} 112 | {"label_des": "买家咨询物流信息", "label": 111} 113 | {"label_des": "买家咨询优惠券使用规则", "label": 112} 114 | {"label_des": "买家咨询运费金额", "label": 113} 115 | {"label_des": "买家咨询运费险赔付规则", "label": 114} 116 | {"label_des": "买家咨询赠品何时发货", "label": 115} 117 | {"label_des": "买家咨询赠品是否可以自选", "label": 116} 118 | {"label_des": "买家咨询自己的旺旺昵称", "label": 117} 119 | -------------------------------------------------------------------------------- /datasets/raw_cic/labels.txt: -------------------------------------------------------------------------------- 1 | 买家抱怨商品涨价了 2 | 买家表达不满/生气 3 | 买家表达赞美/满意 4 | 买家表示不想要了 5 | 买家表示地址正确 6 | 买家表示好的 7 | 买家表示具体时间寄回去 8 | 买家表示麻烦卖家,表达感谢 9 | 买家表示马上下单付款 10 | 买家表示拍错了 11 | 买家表示伤心难过 12 | 买家表示商家发错地址 13 | 买家表示商家发错货了 14 | 买家表示商品已经寄回去了 15 | 买家表示稍等 16 | 买家表示是老顾客 17 | 买家表示收件信息不需要修改了 18 | 买家表示送人的 19 | 买家表示无法申请退款 20 | 买家表示无法下单 21 | 买家表示物流太慢了 22 | 买家表示先买来试试 23 | 买家表示需要退货退款 24 | 买家表示要重拍 25 | 买家表示已经收到货了 26 | 买家表示已下单付款 27 | 买家表示优惠券无法使用 28 | 买家表示有疑问/不理解 29 | 买家表示再看看 30 | 买家催促发货 31 | 买家催促回复 32 | 买家打招呼 33 | 买家对特定款式/颜色表达喜好 34 | 买家发送结束语 35 | 买家发送开票信息 36 | 买家发送支付宝账号 37 | 买家反馈收到的商品有问题 38 | 买家确认地址是否修改成功 39 | 买家讨价还价 40 | 买家向商家表达不好意思 41 | 买家需要商品推荐 42 | 买家要求补偿 43 | 买家要求发货检查 44 | 买家要求改商品信息 45 | 买家要求改运费 46 | 买家要求核对订单信息 47 | 买家要求取消退换货退款 48 | 买家要求添加商家微信 49 | 买家要求退运费 50 | 买家要求修改收件信息 51 | 买家要求延迟发货 52 | 买家要求延长收货时间 53 | 买家咨询补货/上货时间 54 | 买家咨询到货时间 55 | 买家咨询发货时间 56 | 买家咨询发什么快递 57 | 买家咨询返现方式 58 | 买家咨询返现金额 59 | 买家咨询返现时间 60 | 买家咨询换货发货时间 61 | 买家咨询活动规则 62 | 买家咨询快递单号 63 | 买家咨询买多件能否优惠 64 | 买家咨询哪款更好 65 | 买家咨询能否到付 66 | 买家咨询能否定制 67 | 买家咨询能否分开/一起发货 68 | 买家咨询能否开发票及发票类型和寄送时间 69 | 买家咨询能否提前享受优惠 70 | 买家咨询偏远地区是否发货 71 | 买家咨询如何加入会员 72 | 买家咨询如何领取优惠券 73 | 买家咨询如何下单 74 | 买家咨询商家发货地 75 | 买家咨询商家是否收到寄回的商品 76 | 买家咨询商品包装 77 | 买家咨询商品产地 78 | 买家咨询商品的材质/面料 79 | 买家咨询商品的色差问题 80 | 买家咨询商品规格数量 81 | 买家咨询商品价格 82 | 买家咨询商品具体尺码尺寸 83 | 买家咨询商品区别 84 | 买家咨询商品上新时间 85 | 买家咨询商品是不是预售款 86 | 买家咨询商品是否可以单买/单卖 87 | 买家咨询商品是否有赠品 88 | 买家咨询商品是否有质保,质保多久 89 | 买家咨询商品是什么颜色 90 | 买家咨询商品质量是否有保障 91 | 买家咨询商品重量 92 | 买家咨询什么颜色好看 93 | 买家咨询是否可以打折 94 | 买家咨询是否可以发顺丰 95 | 买家咨询是否可以改价 96 | 买家咨询是否可以拒收 97 | 买家咨询是否可以微信支付 98 | 买家咨询是否可以指定快递 99 | 买家咨询是否有好评返现 100 | 买家咨询是否有活动 101 | 买家咨询是否有买家秀 102 | 买家咨询是否有实体店 103 | 买家咨询是否有味道 104 | 买家咨询退换货地址 105 | 买家咨询退换货规则 106 | 买家咨询退换货运费由谁承担 107 | 买家咨询退换货支持快递 108 | 买家咨询退货退款原因选什么 109 | 买家咨询退款金额 110 | 买家咨询退款去向 111 | 买家咨询退款时间 112 | 买家咨询物流信息 113 | 买家咨询优惠券使用规则 114 | 买家咨询运费金额 115 | 买家咨询运费险赔付规则 116 | 买家咨询赠品何时发货 117 | 买家咨询赠品是否可以自选 118 | 买家咨询自己的旺旺昵称 -------------------------------------------------------------------------------- /datasets/raw_iflytek/labels.json: -------------------------------------------------------------------------------- 1 | {"label": "0", "label_des": "打车"} 2 | {"label": "1", "label_des": "地图导航"} 3 | {"label": "2", "label_des": "免费WIFI"} 4 | {"label": "3", "label_des": "租车"} 5 | {"label": "4", "label_des": "同城服务"} 6 | {"label": "5", "label_des": "快递物流"} 7 | {"label": "6", "label_des": "婚庆"} 8 | {"label": "7", "label_des": "家政"} 9 | {"label": "8", "label_des": "公共交通"} 10 | {"label": "9", "label_des": "政务"} 11 | {"label": "10", "label_des": "社区服务"} 12 | {"label": "11", "label_des": "薅羊毛"} 13 | {"label": "12", "label_des": "魔幻"} 14 | {"label": "13", "label_des": "仙侠"} 15 | {"label": "14", "label_des": "卡牌"} 16 | {"label": "15", "label_des": "飞行空战"} 17 | {"label": "16", "label_des": "射击游戏"} 18 | {"label": "17", "label_des": "休闲益智"} 19 | {"label": "18", "label_des": "动作类"} 20 | {"label": "19", "label_des": "体育竞技"} 21 | {"label": "20", "label_des": "棋牌中心"} 22 | {"label": "21", "label_des": "经营养成"} 23 | {"label": "22", "label_des": "策略"} 24 | {"label": "23", "label_des": "MOBA"} 25 | {"label": "24", "label_des": "辅助工具"} 26 | {"label": "25", "label_des": "约会社交"} 27 | {"label": "26", "label_des": "即时通讯"} 28 | {"label": "27", "label_des": "工作社交"} 29 | {"label": "28", "label_des": "论坛圈子"} 30 | {"label": "29", "label_des": "婚恋社交"} 31 | {"label": "30", "label_des": "情侣社交"} 32 | {"label": "31", "label_des": "社交工具"} 33 | {"label": "32", "label_des": "生活社交"} 34 | {"label": "33", "label_des": "微博博客"} 35 | {"label": "34", "label_des": "新闻"} 36 | {"label": "35", "label_des": "漫画"} 37 | {"label": "36", "label_des": "小说"} 38 | {"label": "37", "label_des": "技术"} 39 | {"label": "38", "label_des": "教辅"} 40 | {"label": "39", "label_des": "问答交流"} 41 | {"label": "40", "label_des": "搞笑"} 42 | {"label": "41", "label_des": "杂志"} 43 | {"label": "42", "label_des": "百科"} 44 | {"label": "43", "label_des": "影视娱乐"} 45 | {"label": "44", "label_des": "求职"} 46 | {"label": "45", "label_des": "兼职"} 47 | {"label": "46", "label_des": "视频"} 48 | {"label": "47", "label_des": "短视频"} 49 | {"label": "48", "label_des": "音乐"} 50 | {"label": "49", "label_des": "直播"} 51 | {"label": "50", "label_des": "电台"} 52 | {"label": "51", "label_des": "K歌"} 53 | {"label": "52", "label_des": "成人"} 54 | {"label": "53", "label_des": "中小学"} 55 | {"label": "54", "label_des": "职考"} 56 | {"label": "55", "label_des": "公务员"} 57 | {"label": "56", "label_des": "英语"} 58 | {"label": "57", "label_des": "视频教育"} 59 | {"label": "58", "label_des": "高等教育"} 60 | {"label": "59", "label_des": "成人教育"} 61 | {"label": "60", "label_des": "艺术"} 62 | {"label": "61", "label_des": "语言(非英语)"} 63 | {"label": "62", "label_des": "旅游资讯"} 64 | {"label": "63", "label_des": "综合预定"} 65 | {"label": "64", "label_des": "民航"} 66 | {"label": "65", "label_des": "铁路"} 67 | {"label": "66", "label_des": "酒店"} 68 | {"label": "67", "label_des": "行程管理"} 69 | {"label": "68", "label_des": "民宿短租"} 70 | {"label": "69", "label_des": "出国"} 71 | {"label": "70", "label_des": "工具"} 72 | {"label": "71", "label_des": "亲子儿童"} 73 | {"label": "72", "label_des": "母婴"} 74 | {"label": "73", "label_des": "驾校"} 75 | {"label": "74", "label_des": "违章"} 76 | {"label": "75", "label_des": "汽车咨询"} 77 | {"label": "76", "label_des": "汽车交易"} 78 | {"label": "77", "label_des": "日常养车"} 79 | {"label": "78", "label_des": "行车辅助"} 80 | {"label": "79", "label_des": "租房"} 81 | {"label": "80", "label_des": "买房"} 82 | {"label": "81", "label_des": "装修家居"} 83 | {"label": "82", "label_des": "电子产品"} 84 | {"label": "83", "label_des": "问诊挂号"} 85 | {"label": "84", "label_des": "养生保健"} 86 | {"label": "85", "label_des": "医疗服务"} 87 | {"label": "86", "label_des": "减肥瘦身"} 88 | {"label": "87", "label_des": "美妆美业"} 89 | {"label": "88", "label_des": "菜谱"} 90 | {"label": "89", "label_des": "餐饮店"} 91 | {"label": "90", "label_des": "体育咨讯"} 92 | {"label": "91", "label_des": "运动健身"} 93 | {"label": "92", "label_des": "支付"} 94 | {"label": "93", "label_des": "保险"} 95 | {"label": "94", "label_des": "股票"} 96 | {"label": "95", "label_des": "借贷"} 97 | {"label": "96", "label_des": "理财"} 98 | {"label": "97", "label_des": "彩票"} 99 | {"label": "98", "label_des": "记账"} 100 | {"label": "99", "label_des": "银行"} 101 | {"label": "100", "label_des": "美颜"} 102 | {"label": "101", "label_des": "影像剪辑"} 103 | {"label": "102", "label_des": "摄影修图"} 104 | {"label": "103", "label_des": "相机"} 105 | {"label": "104", "label_des": "绘画"} 106 | {"label": "105", "label_des": "二手"} 107 | {"label": "106", "label_des": "电商"} 108 | {"label": "107", "label_des": "团购"} 109 | {"label": "108", "label_des": "外卖"} 110 | {"label": "109", "label_des": "电影票务"} 111 | {"label": "110", "label_des": "社区超市"} 112 | {"label": "111", "label_des": "购物咨询"} 113 | {"label": "112", "label_des": "笔记"} 114 | {"label": "113", "label_des": "办公"} 115 | {"label": "114", "label_des": "日程管理"} 116 | {"label": "115", "label_des": "女性"} 117 | {"label": "116", "label_des": "经营"} 118 | {"label": "117", "label_des": "收款"} 119 | {"label": "118", "label_des": "其他"} -------------------------------------------------------------------------------- /datasets/raw_qbqtc/readme.md: -------------------------------------------------------------------------------- 1 | # QBQTC 2 | QBQTC: QQ Browser Query Title Corpus 3 | 4 | QQ浏览器搜索相关性数据集 5 | 6 | 7 | # 数据集介绍 8 | QQ浏览器搜索相关性数据集(QBQTC,QQ Browser Query Title Corpus),是QQ浏览器搜索引擎目前针对大搜场景构建的一个融合了相关性、权威性、内容质量、 9 | 时效性等维度标注的学习排序(LTR)数据集,广泛应用在搜索引擎业务场景中。 10 | 11 | 相关性的含义:0,相关程度差;1,有一定相关性;2,非常相关。数字越大相关性越高。 12 | 13 | #### 数据量统计 14 | | 训练集(train) | 验证集(dev) | 公开测试集(test) | 私有测试集 | 15 | | :----: | :----: | :----: | :----: | 16 | | 180,000| 20,000| 5,000 | >=10,0000| 17 | 18 | # baseline效果对比 19 | 20 | | 模型 | 训练集(train) | 验证集(dev) | 测试集(test) | 训练参数 | 21 | | :----:| :----: | :----: | :----: | :----: | 22 | |BERT-base | F1:80.3 Acc:84.3 | F1: 64.9 Acc:72.4 | F1: 64.1 Acc:71.8 | batch=64, length=52, epoch=7, lr=2e-5, warmup=0.9 | 23 | | RoBERTa-wwm-ext | F1:67.9 Acc:76.2 | F1:64.9 Acc:71.5 | F1:64.0 Acc:71.0 | batch=64, length=52, epoch=7, lr=2e-5, warmup=0.9| 24 | |RoBERTa-wwm-large-ext | F1:79.8 Acc:84.2 | F1:65.1 Acc:72.4 | F1:66.3 Acc:73.1 | batch=64, length=52, epoch=7, lr=2e-5, warmup=0.9| 25 | 26 | f1_score来自于sklearn.metrics,计算公式如下: 27 | `F1 = 2 * (precision * recall) / (precision + recall)` 28 | 29 | 30 | # 数据集例子 31 | {"id": 0, "query": "小孩咳嗽感冒", "title": "小孩感冒过后久咳嗽该吃什么药育儿问答宝宝树", "label": "1"} 32 | {"id": 1, "query": "前列腺癌根治术后能活多久", "title": "前列腺癌转移能活多久前列腺癌治疗方法盘点-家庭医生在线肿瘤频道", "label": "1"} 33 | {"id": 3, "query": "如何将一个文件复制到另一个文件里", "title": "怎么把布局里的图纸复制到另外一个文件中去百度文库", "label": "0"} 34 | {"id": 214, "query": "免费观看电影速度与激情1", "title": "《速度与激情1》全集-高清电影完整版-在线观看", "label": "2"} 35 | {"id": 98, "query": "昆明公积金", "title": "昆明异地购房不能用住房公积金中新网", "label": "2"} 36 | {"id": 217, "query": "多张图片怎么排版好看", "title": "怎么排版图片", "label": "2"} 37 | 38 | # 更多内容见 39 | QBQTC项目 -------------------------------------------------------------------------------- /datasets/raw_qbqtc/train.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/datasets/raw_qbqtc/train.json -------------------------------------------------------------------------------- /datasets/raw_tnews/labels.json: -------------------------------------------------------------------------------- 1 | {"label_des": "教育", "label": "0"} 2 | {"label_des": "财经", "label": "1"} 3 | {"label_des": "房产", "label": "2"} 4 | {"label_des": "旅游", "label": "3"} 5 | {"label_des": "科技", "label": "4"} 6 | {"label_des": "体育", "label": "5"} 7 | {"label_des": "电竞", "label": "6"} 8 | {"label_des": "文化", "label": "7"} 9 | {"label_des": "汽车", "label": "8"} 10 | {"label_des": "故事", "label": "9"} 11 | {"label_des": "娱乐", "label": "10"} 12 | {"label_des": "军事", "label": "11"} 13 | {"label_des": "农业", "label": "12"} 14 | {"label_des": "国际", "label": "13"} 15 | {"label_des": "股票", "label": "14"} 16 | -------------------------------------------------------------------------------- /dckit/README.md: -------------------------------------------------------------------------------- 1 | # DataCLUE Toolkit 2 | 3 | [安装](#安装) | [使用](#使用) | [示例](#示例) | [贡献](#贡献) | [**References**](#references) 4 | 5 | 为了方便各个算法之间的整合,这里提供了一套统一的输入输出接口。 6 | 并且提供了一些辅助函数,帮助大家更方便地使用DataCLUE。 7 | (我们鼓励大家用dckit进行开发,以更好的实现不同算法的共享。但是你也完全可以自己实现相应功能完成DataCLUE的任务)。 8 | 9 | # Updates 10 | [Nov 16, 2021] First version of dckit is released. 11 | 12 | # 安装 13 | 在DataCLUE目录下 14 | 15 | `pip install -e .` 16 | 17 | # 使用 18 | ```python 19 | from dckit import read_datasets, random_split_data, evaluate 20 | 21 | data = read_datasets(dataset='CIC') # 读取数据 22 | # TODO 对数据进行处理,这里example_transform 是你需要实现的变换 23 | data = example_transform(data) 24 | 25 | random_split_data(data, test_size=2000, seed=0) # 随机切分数据到训练、测试集 26 | f1 = evaluate() # 运行模型并返回相应的结果 27 | ``` 28 | 29 | # 示例 30 | 我们在中`baseline`实现了几个策略都用到了dckit,比如你可以看`baseline/single/data_aug`或其它相应baseline代码中的实现 31 | 32 | 33 | # 贡献 34 | - 如果你觉得dckit缺少一些通用的基本功能,你可以提一个issue。 35 | - 如果你已经实现了dckit的扩展功能,欢迎开启一个PR。 36 | 37 | # References 38 | ```bib 39 | @article{xu2021dataclue, 40 | title={DataCLUE: A Benchmark Suite for Data-centric NLP}, 41 | author={Liang Xu and Jiacheng Liu and Xiang Pan and Xiaojing Lu and Xiaofeng Hou}, 42 | year={2021}, 43 | eprint={2111.08647}, 44 | archivePrefix={arXiv}, 45 | primaryClass={cs.CL} 46 | } 47 | ``` 48 | -------------------------------------------------------------------------------- /dckit/__init__.py: -------------------------------------------------------------------------------- 1 | from dckit.utils import read_datasets, random_split_data 2 | from dckit.evaluate import evaluate 3 | -------------------------------------------------------------------------------- /dckit/evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sklearn.metrics import f1_score 3 | import json 4 | import numpy as np 5 | 6 | path = os.path.split(os.path.split(os.path.realpath(__file__))[0])[0] 7 | 8 | 9 | def calc_f1(dataset='cic'): 10 | y_true = [] 11 | for line in open('{}/datasets/raw_{}/test_public.json'.format(path, dataset.lower()), 'r', encoding='utf-8'): 12 | y_true.append(json.loads(line)['label']) 13 | y_pred = [] 14 | for line in open('{}/output_dir/bert/test_prediction.json'.format(path), 'r', encoding='utf-8'): 15 | y_pred.append(json.loads(line)['label']) 16 | 17 | f1_macro = f1_score(y_true, y_pred, average='macro') 18 | return f1_macro 19 | 20 | 21 | def evaluate(dataset='cic'): 22 | cmds = [ 23 | 'rm -rf {}/output_dir/bert'.format(path), 24 | 'rm -f {}/datasets/{}/cached*'.format(path, dataset), 25 | 'cd {}/baselines/models_pytorch/classifier_pytorch'.format(path), 26 | 'bash run_classifier_{}.sh'.format(dataset), 27 | 'bash run_classifier_{}.sh predict'.format(dataset), 28 | ] 29 | os.system('&&'.join(cmds)) 30 | return calc_f1(dataset.lower()) 31 | -------------------------------------------------------------------------------- /dckit/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | from sklearn.model_selection import train_test_split 5 | 6 | path = os.path.split(os.path.split(os.path.realpath(__file__))[0])[0] 7 | 8 | 9 | def read_datasets(dataset='cic'): 10 | """ 11 | 根据输入的数据名称读取数据 12 | 参数: 13 | dataset: 数据集名称 14 | 输出: 15 | full_data: 字典形式存储的数据,包括: 16 | - 'json': json数据的每一行,如 {"id": 13, "label": "79", "sentence": "一斤大概有多少个", "label_des": "买家咨询商品规格数量"} 17 | 这里为了统一输入输出没有区分train和dev了 18 | - 'info': 标签号好描述的对应关系,如{79:'买家咨询商品规格数量'} 19 | """ 20 | dataset = dataset.lower() 21 | if dataset in ['cic', 'tnews', 'iflytek']: 22 | json_data = [] 23 | for data_type in ['train', 'dev']: 24 | for line in open('{}/datasets/raw_{}/{}.json'.format(path, dataset, data_type), 'r', encoding='utf-8'): 25 | # line = {"id": 13, "label": "79", "sentence": "一斤大概有多少个", "label_des": "买家咨询商品规格数量"} 26 | one = json.loads(line) 27 | json_data.append(one) 28 | 29 | label_info = {} 30 | for line in open('{}/datasets/raw_{}/{}.json'.format(path, dataset, 'labels'), 'r', encoding='utf-8'): 31 | one = json.loads(line) 32 | label_info[one['label']] = one['label_des'] 33 | full_data = {'json': json_data, 'info': label_info} 34 | return full_data 35 | elif dataset in ['afqmc', 'qbqtc', 'triclue']: 36 | json_data = [] 37 | for data_type in ['train', 'dev']: 38 | for line in open('{}/datasets/raw_{}/{}.json'.format(path, dataset, data_type), 'r', encoding='utf-8'): 39 | # line = {"label": "79", "sentence1": "一斤大概有多少个", "sentence2": "买家咨询商品规格数量"} 40 | one = json.loads(line) 41 | json_data.append(one) 42 | label_info = {} 43 | full_data = {'json': json_data, 'info': label_info} 44 | return full_data 45 | elif dataset in ['cluener']: 46 | """ 47 | {"text": "浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为,对目前国内商业银行而言,", 48 | "label": {"name": {"叶老桂": [[9, 11]]}, "company": {"浙商银行": [[0, 3]]}}} 49 | {"text": "生生不息CSOL生化狂潮让你填弹狂扫", "label": {"game": {"CSOL": [[4, 7]]}}} 50 | """ 51 | json_data = [] 52 | for data_type in ['train', 'dev']: 53 | for line in open('{}/datasets/raw_{}/{}.json'.format(path, dataset, data_type), 'r', encoding='utf-8'): 54 | # line = {"label": "79", "sentence1": "一斤大概有多少个", "sentence2": "买家咨询商品规格数量"} 55 | one = json.loads(line) 56 | json_data.append(one) 57 | label_info = {} 58 | full_data = {'json': json_data, 'info': label_info} 59 | return full_data 60 | else: 61 | raise NotImplementedError 62 | 63 | 64 | def random_split_data(data, test_size=2000, seed=0, dataset='cic'): 65 | if dataset == 'cluener': 66 | raise NotImplementedError 67 | json_data = data['json'] 68 | labels = [] 69 | for line in json_data: 70 | labels.append(int(line['label'])) 71 | train_idx, test_idx, _, _ = train_test_split(range(len(labels)), labels, stratify=labels, 72 | shuffle=True, test_size=test_size, random_state=seed) 73 | 74 | f = open('{}/datasets/{}/train.json'.format(path, dataset), 'w', encoding='utf-8') 75 | for idx in train_idx: 76 | dic = json_data[idx] 77 | str_sen = json.dumps(dic, ensure_ascii=False) 78 | f.write(str_sen + '\n') 79 | 80 | f = open('{}/datasets/{}/dev.json'.format(path, dataset), 'w', encoding='utf-8') 81 | for idx in test_idx: 82 | dic = json_data[idx] 83 | str_sen = json.dumps(dic, ensure_ascii=False) 84 | f.write(str_sen + '\n') 85 | -------------------------------------------------------------------------------- /resources/dataclue_submit_examples/dataclue_submit_examples.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/dataclue_submit_examples/dataclue_submit_examples.zip -------------------------------------------------------------------------------- /resources/dataclue_submit_examples_old_nouse_iflytek/dataclue_submit_examples.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/dataclue_submit_examples_old_nouse_iflytek/dataclue_submit_examples.zip -------------------------------------------------------------------------------- /resources/img/bxu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/img/bxu.jpg -------------------------------------------------------------------------------- /resources/img/improve.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/img/improve.jpeg -------------------------------------------------------------------------------- /resources/img/lifec.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/img/lifec.jpeg -------------------------------------------------------------------------------- /resources/img/takeaway2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/img/takeaway2.jpeg -------------------------------------------------------------------------------- /resources/img/teamgroup.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/img/teamgroup.jpeg -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import setuptools 6 | 7 | from setuptools import setup 8 | from setuptools.command.develop import develop 9 | from setuptools.command.install import install 10 | from subprocess import call 11 | 12 | with open('dckit/README.md', 'r') as f: 13 | long_description = f.read() 14 | 15 | 16 | class Installation(install): 17 | def run(self): 18 | call(['pip install -r requirements.txt --no-clean'], shell=True) 19 | install.run(self) 20 | 21 | 22 | setuptools.setup( 23 | name='dckit', 24 | version='0.0.1', 25 | author='JC Liu', 26 | author_email='CLUE@CLUEbenchmarks.com', 27 | maintainer='DataCLUE', 28 | maintainer_email='CLUE@CLUEbenchmarks.com', 29 | description='Python toolkit for Data-centric Chinese Language Understanding Evaluation benchmark.', 30 | long_description=long_description, 31 | long_description_content_type='text/markdown', 32 | url='https://github.com/CLUEBenchmark/DataCLUE', 33 | include_package_data=True, 34 | packages=setuptools.find_packages(), 35 | classifiers=[ 36 | 'Programming Language :: Python :: 3', 37 | 'License :: OSI Approved :: MIT License', 38 | 'Operating System :: OS Independent'], 39 | install_requires=[], 40 | cmdclass={'install': Installation}) 41 | --------------------------------------------------------------------------------