├── .gitignore
├── README.md
├── README_en.md
├── __init__.py
├── baselines
    ├── CONTRIBUTING.md
    ├── __init__.py
    ├── baseline_lxj
    │   ├── .gitignore
    │   ├── baseline_data
    │   │   ├── all.json
    │   │   ├── dev_1.json
    │   │   ├── dev_2.json
    │   │   ├── dev_3.json
    │   │   ├── dev_4.json
    │   │   ├── dev_5.json
    │   │   ├── dev_6.json
    │   │   ├── train_1.json
    │   │   ├── train_2.json
    │   │   ├── train_3.json
    │   │   ├── train_4.json
    │   │   ├── train_5.json
    │   │   └── train_6.json
    │   ├── dataclue.py
    │   ├── dataclue_change_label.py
    │   ├── requirements.txt
    │   ├── run.sh
    │   ├── run_multi_classify.py
    │   └── run_multi_classify_bert_multi_seed.sh
    ├── models_pytorch
    │   └── classifier_pytorch
    │   │   ├── README.md
    │   │   ├── compute_f1.py
    │   │   ├── convert_bert_original_tf_checkpoint_to_pytorch.py
    │   │   ├── metrics
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-36.pyc
    │   │       │   ├── __init__.cpython-37.pyc
    │   │       │   ├── clue_compute_metrics.cpython-36.pyc
    │   │       │   └── clue_compute_metrics.cpython-37.pyc
    │   │       └── clue_compute_metrics.py
    │   │   ├── notebook
    │   │       └── rbt3_iflytek_gpu.ipynb
    │   │   ├── outputs
    │   │       ├── afqmc_output
    │   │       │   └── .gitignore
    │   │       ├── cmnli_output
    │   │       │   └── .gitignore
    │   │       ├── copa_output
    │   │       │   └── .gitignore
    │   │       ├── csl_output
    │   │       │   └── .gitignore
    │   │       ├── iflytek_output
    │   │       │   └── .gitignore
    │   │       ├── tnews_output
    │   │       │   └── .gitignore
    │   │       └── wsc_output
    │   │       │   └── .gitignore
    │   │   ├── processors
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-36.pyc
    │   │       │   ├── __init__.cpython-37.pyc
    │   │       │   ├── clue.cpython-36.pyc
    │   │       │   ├── clue.cpython-37.pyc
    │   │       │   ├── utils.cpython-36.pyc
    │   │       │   └── utils.cpython-37.pyc
    │   │       ├── clue.py
    │   │       └── utils.py
    │   │   ├── run_classifier.py
    │   │   ├── run_classifier_afqmc.sh
    │   │   ├── run_classifier_cic.sh
    │   │   ├── run_classifier_cic_torch12_py36.sh
    │   │   ├── run_classifier_iflytek.sh
    │   │   ├── run_classifier_iflytek_original.sh
    │   │   ├── run_classifier_qbqtc.sh
    │   │   ├── run_classifier_tnews.sh
    │   │   ├── run_classifier_triclue.sh
    │   │   ├── tools
    │   │       ├── __pycache__
    │   │       │   ├── common.cpython-36.pyc
    │   │       │   ├── common.cpython-37.pyc
    │   │       │   ├── progressbar.cpython-36.pyc
    │   │       │   └── progressbar.cpython-37.pyc
    │   │       ├── common.py
    │   │       └── progressbar.py
    │   │   └── transformers
    │   │       ├── __init__.py
    │   │       ├── __main__.py
    │   │       ├── __pycache__
    │   │           ├── __init__.cpython-36.pyc
    │   │           ├── __init__.cpython-37.pyc
    │   │           ├── configuration_auto.cpython-36.pyc
    │   │           ├── configuration_auto.cpython-37.pyc
    │   │           ├── configuration_bert.cpython-36.pyc
    │   │           ├── configuration_bert.cpython-37.pyc
    │   │           ├── configuration_ctrl.cpython-36.pyc
    │   │           ├── configuration_ctrl.cpython-37.pyc
    │   │           ├── configuration_distilbert.cpython-36.pyc
    │   │           ├── configuration_distilbert.cpython-37.pyc
    │   │           ├── configuration_gpt2.cpython-36.pyc
    │   │           ├── configuration_gpt2.cpython-37.pyc
    │   │           ├── configuration_openai.cpython-36.pyc
    │   │           ├── configuration_openai.cpython-37.pyc
    │   │           ├── configuration_roberta.cpython-36.pyc
    │   │           ├── configuration_roberta.cpython-37.pyc
    │   │           ├── configuration_transfo_xl.cpython-36.pyc
    │   │           ├── configuration_transfo_xl.cpython-37.pyc
    │   │           ├── configuration_utils.cpython-36.pyc
    │   │           ├── configuration_utils.cpython-37.pyc
    │   │           ├── configuration_xlm.cpython-36.pyc
    │   │           ├── configuration_xlm.cpython-37.pyc
    │   │           ├── configuration_xlnet.cpython-36.pyc
    │   │           ├── configuration_xlnet.cpython-37.pyc
    │   │           ├── file_utils.cpython-36.pyc
    │   │           ├── file_utils.cpython-37.pyc
    │   │           ├── modeling_albert.cpython-36.pyc
    │   │           ├── modeling_albert.cpython-37.pyc
    │   │           ├── modeling_auto.cpython-36.pyc
    │   │           ├── modeling_auto.cpython-37.pyc
    │   │           ├── modeling_bert.cpython-36.pyc
    │   │           ├── modeling_bert.cpython-37.pyc
    │   │           ├── modeling_ctrl.cpython-36.pyc
    │   │           ├── modeling_ctrl.cpython-37.pyc
    │   │           ├── modeling_distilbert.cpython-36.pyc
    │   │           ├── modeling_distilbert.cpython-37.pyc
    │   │           ├── modeling_gpt2.cpython-36.pyc
    │   │           ├── modeling_gpt2.cpython-37.pyc
    │   │           ├── modeling_openai.cpython-36.pyc
    │   │           ├── modeling_openai.cpython-37.pyc
    │   │           ├── modeling_roberta.cpython-36.pyc
    │   │           ├── modeling_roberta.cpython-37.pyc
    │   │           ├── modeling_transfo_xl.cpython-36.pyc
    │   │           ├── modeling_transfo_xl.cpython-37.pyc
    │   │           ├── modeling_transfo_xl_utilities.cpython-36.pyc
    │   │           ├── modeling_transfo_xl_utilities.cpython-37.pyc
    │   │           ├── modeling_utils.cpython-36.pyc
    │   │           ├── modeling_utils.cpython-37.pyc
    │   │           ├── modeling_xlm.cpython-36.pyc
    │   │           ├── modeling_xlm.cpython-37.pyc
    │   │           ├── modeling_xlnet.cpython-36.pyc
    │   │           ├── modeling_xlnet.cpython-37.pyc
    │   │           ├── optimization.cpython-36.pyc
    │   │           ├── optimization.cpython-37.pyc
    │   │           ├── tokenization_auto.cpython-36.pyc
    │   │           ├── tokenization_auto.cpython-37.pyc
    │   │           ├── tokenization_bert.cpython-36.pyc
    │   │           ├── tokenization_bert.cpython-37.pyc
    │   │           ├── tokenization_ctrl.cpython-36.pyc
    │   │           ├── tokenization_ctrl.cpython-37.pyc
    │   │           ├── tokenization_distilbert.cpython-36.pyc
    │   │           ├── tokenization_distilbert.cpython-37.pyc
    │   │           ├── tokenization_gpt2.cpython-36.pyc
    │   │           ├── tokenization_gpt2.cpython-37.pyc
    │   │           ├── tokenization_openai.cpython-36.pyc
    │   │           ├── tokenization_openai.cpython-37.pyc
    │   │           ├── tokenization_roberta.cpython-36.pyc
    │   │           ├── tokenization_roberta.cpython-37.pyc
    │   │           ├── tokenization_transfo_xl.cpython-36.pyc
    │   │           ├── tokenization_transfo_xl.cpython-37.pyc
    │   │           ├── tokenization_utils.cpython-36.pyc
    │   │           ├── tokenization_utils.cpython-37.pyc
    │   │           ├── tokenization_xlm.cpython-36.pyc
    │   │           ├── tokenization_xlm.cpython-37.pyc
    │   │           ├── tokenization_xlnet.cpython-36.pyc
    │   │           └── tokenization_xlnet.cpython-37.pyc
    │   │       ├── configuration_auto.py
    │   │       ├── configuration_bert.py
    │   │       ├── configuration_ctrl.py
    │   │       ├── configuration_distilbert.py
    │   │       ├── configuration_gpt2.py
    │   │       ├── configuration_openai.py
    │   │       ├── configuration_roberta.py
    │   │       ├── configuration_transfo_xl.py
    │   │       ├── configuration_utils.py
    │   │       ├── configuration_xlm.py
    │   │       ├── configuration_xlnet.py
    │   │       ├── file_utils.py
    │   │       ├── modeling_albert.py
    │   │       ├── modeling_auto.py
    │   │       ├── modeling_bert.py
    │   │       ├── modeling_ctrl.py
    │   │       ├── modeling_distilbert.py
    │   │       ├── modeling_gpt2.py
    │   │       ├── modeling_openai.py
    │   │       ├── modeling_roberta.py
    │   │       ├── modeling_transfo_xl.py
    │   │       ├── modeling_transfo_xl_utilities.py
    │   │       ├── modeling_utils.py
    │   │       ├── modeling_xlm.py
    │   │       ├── modeling_xlnet.py
    │   │       ├── optimization.py
    │   │       ├── tokenization_auto.py
    │   │       ├── tokenization_bert.py
    │   │       ├── tokenization_ctrl.py
    │   │       ├── tokenization_distilbert.py
    │   │       ├── tokenization_gpt2.py
    │   │       ├── tokenization_openai.py
    │   │       ├── tokenization_roberta.py
    │   │       ├── tokenization_transfo_xl.py
    │   │       ├── tokenization_utils.py
    │   │       ├── tokenization_xlm.py
    │   │       └── tokenization_xlnet.py
    ├── multi
    │   ├── README.md
    │   └── simple_baseline
    │   │   ├── README.md
    │   │   └── simple_baseline.py
    └── single
    │   ├── README.md
    │   ├── __init__.py
    │   ├── data_aug
    │       ├── README.md
    │       ├── __init__.py
    │       ├── data_aug.py
    │       └── parallel_textda.py
    │   ├── data_mixup
    │       ├── README.md
    │       ├── __init__.py
    │       └── data_mixup.py
    │   ├── def_aug
    │       ├── README.md
    │       ├── __init__.py
    │       └── def_aug.py
    │   ├── delete_noise
    │       ├── README.md
    │       ├── __init__.py
    │       ├── classifier.py
    │       └── delete_noise.py
    │   └── template
    │       ├── README.md
    │       └── template.py
├── datasets
    ├── afqmc
    │   └── test_public.json
    ├── cic
    │   ├── README.txt
    │   └── test_public.json
    ├── iflytek
    │   └── test_public.json
    ├── qbqtc
    │   └── test_public.json
    ├── raw_afqmc
    │   ├── dev.json
    │   ├── readme.md
    │   ├── test_public.json
    │   └── train.json
    ├── raw_cic
    │   ├── README.txt
    │   ├── dev.json
    │   ├── labels.json
    │   ├── labels.txt
    │   ├── test_public.json
    │   └── train.json
    ├── raw_iflytek
    │   ├── dev.json
    │   ├── labels.json
    │   ├── test_public.json
    │   └── train.json
    ├── raw_qbqtc
    │   ├── dev.json
    │   ├── readme.md
    │   ├── test_public.json
    │   └── train.json
    ├── raw_tnews
    │   ├── dev.json
    │   ├── labels.json
    │   ├── test_public.json
    │   └── train.json
    ├── raw_triclue
    │   ├── dev.json
    │   ├── test_public.json
    │   └── train.json
    ├── tnews
    │   └── test_public.json
    └── triclue
    │   └── test_public.json
├── dckit
    ├── README.md
    ├── __init__.py
    ├── evaluate.py
    └── utils.py
├── resources
    ├── dataclue_submit_examples
    │   └── dataclue_submit_examples.zip
    ├── dataclue_submit_examples_old_nouse_iflytek
    │   └── dataclue_submit_examples.zip
    └── img
    │   ├── bxu.jpg
    │   ├── improve.jpeg
    │   ├── lifec.jpeg
    │   ├── takeaway2.jpeg
    │   └── teamgroup.jpeg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .nox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # IPython
 77 | profile_default/
 78 | ipython_config.py
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | env.bak/
 96 | venv.bak/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # Pyre type checker
109 | .pyre/
110 | data/
111 | 
112 | 
113 | .idea/
114 | .vscode/
115 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/__init__.py


--------------------------------------------------------------------------------
/baselines/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # 如何给DataCLUE做贡献
 2 | 
 3 | ## 如果你发现了问题
 4 | 
 5 | 请直接提一个 issue
 6 | 
 7 | ## 如果你采用了一个新的方法
 8 | 
 9 | 请首先确认类似方法没有被提出。
10 | 
11 | 如果没有，那么恭喜你发现了一个新方法，请以文件夹的方式添加你的方法添加到`baselines/single`
12 | 我们推荐文件夹包含如下两个文件
13 | 
14 | - README.md 描述算法的主要思路和测试的结果，也可以涵盖参数设置、参考文献等内容
15 | - xxxx.py 程序入口文件，最好和方法名一致
16 | 
17 | 另外如果有用到特殊的库的话，请添加
18 | 
19 | - requirements.txt
20 | 
21 | 同时请添加你的方法在test_public上的结果和相应的链接到`baselines/single/README.md`
22 | 
23 | ## 如果你尝试了一个方法组合
24 | 
25 | 我们推荐先将单独的算法放到single文件夹中，然后请在`baselines/multi`中添加你组合实验的代码
26 | 
27 | - README.md 描述算法的主要思路和测试的结果，也可以涵盖参数设置等内容
28 | - xxxx.py 程序入口文件，最好和方法名一致
29 | 
30 | ## 编码规范
31 | 
32 | 我们提供了一个简单接口用于读取中的数据和进行验证。为了使得大家的代码可以在方便进行整合，请大家采用统一的接口。
33 | 
34 | 代码尽量遵循PEP8等规范。
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/__init__.py


--------------------------------------------------------------------------------
/baselines/baseline_lxj/.gitignore:
--------------------------------------------------------------------------------
1 | *output*
2 | *lock
3 | __pycache__
4 | 


--------------------------------------------------------------------------------
/baselines/baseline_lxj/dataclue.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """TODO: Add a description here."""
 17 | 
 18 | from __future__ import absolute_import, division, print_function
 19 | 
 20 | import csv
 21 | import json
 22 | import os
 23 | import pdb
 24 | import json
 25 | from tqdm import tqdm
 26 | from ali_data_util import weibo_data_process
 27 | 
 28 | import datasets
 29 | 
 30 | label_file=open("../../datasets/cic/labels.txt")
 31 | label_list=[line.strip() for line in label_file]
 32 | label_file.close()
 33 | 
 34 | # TODO: Add BibTeX citation
 35 | _CITATION = """\
 36 | @InProceedings{huggingface:dataset,
 37 | title = {A great new dataset},
 38 | authors={huggingface, Inc.
 39 | },
 40 | year={2020}
 41 | }
 42 | """
 43 | 
 44 | # TODO: Add description of the dataset here
 45 | _DESCRIPTION = """\
 46 | This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
 47 | """
 48 | 
 49 | # _URL = "https://huggingface.co/great-new-dataset.zip"
 50 | 
 51 | 
 52 | # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
 53 | # Using a specific configuration class is optional, you can also use the base class if you don't need
 54 | # to add specific attributes.
 55 | # here we give an example for three sub-set of the dataset with difference sizes.
 56 | class AliDatasetConfig(datasets.BuilderConfig):
 57 |     """ BuilderConfig for AliDataset"""
 58 | 
 59 |     def __init__(self, data_size, **kwargs):
 60 |         """
 61 | 
 62 |         Args:
 63 |             data_size: the size of the training set we want to us (xs, s, m, l, xl)
 64 |             **kwargs: keyword arguments forwarded to super.
 65 |         """
 66 |         self.data_size = data_size
 67 | 
 68 | 
 69 | class AliDataset(datasets.GeneratorBasedBuilder):
 70 |     """TODO: Short description of my dataset."""
 71 | 
 72 |     VERSION = datasets.Version("0.0.1")
 73 | 
 74 |     # This is an example of a dataset with multiple configurations.
 75 |     # If you don't want/need to define several sub-sets in your dataset,
 76 |     # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.
 77 |     # BUILDER_CONFIG_CLASS = AliDatasetConfig
 78 |     # BUILDER_CONFIGS = [
 79 |         # AliDatasetConfig(name="my_dataset_" + size, description="A small dataset", data_size=size) for size in ["small", "medium", "large"]
 80 |     # ]
 81 | 
 82 |     def _info(self):
 83 |         # TODO: Specifies the datasets.DatasetInfo object
 84 |         return datasets.DatasetInfo(
 85 |             # This is the description that will appear on the datasets page.
 86 |             description=_DESCRIPTION,
 87 |             # This defines the different columns of the dataset and their types
 88 |             features=datasets.Features(
 89 |                 {
 90 |                     "sentence": datasets.Value("string"),
 91 |                     "label": datasets.features.ClassLabel(names=label_list)
 92 |                     # These are the features of your dataset like images, labels ...
 93 |                 }
 94 |             ),
 95 |             # If there's a common (input, target) tuple from the features,
 96 |             # specify them here. They'll be used if as_supervised=True in
 97 |             # builder.as_dataset.
 98 |             supervised_keys=None,
 99 |             # Homepage of the dataset for documentation
100 |             homepage="xiaoling@30.54.209.130:/media2/xiaoling/multi_classifier_model/yewu_classify",
101 |             citation=_CITATION,
102 |         )
103 | 
104 |     def _split_generators(self, dl_manager):
105 |         """Returns SplitGenerators."""
106 |         # TODO: Downloads the data and defines the splits
107 |         # dl_manager is a datasets.download.DownloadManager that can be used to
108 |         # download and extract URLs
109 |         # dl_dir = dl_manager.download_and_extract(_URL)
110 |         # data_dir = os.path.join(dl_dir, "great-new-dataset")
111 |         if not self.config.data_files:
112 |             raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
113 |         data_files = dl_manager.download_and_extract(self.config.data_files)
114 |         if isinstance(data_files,(list,tuple)):
115 |             raise ValueError("not right input")
116 |         if isinstance(data_files, str):
117 |             return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": data_files})]
118 |         if isinstance(data_files,dict):
119 |             splits = []
120 |             for split_name, files in data_files.items():
121 |                 splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"filepath": files}))
122 |             return splits
123 | 
124 |     def preprocess_text(self, text, MAX_LENGTH=256):
125 |         if len(text) > 2 * MAX_LENGTH:
126 |             text = text[:MAX_LENGTH] + text[-MAX_LENGTH:]
127 |         # obj前预处理是为了obj处理太耗时引起的，后处理的目的是防止源码截断的时候只取前max_length个token，而这边是前后各取half_max_length个token
128 |         text = weibo_data_process(text)
129 | 
130 |         if len(text) > MAX_LENGTH:
131 |             half_max_length = int(MAX_LENGTH/2)
132 |             return text[:half_max_length] + text[-half_max_length:]
133 |         else:
134 |             return text
135 | 
136 |     def _generate_examples(self, filepath):
137 |         """ Yields examples. """
138 |         # TODO: Yields (key, example) tuples from the dataset
139 |         with open(filepath) as f:
140 |             for id_, row in tqdm(enumerate(f)):
141 |                 # data = json.loads(row)
142 |                 # data=row.strip().split(',')
143 |                 data=json.loads(row.strip())
144 |                 label=int(data["label"]) if "label" in data else 0
145 |                 sentence=data["sentence"]
146 |                 yield id_, {
147 |                     "sentence": sentence,
148 |                     # "sentence": self.preprocess_text(data[0]),
149 |                     # "sentence": data[0],
150 |                     "label": label,
151 |                     }
152 | 


--------------------------------------------------------------------------------
/baselines/baseline_lxj/dataclue_change_label.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | import os
 3 | import sys
 4 | import pdb
 5 | import json
 6 | import collections
 7 | 
 8 | label_file=open("../../datasets/cic/labels.txt")
 9 | label_lines=label_file.readlines()
10 | id_label_map={index:str(label_lines[index].strip()) for index in range(len(label_lines))}
11 | label_id_map={str(label_lines[index].strip()):index for index in range(len(label_lines))}
12 | 
13 | all_sentences=[]
14 | all_labels=[]
15 | all_ids=[]
16 | all_label_des=[]
17 | 
18 | all_source_lines={}
19 | for split_file_index in range(1,7):
20 | 
21 |     dev_file=open("./baseline_data/dev_{}.json".format(split_file_index),'r',encoding="utf-8")
22 |     dev_lines=[json.loads(line.strip()) for line in dev_file]
23 | 
24 |     sentences=[line["sentence"] for line in dev_lines]
25 |     labels=[line["label"] for line in dev_lines]
26 |     ids=[line["id"] for line in dev_lines]
27 |     label_des=[line["label_des"] for line in dev_lines]
28 | 
29 |     all_source_lines.update({ids[index]:dev_lines[index] for index in range(len(dev_lines))})
30 | 
31 |     all_sentences.extend(sentences)
32 |     all_labels.extend(labels)
33 |     all_ids.extend(ids)
34 |     all_label_des.extend(label_des)
35 | 
36 | dev_result_map={}
37 | for seed in [8,9,10]:
38 | 
39 |     dev_result_map[seed]=[]
40 | 
41 |     for split_file_index in range(1,7):
42 |         dev_result_file=open("./output_dir/dataclue_{}_{}/eval_preds_{}.txt".format(split_file_index,seed,seed),'r',encoding="utf-8")
43 |         dev_results=[str(line.strip()) for line in dev_result_file]
44 |         dev_result_map[seed].extend(dev_results)
45 | 
46 |     assert len(all_sentences)==len(all_labels)==len(all_ids)==len(dev_result_map[seed])
47 | 
48 | dev_result_map_prob={}
49 | for seed in [8,9,10]:
50 | 
51 |     dev_result_map_prob[seed]=[]
52 | 
53 |     for split_file_index in range(1,7):
54 |         dev_result_file=open("./output_dir/dataclue_{}_{}/eval_probility_{}.txt".format(split_file_index,seed,seed),'r',encoding="utf-8")
55 |         dev_results=[str(line.strip()) for line in dev_result_file]
56 |         dev_result_map_prob[seed].extend(dev_results)
57 | 
58 |     assert len(all_sentences)==len(all_labels)==len(all_ids)==len(dev_result_map_prob[seed])
59 | 
60 | result_map={}
61 | average_score_list=[]
62 | for index in range(len(all_sentences)):
63 |     average_score=str((float(dev_result_map_prob[8][index])+float(dev_result_map_prob[9][index])+float(dev_result_map_prob[10][index]))/3)
64 |     average_score_list.append(average_score)
65 |     result_map[average_score]=all_sentences[index]+"\t"+all_label_des[index]+"\t"+all_labels[index]+"\t"+str(all_ids[index])+"\t"+dev_result_map[8][index]+"\t"+dev_result_map[9][index]+"\t"+dev_result_map[10][index]+"\t"+dev_result_map_prob[8][index]+"\t"+dev_result_map_prob[9][index]+"\t"+dev_result_map_prob[10][index]
66 | 
67 | need_change_sentence_index=[]
68 | 
69 | count=0
70 | for index in range(len(all_sentences)):
71 |     if float(average_score_list[index])>0.6 and dev_result_map[8][index]==dev_result_map[9][index]==dev_result_map[10][index] and dev_result_map[8][index]!=all_label_des[index]:
72 |         need_change_id=all_ids[index]
73 |         all_source_lines[need_change_id]["label_des"]=dev_result_map[8][index]
74 |         all_source_lines[need_change_id]["label"]=str(label_id_map[dev_result_map[8][index]])
75 |         count+=1
76 | 
77 | for id_,line in all_source_lines.items():
78 |     print(json.dumps(line, ensure_ascii=False))
79 | 


--------------------------------------------------------------------------------
/baselines/baseline_lxj/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.12.0.dev0
2 | datasets>=1.10.2
3 | 


--------------------------------------------------------------------------------
/baselines/baseline_lxj/run.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | bash -x ./run_multi_classify_bert_multi_seed.sh
3 | python ./dataclue_change_label.py > result.json
4 | 


--------------------------------------------------------------------------------
/baselines/baseline_lxj/run_multi_classify_bert_multi_seed.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | model_type=bert
 3 | epoch=6
 4 | ttime=`date +"%Y-%m-%d-%H-%M"`
 5 | echo $ttime
 6 | 
 7 | for seed in $(seq 8 10);do
 8 |     for i in $(seq 1 6);do
 9 |         CUDA_VISIBLE_DEVICES=0 python ./run_multi_classify.py \
10 |             --model_name_or_path=bert-base-chinese \
11 |             --output_dir=./output_dir/dataclue_$i\_$seed \
12 |             --model_type=$model_type \
13 |             --train_file=./baseline_data/train_$i.json \
14 |             --validation_file=./baseline_data/dev_$i.json \
15 |             --test_file=../../datasets/cic/test_public.json \
16 |             --task_name=dataclue \
17 |             --per_device_train_batch_size=16 \
18 |             --num_train_epochs=$epoch \
19 |             --max_seq_length=64 \
20 |             --learning_rate=2e-5 \
21 |             --seed=$seed \
22 |             --overwrite_output_dir \
23 |             --overwrite_cache \
24 |             --do_train \
25 |             --do_eval \
26 |             --do_predict \
27 |             --evaluation_strategy=epoch \
28 |             --save_strategy=epoch
29 |     done
30 | done
31 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/README.md:
--------------------------------------------------------------------------------
  1 | # CLUE_pytorch
  2 | 
  3 | 中文语言理解测评基准(Language Understanding Evaluation benchmark for Chinese)
  4 | 
  5 | **备注**：此版本为个人开发版(目前支持所有的分类型任务)，正式版见https://github.com/CLUEbenchmark/CLUE
  6 | 
  7 | ## 代码目录说明
  8 | 
  9 | ```text
 10 | ├── CLUEdatasets   #　存放数据
 11 | |  └── tnews　　　
 12 | |  └── wsc　
 13 | |  └── ...
 14 | ├── metrics　　　　　　　　　# metric计算
 15 | |  └── clue_compute_metrics.py　　　
 16 | ├── outputs              # 模型输出保存
 17 | |  └── tnews_output
 18 | |  └── wsc_output　
 19 | |  └── ...
 20 | ├── prev_trained_model　# 预训练模型
 21 | |  └── albert_base
 22 | |  └── bert-wwm
 23 | |  └── ...
 24 | ├── processors　　　　　# 数据处理
 25 | |  └── clue.py
 26 | |  └── ...
 27 | ├── tools　　　　　　　　#　通用脚本
 28 | |  └── progressbar.py
 29 | |  └── ...
 30 | ├── transformers　　　# 主模型
 31 | |  └── modeling_albert.py
 32 | |  └── modeling_bert.py
 33 | |  └── ...
 34 | ├── convert_albert_original_tf_checkpoint_to_pytorch.py　#　模型文件转换
 35 | ├── run_classifier.py       # 主程序
 36 | ├── run_classifier_tnews.sh   #　任务运行脚本
 37 | ├── download_clue_data.py   # 数据集下载
 38 | ```
 39 | ### 依赖模块
 40 | 
 41 | - pytorch=1.1.0
 42 | - boto3=1.9
 43 | - regex
 44 | - sacremoses
 45 | - sentencepiece
 46 | - python3.7+
 47 | 
 48 | ### 运行方式
 49 | 
 50 | **1. 下载CLUE数据集，运行以下命令：**
 51 | ```python
 52 | python download_clue_data.py --data_dir=./CLUEdatasets --tasks=all
 53 | ```
 54 | 上述命令默认下载全CLUE数据集，你也可以指定`--tasks`进行下载对应任务数据集，默认存在在`./CLUEdatasets/{对应task}`目录下。
 55 | 
 56 | **2. 若下载对应tf模型权重(若下载为pytorch权重，则跳过该步)，运行转换脚本，比如转换`albert_base_tf`:**
 57 | 
 58 | ```python
 59 | python convert_albert_original_tf_checkpoint_to_pytorch.py \
 60 |       --tf_checkpoint_path=./prev_trained_model/albert_base_tf \
 61 |       --bert_config_file=./prev_trained_model/albert_base_tf/albert_config_base.json \
 62 |       --pytorch_dump_path=./prev_trained_model/albert_base/pytorch_model.bin
 63 | ```
 64 | **注意**: 当转换完模型(包括下载的pytorch模型权重)之后，需要在对应的文件夹内存放`config.json`和`vocab.txt`文件，比如：
 65 | 
 66 | ```text
 67 | ├── prev_trained_model　# 预训练模型
 68 | |  └── bert-base
 69 | |  | └── vocab.txt
 70 | |  | └── config.json
 71 | |  | └── pytorch_model.bin
 72 | 
 73 | ```
 74 | **3. 直接运行对应任务sh脚本，如：**
 75 | 
 76 | ```shell
 77 | sh run_classifier_tnews.sh
 78 | ```
 79 | **4. 评估**
 80 | 
 81 | 当前默认使用最后一个checkpoint模型作为评估模型，你也可以指定`--predict_checkpoints`参数进行对应的checkpoint进行评估，比如：
 82 | ```python
 83 | CURRENT_DIR=`pwd`
 84 | export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/bert-base
 85 | export GLUE_DIR=$CURRENT_DIR/CLUEdatasets
 86 | export OUTPUR_DIR=$CURRENT_DIR/outputs
 87 | TASK_NAME="copa"
 88 | 
 89 | python run_classifier.py \
 90 |   --model_type=bert \
 91 |   --model_name_or_path=$BERT_BASE_DIR \
 92 |   --task_name=$TASK_NAME \
 93 |   --do_predict \
 94 |   --predict_checkpoints=100 \
 95 |   --do_lower_case \
 96 |   --data_dir=$GLUE_DIR/${TASK_NAME}/ \
 97 |   --max_seq_length=128 \
 98 |   --per_gpu_train_batch_size=16 \
 99 |   --per_gpu_eval_batch_size=16 \
100 |   --learning_rate=1e-5 \
101 |   --num_train_epochs=2.0 \
102 |   --logging_steps=50 \
103 |   --save_steps=50 \
104 |   --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \
105 |   --overwrite_output_dir \
106 |   --seed=42
107 | ```
108 | 
109 | ### 模型列表
110 | 
111 | ```
112 | MODEL_CLASSES = {
113 |     ## bert ernie bert_wwm bert_wwwm_ext
114 |     'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
115 |     # xlnet_base xlnet_mid xlnet_large
116 |     'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
117 |     # roberta_base roberta_wwm roberta_wwm_ext roberta_wwm_large_ext
118 |     'roberta': (BertConfig, BertForSequenceClassification, BertTokenizer),
119 |     # albert_tiny albert_base albert_large albert_xlarge
120 |     'albert': (BertConfig, AlbertForSequenceClassification, BertTokenizer)
121 | }
122 | ```
123 | **注意**: bert ernie bert_wwm bert_wwwm_ext等模型只是权重不一样，而模型本身主体一样，因此参数`model_type=bert`其余同理。
124 | 
125 | ### 结果
126 | 
127 | 当前按照https://github.com/CLUEbenchmark/CLUE  提供的参数，除了**COPA**任务无法复现，其余任务基本保持一致。
128 | 
129 | 
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/compute_f1.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pandas as pd
 3 | 
 4 | from sklearn.metrics import f1_score
 5 | 
 6 | def compute_f1_score_by_list(y_true_list,y_pred_list):
 7 |     #y_true = [1, 1, 1, 1, 2, 2, 2, 3, 3]
 8 |     # y_pred = [1, 1, 2, 3, 2, 2, 3, 2, 3]
 9 |     f1_micro = f1_score(y_true_list, y_pred_list, average='micro')
10 |     f1_macro = f1_score(y_true_list, y_pred_list, average='macro')
11 |     print('f1_micro: {0}'.format(f1_micro))
12 |     print('f1_macro: {0}'.format(f1_macro))
13 | 
14 | def compute_score_fn(target_file, predict_file):
15 |     predict_object=open(predict_file,'r')
16 |     predict_lines=predict_object.readlines()
17 | 
18 |     target_object=open(target_file,'r')
19 |     target_lines=target_object.readlines()
20 |     countt=0
21 |     total_ignore=0
22 |     y_pred_list=[]
23 |     y_true_list=[]
24 |     for i, source_line in enumerate(predict_lines):
25 |         source_line_json=json.loads(source_line)
26 |         predict_label=source_line_json['label']
27 |         y_pred_list.append(predict_label)
28 |         target_line_json=json.loads(target_lines[i])
29 |         target_label=target_line_json['label']
30 |         y_true_list.append(target_label)
31 |         if str(target_label)=='-1':
32 |             total_ignore=total_ignore+1
33 |             continue
34 |         if predict_label==target_label:
35 |             countt=countt+1
36 | 
37 |     compute_f1_score_by_list(y_true_list, y_pred_list)
38 |     avg=float(countt)/float(len(target_lines)-total_ignore)
39 |     print("avg:",avg,";total_ignore:",total_ignore,";target_lines:",len(target_lines))
40 | 
41 | 
42 | target_file='test_public.json'
43 | predict_file='test_public_preidct.json'
44 | compute_score_fn(target_file, predict_file)
45 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/convert_bert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | """Convert BERT checkpoint."""
 2 | 
 3 | from __future__ import absolute_import
 4 | from __future__ import division
 5 | from __future__ import print_function
 6 | 
 7 | import argparse
 8 | import torch
 9 | 
10 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
11 | 
12 | import logging
13 | logging.basicConfig(level=logging.INFO)
14 | 
15 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
16 |     # Initialise PyTorch model
17 |     config = BertConfig.from_json_file(bert_config_file)
18 |     print("Building PyTorch model from configuration: {}".format(str(config)))
19 |     model = BertForPreTraining(config)
20 | 
21 |     # Load weights from tf checkpoint
22 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
23 | 
24 |     # Save pytorch-model
25 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
26 |     torch.save(model.state_dict(), pytorch_dump_path)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     parser = argparse.ArgumentParser()
31 |     ## Required parameters
32 |     parser.add_argument("--tf_checkpoint_path",
33 |                         default = None,
34 |                         type = str,
35 |                         required = True,
36 |                         help = "Path to the TensorFlow checkpoint path.")
37 |     parser.add_argument("--bert_config_file",
38 |                         default = None,
39 |                         type = str,
40 |                         required = True,
41 |                         help = "The config json file corresponding to the pre-trained BERT model. \n"
42 |                             "This specifies the model architecture.")
43 |     parser.add_argument("--pytorch_dump_path",
44 |                         default = None,
45 |                         type = str,
46 |                         required = True,
47 |                         help = "Path to the output PyTorch model.")
48 |     args = parser.parse_args()
49 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
50 |                                      args.bert_config_file,
51 |                                      args.pytorch_dump_path)
52 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/metrics/__init__.py


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/clue_compute_metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/clue_compute_metrics.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/clue_compute_metrics.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/metrics/__pycache__/clue_compute_metrics.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/metrics/clue_compute_metrics.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sys
 3 | import logging
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | try:
 8 |     from scipy.stats import pearsonr, spearmanr
 9 |     from sklearn.metrics import matthews_corrcoef, f1_score
10 |     _has_sklearn = True
11 | except (AttributeError, ImportError) as e:
12 |     logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
13 |     _has_sklearn = False
14 | 
15 | def simple_accuracy(preds, labels):
16 |     return (preds == labels).mean()
17 | 
18 | def acc_and_f1(preds, labels):
19 |     acc = simple_accuracy(preds, labels)
20 |     f1 = f1_score(y_true=labels, y_pred=preds,average="macro")
21 |     return {
22 |         "acc": acc,
23 |         "f1": f1,
24 |         "acc_and_f1": (acc + f1) / 2,
25 |     }
26 | 
27 | 
28 | def pearson_and_spearman(preds, labels):
29 |     pearson_corr = pearsonr(preds, labels)[0]
30 |     spearman_corr = spearmanr(preds, labels)[0]
31 |     return {
32 |         "pearson": pearson_corr,
33 |         "spearmanr": spearman_corr,
34 |         "corr": (pearson_corr + spearman_corr) / 2,
35 |     }
36 | 
37 | def compute_metrics(task_name, preds, labels):
38 |     assert len(preds) == len(labels)
39 |     if task_name == "cls":
40 |         return {"acc": simple_accuracy(preds, labels)}
41 |     elif task_name == "cmnli":
42 |         return {"acc": simple_accuracy(preds, labels)}
43 |     elif task_name == "ocnli":
44 |         return {"acc": simple_accuracy(preds, labels)}
45 |     elif task_name == "iflytek":
46 |         return {"acc": simple_accuracy(preds, labels)}
47 |     elif task_name == "wsc":
48 |         return {"acc": simple_accuracy(preds, labels)}
49 |     elif task_name == "tnews":
50 |         return {"acc": simple_accuracy(preds, labels)}
51 |     elif task_name == "afqmc":
52 |         return {"acc": simple_accuracy(preds, labels)}
53 |     elif task_name == "copa":
54 |         return {"acc": simple_accuracy(preds, labels)}
55 |     elif task_name == "cic":
56 |         return {"acc": acc_and_f1(preds, labels)}
57 |     else:
58 |         raise KeyError(task_name)
59 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/outputs/afqmc_output/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/outputs/cmnli_output/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/outputs/copa_output/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/outputs/csl_output/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/outputs/iflytek_output/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/outputs/tnews_output/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/outputs/wsc_output/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/processors/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import InputExample, InputFeatures, DataProcessor
2 | from .clue import (clue_output_modes, clue_processors, clue_tasks_num_labels,
3 |                    clue_convert_examples_to_features, collate_fn, xlnet_collate_fn)
4 | 
5 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/clue.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/clue.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/clue.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/clue.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/processors/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/processors/utils.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import sys
  3 | import copy
  4 | import json
  5 | 
  6 | class InputExample(object):
  7 |     """
  8 |     A single training/test example for simple sequence classification.
  9 | 
 10 |     Args:
 11 |         guid: Unique id for the example.
 12 |         text_a: string. The untokenized text of the first sequence. For single
 13 |         sequence tasks, only this sequence must be specified.
 14 |         text_b: (Optional) string. The untokenized text of the second sequence.
 15 |         Only must be specified for sequence pair tasks.
 16 |         label: (Optional) string. The label of the example. This should be
 17 |         specified for train and dev examples, but not for test examples.
 18 |     """
 19 |     def __init__(self, guid, text_a, text_b=None, label=None):
 20 |         self.guid = guid
 21 |         self.text_a = text_a
 22 |         self.text_b = text_b
 23 |         self.label = label
 24 | 
 25 |     def __repr__(self):
 26 |         return str(self.to_json_string())
 27 | 
 28 |     def to_dict(self):
 29 |         """Serializes this instance to a Python dictionary."""
 30 |         output = copy.deepcopy(self.__dict__)
 31 |         return output
 32 | 
 33 |     def to_json_string(self):
 34 |         """Serializes this instance to a JSON string."""
 35 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 36 | 
 37 | 
 38 | class InputFeatures(object):
 39 |     """
 40 |     A single set of features of data.
 41 | 
 42 |     Args:
 43 |         input_ids: Indices of input sequence tokens in the vocabulary.
 44 |         attention_mask: Mask to avoid performing attention on padding token indices.
 45 |             Mask values selected in ``[0, 1]``:
 46 |             Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
 47 |         token_type_ids: Segment token indices to indicate first and second portions of the inputs.
 48 |         label: Label corresponding to the input
 49 |     """
 50 | 
 51 |     def __init__(self, input_ids, attention_mask, token_type_ids, label,input_len):
 52 |         self.input_ids = input_ids
 53 |         self.attention_mask = attention_mask
 54 |         self.token_type_ids = token_type_ids
 55 |         self.input_len = input_len
 56 |         self.label = label
 57 | 
 58 |     def __repr__(self):
 59 |         return str(self.to_json_string())
 60 | 
 61 |     def to_dict(self):
 62 |         """Serializes this instance to a Python dictionary."""
 63 |         output = copy.deepcopy(self.__dict__)
 64 |         return output
 65 | 
 66 |     def to_json_string(self):
 67 |         """Serializes this instance to a JSON string."""
 68 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 69 | 
 70 | 
 71 | class DataProcessor(object):
 72 |     """Base class for data converters for sequence classification data sets."""
 73 | 
 74 |     def get_train_examples(self, data_dir):
 75 |         """Gets a collection of `InputExample`s for the train set."""
 76 |         raise NotImplementedError()
 77 | 
 78 |     def get_dev_examples(self, data_dir):
 79 |         """Gets a collection of `InputExample`s for the dev set."""
 80 |         raise NotImplementedError()
 81 | 
 82 |     def get_labels(self):
 83 |         """Gets the list of labels for this data set."""
 84 |         raise NotImplementedError()
 85 | 
 86 |     @classmethod
 87 |     def _read_tsv(cls, input_file, quotechar=None):
 88 |         """Reads a tab separated value file."""
 89 |         with open(input_file, "r", encoding="utf-8-sig") as f:
 90 |             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
 91 |             lines = []
 92 |             for line in reader:
 93 |                 lines.append(line)
 94 |             return lines
 95 | 
 96 |     @classmethod
 97 |     def _read_json(cls, input_file):
 98 |         """Reads a json list file."""
 99 |         with open(input_file, "r") as f:
100 |             reader = f.readlines()
101 |             lines = []
102 |             for line in reader:
103 |                 lines.append(json.loads(line.strip()))
104 |             return lines
105 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/run_classifier_afqmc.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env bash
 3 | # @Author: bo.shi
 4 | # @Date:   2019-11-04 09:56:36
 5 | # @Last Modified by:   bo.shi
 6 | # @Last Modified time: 2019-12-05 11:23:45
 7 | 
 8 | TASK_NAME="afqmc"
 9 | MODEL_NAME="chinese_rbtl3_pytorch"
10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
11 | echo "CURRENT_DIR:"+$CURRENT_DIR
12 | export CUDA_VISIBLE_DEVICES="0"
13 | export CLUE_DATA_DIR=../../../datasets  # that is under project path
14 | export OUTPUT_DIR=../../../output_dir/ #  # that is under project path
15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model
16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME
17 | 
18 | # download base model if not exists
19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then
20 |   mkdir -p $ROBERTA_WWM_SMALL_DIR
21 |   echo "makedir $ROBERTA_WWM_SMALL_DIR"
22 | fi
23 | cd $ROBERTA_WWM_SMALL_DIR
24 | 
25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then
26 |   echo "Model not exists, will downloda it now..."
27 |   # rm *
28 |   # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm
29 |   wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip
30 |   unzip chinese_rbtl3_pytorch.zip
31 |   rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip
32 | else
33 |   echo "Model exists, will reuse it."
34 | fi
35 | 
36 | # run task
37 | cd $CURRENT_DIR
38 | echo "Start running..."
39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR
40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR
41 | 
42 | if [ $# == 0 ]; then
43 |     echo "Start training..."
44 |     python run_classifier.py \
45 |       --model_type=bert \
46 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \
47 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
48 |       --task_name=$TASK_NAME \
49 |       --do_train \
50 |       --do_eval \
51 |       --do_lower_case \
52 |       --max_seq_length=32 \
53 |       --per_gpu_train_batch_size=64 \
54 |       --per_gpu_eval_batch_size=32 \
55 |       --learning_rate=2e-5 \
56 |       --num_train_epochs=15 \
57 |       --logging_steps=300 \
58 |       --save_steps=300 \
59 |       --output_dir=$OUTPUT_DIR  \
60 |       --overwrite_output_dir \
61 |       --seed=42
62 | 
63 | # run below lines to generate predicted file on test.json
64 | elif [ $1 == "predict" ]; then
65 |     echo "Start predict..."
66 |     python run_classifier.py \
67 |       --model_type=bert \
68 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR  \
69 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
70 |       --task_name=$TASK_NAME \
71 |       --do_predict \
72 |       --do_lower_case \
73 |       --max_seq_length=32 \
74 |       --per_gpu_train_batch_size=64 \
75 |       --per_gpu_eval_batch_size=32 \
76 |       --learning_rate=2e-5 \
77 |       --num_train_epochs=15 \
78 |       --logging_steps=300 \
79 |       --save_steps=300 \
80 |       --output_dir=$OUTPUT_DIR \
81 |       --overwrite_output_dir \
82 |       --seed=42
83 |  fi
84 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/run_classifier_cic.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env bash
 3 | # @Author: bo.shi
 4 | # @Date:   2019-11-04 09:56:36
 5 | # @Last Modified by:   bo.shi
 6 | # @Last Modified time: 2019-12-05 11:23:45
 7 | 
 8 | TASK_NAME="cic"
 9 | MODEL_NAME="chinese_rbtl3_pytorch"
10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
11 | echo "CURRENT_DIR:"+$CURRENT_DIR
12 | export CUDA_VISIBLE_DEVICES="0"
13 | export CLUE_DATA_DIR=../../../datasets  # that is under project path
14 | export OUTPUT_DIR=../../../output_dir/ #  # that is under project path
15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model
16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME
17 | 
18 | # download base model if not exists
19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then
20 |   mkdir -p $ROBERTA_WWM_SMALL_DIR
21 |   echo "makedir $ROBERTA_WWM_SMALL_DIR"
22 | fi
23 | cd $ROBERTA_WWM_SMALL_DIR
24 | 
25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then
26 |   echo "Model not exists, will downloda it now..."
27 |   # rm *
28 |   # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm
29 |   wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip
30 |   unzip chinese_rbtl3_pytorch.zip
31 |   rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip
32 | else
33 |   echo "Model exists, will reuse it."
34 | fi
35 | 
36 | # run task
37 | cd $CURRENT_DIR
38 | echo "Start running..."
39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR
40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR
41 | 
42 | if [ $# == 0 ]; then
43 |     echo "Start training..."
44 |     python run_classifier.py \
45 |       --model_type=bert \
46 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \
47 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
48 |       --task_name=$TASK_NAME \
49 |       --do_train \
50 |       --do_eval \
51 |       --do_lower_case \
52 |       --max_seq_length=32 \
53 |       --per_gpu_train_batch_size=64 \
54 |       --per_gpu_eval_batch_size=32 \
55 |       --learning_rate=2e-5 \
56 |       --num_train_epochs=15 \
57 |       --logging_steps=300 \
58 |       --save_steps=300 \
59 |       --output_dir=$OUTPUT_DIR  \
60 |       --overwrite_output_dir \
61 |       --seed=42
62 | 
63 | # run below lines to generate predicted file on test.json
64 | elif [ $1 == "predict" ]; then
65 |     echo "Start predict..."
66 |     python run_classifier.py \
67 |       --model_type=bert \
68 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR  \
69 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
70 |       --task_name=$TASK_NAME \
71 |       --do_predict \
72 |       --do_lower_case \
73 |       --max_seq_length=32 \
74 |       --per_gpu_train_batch_size=64 \
75 |       --per_gpu_eval_batch_size=32 \
76 |       --learning_rate=2e-5 \
77 |       --num_train_epochs=15 \
78 |       --logging_steps=300 \
79 |       --save_steps=300 \
80 |       --output_dir=$OUTPUT_DIR \
81 |       --overwrite_output_dir \
82 |       --seed=42
83 |  fi
84 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/run_classifier_cic_torch12_py36.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env bash
 3 | # @Author: bo.shi
 4 | # @Date:   2019-11-04 09:56:36
 5 | # @Last Modified by:   bo.shi
 6 | # @Last Modified time: 2019-12-05 11:23:45
 7 | source /root/anaconda/bin/activate /root/anaconda/envs/torch_1.2_cu10.0_py36
 8 | conda init torch_1.2_cu10.0_py36
 9 | 
10 | export LC_ALL="en_US.utf8"
11 | TASK_NAME="cic"
12 | MODEL_NAME="chinese_rbtl3_pytorch"
13 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
14 | echo "CURRENT_DIR:"+$CURRENT_DIR
15 | export CUDA_VISIBLE_DEVICES="0"
16 | export CLUE_DATA_DIR=../../../datasets  # that is under project path
17 | export OUTPUT_DIR=../../../output_dir/ #  # that is under project path
18 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model
19 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME
20 | 
21 | # download base model if not exists
22 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then
23 |   mkdir -p $ROBERTA_WWM_SMALL_DIR
24 |   echo "makedir $ROBERTA_WWM_SMALL_DIR"
25 | fi
26 | cd $ROBERTA_WWM_SMALL_DIR
27 | 
28 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then
29 |   echo "Model not exists, will downloda it now..."
30 |   # rm *
31 |   # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm
32 |   wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip
33 |   unzip chinese_rbtl3_pytorch.zip
34 |   rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip
35 | else
36 |   echo "Model exists, will reuse it."
37 | fi
38 | 
39 | # run task
40 | cd $CURRENT_DIR
41 | echo "Start running..."
42 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR
43 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR
44 | 
45 | if [ $# == 0 ]; then
46 |     echo "Start training..."
47 |     python run_classifier.py \
48 |       --model_type=bert \
49 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \
50 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
51 |       --task_name=$TASK_NAME \
52 |       --do_train \
53 |       --do_eval \
54 |       --do_lower_case \
55 |       --max_seq_length=32 \
56 |       --per_gpu_train_batch_size=32 \
57 |       --per_gpu_eval_batch_size=32 \
58 |       --learning_rate=2e-5 \
59 |       --num_train_epochs=2 \
60 |       --logging_steps=300 \
61 |       --save_steps=300 \
62 |       --output_dir=$OUTPUT_DIR  \
63 |       --overwrite_output_dir \
64 |       --seed=42
65 | 
66 | # run below lines to generate predicted file on test.json
67 | elif [ $1 == "predict" ]; then
68 |     echo "Start predict..."
69 |     python run_classifier.py \
70 |       --model_type=bert \
71 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR  \
72 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
73 |       --task_name=$TASK_NAME \
74 |       --do_predict \
75 |       --do_lower_case \
76 |       --max_seq_length=32 \
77 |       --per_gpu_train_batch_size=32 \
78 |       --per_gpu_eval_batch_size=32 \
79 |       --learning_rate=2e-5 \
80 |       --num_train_epochs=15 \
81 |       --logging_steps=300 \
82 |       --save_steps=300 \
83 |       --output_dir=$OUTPUT_DIR \
84 |       --overwrite_output_dir \
85 |       --seed=42
86 |  fi
87 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/run_classifier_iflytek.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env bash
 3 | # @Author: bo.shi
 4 | # @Date:   2019-11-04 09:56:36
 5 | # @Last Modified by:   bo.shi
 6 | # @Last Modified time: 2019-12-05 11:23:45
 7 | 
 8 | TASK_NAME="iflytek"
 9 | MODEL_NAME="chinese_rbtl3_pytorch"
10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
11 | echo "CURRENT_DIR:"+$CURRENT_DIR
12 | export CUDA_VISIBLE_DEVICES="0"
13 | export CLUE_DATA_DIR=../../../datasets  # that is under project path
14 | export OUTPUT_DIR=../../../output_dir/ #  # that is under project path
15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model
16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME
17 | 
18 | # download base model if not exists
19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then
20 |   mkdir -p $ROBERTA_WWM_SMALL_DIR
21 |   echo "makedir $ROBERTA_WWM_SMALL_DIR"
22 | fi
23 | cd $ROBERTA_WWM_SMALL_DIR
24 | 
25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then
26 |   echo "Model not exists, will downloda it now..."
27 |   # rm *
28 |   # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm
29 |   wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip
30 |   unzip chinese_rbtl3_pytorch.zip
31 |   rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip
32 | else
33 |   echo "Model exists, will reuse it."
34 | fi
35 | 
36 | # run task
37 | cd $CURRENT_DIR
38 | echo "Start running..."
39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR
40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR
41 | 
42 | if [ $# == 0 ]; then
43 |     echo "Start training..."
44 |     python run_classifier.py \
45 |       --model_type=bert \
46 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \
47 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
48 |       --task_name=$TASK_NAME \
49 |       --do_train \
50 |       --do_eval \
51 |       --do_lower_case \
52 |       --max_seq_length=256 \
53 |       --per_gpu_train_batch_size=64 \
54 |       --per_gpu_eval_batch_size=32 \
55 |       --learning_rate=2e-5 \
56 |       --num_train_epochs=6 \
57 |       --logging_steps=300 \
58 |       --save_steps=300 \
59 |       --output_dir=$OUTPUT_DIR  \
60 |       --overwrite_output_dir \
61 |       --seed=42
62 | 
63 | # run below lines to generate predicted file on test.json
64 | elif [ $1 == "predict" ]; then
65 |     echo "Start predict..."
66 |     python run_classifier.py \
67 |       --model_type=bert \
68 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR  \
69 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
70 |       --task_name=$TASK_NAME \
71 |       --do_predict \
72 |       --do_lower_case \
73 |       --max_seq_length=256 \
74 |       --per_gpu_train_batch_size=32 \
75 |       --per_gpu_eval_batch_size=32 \
76 |       --learning_rate=2e-5 \
77 |       --num_train_epochs=6 \
78 |       --logging_steps=300 \
79 |       --save_steps=300 \
80 |       --output_dir=$OUTPUT_DIR \
81 |       --overwrite_output_dir \
82 |       --seed=42
83 |  fi
84 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/run_classifier_iflytek_original.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # @Author: bo.shi
 3 | # @Date:   2019-11-04 09:56:36
 4 | # @Last Modified by:   bo.shi
 5 | # @Last Modified time: 2020-01-01 11:43:42
 6 | 
 7 | TASK_NAME="iflytek"
 8 | MODEL_NAME="../../../../local_models/chinese_rbtl3_pytorch"
 9 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
10 | export CUDA_VISIBLE_DEVICES="0"
11 | export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model
12 | export BERT_WWM_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME
13 | export CLUE_DATA_DIR=$CURRENT_DIR/CLUEdatasets
14 | experot pretrained_model_url=https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip
15 | 
16 | # download and unzip dataset
17 | if [ ! -d $CLUE_DATA_DIR ]; then
18 |   mkdir -p $CLUE_DATA_DIR
19 |   echo "makedir $CLUE_DATA_DIR"
20 | fi
21 | cd $CLUE_DATA_DIR
22 | if [ ! -d $TASK_NAME ]; then
23 |   mkdir $TASK_NAME
24 |   echo "makedir $CLUE_DATA_DIR/$TASK_NAME"
25 | fi
26 | cd $TASK_NAME
27 | if [ ! -f "train.json" ] || [ ! -f "dev.json" ] || [ ! -f "test.json" ]; then
28 |   rm *
29 |   wget https://storage.googleapis.com/cluebenchmark/tasks/iflytek_public.zip
30 |   unzip iflytek_public.zip
31 |   rm iflytek_public.zip
32 | else
33 |   echo "data exists"
34 | fi
35 | echo "Finish download dataset."
36 | 
37 | # make output dir
38 | if [ ! -d $CURRENT_DIR/${TASK_NAME}_output ]; then
39 |   mkdir -p $CURRENT_DIR/${TASK_NAME}_output
40 |   echo "makedir $CURRENT_DIR/${TASK_NAME}_output"
41 | fi
42 | 
43 | # run task
44 | cd $CURRENT_DIR
45 | echo "Start running..."
46 | if [ $# == 0 ]; then
47 |     python run_classifier.py \
48 |       --model_type=bert \
49 |       --model_name_or_path=$MODEL_NAME \
50 |       --task_name=$TASK_NAME \
51 |       --do_train \
52 |       --do_eval \
53 |       --do_lower_case \
54 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
55 |       --max_seq_length=128 \
56 |       --per_gpu_train_batch_size=16 \
57 |       --per_gpu_eval_batch_size=16 \
58 |       --learning_rate=2e-5 \
59 |       --num_train_epochs=3.0 \
60 |       --logging_steps=759 \
61 |       --save_steps=759 \
62 |       --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ \
63 |       --overwrite_output_dir \
64 |       --seed=42
65 | elif [ $1 == "predict" ]; then
66 |     echo "Start predict..."
67 |     python run_classifier.py \
68 |       --model_type=bert \
69 |       --model_name_or_path=$MODEL_NAME \
70 |       --task_name=$TASK_NAME \
71 |       --do_predict \
72 |       --do_lower_case \
73 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
74 |       --max_seq_length=128 \
75 |       --per_gpu_train_batch_size=16 \
76 |       --per_gpu_eval_batch_size=16 \
77 |       --learning_rate=2e-5 \
78 |       --num_train_epochs=3.0 \
79 |       --logging_steps=759 \
80 |       --save_steps=759 \
81 |       --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ \
82 |       --overwrite_output_dir \
83 |       --seed=42
84 | fi
85 | 
86 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/run_classifier_qbqtc.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env bash
 3 | # @Author: bo.shi
 4 | # @Date:   2019-11-04 09:56:36
 5 | # @Last Modified by:   bo.shi
 6 | # @Last Modified time: 2019-12-05 11:23:45
 7 | 
 8 | TASK_NAME="qbqtc"
 9 | MODEL_NAME="chinese_rbtl3_pytorch"
10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
11 | echo "CURRENT_DIR:"+$CURRENT_DIR
12 | export CUDA_VISIBLE_DEVICES="0"
13 | export CLUE_DATA_DIR=../../../datasets  # that is under project path
14 | export OUTPUT_DIR=../../../output_dir/ #  # that is under project path
15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model
16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME
17 | 
18 | # download base model if not exists
19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then
20 |   mkdir -p $ROBERTA_WWM_SMALL_DIR
21 |   echo "makedir $ROBERTA_WWM_SMALL_DIR"
22 | fi
23 | cd $ROBERTA_WWM_SMALL_DIR
24 | 
25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then
26 |   echo "Model not exists, will downloda it now..."
27 |   # rm *
28 |   # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm
29 |   wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip
30 |   unzip chinese_rbtl3_pytorch.zip
31 |   rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip
32 | else
33 |   echo "Model exists, will reuse it."
34 | fi
35 | 
36 | # run task
37 | cd $CURRENT_DIR
38 | echo "Start running..."
39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR
40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR
41 | 
42 | if [ $# == 0 ]; then
43 |     echo "Start training..."
44 |     python run_classifier.py \
45 |       --model_type=bert \
46 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \
47 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
48 |       --task_name=$TASK_NAME \
49 |       --do_train \
50 |       --do_eval \
51 |       --do_lower_case \
52 |       --max_seq_length=32 \
53 |       --per_gpu_train_batch_size=64 \
54 |       --per_gpu_eval_batch_size=32 \
55 |       --learning_rate=2e-5 \
56 |       --num_train_epochs=15 \
57 |       --logging_steps=300 \
58 |       --save_steps=300 \
59 |       --output_dir=$OUTPUT_DIR  \
60 |       --overwrite_output_dir \
61 |       --seed=42
62 | 
63 | # run below lines to generate predicted file on test.json
64 | elif [ $1 == "predict" ]; then
65 |     echo "Start predict..."
66 |     python run_classifier.py \
67 |       --model_type=bert \
68 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR  \
69 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
70 |       --task_name=$TASK_NAME \
71 |       --do_predict \
72 |       --do_lower_case \
73 |       --max_seq_length=32 \
74 |       --per_gpu_train_batch_size=64 \
75 |       --per_gpu_eval_batch_size=32 \
76 |       --learning_rate=2e-5 \
77 |       --num_train_epochs=15 \
78 |       --logging_steps=300 \
79 |       --save_steps=300 \
80 |       --output_dir=$OUTPUT_DIR \
81 |       --overwrite_output_dir \
82 |       --seed=42
83 |  fi
84 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/run_classifier_tnews.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env bash
 3 | # @Author: bo.shi
 4 | # @Date:   2019-11-04 09:56:36
 5 | # @Last Modified by:   bo.shi
 6 | # @Last Modified time: 2019-12-05 11:23:45
 7 | 
 8 | TASK_NAME="tnews"
 9 | MODEL_NAME="chinese_rbtl3_pytorch"
10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
11 | echo "CURRENT_DIR:"+$CURRENT_DIR
12 | export CUDA_VISIBLE_DEVICES="0"
13 | export CLUE_DATA_DIR=../../../datasets  # that is under project path
14 | export OUTPUT_DIR=../../../output_dir/ #  # that is under project path
15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model
16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME
17 | 
18 | # download base model if not exists
19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then
20 |   mkdir -p $ROBERTA_WWM_SMALL_DIR
21 |   echo "makedir $ROBERTA_WWM_SMALL_DIR"
22 | fi
23 | cd $ROBERTA_WWM_SMALL_DIR
24 | 
25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then
26 |   echo "Model not exists, will downloda it now..."
27 |   # rm *
28 |   # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm
29 |   wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip
30 |   unzip chinese_rbtl3_pytorch.zip
31 |   rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip
32 | else
33 |   echo "Model exists, will reuse it."
34 | fi
35 | 
36 | # run task
37 | cd $CURRENT_DIR
38 | echo "Start running..."
39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR
40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR
41 | 
42 | if [ $# == 0 ]; then
43 |     echo "Start training..."
44 |     python run_classifier.py \
45 |       --model_type=bert \
46 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \
47 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
48 |       --task_name=$TASK_NAME \
49 |       --do_train \
50 |       --do_eval \
51 |       --do_lower_case \
52 |       --max_seq_length=32 \
53 |       --per_gpu_train_batch_size=64 \
54 |       --per_gpu_eval_batch_size=32 \
55 |       --learning_rate=2e-5 \
56 |       --num_train_epochs=15 \
57 |       --logging_steps=300 \
58 |       --save_steps=300 \
59 |       --output_dir=$OUTPUT_DIR  \
60 |       --overwrite_output_dir \
61 |       --seed=42
62 | 
63 | # run below lines to generate predicted file on test.json
64 | elif [ $1 == "predict" ]; then
65 |     echo "Start predict..."
66 |     python run_classifier.py \
67 |       --model_type=bert \
68 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR  \
69 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
70 |       --task_name=$TASK_NAME \
71 |       --do_predict \
72 |       --do_lower_case \
73 |       --max_seq_length=32 \
74 |       --per_gpu_train_batch_size=64 \
75 |       --per_gpu_eval_batch_size=32 \
76 |       --learning_rate=2e-5 \
77 |       --num_train_epochs=15 \
78 |       --logging_steps=300 \
79 |       --save_steps=300 \
80 |       --output_dir=$OUTPUT_DIR \
81 |       --overwrite_output_dir \
82 |       --seed=42
83 |  fi
84 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/run_classifier_triclue.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env bash
 3 | # @Author: bo.shi
 4 | # @Date:   2019-11-04 09:56:36
 5 | # @Last Modified by:   bo.shi
 6 | # @Last Modified time: 2019-12-05 11:23:45
 7 | 
 8 | TASK_NAME="triclue"
 9 | MODEL_NAME="chinese_rbtl3_pytorch"
10 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
11 | echo "CURRENT_DIR:"+$CURRENT_DIR
12 | export CUDA_VISIBLE_DEVICES="0"
13 | export CLUE_DATA_DIR=../../../datasets  # that is under project path
14 | export OUTPUT_DIR=../../../output_dir/ #  # that is under project path
15 | export PRETRAINED_MODELS_DIR=../../../pre_trained_model # that is project model
16 | export ROBERTA_WWM_SMALL_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME
17 | 
18 | # download base model if not exists
19 | if [ ! -d $ROBERTA_WWM_SMALL_DIR ]; then
20 |   mkdir -p $ROBERTA_WWM_SMALL_DIR
21 |   echo "makedir $ROBERTA_WWM_SMALL_DIR"
22 | fi
23 | cd $ROBERTA_WWM_SMALL_DIR
24 | 
25 | if [ ! -f "config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "pytorch_model.bin" ] ; then
26 |   echo "Model not exists, will downloda it now..."
27 |   # rm *
28 |   # you can find detail of the base model from here: https://github.com/ymcui/Chinese-BERT-wwm
29 |   wget -c https://storage.googleapis.com/cluebenchmark/pretrained_models/chinese_rbtl3_pytorch.zip
30 |   unzip chinese_rbtl3_pytorch.zip
31 |   rm chinese_rbtl3_pytorch.zip # chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip
32 | else
33 |   echo "Model exists, will reuse it."
34 | fi
35 | 
36 | # run task
37 | cd $CURRENT_DIR
38 | echo "Start running..."
39 | echo "Data folder.CLUE_DATA_DIR:"$CLUE_DATA_DIR
40 | echo "Model folder.ROBERTA_WWM_SMALL_DIR:"$ROBERTA_WWM_SMALL_DIR
41 | 
42 | if [ $# == 0 ]; then
43 |     echo "Start training..."
44 |     python run_classifier.py \
45 |       --model_type=bert \
46 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR \
47 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
48 |       --task_name=$TASK_NAME \
49 |       --do_train \
50 |       --do_eval \
51 |       --do_lower_case \
52 |       --max_seq_length=32 \
53 |       --per_gpu_train_batch_size=64 \
54 |       --per_gpu_eval_batch_size=32 \
55 |       --learning_rate=2e-5 \
56 |       --num_train_epochs=15 \
57 |       --logging_steps=300 \
58 |       --save_steps=300 \
59 |       --output_dir=$OUTPUT_DIR  \
60 |       --overwrite_output_dir \
61 |       --seed=42
62 | 
63 | # run below lines to generate predicted file on test.json
64 | elif [ $1 == "predict" ]; then
65 |     echo "Start predict..."
66 |     python run_classifier.py \
67 |       --model_type=bert \
68 |       --model_name_or_path=$ROBERTA_WWM_SMALL_DIR  \
69 |       --data_dir=$CLUE_DATA_DIR/${TASK_NAME}/ \
70 |       --task_name=$TASK_NAME \
71 |       --do_predict \
72 |       --do_lower_case \
73 |       --max_seq_length=32 \
74 |       --per_gpu_train_batch_size=64 \
75 |       --per_gpu_eval_batch_size=32 \
76 |       --learning_rate=2e-5 \
77 |       --num_train_epochs=15 \
78 |       --logging_steps=300 \
79 |       --save_steps=300 \
80 |       --output_dir=$OUTPUT_DIR \
81 |       --overwrite_output_dir \
82 |       --seed=42
83 |  fi
84 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/common.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/common.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/common.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/common.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/progressbar.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/progressbar.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/progressbar.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/tools/__pycache__/progressbar.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/tools/progressbar.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | class ProgressBar(object):
 4 |     '''
 5 |     custom progress bar
 6 |     Example:
 7 |         >>> pbar = ProgressBar(n_total=30,desc='Training')
 8 |         >>> step = 2
 9 |         >>> pbar(step=step)
10 |     '''
11 |     def __init__(self, n_total,width=30,desc = 'Training'):
12 |         self.width = width
13 |         self.n_total = n_total
14 |         self.start_time = time.time()
15 |         self.desc = desc
16 | 
17 |     def __call__(self, step, info={}):
18 |         now = time.time()
19 |         current = step + 1
20 |         recv_per = current / self.n_total
21 |         bar = f'[{self.desc}] {current}/{self.n_total} ['
22 |         if recv_per >= 1:
23 |             recv_per = 1
24 |         prog_width = int(self.width * recv_per)
25 |         if prog_width > 0:
26 |             bar += '=' * (prog_width - 1)
27 |             if current< self.n_total:
28 |                 bar += ">"
29 |             else:
30 |                 bar += '='
31 |         bar += '.' * (self.width - prog_width)
32 |         bar += ']'
33 |         show_bar = f"\r{bar}"
34 |         time_per_unit = (now - self.start_time) / current
35 |         if current < self.n_total:
36 |             eta = time_per_unit * (self.n_total - current)
37 |             if eta > 3600:
38 |                 eta_format = ('%d:%02d:%02d' %
39 |                               (eta // 3600, (eta % 3600) // 60, eta % 60))
40 |             elif eta > 60:
41 |                 eta_format = '%d:%02d' % (eta // 60, eta % 60)
42 |             else:
43 |                 eta_format = '%ds' % eta
44 |             time_info = f' - ETA: {eta_format}'
45 |         else:
46 |             if time_per_unit >= 1:
47 |                 time_info = f' {time_per_unit:.1f}s/step'
48 |             elif time_per_unit >= 1e-3:
49 |                 time_info = f' {time_per_unit * 1e3:.1f}ms/step'
50 |             else:
51 |                 time_info = f' {time_per_unit * 1e6:.1f}us/step'
52 | 
53 |         show_bar += time_info
54 |         if len(info) != 0:
55 |             show_info = f'{show_bar} ' + \
56 |                         "-".join([f' {key}: {value:.4f} ' for key, value in info.items()])
57 |             print(show_info, end='')
58 |         else:
59 |             print(show_bar, end='')
60 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "2.1.1"
 2 | 
 3 | # Work around to update TensorFlow's absl.logging threshold which alters the
 4 | # default Python logging output behavior when present.
 5 | # see: https://github.com/abseil/abseil-py/issues/99
 6 | # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 7 | try:
 8 |     import absl.logging
 9 |     absl.logging.set_verbosity('info')
10 |     absl.logging.set_stderrthreshold('info')
11 |     absl.logging._warn_preinit_stderr = False
12 | except:
13 |     pass
14 | 
15 | import logging
16 | 
17 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
18 | 
19 | # Files and general utilities
20 | from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
21 |                          cached_path, add_start_docstrings, add_end_docstrings,
22 |                          WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
23 |                          is_tf_available, is_torch_available)
24 | 
25 | # Tokenizers
26 | from .tokenization_utils import (PreTrainedTokenizer)
27 | from .tokenization_auto import AutoTokenizer
28 | from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
29 | from .tokenization_openai import OpenAIGPTTokenizer
30 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
31 | from .tokenization_gpt2 import GPT2Tokenizer
32 | from .tokenization_ctrl import CTRLTokenizer
33 | from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
34 | from .tokenization_xlm import XLMTokenizer
35 | from .tokenization_roberta import RobertaTokenizer
36 | from .tokenization_distilbert import DistilBertTokenizer
37 | 
38 | # Configurations
39 | from .configuration_utils import PretrainedConfig
40 | from .configuration_auto import AutoConfig
41 | from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
42 | from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
43 | from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
44 | from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
45 | from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
46 | from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
47 | from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
48 | from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
49 | from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
50 | from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
51 | 
52 | # Modeling
53 | if is_torch_available():
54 |     from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
55 |     from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
56 |                                 AutoModelWithLMHead)
57 | 
58 |     from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
59 |                                 BertForMaskedLM, BertForNextSentencePrediction,
60 |                                 BertForSequenceClassification, BertForMultipleChoice,
61 |                                 BertForTokenClassification, BertForQuestionAnswering,
62 |                                 load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
63 |     from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
64 |                                 OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
65 |                                 load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
66 |     from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
67 |                                     load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
68 |     from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
69 |                                 GPT2LMHeadModel, GPT2DoubleHeadsModel,
70 |                                 load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
71 |     from .modeling_ctrl import (CTRLPreTrainedModel, CTRLModel,
72 |                                 CTRLLMHeadModel,
73 |                                 CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
74 |     from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
75 |                                 XLNetForSequenceClassification, XLNetForMultipleChoice,
76 |                                 XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
77 |                                 load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
78 |     from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
79 |                             XLMWithLMHeadModel, XLMForSequenceClassification,
80 |                             XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
81 |                             XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
82 |     from .modeling_roberta import (RobertaForMaskedLM, RobertaModel,
83 |                                 RobertaForSequenceClassification, RobertaForMultipleChoice,
84 |                                 ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
85 |     from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
86 |                                 DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
87 |                                 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
88 |     from .modeling_albert import AlbertForSequenceClassification
89 | 
90 |     # Optimization
91 |     from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
92 |                                WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
93 | if not is_tf_available() and not is_torch_available():
94 |     logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
95 |                    "Models won't be available and only tokenizers, configuration"
96 |                    "and file/data utilities can be used.")
97 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__main__.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | def main():
  3 |     import sys
  4 |     if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
  5 |         print(
  6 |         "This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
  7 |         "It should be used as one of: \n"
  8 |         ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
  9 |         ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
 10 |         ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
 11 |         ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
 12 |         ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
 13 |         ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
 14 |     else:
 15 |         if sys.argv[1] == "bert":
 16 |             try:
 17 |                 from convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
 18 |             except ImportError:
 19 |                 print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 20 |                     "In that case, it requires TensorFlow to be installed. Please see "
 21 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 22 |                 raise
 23 | 
 24 |             if len(sys.argv) != 5:
 25 |                 # pylint: disable=line-too-long
 26 |                 print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
 27 |             else:
 28 |                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
 29 |                 TF_CONFIG = sys.argv.pop()
 30 |                 TF_CHECKPOINT = sys.argv.pop()
 31 |                 convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
 32 |         elif sys.argv[1] == "gpt":
 33 |             from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
 34 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 35 |                 # pylint: disable=line-too-long
 36 |                 print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
 37 |             else:
 38 |                 OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
 39 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 40 |                 if len(sys.argv) == 5:
 41 |                     OPENAI_GPT_CONFIG = sys.argv[4]
 42 |                 else:
 43 |                     OPENAI_GPT_CONFIG = ""
 44 |                 convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
 45 |                                                     OPENAI_GPT_CONFIG,
 46 |                                                     PYTORCH_DUMP_OUTPUT)
 47 |         elif sys.argv[1] == "transfo_xl":
 48 |             try:
 49 |                 from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
 50 |             except ImportError:
 51 |                 print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 52 |                     "In that case, it requires TensorFlow to be installed. Please see "
 53 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 54 |                 raise
 55 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 56 |                 # pylint: disable=line-too-long
 57 |                 print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
 58 |             else:
 59 |                 if 'ckpt' in sys.argv[2].lower():
 60 |                     TF_CHECKPOINT = sys.argv[2]
 61 |                     TF_DATASET_FILE = ""
 62 |                 else:
 63 |                     TF_DATASET_FILE = sys.argv[2]
 64 |                     TF_CHECKPOINT = ""
 65 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 66 |                 if len(sys.argv) == 5:
 67 |                     TF_CONFIG = sys.argv[4]
 68 |                 else:
 69 |                     TF_CONFIG = ""
 70 |                 convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
 71 |         elif sys.argv[1] == "gpt2":
 72 |             try:
 73 |                 from convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
 74 |             except ImportError:
 75 |                 print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 76 |                     "In that case, it requires TensorFlow to be installed. Please see "
 77 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 78 |                 raise
 79 | 
 80 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 81 |                 # pylint: disable=line-too-long
 82 |                 print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
 83 |             else:
 84 |                 TF_CHECKPOINT = sys.argv[2]
 85 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 86 |                 if len(sys.argv) == 5:
 87 |                     TF_CONFIG = sys.argv[4]
 88 |                 else:
 89 |                     TF_CONFIG = ""
 90 |                 convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
 91 |         elif sys.argv[1] == "xlnet":
 92 |             try:
 93 |                 from convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
 94 |             except ImportError:
 95 |                 print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 96 |                     "In that case, it requires TensorFlow to be installed. Please see "
 97 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 98 |                 raise
 99 | 
100 |             if len(sys.argv) < 5 or len(sys.argv) > 6:
101 |                 # pylint: disable=line-too-long
102 |                 print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
103 |             else:
104 |                 TF_CHECKPOINT = sys.argv[2]
105 |                 TF_CONFIG = sys.argv[3]
106 |                 PYTORCH_DUMP_OUTPUT = sys.argv[4]
107 |                 if len(sys.argv) == 6:
108 |                     FINETUNING_TASK = sys.argv[5]
109 |                 else:
110 |                     FINETUNING_TASK = None
111 | 
112 |                 convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT,
113 |                                                     TF_CONFIG,
114 |                                                     PYTORCH_DUMP_OUTPUT,
115 |                                                     FINETUNING_TASK)
116 |         elif sys.argv[1] == "xlm":
117 |             from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
118 | 
119 |             if len(sys.argv) != 4:
120 |                 # pylint: disable=line-too-long
121 |                 print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
122 |             else:
123 |                 XLM_CHECKPOINT_PATH = sys.argv[2]
124 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
125 | 
126 |                 convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT)
127 | 
128 | if __name__ == '__main__':
129 |     main()
130 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_auto.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_auto.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_auto.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_auto.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_bert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_bert.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_bert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_bert.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_ctrl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_ctrl.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_ctrl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_ctrl.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_distilbert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_distilbert.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_distilbert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_distilbert.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_gpt2.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_gpt2.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_gpt2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_gpt2.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_openai.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_openai.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_openai.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_openai.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_roberta.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_roberta.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_roberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_roberta.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_transfo_xl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_transfo_xl.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_transfo_xl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_transfo_xl.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlm.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlm.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlnet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlnet.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/configuration_xlnet.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/file_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/file_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/file_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/file_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_albert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_albert.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_albert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_albert.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_auto.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_auto.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_auto.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_auto.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_bert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_bert.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_bert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_bert.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_ctrl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_ctrl.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_ctrl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_ctrl.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_distilbert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_distilbert.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_distilbert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_distilbert.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_gpt2.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_gpt2.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_gpt2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_gpt2.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_openai.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_openai.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_openai.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_openai.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_roberta.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_roberta.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_roberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_roberta.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlm.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlm.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlnet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlnet.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/modeling_xlnet.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/optimization.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/optimization.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/optimization.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/optimization.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_auto.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_auto.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_auto.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_auto.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_bert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_bert.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_bert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_bert.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_ctrl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_ctrl.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_ctrl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_ctrl.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_distilbert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_distilbert.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_distilbert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_distilbert.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_gpt2.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_gpt2.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_gpt2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_gpt2.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_openai.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_openai.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_openai.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_openai.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_roberta.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_roberta.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_roberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_roberta.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_transfo_xl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_transfo_xl.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_transfo_xl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_transfo_xl.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlm.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlm.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlnet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlnet.cpython-36.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/models_pytorch/classifier_pytorch/transformers/__pycache__/tokenization_xlnet.cpython-37.pyc


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/configuration_bert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ BERT model configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
 31 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
 32 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
 33 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
 34 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
 35 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
 36 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
 37 |     'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
 38 |     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
 39 |     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
 40 |     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
 41 |     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
 42 |     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
 43 |     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
 44 |     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
 45 | }
 46 | 
 47 | 
 48 | class BertConfig(PretrainedConfig):
 49 |     r"""
 50 |         :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
 51 |         `BertModel`.
 52 | 
 53 | 
 54 |         Arguments:
 55 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
 56 |             hidden_size: Size of the encoder layers and the pooler layer.
 57 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 58 |             num_attention_heads: Number of attention heads for each attention layer in
 59 |                 the Transformer encoder.
 60 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 61 |                 layer in the Transformer encoder.
 62 |             hidden_act: The non-linear activation function (function or string) in the
 63 |                 encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
 64 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 65 |                 layers in the embeddings, encoder, and pooler.
 66 |             attention_probs_dropout_prob: The dropout ratio for the attention
 67 |                 probabilities.
 68 |             max_position_embeddings: The maximum sequence length that this model might
 69 |                 ever be used with. Typically set this to something large just in case
 70 |                 (e.g., 512 or 1024 or 2048).
 71 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 72 |                 `BertModel`.
 73 |             initializer_range: The sttdev of the truncated_normal_initializer for
 74 |                 initializing all weight matrices.
 75 |             layer_norm_eps: The epsilon used by LayerNorm.
 76 |     """
 77 |     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 78 | 
 79 |     def __init__(self,
 80 |                  vocab_size_or_config_json_file=30522,
 81 |                  hidden_size=768,
 82 |                  num_hidden_layers=12,
 83 |                  num_attention_heads=12,
 84 |                  intermediate_size=3072,
 85 |                  hidden_act="gelu",
 86 |                  hidden_dropout_prob=0.1,
 87 |                  attention_probs_dropout_prob=0.1,
 88 |                  max_position_embeddings=512,
 89 |                  type_vocab_size=2,
 90 |                  initializer_range=0.02,
 91 |                  layer_norm_eps=1e-12,
 92 |                  **kwargs):
 93 |         super(BertConfig, self).__init__(**kwargs)
 94 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 95 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
 96 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
 97 |                 json_config = json.loads(reader.read())
 98 |             for key, value in json_config.items():
 99 |                 self.__dict__[key] = value
100 |         elif isinstance(vocab_size_or_config_json_file, int):
101 |             self.vocab_size = vocab_size_or_config_json_file
102 |             self.hidden_size = hidden_size
103 |             self.num_hidden_layers = num_hidden_layers
104 |             self.num_attention_heads = num_attention_heads
105 |             self.hidden_act = hidden_act
106 |             self.intermediate_size = intermediate_size
107 |             self.hidden_dropout_prob = hidden_dropout_prob
108 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
109 |             self.max_position_embeddings = max_position_embeddings
110 |             self.type_vocab_size = type_vocab_size
111 |             self.initializer_range = initializer_range
112 |             self.layer_norm_eps = layer_norm_eps
113 |         else:
114 |             raise ValueError("First argument must be either a vocabulary size (int)"
115 |                              " or the path to a pretrained model config file (str)")
116 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/configuration_ctrl.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Salesforce and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Salesforce CTRL configuration """
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import json
 20 | import logging
 21 | import sys
 22 | from io import open
 23 | 
 24 | from .configuration_utils import PretrainedConfig
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"}
 29 | 
 30 | class CTRLConfig(PretrainedConfig):
 31 |     """Configuration class to store the configuration of a `CTRLModel`.
 32 | 
 33 |     Args:
 34 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
 35 |         n_positions: Number of positional embeddings.
 36 |         n_ctx: Size of the causal mask (usually same as n_positions).
 37 |         dff: Size of the inner dimension of the FFN.
 38 |         n_embd: Dimensionality of the embeddings and hidden states.
 39 |         n_layer: Number of hidden layers in the Transformer encoder.
 40 |         n_head: Number of attention heads for each attention layer in
 41 |             the Transformer encoder.
 42 |         layer_norm_epsilon: epsilon to use in the layer norm layers
 43 |         resid_pdrop: The dropout probabilitiy for all fully connected
 44 |             layers in the embeddings, encoder, and pooler.
 45 |         attn_pdrop: The dropout ratio for the attention
 46 |             probabilities.
 47 |         embd_pdrop: The dropout ratio for the embeddings.
 48 |         initializer_range: The sttdev of the truncated_normal_initializer for
 49 |             initializing all weight matrices.
 50 |     """
 51 |     pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         vocab_size_or_config_json_file=246534,
 56 |         n_positions=256,
 57 |         n_ctx=256,
 58 |         n_embd=1280,
 59 |         dff=8192,
 60 |         n_layer=48,
 61 |         n_head=16,
 62 |         resid_pdrop=0.1,
 63 |         embd_pdrop=0.1,
 64 |         attn_pdrop=0.1,
 65 |         layer_norm_epsilon=1e-6,
 66 |         initializer_range=0.02,
 67 | 
 68 |         num_labels=1,
 69 |         summary_type='cls_index',
 70 |         summary_use_proj=True,
 71 |         summary_activation=None,
 72 |         summary_proj_to_labels=True,
 73 |         summary_first_dropout=0.1,
 74 |         **kwargs
 75 |     ):
 76 |         """Constructs CTRLConfig.
 77 | 
 78 |         Args:
 79 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
 80 |             n_positions: Number of positional embeddings.
 81 |             n_ctx: Size of the causal mask (usually same as n_positions).
 82 |             dff: Size of the inner dimension of the FFN.
 83 |             n_embd: Dimensionality of the embeddings and hidden states.
 84 |             n_layer: Number of hidden layers in the Transformer encoder.
 85 |             n_head: Number of attention heads for each attention layer in
 86 |                 the Transformer encoder.
 87 |             layer_norm_epsilon: epsilon to use in the layer norm layers
 88 |             resid_pdrop: The dropout probabilitiy for all fully connected
 89 |                 layers in the embeddings, encoder, and pooler.
 90 |             attn_pdrop: The dropout ratio for the attention
 91 |                 probabilities.
 92 |             embd_pdrop: The dropout ratio for the embeddings.
 93 |             initializer_range: The sttdev of the truncated_normal_initializer for
 94 |                 initializing all weight matrices.
 95 |         """
 96 |         super(CTRLConfig, self).__init__(**kwargs)
 97 | 
 98 |         self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
 99 |         self.n_ctx = n_ctx
100 |         self.n_positions = n_positions
101 |         self.n_embd = n_embd
102 |         self.n_layer = n_layer
103 |         self.n_head = n_head
104 |         self.dff = dff
105 |         self.resid_pdrop = resid_pdrop
106 |         self.embd_pdrop = embd_pdrop
107 |         self.attn_pdrop = attn_pdrop
108 |         self.layer_norm_epsilon = layer_norm_epsilon
109 |         self.initializer_range = initializer_range
110 | 
111 |         self.num_labels = num_labels
112 |         self.summary_type = summary_type
113 |         self.summary_use_proj = summary_use_proj
114 |         self.summary_activation = summary_activation
115 |         self.summary_first_dropout = summary_first_dropout
116 |         self.summary_proj_to_labels = summary_proj_to_labels
117 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
118 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
119 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
120 |                 json_config = json.loads(reader.read())
121 |             for key, value in json_config.items():
122 |                 self.__dict__[key] = value
123 |         elif not isinstance(vocab_size_or_config_json_file, int):
124 |             raise ValueError(
125 |                 "First argument must be either a vocabulary size (int)"
126 |                 "or the path to a pretrained model config file (str)"
127 |             )
128 | 
129 |     @property
130 |     def max_position_embeddings(self):
131 |         return self.n_positions
132 | 
133 |     @property
134 |     def hidden_size(self):
135 |         return self.n_embd
136 | 
137 |     @property
138 |     def num_attention_heads(self):
139 |         return self.n_head
140 | 
141 |     @property
142 |     def num_hidden_layers(self):
143 |         return self.n_layer
144 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/configuration_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ DistilBERT model configuration """
16 | from __future__ import (absolute_import, division, print_function,
17 |                         unicode_literals)
18 | 
19 | import sys
20 | import json
21 | import logging
22 | from io import open
23 | 
24 | from .configuration_utils import PretrainedConfig
25 | 
26 | logger = logging.getLogger(__name__)
27 | 
28 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
29 |     'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
30 |     'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
31 | }
32 | 
33 | 
34 | class DistilBertConfig(PretrainedConfig):
35 |     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | 
37 |     def __init__(self,
38 |                  vocab_size_or_config_json_file=30522,
39 |                  max_position_embeddings=512,
40 |                  sinusoidal_pos_embds=False,
41 |                  n_layers=6,
42 |                  n_heads=12,
43 |                  dim=768,
44 |                  hidden_dim=4*768,
45 |                  dropout=0.1,
46 |                  attention_dropout=0.1,
47 |                  activation='gelu',
48 |                  initializer_range=0.02,
49 |                  tie_weights_=True,
50 |                  qa_dropout=0.1,
51 |                  seq_classif_dropout=0.2,
52 |                  **kwargs):
53 |         super(DistilBertConfig, self).__init__(**kwargs)
54 | 
55 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
56 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
57 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
58 |                 json_config = json.loads(reader.read())
59 |             for key, value in json_config.items():
60 |                 self.__dict__[key] = value
61 |         elif isinstance(vocab_size_or_config_json_file, int):
62 |             self.vocab_size = vocab_size_or_config_json_file
63 |             self.max_position_embeddings = max_position_embeddings
64 |             self.sinusoidal_pos_embds = sinusoidal_pos_embds
65 |             self.n_layers = n_layers
66 |             self.n_heads = n_heads
67 |             self.dim = dim
68 |             self.hidden_dim = hidden_dim
69 |             self.dropout = dropout
70 |             self.attention_dropout = attention_dropout
71 |             self.activation = activation
72 |             self.initializer_range = initializer_range
73 |             self.tie_weights_ = tie_weights_
74 |             self.qa_dropout = qa_dropout
75 |             self.seq_classif_dropout = seq_classif_dropout
76 |         else:
77 |             raise ValueError("First argument must be either a vocabulary size (int)"
78 |                              " or the path to a pretrained model config file (str)")
79 |     @property
80 |     def hidden_size(self):
81 |         return self.dim
82 | 
83 |     @property
84 |     def num_attention_heads(self):
85 |         return self.n_heads
86 | 
87 |     @property
88 |     def num_hidden_layers(self):
89 |         return self.n_layers
90 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/configuration_gpt2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ OpenAI GPT-2 configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
 30 |                                       "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
 31 |                                       "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
 32 |                                       "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
 33 | 
 34 | class GPT2Config(PretrainedConfig):
 35 |     """Configuration class to store the configuration of a `GPT2Model`.
 36 | 
 37 |     Args:
 38 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
 39 |         n_positions: Number of positional embeddings.
 40 |         n_ctx: Size of the causal mask (usually same as n_positions).
 41 |         n_embd: Dimensionality of the embeddings and hidden states.
 42 |         n_layer: Number of hidden layers in the Transformer encoder.
 43 |         n_head: Number of attention heads for each attention layer in
 44 |             the Transformer encoder.
 45 |         layer_norm_epsilon: epsilon to use in the layer norm layers
 46 |         resid_pdrop: The dropout probabilitiy for all fully connected
 47 |             layers in the embeddings, encoder, and pooler.
 48 |         attn_pdrop: The dropout ratio for the attention
 49 |             probabilities.
 50 |         embd_pdrop: The dropout ratio for the embeddings.
 51 |         initializer_range: The sttdev of the truncated_normal_initializer for
 52 |             initializing all weight matrices.
 53 |     """
 54 |     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 55 | 
 56 |     def __init__(
 57 |         self,
 58 |         vocab_size_or_config_json_file=50257,
 59 |         n_positions=1024,
 60 |         n_ctx=1024,
 61 |         n_embd=768,
 62 |         n_layer=12,
 63 |         n_head=12,
 64 |         resid_pdrop=0.1,
 65 |         embd_pdrop=0.1,
 66 |         attn_pdrop=0.1,
 67 |         layer_norm_epsilon=1e-5,
 68 |         initializer_range=0.02,
 69 | 
 70 |         num_labels=1,
 71 |         summary_type='cls_index',
 72 |         summary_use_proj=True,
 73 |         summary_activation=None,
 74 |         summary_proj_to_labels=True,
 75 |         summary_first_dropout=0.1,
 76 |         **kwargs
 77 |     ):
 78 |         """Constructs GPT2Config.
 79 | 
 80 |         Args:
 81 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
 82 |             n_positions: Number of positional embeddings.
 83 |             n_ctx: Size of the causal mask (usually same as n_positions).
 84 |             n_embd: Dimensionality of the embeddings and hidden states.
 85 |             n_layer: Number of hidden layers in the Transformer encoder.
 86 |             n_head: Number of attention heads for each attention layer in
 87 |                 the Transformer encoder.
 88 |             layer_norm_epsilon: epsilon to use in the layer norm layers
 89 |             resid_pdrop: The dropout probabilitiy for all fully connected
 90 |                 layers in the embeddings, encoder, and pooler.
 91 |             attn_pdrop: The dropout ratio for the attention
 92 |                 probabilities.
 93 |             embd_pdrop: The dropout ratio for the embeddings.
 94 |             initializer_range: The sttdev of the truncated_normal_initializer for
 95 |                 initializing all weight matrices.
 96 |         """
 97 |         super(GPT2Config, self).__init__(**kwargs)
 98 | 
 99 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
100 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
101 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
102 |                 json_config = json.loads(reader.read())
103 |             for key, value in json_config.items():
104 |                 self.__dict__[key] = value
105 |         elif isinstance(vocab_size_or_config_json_file, int):
106 |             self.vocab_size = vocab_size_or_config_json_file
107 |             self.n_ctx = n_ctx
108 |             self.n_positions = n_positions
109 |             self.n_embd = n_embd
110 |             self.n_layer = n_layer
111 |             self.n_head = n_head
112 |             self.resid_pdrop = resid_pdrop
113 |             self.embd_pdrop = embd_pdrop
114 |             self.attn_pdrop = attn_pdrop
115 |             self.layer_norm_epsilon = layer_norm_epsilon
116 |             self.initializer_range = initializer_range
117 | 
118 |             self.num_labels = num_labels
119 |             self.summary_type = summary_type
120 |             self.summary_use_proj = summary_use_proj
121 |             self.summary_activation = summary_activation
122 |             self.summary_first_dropout = summary_first_dropout
123 |             self.summary_proj_to_labels = summary_proj_to_labels
124 |         else:
125 |             raise ValueError(
126 |                 "First argument must be either a vocabulary size (int)"
127 |                 "or the path to a pretrained model config file (str)"
128 |             )
129 | 
130 |     @property
131 |     def max_position_embeddings(self):
132 |         return self.n_positions
133 | 
134 |     @property
135 |     def hidden_size(self):
136 |         return self.n_embd
137 | 
138 |     @property
139 |     def num_attention_heads(self):
140 |         return self.n_head
141 | 
142 |     @property
143 |     def num_hidden_layers(self):
144 |         return self.n_layer
145 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/configuration_openai.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ OpenAI GPT configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
 31 | }
 32 | 
 33 | class OpenAIGPTConfig(PretrainedConfig):
 34 |     """
 35 |     Configuration class to store the configuration of a `OpenAIGPTModel`.
 36 | 
 37 |     Args:
 38 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
 39 |         n_positions: Number of positional embeddings.
 40 |         n_ctx: Size of the causal mask (usually same as n_positions).
 41 |         n_embd: Dimensionality of the embeddings and hidden states.
 42 |         n_layer: Number of hidden layers in the Transformer encoder.
 43 |         n_head: Number of attention heads for each attention layer in
 44 |             the Transformer encoder.
 45 |         afn: The non-linear activation function (function or string) in the
 46 |             encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 47 |         resid_pdrop: The dropout probabilitiy for all fully connected
 48 |             layers in the embeddings, encoder, and pooler.
 49 |         attn_pdrop: The dropout ratio for the attention
 50 |             probabilities.
 51 |         embd_pdrop: The dropout ratio for the embeddings.
 52 |         layer_norm_epsilon: epsilon to use in the layer norm layers
 53 |         initializer_range: The sttdev of the truncated_normal_initializer for
 54 |             initializing all weight matrices.
 55 |         predict_special_tokens: should we predict special tokens (when the model has a LM head)
 56 |     """
 57 |     pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         vocab_size_or_config_json_file=40478,
 62 |         n_positions=512,
 63 |         n_ctx=512,
 64 |         n_embd=768,
 65 |         n_layer=12,
 66 |         n_head=12,
 67 |         afn="gelu",
 68 |         resid_pdrop=0.1,
 69 |         embd_pdrop=0.1,
 70 |         attn_pdrop=0.1,
 71 |         layer_norm_epsilon=1e-5,
 72 |         initializer_range=0.02,
 73 |         predict_special_tokens=True,
 74 | 
 75 |         num_labels=1,
 76 |         summary_type='cls_index',
 77 |         summary_use_proj=True,
 78 |         summary_activation=None,
 79 |         summary_proj_to_labels=True,
 80 |         summary_first_dropout=0.1,
 81 |         **kwargs
 82 |     ):
 83 |         """Constructs OpenAIGPTConfig.
 84 |         """
 85 |         super(OpenAIGPTConfig, self).__init__(**kwargs)
 86 | 
 87 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 88 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
 89 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
 90 |                 json_config = json.loads(reader.read())
 91 |             for key, value in json_config.items():
 92 |                 self.__dict__[key] = value
 93 |         elif isinstance(vocab_size_or_config_json_file, int):
 94 |             self.vocab_size = vocab_size_or_config_json_file
 95 |             self.n_ctx = n_ctx
 96 |             self.n_positions = n_positions
 97 |             self.n_embd = n_embd
 98 |             self.n_layer = n_layer
 99 |             self.n_head = n_head
100 |             self.afn = afn
101 |             self.resid_pdrop = resid_pdrop
102 |             self.embd_pdrop = embd_pdrop
103 |             self.attn_pdrop = attn_pdrop
104 |             self.layer_norm_epsilon = layer_norm_epsilon
105 |             self.initializer_range = initializer_range
106 |             self.predict_special_tokens = predict_special_tokens
107 | 
108 |             self.num_labels = num_labels
109 |             self.summary_type = summary_type
110 |             self.summary_use_proj = summary_use_proj
111 |             self.summary_activation = summary_activation
112 |             self.summary_first_dropout = summary_first_dropout
113 |             self.summary_proj_to_labels = summary_proj_to_labels
114 |         else:
115 |             raise ValueError(
116 |                 "First argument must be either a vocabulary size (int)"
117 |                 "or the path to a pretrained model config file (str)"
118 |             )
119 | 
120 |     @property
121 |     def max_position_embeddings(self):
122 |         return self.n_positions
123 | 
124 |     @property
125 |     def hidden_size(self):
126 |         return self.n_embd
127 | 
128 |     @property
129 |     def num_attention_heads(self):
130 |         return self.n_head
131 | 
132 |     @property
133 |     def num_hidden_layers(self):
134 |         return self.n_layer
135 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/configuration_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | 
18 | from __future__ import (absolute_import, division, print_function,
19 |                         unicode_literals)
20 | 
21 | import logging
22 | 
23 | from .configuration_bert import BertConfig
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 |     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
29 |     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
30 |     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
31 | }
32 | 
33 | 
34 | class RobertaConfig(BertConfig):
35 |     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/configuration_transfo_xl.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Transformer XL configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
 31 | }
 32 | 
 33 | class TransfoXLConfig(PretrainedConfig):
 34 |     """Configuration class to store the configuration of a `TransfoXLModel`.
 35 | 
 36 |         Args:
 37 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
 38 |             cutoffs: cutoffs for the adaptive softmax
 39 |             d_model: Dimensionality of the model's hidden states.
 40 |             d_embed: Dimensionality of the embeddings
 41 |             d_head: Dimensionality of the model's heads.
 42 |             div_val: divident value for adapative input and softmax
 43 |             pre_lnorm: apply LayerNorm to the input instead of the output
 44 |             d_inner: Inner dimension in FF
 45 |             n_layer: Number of hidden layers in the Transformer encoder.
 46 |             n_head: Number of attention heads for each attention layer in
 47 |                 the Transformer encoder.
 48 |             tgt_len: number of tokens to predict
 49 |             ext_len: length of the extended context
 50 |             mem_len: length of the retained previous heads
 51 |             same_length: use the same attn length for all tokens
 52 |             proj_share_all_but_first: True to share all but first projs, False not to share.
 53 |             attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
 54 |             clamp_len: use the same pos embeddings after clamp_len
 55 |             sample_softmax: number of samples in sampled softmax
 56 |             adaptive: use adaptive softmax
 57 |             tie_weight: tie the word embedding and softmax weights
 58 |             dropout: The dropout probabilitiy for all fully connected
 59 |                 layers in the embeddings, encoder, and pooler.
 60 |             dropatt: The dropout ratio for the attention probabilities.
 61 |             untie_r: untie relative position biases
 62 |             embd_pdrop: The dropout ratio for the embeddings.
 63 |             init: parameter initializer to use
 64 |             init_range: parameters initialized by U(-init_range, init_range).
 65 |             proj_init_std: parameters initialized by N(0, init_std)
 66 |             init_std: parameters initialized by N(0, init_std)
 67 |     """
 68 |     pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 69 | 
 70 |     def __init__(self,
 71 |                  vocab_size_or_config_json_file=267735,
 72 |                  cutoffs=[20000, 40000, 200000],
 73 |                  d_model=1024,
 74 |                  d_embed=1024,
 75 |                  n_head=16,
 76 |                  d_head=64,
 77 |                  d_inner=4096,
 78 |                  div_val=4,
 79 |                  pre_lnorm=False,
 80 |                  n_layer=18,
 81 |                  tgt_len=128,
 82 |                  ext_len=0,
 83 |                  mem_len=1600,
 84 |                  clamp_len=1000,
 85 |                  same_length=True,
 86 |                  proj_share_all_but_first=True,
 87 |                  attn_type=0,
 88 |                  sample_softmax=-1,
 89 |                  adaptive=True,
 90 |                  tie_weight=True,
 91 |                  dropout=0.1,
 92 |                  dropatt=0.0,
 93 |                  untie_r=True,
 94 |                  init="normal",
 95 |                  init_range=0.01,
 96 |                  proj_init_std=0.01,
 97 |                  init_std=0.02,
 98 |                  layer_norm_epsilon=1e-5,
 99 |                  **kwargs):
100 |         """Constructs TransfoXLConfig.
101 |         """
102 |         super(TransfoXLConfig, self).__init__(**kwargs)
103 |         self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
104 |         self.cutoffs = []
105 |         self.cutoffs.extend(cutoffs)
106 |         self.tie_weight = tie_weight
107 |         if proj_share_all_but_first:
108 |             self.tie_projs = [False] + [True] * len(self.cutoffs)
109 |         else:
110 |             self.tie_projs = [False] + [False] * len(self.cutoffs)
111 |         self.d_model = d_model
112 |         self.d_embed = d_embed
113 |         self.d_head = d_head
114 |         self.d_inner = d_inner
115 |         self.div_val = div_val
116 |         self.pre_lnorm = pre_lnorm
117 |         self.n_layer = n_layer
118 |         self.n_head = n_head
119 |         self.tgt_len = tgt_len
120 |         self.ext_len = ext_len
121 |         self.mem_len = mem_len
122 |         self.same_length = same_length
123 |         self.attn_type = attn_type
124 |         self.clamp_len = clamp_len
125 |         self.sample_softmax = sample_softmax
126 |         self.adaptive = adaptive
127 |         self.dropout = dropout
128 |         self.dropatt = dropatt
129 |         self.untie_r = untie_r
130 |         self.init = init
131 |         self.init_range = init_range
132 |         self.proj_init_std = proj_init_std
133 |         self.init_std = init_std
134 |         self.layer_norm_epsilon = layer_norm_epsilon
135 | 
136 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
137 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
138 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
139 |                 json_config = json.loads(reader.read())
140 |             for key, value in json_config.items():
141 |                 self.__dict__[key] = value
142 |         elif not isinstance(vocab_size_or_config_json_file, int):
143 |             raise ValueError("First argument must be either a vocabulary size (int)"
144 |                              " or the path to a pretrained model config file (str)")
145 | 
146 |     @property
147 |     def max_position_embeddings(self):
148 |         return self.tgt_len + self.ext_len + self.mem_len
149 | 
150 |     @property
151 |     def vocab_size(self):
152 |         return self.n_token
153 | 
154 |     @vocab_size.setter
155 |     def vocab_size(self, value):
156 |         self.n_token = value
157 | 
158 |     @property
159 |     def hidden_size(self):
160 |         return self.d_model
161 | 
162 |     @property
163 |     def num_attention_heads(self):
164 |         return self.n_head
165 | 
166 |     @property
167 |     def num_hidden_layers(self):
168 |         return self.n_layer
169 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/configuration_xlnet.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ XLNet configuration """
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import json
 20 | import logging
 21 | import sys
 22 | from io import open
 23 | 
 24 | from .configuration_utils import PretrainedConfig
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 29 |     'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
 30 |     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
 31 | }
 32 | 
 33 | 
 34 | class XLNetConfig(PretrainedConfig):
 35 |     """Configuration class to store the configuration of a ``XLNetModel``.
 36 | 
 37 |     Args:
 38 |         vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
 39 |         d_model: Size of the encoder layers and the pooler layer.
 40 |         n_layer: Number of hidden layers in the Transformer encoder.
 41 |         n_head: Number of attention heads for each attention layer in
 42 |             the Transformer encoder.
 43 |         d_inner: The size of the "intermediate" (i.e., feed-forward)
 44 |             layer in the Transformer encoder.
 45 |         ff_activation: The non-linear activation function (function or string) in the
 46 |             encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 47 |         untie_r: untie relative position biases
 48 |         attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
 49 | 
 50 |         dropout: The dropout probabilitiy for all fully connected
 51 |             layers in the embeddings, encoder, and pooler.
 52 |         initializer_range: The sttdev of the truncated_normal_initializer for
 53 |             initializing all weight matrices.
 54 |         layer_norm_eps: The epsilon used by LayerNorm.
 55 | 
 56 |         dropout: float, dropout rate.
 57 |         init: str, the initialization scheme, either "normal" or "uniform".
 58 |         init_range: float, initialize the parameters with a uniform distribution
 59 |             in [-init_range, init_range]. Only effective when init="uniform".
 60 |         init_std: float, initialize the parameters with a normal distribution
 61 |             with mean 0 and stddev init_std. Only effective when init="normal".
 62 |         mem_len: int, the number of tokens to cache.
 63 |         reuse_len: int, the number of tokens in the currect batch to be cached
 64 |             and reused in the future.
 65 |         bi_data: bool, whether to use bidirectional input pipeline.
 66 |             Usually set to True during pretraining and False during finetuning.
 67 |         clamp_len: int, clamp all relative distances larger than clamp_len.
 68 |             -1 means no clamping.
 69 |         same_length: bool, whether to use the same attention length for each token.
 70 |         finetuning_task: name of the glue task on which the model was fine-tuned if any
 71 |     """
 72 |     pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 73 | 
 74 |     def __init__(self,
 75 |                  vocab_size_or_config_json_file=32000,
 76 |                  d_model=1024,
 77 |                  n_layer=24,
 78 |                  n_head=16,
 79 |                  d_inner=4096,
 80 |                  max_position_embeddings=512,
 81 |                  ff_activation="gelu",
 82 |                  untie_r=True,
 83 |                  attn_type="bi",
 84 | 
 85 |                  initializer_range=0.02,
 86 |                  layer_norm_eps=1e-12,
 87 | 
 88 |                  dropout=0.1,
 89 |                  mem_len=None,
 90 |                  reuse_len=None,
 91 |                  bi_data=False,
 92 |                  clamp_len=-1,
 93 |                  same_length=False,
 94 | 
 95 |                  finetuning_task=None,
 96 |                  num_labels=2,
 97 |                  summary_type='last',
 98 |                  summary_use_proj=True,
 99 |                  summary_activation='tanh',
100 |                  summary_last_dropout=0.1,
101 |                  start_n_top=5,
102 |                  end_n_top=5,
103 |                  **kwargs):
104 |         """Constructs XLNetConfig.
105 |         """
106 |         super(XLNetConfig, self).__init__(**kwargs)
107 | 
108 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
109 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
110 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
111 |                 json_config = json.loads(reader.read())
112 |             for key, value in json_config.items():
113 |                 setattr(config, key, value)
114 |         elif isinstance(vocab_size_or_config_json_file, int):
115 |             self.n_token = vocab_size_or_config_json_file
116 |             self.d_model = d_model
117 |             self.n_layer = n_layer
118 |             self.n_head = n_head
119 |             assert d_model % n_head == 0
120 |             self.d_head = d_model // n_head
121 |             self.ff_activation = ff_activation
122 |             self.d_inner = d_inner
123 |             self.untie_r = untie_r
124 |             self.attn_type = attn_type
125 | 
126 |             self.initializer_range = initializer_range
127 |             self.layer_norm_eps = layer_norm_eps
128 | 
129 |             self.dropout = dropout
130 |             self.mem_len = mem_len
131 |             self.reuse_len = reuse_len
132 |             self.bi_data = bi_data
133 |             self.clamp_len = clamp_len
134 |             self.same_length = same_length
135 | 
136 |             self.finetuning_task = finetuning_task
137 |             self.num_labels = num_labels
138 |             self.summary_type = summary_type
139 |             self.summary_use_proj = summary_use_proj
140 |             self.summary_activation = summary_activation
141 |             self.summary_last_dropout = summary_last_dropout
142 |             self.start_n_top = start_n_top
143 |             self.end_n_top = end_n_top
144 |         else:
145 |             raise ValueError("First argument must be either a vocabulary size (int)"
146 |                              " or the path to a pretrained model config file (str)")
147 | 
148 |     @property
149 |     def max_position_embeddings(self):
150 |         return -1
151 | 
152 |     @property
153 |     def vocab_size(self):
154 |         return self.n_token
155 | 
156 |     @vocab_size.setter
157 |     def vocab_size(self, value):
158 |         self.n_token = value
159 | 
160 |     @property
161 |     def hidden_size(self):
162 |         return self.d_model
163 | 
164 |     @property
165 |     def num_attention_heads(self):
166 |         return self.n_head
167 | 
168 |     @property
169 |     def num_hidden_layers(self):
170 |         return self.n_layer
171 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for DistilBERT."""
16 | 
17 | from __future__ import absolute_import, division, print_function, unicode_literals
18 | 
19 | import collections
20 | import logging
21 | import os
22 | import unicodedata
23 | from io import open
24 | 
25 | from .tokenization_bert import BertTokenizer
26 | 
27 | logger = logging.getLogger(__name__)
28 | 
29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
30 | 
31 | PRETRAINED_VOCAB_FILES_MAP = {
32 |     'vocab_file':
33 |     {
34 |         'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
35 |         'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
36 |     }
37 | }
38 | 
39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40 |     'distilbert-base-uncased': 512,
41 |     'distilbert-base-uncased-distilled-squad': 512,
42 | }
43 | 
44 | 
45 | class DistilBertTokenizer(BertTokenizer):
46 |     r"""
47 |     Constructs a DistilBertTokenizer.
48 |     :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
49 | 
50 |     Args:
51 |         vocab_file: Path to a one-wordpiece-per-line vocabulary file
52 |         do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
53 |         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
54 |         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
55 |             minimum of this value (if specified) and the underlying BERT model's sequence length.
56 |         never_split: List of tokens which will never be split during tokenization. Only has an effect when
57 |             do_wordpiece_only=False
58 |     """
59 | 
60 |     vocab_files_names = VOCAB_FILES_NAMES
61 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
62 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
63 | 


--------------------------------------------------------------------------------
/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_roberta.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for RoBERTa."""
 16 | from __future__ import (absolute_import, division, print_function,
 17 |                         unicode_literals)
 18 | 
 19 | import sys
 20 | import json
 21 | import logging
 22 | import os
 23 | import regex as re
 24 | from io import open
 25 | 
 26 | from .tokenization_gpt2 import GPT2Tokenizer
 27 | 
 28 | try:
 29 |     from functools import lru_cache
 30 | except ImportError:
 31 |     # Just a dummy decorator to get the checks to run on python2
 32 |     # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
 33 |     def lru_cache():
 34 |         return lambda func: func
 35 | 
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | VOCAB_FILES_NAMES = {
 39 |     'vocab_file': 'vocab.json',
 40 |     'merges_file': 'merges.txt',
 41 | }
 42 | 
 43 | PRETRAINED_VOCAB_FILES_MAP = {
 44 |     'vocab_file':
 45 |     {
 46 |         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
 47 |         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
 48 |         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
 49 |     },
 50 |     'merges_file':
 51 |     {
 52 |         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
 53 |         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
 54 |         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
 55 |     },
 56 | }
 57 | 
 58 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 59 |     'roberta-base': 512,
 60 |     'roberta-large': 512,
 61 |     'roberta-large-mnli': 512,
 62 | }
 63 | 
 64 | 
 65 | class RobertaTokenizer(GPT2Tokenizer):
 66 |     """
 67 |     RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
 68 |         - Byte-level Byte-Pair-Encoding
 69 |         - Requires a space to start the input string => the encoding methods should be called with the
 70 |           ``add_prefix_space`` flag set to ``True``.
 71 |           Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
 72 |           the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
 73 |     """
 74 |     vocab_files_names = VOCAB_FILES_NAMES
 75 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
 76 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 77 | 
 78 |     def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
 79 |                  cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
 80 |         super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
 81 |                                                bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
 82 |                                                sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
 83 |                                                mask_token=mask_token, **kwargs)
 84 |         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
 85 |         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
 86 | 
 87 |     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
 88 |         """
 89 |         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
 90 |         by concatenating and adding special tokens.
 91 |         A RoBERTa sequence has the following format:
 92 |             single sequence: <s> X </s>
 93 |             pair of sequences: <s> A </s></s> B </s>
 94 |         """
 95 |         if token_ids_1 is None:
 96 |             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 97 |         cls = [self.cls_token_id]
 98 |         sep = [self.sep_token_id]
 99 |         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
100 | 
101 |     def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
102 |         """
103 |         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
104 |         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
105 | 
106 |         Args:
107 |             token_ids_0: list of ids (must not contain special tokens)
108 |             token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
109 |                 for sequence pairs
110 |             already_has_special_tokens: (default False) Set to True if the token list is already formated with
111 |                 special tokens for the model
112 | 
113 |         Returns:
114 |             A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
115 |         """
116 |         if already_has_special_tokens:
117 |             if token_ids_1 is not None:
118 |                 raise ValueError("You should not supply a second sequence if the provided sequence of "
119 |                                  "ids is already formated with special tokens for the model.")
120 |             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
121 | 
122 |         if token_ids_1 is None:
123 |             return [1] + ([0] * len(token_ids_0)) + [1]
124 |         return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
125 | 
126 |     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
127 |         """
128 |         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
129 |         A RoBERTa sequence pair mask has the following format:
130 |         0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
131 |         | first sequence    | second sequence
132 | 
133 |         if token_ids_1 is None, only returns the first portion of the mask (0's).
134 |         """
135 |         sep = [self.sep_token_id]
136 |         cls = [self.cls_token_id]
137 | 
138 |         if token_ids_1 is None:
139 |             return len(cls + token_ids_0 + sep) * [0]
140 |         return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
141 | 


--------------------------------------------------------------------------------
/baselines/multi/README.md:
--------------------------------------------------------------------------------
1 | # 多策略方法汇总
2 | 
3 | |组合编号|  策略名称(包含策略id) | 亦可用于英文？  | 亦可用于CV？  |  借助人力？ | CIC (F1 marco)  |
4 | |---|---|---|---|---|---|
5 | |0| 基线(2,3,4)  | yes |   |   |  0.7849 |
6 | 
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/baselines/multi/simple_baseline/README.md:
--------------------------------------------------------------------------------
1 | ### 算法描述
2 | 
3 | 这里我们组合了如下策略：
4 | 
5 | 1. 通过训练一个分类模型根据预测的熵找出数据中最有可能标签错误的样本，并丢弃；
6 | 2. 使用数据增强提升数据量，即对输入文本的增强；
7 | 3. 将标签定义增强后添加到训练集中增加数据量。比如标签定义买家抱怨商品了;标签定义增强->买家抱怨商品涨价了。
8 | 
9 | 


--------------------------------------------------------------------------------
/baselines/multi/simple_baseline/simple_baseline.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | from textda.data_expansion import data_expansion
 3 | import os
 4 | import sys
 5 | path = os.path.split(os.getcwd())
 6 | sys.path.append(path)
 7 | 
 8 | from dckit import read_datasets, random_split_data, evaluate
 9 | from baselines.single.data_aug.data_aug import data_aug
10 | from baselines.single.def_aug.def_aug import def_aug
11 | from baselines.single.delete_noise.delete_noise import delete_noise
12 | 
13 | 
14 | def simple_baseline(data, use_delete=False, use_aug=False, use_def=False):
15 |     if use_delete:
16 |         data = delete_noise(data)
17 |     if use_aug:
18 |         data = data_aug(data)
19 |     if use_def:
20 |         data = def_aug(data)
21 |     return data
22 | 
23 | 
24 | def main():
25 |     data = read_datasets()
26 |     data = simple_baseline(data)
27 |     random_split_data(data)
28 |     f1 = evaluate()
29 |     print('Macro-F1=', f1)
30 |     return f1
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     main()


--------------------------------------------------------------------------------
/baselines/single/README.md:
--------------------------------------------------------------------------------
 1 | # 单策略方法汇总
 2 | 
 3 | |策略编号|  策略名称 | 亦可用于英文？  | 亦可用于CV？  |  借助人力？ | CIC Marco-F1  | TNEWS Marco-F1  | IFLYTEK Marco-F1  |AFQMC Marco-F1|
 4 | |---|---|---|---|---|---|---|---|---|
 5 | |0| 基线  | yes | yes  |   |  0.7278 |0.4683|0.3097|0.5904|
 6 | |1| 人类表现 (Accuracy)   |   |   |   |  0.8740 |0.71|0.66|0.81|
 7 | |2| [数据增强](data_aug)  | yes | yes  |   |  0.7462 |0.4805|0.4015||
 8 | |3| [噪声标签删除](delete_noise)  |  yes |  yes |   |  0.7332 |0.4934|0.2941||
 9 | |4| [定义增强](definition_aug)  |  yes |   |   | 0.7822 |0.4570|0.3371||
10 | 


--------------------------------------------------------------------------------
/baselines/single/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/__init__.py


--------------------------------------------------------------------------------
/baselines/single/data_aug/README.md:
--------------------------------------------------------------------------------
 1 | # 数据增强
 2 | 
 3 | 对输入数据进行增强
 4 | 
 5 | # 参数说明
 6 | 
 7 | 这里只用了增强次数作为参数
 8 | 
 9 | # 参数选择实验
10 | 
11 | |增强次数 | 0  | 1  | 3  |  5 |10|
12 | |---|---|---|---|---|---|
13 | | Marco-F1| .7278 | .7388 | .7462 | .7363 | .6694 |
14 | 
15 | 
16 | # 可能问题
17 | 这里的数据增强部分用了[synonyms](https://github.com/chatopera/Synonyms)，其中资源下载可能存在问题。如果存在问题请按照如下设置：
18 | ```bash
19 | export SYNONYMS_WORD2VEC_BIN_URL_ZH_CN=https://gitee.com/chatopera/cskefu/attach_files/610602/download/words.vector.gz
20 | pip install -U synonyms
21 | python -c "import synonyms" # download word vectors file
22 | ```
23 | 
24 | 


--------------------------------------------------------------------------------
/baselines/single/data_aug/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/data_aug/__init__.py


--------------------------------------------------------------------------------
/baselines/single/data_aug/data_aug.py:
--------------------------------------------------------------------------------
 1 | import imp
 2 | from tqdm import tqdm
 3 | from textda.data_expansion import data_expansion
 4 | import sys
 5 | sys.path.append('../../../')
 6 | sys.path.append('../../../dckit')
 7 | from dckit import read_datasets, random_split_data, evaluate
 8 | import swifter
 9 | import pandas as pd
10 | import numpy as np
11 | 
12 | 
13 | def aug_function(sentence, alpha_ri=0.1, alpha_rs=0, num_aug=3):
14 |     aug_list = data_expansion(sentence, alpha_ri, alpha_rs, p_rd=0.2, num_aug=num_aug)
15 |     if len(aug_list) != num_aug:
16 |         l = len(aug_list)
17 |         if l < num_aug:
18 |             for i in range(num_aug-l):
19 |                 aug_list.append(None)
20 |         else:
21 |             aug_list = aug_list[:num_aug]
22 |     return aug_list
23 | 
24 | 
25 | def data_aug(data, num_aug=3):
26 |     json_data = data['json']
27 |     df = pd.DataFrame.from_records(json_data)
28 |     df.columns = json_data[0].keys()
29 |     aug_lists = df['sentence'].swifter.apply(aug_function)
30 |     aug_lens = [len(aug_list) for aug_list in aug_lists]
31 |     flatten_list = [j for sub in aug_lists for j in sub]
32 |     newdf = pd.DataFrame(np.repeat(df.values, num_aug, axis=0), columns=df.columns)
33 |     newdf['sentence'] = flatten_list
34 |     # remove none
35 |     newdf.dropna(inplace=True)
36 |     data["json"] = newdf.to_dict(orient='records')
37 |     return data
38 | 
39 | 
40 | 
41 | def main():
42 |     data = read_datasets()
43 |     data = data_aug(data)
44 |     random_split_data(data)
45 |     f1 = evaluate()
46 |     print('Macro-F1=', f1)
47 |     return f1
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     main()
52 | 


--------------------------------------------------------------------------------
/baselines/single/data_aug/parallel_textda.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor, as_completed
 2 | from textda.data_expansion import data_expansion
 3 | 
 4 | 
 5 | def expand(sentence, label, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.2, num_aug=9):
 6 |     res = data_expansion(sentence, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs,
 7 |                          p_rd=p_rd, num_aug=num_aug)
 8 |     return res, [label] * len(res)
 9 | 
10 | 
11 | def parallel_expansion(sentences, labels, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.2, num_aug=9, workers=16):
12 |     '''
13 |     if you set alpha_ri and alpha_rs is 0 that means use linear classifier for it, and insensitive to word location
14 |     :param sentences: input sentence list
15 |     :param labels: input label list
16 |     :param alpha_sr: Replace synonym control param. bigger means more words are Replace
17 |     :param alpha_ri: Random insert. bigger means more words are Insert
18 |     :param alpha_rs: Random swap. bigger means more words are swap
19 |     :param p_rd: Random delete. bigger means more words are deleted
20 |     :param num_aug: How many times do you repeat each method
21 |     :param workers: Number of process
22 |     :return:
23 |     '''
24 |     assert len(sentences) == len(labels)
25 |     res_sentences = []
26 |     res_labels = []
27 |     with ProcessPoolExecutor(max_workers=workers) as t:
28 |         obj_list = []
29 |         for idx, sentence in enumerate(sentences):
30 |             obj = t.submit(expand, sentence, labels[idx], alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs,
31 |                            p_rd=p_rd, num_aug=num_aug)
32 |             obj_list.append(obj)
33 | 
34 |         for future in as_completed(obj_list):
35 |             res = future.result()
36 |             res_sentences.extend(res[0])
37 |             res_labels.extend(res[1])
38 |     return res_sentences, res_labels
39 | 


--------------------------------------------------------------------------------
/baselines/single/data_mixup/README.md:
--------------------------------------------------------------------------------
 1 | # 数据组合
 2 | 
 3 | 对输入数据进行组合，生成复合数据，减少其中噪声的影响。
 4 | 比如对于句子：“我希望换一个地址”和“你们这怎么换地址”可以合并成“我希望换一个地址。你们这怎么换地址”
 5 | 
 6 | # 参数说明
 7 | 
 8 | 这里只用了复合次数作为参数
 9 | 
10 | # 参数选择实验
11 | 
12 | 分析显示
13 | |复合生成次数 | 0  | 2  |
14 | |---|---|---|
15 | | Marco-F1| .7278 | .7376 |
16 | 


--------------------------------------------------------------------------------
/baselines/single/data_mixup/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/data_mixup/__init__.py


--------------------------------------------------------------------------------
/baselines/single/data_mixup/data_mixup.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | from dckit import read_datasets, random_split_data, evaluate
 3 | import random
 4 | import numpy as np
 5 | 
 6 | 
 7 | def data_mix(data, num_mix=3):
 8 |     json_data = data['json']
 9 |     new_json = json_data
10 |     # 按类聚合数据
11 |     sentence_by_class = {}
12 |     label_desc_map = {}
13 |     for idx, tmp in enumerate(tqdm(json_data)):
14 |         if tmp['label'] not in label_desc_map:
15 |             label_desc_map[tmp['label']] = tmp['label_des']
16 |         if tmp['label'] not in sentence_by_class:
17 |             sentence_by_class[tmp['label']] = []
18 |         sentence_by_class[tmp['label']].append(tmp['sentence'])
19 |     idx = 0
20 |     for classes, sentences in tqdm(sentence_by_class.items()):
21 |         for _ in range(len(json_data)//len(data['info'])):
22 |             random.shuffle(sentences)
23 |             sentence = '。'.join(sentences[:num_mix])
24 |             dic = {'id': idx, 'sentence': sentence, 'label': classes, 'label_des': label_desc_map[classes]}
25 |             idx += 1
26 |             new_json.append(dic)
27 |     data['json'] = new_json
28 |     return data
29 | 
30 | 
31 | def main():
32 |     res = []
33 |     for i in range(5):
34 |         data = read_datasets()
35 |         data = data_mix(data)
36 |         random_split_data(data, seed=i)
37 |         f1 = evaluate()
38 |         res.append(f1)
39 |     print('Macro-F1=', np.mean(res), np.std(res))
40 |     return f1
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/baselines/single/def_aug/README.md:
--------------------------------------------------------------------------------
 1 | # 标签定义增强
 2 | 
 3 | 将标签定义加入进行增强
 4 | 
 5 | # 参数说明
 6 | 
 7 | 这里只用了增强次数作为参数
 8 | 
 9 | # 参数选择实验
10 | 
11 | |增强次数 | 0  | 1  | 3  |  5 |10|
12 | |---|---|---|---|---|---|
13 | | Marco-F1| .7278 | .7388 | .7462 | .7363 | .6694 |
14 | 
15 | # 可能问题
16 | 这里的数据增强部分用了[synonyms](https://github.com/chatopera/Synonyms)，其中资源下载可能存在问题。如果存在问题请按照如下设置：
17 | ```bash
18 | export SYNONYMS_WORD2VEC_BIN_URL_ZH_CN=https://gitee.com/chatopera/cskefu/attach_files/610602/download/words.vector.gz
19 | pip install -U synonyms
20 | python -c "import synonyms" # download word vectors file
21 | ```
22 | 
23 | 


--------------------------------------------------------------------------------
/baselines/single/def_aug/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/def_aug/__init__.py


--------------------------------------------------------------------------------
/baselines/single/def_aug/def_aug.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from tqdm import tqdm
 4 | from textda.data_expansion import data_expansion
 5 | from dckit import read_datasets, random_split_data, evaluate
 6 | 
 7 | path = os.path.split(os.path.split(os.path.realpath(__file__))[0])[0]
 8 | 
 9 | 
10 | def def_aug(data, num_aug=50):
11 |     json_data = data['json']
12 |     label_info = data['info']
13 |     for idx, line in label_info.items():
14 |         if num_aug > 0:
15 |             sen_list = data_expansion(line, alpha_ri=0.2, alpha_rs=0, num_aug=num_aug)
16 |         if num_aug == 0:
17 |             sen_list = [line]
18 |         for sen in sen_list:
19 |             tmp = {}
20 |             tmp['id'] = -1
21 |             tmp['sentence'] = sen
22 |             tmp['label_des'] = line
23 |             tmp['label'] = idx
24 |             json_data.append(tmp)
25 | 
26 |     data['json'] = json_data
27 |     return data
28 | 
29 | 
30 | def main():
31 |     data = read_datasets()
32 |     data = def_aug(data)
33 |     random_split_data(data)
34 |     f1 = evaluate()
35 |     print('Macro-F1=', f1)
36 |     return f1
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/baselines/single/delete_noise/README.md:
--------------------------------------------------------------------------------
 1 | # 噪声删除
 2 | 
 3 | 交叉验证训练多个模型，根据熵判断最有可能错误的
 4 | 
 5 | # 参数说明
 6 | 
 7 | 这里只用了删除次数作为参数
 8 | 
 9 | # 参数选择实验
10 | 
11 | |增强次数 | 0  | 1  | 3  |  5 |10|
12 | |---|---|---|---|---|---|
13 | | Marco-F1| .7278 | .7388 | .7462 | .7363 | .6694 |
14 | 


--------------------------------------------------------------------------------
/baselines/single/delete_noise/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/delete_noise/__init__.py


--------------------------------------------------------------------------------
/baselines/single/delete_noise/classifier.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICE"] = '1'
  3 | from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
  4 | import torch
  5 | import numpy as np
  6 | from datasets import load_metric
  7 | from sklearn.model_selection import StratifiedKFold  # StratifiedKFold划分数据集的原理：划分后的训练集和验证集中类别分布尽量和原数据集一样
  8 | 
  9 | path = os.path.split(os.path.split(os.path.realpath(__file__))[0])[0]
 10 | from baselines.single.data_aug.parallel_textda import parallel_expansion
 11 | os.environ["TOKENIZERS_PARALLELISM"] = 'false'
 12 | PRETRAIN = 'hfl/rbtl3'  # 加载的预训练模型的名称
 13 | metric = load_metric("f1")  # 使用f1 score作为指标
 14 | 
 15 | 
 16 | # 计算标签与预测值在给定的指标上的效果
 17 | def compute_metrics(eval_pred):
 18 |     predictions, labels = eval_pred
 19 |     predictions = np.argmax(predictions, axis=1)
 20 |     return metric.compute(predictions=predictions, references=labels, average='macro')
 21 | 
 22 | 
 23 | class MyDataset(torch.utils.data.Dataset):
 24 |     def __init__(self, encodings, labels):
 25 |         self.encodings = encodings
 26 |         self.labels = labels
 27 | 
 28 |     def __getitem__(self, idx):
 29 |         item = {key: torch.tensor(val[idx])
 30 |                 for key, val in self.encodings.items()}
 31 |         item['labels'] = torch.tensor(self.labels[idx])
 32 |         return item
 33 | 
 34 |     def __len__(self):
 35 |         return len(self.labels)
 36 | 
 37 | 
 38 | def get_prediction(data):
 39 |     """
 40 |     训练一个模型，得到数据点上的标签预测：
 41 |     1） 加载数据；
 42 |     2）使用K折交叉验证训练，并在验证集上做预测；
 43 |     3) 合并交叉验证的结果，并得到整个数据集上模型的预测的概率分布
 44 |     train a model to get estimation of each data point
 45 |     """
 46 |     # 1、加载所有数据、标签到列表 all_text, all_label,all_id
 47 |     all_text, all_label, all_id = [], [], []
 48 |     for idx, line in enumerate(data['json']):
 49 |         all_text.append(line['sentence'])
 50 |         all_label.append(int(line['label']))
 51 |         all_id.append(idx)
 52 |     # 加载标签定义增强后的数据
 53 |     # label_data.json--->{"id": -1, "sentence": "买家抱怨商品了", "label_des": "买家抱怨商品涨价了\n", "label": 0}
 54 |     label_text, label_label = [], []
 55 |     # for line in open('../../datasets/cic/label_data.json', 'r', encoding='utf-8'):
 56 |     #     label_text.append(json.loads(line)['sentence'])
 57 |     #     label_label.append(int(json.loads(line)['label']))
 58 | 
 59 |     # 2、使用K折交叉验证训练，并在验证集上做预测：遍历每一折得到训练集和验证子集、数据增强、设置训练参数和数据进行训练、在验证集上进行预测
 60 |     dev_out = {}  # 带索引(index)的验证子集的列表
 61 |     dev_index = {}  # 带索引(index)的验证集的列表
 62 |     kf = StratifiedKFold(n_splits=6, shuffle=True)  # StratifiedKFold划分数据集的原理：划分后的训练集和验证集中类别分布尽量和原数据集一样
 63 |     # kf.get_n_splits(all_text, all_label)
 64 |     for kf_id, (train_index, test_index) in enumerate(kf.split(all_text, all_label)):
 65 |         # 2.1 得到训练和验证子集
 66 |         # kf_id:第几折；train_index, test_index这一折的训练、验证集。
 67 |         train_text = [all_text[i] for i in train_index][:] + label_text  # 训练集的文本
 68 |         train_label = [all_label[i] for i in train_index][:] + label_label  # 训练集的标签
 69 |         dev_text = [all_text[i] for i in test_index]
 70 |         dev_label = [all_label[i] for i in test_index]
 71 |         dev_index[kf_id] = test_index
 72 | 
 73 |         # 2.2 对训练数据进行数据扩增
 74 |         # new_train_text = []
 75 |         # new_train_label = []
 76 |         # for idx, tmp_text in enumerate(train_text):
 77 |         #     sen_list = data_expansion(tmp_text, alpha_ri=0.1, alpha_rs=0, num_aug=5)
 78 |         #     new_train_text.extend(sen_list)
 79 |         #     new_train_label.extend([train_label[idx]] * len(sen_list))
 80 |         #
 81 |         # train_text = new_train_text
 82 |         # train_label = new_train_label
 83 | 
 84 |         sen_list, label_list = parallel_expansion(train_text, train_label, alpha_ri=0.1, alpha_rs=0, num_aug=5)
 85 |         train_text = sen_list
 86 |         train_label = label_list
 87 |         assert len(train_text) == len(train_label)
 88 |         # 2.3 设置使用的预训练模型，并设置tokenizer、数据集对象
 89 |         tokenizer = AutoTokenizer.from_pretrained(PRETRAIN, do_lower_case=True)
 90 |         train_encodings = tokenizer(train_text, truncation=True, padding=True, max_length=32)
 91 |         val_encodings = tokenizer(dev_text, truncation=True, padding=True, max_length=32)
 92 | 
 93 |         train_dataset = MyDataset(train_encodings, train_label)
 94 |         val_dataset = MyDataset(val_encodings, dev_label)
 95 | 
 96 |         # 2.4 实例化训练参数
 97 |         training_args = TrainingArguments(
 98 |             # output directory
 99 |             output_dir='../../tmpresults/tmpresult{}'.format(kf_id),
100 |             num_train_epochs=50,  # total number of training epochs
101 |             per_device_train_batch_size=256,  # batch size per device during training
102 |             per_device_eval_batch_size=32,  # batch size for evaluation
103 |             warmup_steps=500,  # number of warmup steps for learning rate scheduler
104 |             learning_rate=3e-4 if 'electra' in PRETRAIN else 2e-5,
105 |             weight_decay=0.01,  # strength of weight decay
106 |             save_total_limit=1,
107 |             # logging_dir='../../tmplogs',  # directory for storing logs
108 |             # logging_steps=10,
109 |             # evaluation_strategy="epoch",
110 |         )
111 |         model = AutoModelForSequenceClassification.from_pretrained(PRETRAIN, num_labels=len(data['info']))
112 | 
113 |         # 2.5 利用实例化的训练对象进行训练（模型、训练参数、训练集、验证集、评价指标）
114 |         trainer = Trainer(
115 |             # the instantiated 🤗 Transformers model to be trained
116 |             model=model,
117 |             args=training_args,  # training arguments, defined above
118 |             train_dataset=train_dataset,  # training dataset
119 |             eval_dataset=val_dataset,  # evaluation dataset
120 |             compute_metrics=compute_metrics,
121 |         )
122 |         trainer.train()  # 训练模型
123 | 
124 |         # 2.6 利用训练好的模型在验证集上进行预测
125 |         dev_outputs = trainer.predict(val_dataset).predictions
126 |         dev_out[kf_id] = dev_outputs  # 将预测结果保存在列表中
127 | 
128 |     # 3、合并交叉验证的结果，并得到整个数据集上模型的预测的概率分布
129 |     alls = [0] * len(all_label)
130 |     for kfid in range(6):
131 |         for idx, item in enumerate(dev_index[kfid]):
132 |             # dev_index[0]:第0折的验证数据的索引的列表
133 |             alls[item - 1] = dev_out[kfid][idx]
134 |     outputs = np.array(alls)
135 |     return outputs
136 | 
137 | 
138 | if __name__ == '__main__':
139 |     get_prediction()
140 | 


--------------------------------------------------------------------------------
/baselines/single/delete_noise/delete_noise.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | from dckit import read_datasets, random_split_data, evaluate
 3 | from scipy.stats import entropy
 4 | import numpy as np
 5 | import os
 6 | import sys
 7 | path = os.path.split(os.path.split(os.path.realpath(__file__))[0])[0]
 8 | sys.path.append(path)
 9 | from baselines.single.delete_noise.classifier import get_prediction
10 | 
11 | 
12 | def find_max_entropy(predicted_probabilities):
13 |     entros = entropy(predicted_probabilities, axis=1)
14 |     return np.argsort(entros)[::-1]
15 | 
16 | 
17 | def delete_noise(data, delete_num=100):
18 |     numpy_array_of_predicted_probabilities = get_prediction(data)
19 |     ordered_label_errors = find_max_entropy(numpy_array_of_predicted_probabilities)
20 | 
21 |     json_data = data['json']
22 |     new_json = []
23 |     for idx, tmp in enumerate(tqdm(json_data)):
24 |         # 每一句都给他扩展
25 |         if idx in ordered_label_errors[:delete_num]:  # and idx not in correct_id:
26 |             # print(tmp['sentence'], tmp['label_des'])
27 |             continue
28 |         new_json.append(tmp)
29 |     data['json'] = new_json
30 |     return data
31 | 
32 | 
33 | def main():
34 |     data = read_datasets()
35 |     data = delete_noise(data)
36 |     random_split_data(data)
37 |     f1 = evaluate()
38 |     print('Macro-F1=', f1)
39 |     return f1
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 | 


--------------------------------------------------------------------------------
/baselines/single/template/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/baselines/single/template/README.md


--------------------------------------------------------------------------------
/baselines/single/template/template.py:
--------------------------------------------------------------------------------
 1 | from dckit import read_datasets
 2 | from dckit.evaluate import evaluate
 3 | 
 4 | 
 5 | def template(data):
 6 |     """
 7 |         输入读取的字典，输出还是这个字典，但是修改其内容，如果修改了标签请注意同时修改label_des和label字段
 8 |     """
 9 |     # TODO add your code here
10 |     return data
11 | 
12 | 
13 | def main():
14 |     data = read_datasets()
15 |     template(data)
16 |     f1 = evaluate()
17 |     print('Macro-F1=', f1)
18 |     return f1
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     main()
23 | 


--------------------------------------------------------------------------------
/datasets/cic/README.txt:
--------------------------------------------------------------------------------
1 | 这里存储你的结果
2 | 


--------------------------------------------------------------------------------
/datasets/raw_afqmc/readme.md:
--------------------------------------------------------------------------------
 1 | afqmc用于DataCLUE的版本 | afqmc for dataclue
 2 | 
 3 | 标签含义:
 4 | 0: 语义相似
 5 | 1: 语义不同
 6 | 
 7 | 数据量：
 8 | train.json(32,334)/dev.json(4,316)/test_public.json(2,000)
 9 | 
10 | 更多信息：
11 | for more detail, check: https://github.com/CLUEbenchmark/DataCLUE


--------------------------------------------------------------------------------
/datasets/raw_cic/README.txt:
--------------------------------------------------------------------------------
 1 | DataCLUE: 以数据为中心的AI测评
 2 | 项目地址：https://github.com/CLUEbenchmark/DataCLUE
 3 | 
 4 | 意图识别任务，类别数：118
 5 | train.json：1万，包含部分有问题的数据
 6 | dev.json：2000条，包含部分有问题的数据
 7 | test_public.json：2000条，仅用于学术和实验，和作为训练完后的效果评估。不能用于训练；高质量数据（标注准确率95%或以上）
 8 | 
 9 | train.json/dev.json: 含有噪声的数据，都含有一定比例有标注错误的标签；
10 | 
11 | 
12 | 1) 请做实验，报告你的方法下改进后的数据集(train.json/dev.json)，在test_public.json上做最终测试(2个数值，f1_macro & f1_micro)；或
13 | 2) 你也可以提交到CLUE平台：www.CLUEbenchmarks.com， 或发送邮件包含训练集和验证集的压缩包到邮箱。联系邮箱：CLUEbenchmark@163.com
14 | 
15 | www.CLUEbenchmarks.com
16 | 


--------------------------------------------------------------------------------
/datasets/raw_cic/labels.json:
--------------------------------------------------------------------------------
  1 | {"label_des": "买家抱怨商品涨价了", "label": 0}
  2 | {"label_des": "买家表达不满/生气", "label": 1}
  3 | {"label_des": "买家表达赞美/满意", "label": 2}
  4 | {"label_des": "买家表示不想要了", "label": 3}
  5 | {"label_des": "买家表示地址正确", "label": 4}
  6 | {"label_des": "买家表示好的", "label": 5}
  7 | {"label_des": "买家表示具体时间寄回去", "label": 6}
  8 | {"label_des": "买家表示麻烦卖家，表达感谢", "label": 7}
  9 | {"label_des": "买家表示马上下单付款", "label": 8}
 10 | {"label_des": "买家表示拍错了", "label": 9}
 11 | {"label_des": "买家表示伤心难过", "label": 10}
 12 | {"label_des": "买家表示商家发错地址", "label": 11}
 13 | {"label_des": "买家表示商家发错货了", "label": 12}
 14 | {"label_des": "买家表示商品已经寄回去了", "label": 13}
 15 | {"label_des": "买家表示稍等", "label": 14}
 16 | {"label_des": "买家表示是老顾客", "label": 15}
 17 | {"label_des": "买家表示收件信息不需要修改了", "label": 16}
 18 | {"label_des": "买家表示送人的", "label": 17}
 19 | {"label_des": "买家表示无法申请退款", "label": 18}
 20 | {"label_des": "买家表示无法下单", "label": 19}
 21 | {"label_des": "买家表示物流太慢了", "label": 20}
 22 | {"label_des": "买家表示先买来试试", "label": 21}
 23 | {"label_des": "买家表示需要退货退款", "label": 22}
 24 | {"label_des": "买家表示要重拍", "label": 23}
 25 | {"label_des": "买家表示已经收到货了", "label": 24}
 26 | {"label_des": "买家表示已下单付款", "label": 25}
 27 | {"label_des": "买家表示优惠券无法使用", "label": 26}
 28 | {"label_des": "买家表示有疑问/不理解", "label": 27}
 29 | {"label_des": "买家表示再看看", "label": 28}
 30 | {"label_des": "买家催促发货", "label": 29}
 31 | {"label_des": "买家催促回复", "label": 30}
 32 | {"label_des": "买家打招呼", "label": 31}
 33 | {"label_des": "买家对特定款式/颜色表达喜好", "label": 32}
 34 | {"label_des": "买家发送结束语", "label": 33}
 35 | {"label_des": "买家发送开票信息", "label": 34}
 36 | {"label_des": "买家发送支付宝账号", "label": 35}
 37 | {"label_des": "买家反馈收到的商品有问题", "label": 36}
 38 | {"label_des": "买家确认地址是否修改成功", "label": 37}
 39 | {"label_des": "买家讨价还价", "label": 38}
 40 | {"label_des": "买家向商家表达不好意思", "label": 39}
 41 | {"label_des": "买家需要商品推荐", "label": 40}
 42 | {"label_des": "买家要求补偿", "label": 41}
 43 | {"label_des": "买家要求发货检查", "label": 42}
 44 | {"label_des": "买家要求改商品信息", "label": 43}
 45 | {"label_des": "买家要求改运费", "label": 44}
 46 | {"label_des": "买家要求核对订单信息", "label": 45}
 47 | {"label_des": "买家要求取消退换货退款", "label": 46}
 48 | {"label_des": "买家要求添加商家微信", "label": 47}
 49 | {"label_des": "买家要求退运费", "label": 48}
 50 | {"label_des": "买家要求修改收件信息", "label": 49}
 51 | {"label_des": "买家要求延迟发货", "label": 50}
 52 | {"label_des": "买家要求延长收货时间", "label": 51}
 53 | {"label_des": "买家咨询补货/上货时间", "label": 52}
 54 | {"label_des": "买家咨询到货时间", "label": 53}
 55 | {"label_des": "买家咨询发货时间", "label": 54}
 56 | {"label_des": "买家咨询发什么快递", "label": 55}
 57 | {"label_des": "买家咨询返现方式", "label": 56}
 58 | {"label_des": "买家咨询返现金额", "label": 57}
 59 | {"label_des": "买家咨询返现时间", "label": 58}
 60 | {"label_des": "买家咨询换货发货时间", "label": 59}
 61 | {"label_des": "买家咨询活动规则", "label": 60}
 62 | {"label_des": "买家咨询快递单号", "label": 61}
 63 | {"label_des": "买家咨询买多件能否优惠", "label": 62}
 64 | {"label_des": "买家咨询哪款更好", "label": 63}
 65 | {"label_des": "买家咨询能否到付", "label": 64}
 66 | {"label_des": "买家咨询能否定制", "label": 65}
 67 | {"label_des": "买家咨询能否分开/一起发货", "label": 66}
 68 | {"label_des": "买家咨询能否开发票及发票类型和寄送时间", "label": 67}
 69 | {"label_des": "买家咨询能否提前享受优惠", "label": 68}
 70 | {"label_des": "买家咨询偏远地区是否发货", "label": 69}
 71 | {"label_des": "买家咨询如何加入会员", "label": 70}
 72 | {"label_des": "买家咨询如何领取优惠券", "label": 71}
 73 | {"label_des": "买家咨询如何下单", "label": 72}
 74 | {"label_des": "买家咨询商家发货地", "label": 73}
 75 | {"label_des": "买家咨询商家是否收到寄回的商品", "label": 74}
 76 | {"label_des": "买家咨询商品包装", "label": 75}
 77 | {"label_des": "买家咨询商品产地", "label": 76}
 78 | {"label_des": "买家咨询商品的材质/面料", "label": 77}
 79 | {"label_des": "买家咨询商品的色差问题", "label": 78}
 80 | {"label_des": "买家咨询商品规格数量", "label": 79}
 81 | {"label_des": "买家咨询商品价格", "label": 80}
 82 | {"label_des": "买家咨询商品具体尺码尺寸", "label": 81}
 83 | {"label_des": "买家咨询商品区别", "label": 82}
 84 | {"label_des": "买家咨询商品上新时间", "label": 83}
 85 | {"label_des": "买家咨询商品是不是预售款", "label": 84}
 86 | {"label_des": "买家咨询商品是否可以单买/单卖", "label": 85}
 87 | {"label_des": "买家咨询商品是否有赠品", "label": 86}
 88 | {"label_des": "买家咨询商品是否有质保，质保多久", "label": 87}
 89 | {"label_des": "买家咨询商品是什么颜色", "label": 88}
 90 | {"label_des": "买家咨询商品质量是否有保障", "label": 89}
 91 | {"label_des": "买家咨询商品重量", "label": 90}
 92 | {"label_des": "买家咨询什么颜色好看", "label": 91}
 93 | {"label_des": "买家咨询是否可以打折", "label": 92}
 94 | {"label_des": "买家咨询是否可以发顺丰", "label": 93}
 95 | {"label_des": "买家咨询是否可以改价", "label": 94}
 96 | {"label_des": "买家咨询是否可以拒收", "label": 95}
 97 | {"label_des": "买家咨询是否可以微信支付", "label": 96}
 98 | {"label_des": "买家咨询是否可以指定快递", "label": 97}
 99 | {"label_des": "买家咨询是否有好评返现", "label": 98}
100 | {"label_des": "买家咨询是否有活动", "label": 99}
101 | {"label_des": "买家咨询是否有买家秀", "label": 100}
102 | {"label_des": "买家咨询是否有实体店", "label": 101}
103 | {"label_des": "买家咨询是否有味道", "label": 102}
104 | {"label_des": "买家咨询退换货地址", "label": 103}
105 | {"label_des": "买家咨询退换货规则", "label": 104}
106 | {"label_des": "买家咨询退换货运费由谁承担", "label": 105}
107 | {"label_des": "买家咨询退换货支持快递", "label": 106}
108 | {"label_des": "买家咨询退货退款原因选什么", "label": 107}
109 | {"label_des": "买家咨询退款金额", "label": 108}
110 | {"label_des": "买家咨询退款去向", "label": 109}
111 | {"label_des": "买家咨询退款时间", "label": 110}
112 | {"label_des": "买家咨询物流信息", "label": 111}
113 | {"label_des": "买家咨询优惠券使用规则", "label": 112}
114 | {"label_des": "买家咨询运费金额", "label": 113}
115 | {"label_des": "买家咨询运费险赔付规则", "label": 114}
116 | {"label_des": "买家咨询赠品何时发货", "label": 115}
117 | {"label_des": "买家咨询赠品是否可以自选", "label": 116}
118 | {"label_des": "买家咨询自己的旺旺昵称", "label": 117}
119 | 


--------------------------------------------------------------------------------
/datasets/raw_cic/labels.txt:
--------------------------------------------------------------------------------
  1 | 买家抱怨商品涨价了
  2 | 买家表达不满/生气
  3 | 买家表达赞美/满意
  4 | 买家表示不想要了
  5 | 买家表示地址正确
  6 | 买家表示好的
  7 | 买家表示具体时间寄回去
  8 | 买家表示麻烦卖家，表达感谢
  9 | 买家表示马上下单付款
 10 | 买家表示拍错了
 11 | 买家表示伤心难过
 12 | 买家表示商家发错地址
 13 | 买家表示商家发错货了
 14 | 买家表示商品已经寄回去了
 15 | 买家表示稍等
 16 | 买家表示是老顾客
 17 | 买家表示收件信息不需要修改了
 18 | 买家表示送人的
 19 | 买家表示无法申请退款
 20 | 买家表示无法下单
 21 | 买家表示物流太慢了
 22 | 买家表示先买来试试
 23 | 买家表示需要退货退款
 24 | 买家表示要重拍
 25 | 买家表示已经收到货了
 26 | 买家表示已下单付款
 27 | 买家表示优惠券无法使用
 28 | 买家表示有疑问/不理解
 29 | 买家表示再看看
 30 | 买家催促发货
 31 | 买家催促回复
 32 | 买家打招呼
 33 | 买家对特定款式/颜色表达喜好
 34 | 买家发送结束语
 35 | 买家发送开票信息
 36 | 买家发送支付宝账号
 37 | 买家反馈收到的商品有问题
 38 | 买家确认地址是否修改成功
 39 | 买家讨价还价
 40 | 买家向商家表达不好意思
 41 | 买家需要商品推荐
 42 | 买家要求补偿
 43 | 买家要求发货检查
 44 | 买家要求改商品信息
 45 | 买家要求改运费
 46 | 买家要求核对订单信息
 47 | 买家要求取消退换货退款
 48 | 买家要求添加商家微信
 49 | 买家要求退运费
 50 | 买家要求修改收件信息
 51 | 买家要求延迟发货
 52 | 买家要求延长收货时间
 53 | 买家咨询补货/上货时间
 54 | 买家咨询到货时间
 55 | 买家咨询发货时间
 56 | 买家咨询发什么快递
 57 | 买家咨询返现方式
 58 | 买家咨询返现金额
 59 | 买家咨询返现时间
 60 | 买家咨询换货发货时间
 61 | 买家咨询活动规则
 62 | 买家咨询快递单号
 63 | 买家咨询买多件能否优惠
 64 | 买家咨询哪款更好
 65 | 买家咨询能否到付
 66 | 买家咨询能否定制
 67 | 买家咨询能否分开/一起发货
 68 | 买家咨询能否开发票及发票类型和寄送时间
 69 | 买家咨询能否提前享受优惠
 70 | 买家咨询偏远地区是否发货
 71 | 买家咨询如何加入会员
 72 | 买家咨询如何领取优惠券
 73 | 买家咨询如何下单
 74 | 买家咨询商家发货地
 75 | 买家咨询商家是否收到寄回的商品
 76 | 买家咨询商品包装
 77 | 买家咨询商品产地
 78 | 买家咨询商品的材质/面料
 79 | 买家咨询商品的色差问题
 80 | 买家咨询商品规格数量
 81 | 买家咨询商品价格
 82 | 买家咨询商品具体尺码尺寸
 83 | 买家咨询商品区别
 84 | 买家咨询商品上新时间
 85 | 买家咨询商品是不是预售款
 86 | 买家咨询商品是否可以单买/单卖
 87 | 买家咨询商品是否有赠品
 88 | 买家咨询商品是否有质保，质保多久
 89 | 买家咨询商品是什么颜色
 90 | 买家咨询商品质量是否有保障
 91 | 买家咨询商品重量
 92 | 买家咨询什么颜色好看
 93 | 买家咨询是否可以打折
 94 | 买家咨询是否可以发顺丰
 95 | 买家咨询是否可以改价
 96 | 买家咨询是否可以拒收
 97 | 买家咨询是否可以微信支付
 98 | 买家咨询是否可以指定快递
 99 | 买家咨询是否有好评返现
100 | 买家咨询是否有活动
101 | 买家咨询是否有买家秀
102 | 买家咨询是否有实体店
103 | 买家咨询是否有味道
104 | 买家咨询退换货地址
105 | 买家咨询退换货规则
106 | 买家咨询退换货运费由谁承担
107 | 买家咨询退换货支持快递
108 | 买家咨询退货退款原因选什么
109 | 买家咨询退款金额
110 | 买家咨询退款去向
111 | 买家咨询退款时间
112 | 买家咨询物流信息
113 | 买家咨询优惠券使用规则
114 | 买家咨询运费金额
115 | 买家咨询运费险赔付规则
116 | 买家咨询赠品何时发货
117 | 买家咨询赠品是否可以自选
118 | 买家咨询自己的旺旺昵称


--------------------------------------------------------------------------------
/datasets/raw_iflytek/labels.json:
--------------------------------------------------------------------------------
  1 | {"label": "0", "label_des": "打车"}
  2 | {"label": "1", "label_des": "地图导航"}
  3 | {"label": "2", "label_des": "免费WIFI"}
  4 | {"label": "3", "label_des": "租车"}
  5 | {"label": "4", "label_des": "同城服务"}
  6 | {"label": "5", "label_des": "快递物流"}
  7 | {"label": "6", "label_des": "婚庆"}
  8 | {"label": "7", "label_des": "家政"}
  9 | {"label": "8", "label_des": "公共交通"}
 10 | {"label": "9", "label_des": "政务"}
 11 | {"label": "10", "label_des": "社区服务"}
 12 | {"label": "11", "label_des": "薅羊毛"}
 13 | {"label": "12", "label_des": "魔幻"}
 14 | {"label": "13", "label_des": "仙侠"}
 15 | {"label": "14", "label_des": "卡牌"}
 16 | {"label": "15", "label_des": "飞行空战"}
 17 | {"label": "16", "label_des": "射击游戏"}
 18 | {"label": "17", "label_des": "休闲益智"}
 19 | {"label": "18", "label_des": "动作类"}
 20 | {"label": "19", "label_des": "体育竞技"}
 21 | {"label": "20", "label_des": "棋牌中心"}
 22 | {"label": "21", "label_des": "经营养成"}
 23 | {"label": "22", "label_des": "策略"}
 24 | {"label": "23", "label_des": "MOBA"}
 25 | {"label": "24", "label_des": "辅助工具"}
 26 | {"label": "25", "label_des": "约会社交"}
 27 | {"label": "26", "label_des": "即时通讯"}
 28 | {"label": "27", "label_des": "工作社交"}
 29 | {"label": "28", "label_des": "论坛圈子"}
 30 | {"label": "29", "label_des": "婚恋社交"}
 31 | {"label": "30", "label_des": "情侣社交"}
 32 | {"label": "31", "label_des": "社交工具"}
 33 | {"label": "32", "label_des": "生活社交"}
 34 | {"label": "33", "label_des": "微博博客"}
 35 | {"label": "34", "label_des": "新闻"}
 36 | {"label": "35", "label_des": "漫画"}
 37 | {"label": "36", "label_des": "小说"}
 38 | {"label": "37", "label_des": "技术"}
 39 | {"label": "38", "label_des": "教辅"}
 40 | {"label": "39", "label_des": "问答交流"}
 41 | {"label": "40", "label_des": "搞笑"}
 42 | {"label": "41", "label_des": "杂志"}
 43 | {"label": "42", "label_des": "百科"}
 44 | {"label": "43", "label_des": "影视娱乐"}
 45 | {"label": "44", "label_des": "求职"}
 46 | {"label": "45", "label_des": "兼职"}
 47 | {"label": "46", "label_des": "视频"}
 48 | {"label": "47", "label_des": "短视频"}
 49 | {"label": "48", "label_des": "音乐"}
 50 | {"label": "49", "label_des": "直播"}
 51 | {"label": "50", "label_des": "电台"}
 52 | {"label": "51", "label_des": "K歌"}
 53 | {"label": "52", "label_des": "成人"}
 54 | {"label": "53", "label_des": "中小学"}
 55 | {"label": "54", "label_des": "职考"}
 56 | {"label": "55", "label_des": "公务员"}
 57 | {"label": "56", "label_des": "英语"}
 58 | {"label": "57", "label_des": "视频教育"}
 59 | {"label": "58", "label_des": "高等教育"}
 60 | {"label": "59", "label_des": "成人教育"}
 61 | {"label": "60", "label_des": "艺术"}
 62 | {"label": "61", "label_des": "语言(非英语)"}
 63 | {"label": "62", "label_des": "旅游资讯"}
 64 | {"label": "63", "label_des": "综合预定"}
 65 | {"label": "64", "label_des": "民航"}
 66 | {"label": "65", "label_des": "铁路"}
 67 | {"label": "66", "label_des": "酒店"}
 68 | {"label": "67", "label_des": "行程管理"}
 69 | {"label": "68", "label_des": "民宿短租"}
 70 | {"label": "69", "label_des": "出国"}
 71 | {"label": "70", "label_des": "工具"}
 72 | {"label": "71", "label_des": "亲子儿童"}
 73 | {"label": "72", "label_des": "母婴"}
 74 | {"label": "73", "label_des": "驾校"}
 75 | {"label": "74", "label_des": "违章"}
 76 | {"label": "75", "label_des": "汽车咨询"}
 77 | {"label": "76", "label_des": "汽车交易"}
 78 | {"label": "77", "label_des": "日常养车"}
 79 | {"label": "78", "label_des": "行车辅助"}
 80 | {"label": "79", "label_des": "租房"}
 81 | {"label": "80", "label_des": "买房"}
 82 | {"label": "81", "label_des": "装修家居"}
 83 | {"label": "82", "label_des": "电子产品"}
 84 | {"label": "83", "label_des": "问诊挂号"}
 85 | {"label": "84", "label_des": "养生保健"}
 86 | {"label": "85", "label_des": "医疗服务"}
 87 | {"label": "86", "label_des": "减肥瘦身"}
 88 | {"label": "87", "label_des": "美妆美业"}
 89 | {"label": "88", "label_des": "菜谱"}
 90 | {"label": "89", "label_des": "餐饮店"}
 91 | {"label": "90", "label_des": "体育咨讯"}
 92 | {"label": "91", "label_des": "运动健身"}
 93 | {"label": "92", "label_des": "支付"}
 94 | {"label": "93", "label_des": "保险"}
 95 | {"label": "94", "label_des": "股票"}
 96 | {"label": "95", "label_des": "借贷"}
 97 | {"label": "96", "label_des": "理财"}
 98 | {"label": "97", "label_des": "彩票"}
 99 | {"label": "98", "label_des": "记账"}
100 | {"label": "99", "label_des": "银行"}
101 | {"label": "100", "label_des": "美颜"}
102 | {"label": "101", "label_des": "影像剪辑"}
103 | {"label": "102", "label_des": "摄影修图"}
104 | {"label": "103", "label_des": "相机"}
105 | {"label": "104", "label_des": "绘画"}
106 | {"label": "105", "label_des": "二手"}
107 | {"label": "106", "label_des": "电商"}
108 | {"label": "107", "label_des": "团购"}
109 | {"label": "108", "label_des": "外卖"}
110 | {"label": "109", "label_des": "电影票务"}
111 | {"label": "110", "label_des": "社区超市"}
112 | {"label": "111", "label_des": "购物咨询"}
113 | {"label": "112", "label_des": "笔记"}
114 | {"label": "113", "label_des": "办公"}
115 | {"label": "114", "label_des": "日程管理"}
116 | {"label": "115", "label_des": "女性"}
117 | {"label": "116", "label_des": "经营"}
118 | {"label": "117", "label_des": "收款"}
119 | {"label": "118", "label_des": "其他"}


--------------------------------------------------------------------------------
/datasets/raw_qbqtc/readme.md:
--------------------------------------------------------------------------------
 1 | # QBQTC
 2 | QBQTC: QQ Browser Query Title Corpus
 3 | 
 4 | QQ浏览器搜索相关性数据集
 5 | 
 6 | 
 7 | # 数据集介绍
 8 | QQ浏览器搜索相关性数据集（QBQTC,QQ Browser Query Title Corpus），是QQ浏览器搜索引擎目前针对大搜场景构建的一个融合了相关性、权威性、内容质量、
 9 | 时效性等维度标注的学习排序（LTR）数据集，广泛应用在搜索引擎业务场景中。
10 | 
11 | 相关性的含义：0，相关程度差；1，有一定相关性；2，非常相关。数字越大相关性越高。
12 | 
13 | #### 数据量统计
14 |  | 训练集（train) | 验证集（dev) | 公开测试集（test) | 私有测试集 |
15 | | :----: | :----: | :----: | :----: |
16 | | 180,000| 20,000| 5,000 | >=10,0000|
17 | 
18 | # baseline效果对比
19 | 
20 | | 模型 | 训练集（train) | 验证集（dev) | 测试集（test) | 训练参数 |
21 | | :----:| :----: | :----: | :----: | :----: |
22 | |<a href="https://huggingface.co/bert-base-chinese/tree/main">BERT-base</a> | F1:80.3  Acc:84.3 | F1: 64.9 Acc:72.4 | F1: 64.1 Acc:71.8 | batch=64, length=52, epoch=7, lr=2e-5, warmup=0.9 |
23 | |<a href="https://huggingface.co/hfl/chinese-roberta-wwm-ext"> RoBERTa-wwm-ext</a> | F1:67.9 Acc:76.2 | F1:64.9 Acc:71.5 | F1:64.0 Acc:71.0 | batch=64, length=52, epoch=7, lr=2e-5, warmup=0.9|
24 | |<a href="https://huggingface.co/hfl/chinese-roberta-wwm-ext-large">RoBERTa-wwm-large-ext</a> | F1:79.8 Acc:84.2 | F1:65.1 Acc:72.4 | F1:66.3 Acc:73.1 | batch=64, length=52, epoch=7, lr=2e-5, warmup=0.9|
25 | 
26 | f1_score来自于sklearn.metrics，计算公式如下：
27 | `F1 =  2 * (precision * recall) / (precision + recall)`
28 | 
29 | 
30 | # 数据集例子
31 |     {"id": 0, "query": "小孩咳嗽感冒", "title": "小孩感冒过后久咳嗽该吃什么药育儿问答宝宝树", "label": "1"}
32 |     {"id": 1, "query": "前列腺癌根治术后能活多久", "title": "前列腺癌转移能活多久前列腺癌治疗方法盘点-家庭医生在线肿瘤频道", "label": "1"}
33 |     {"id": 3, "query": "如何将一个文件复制到另一个文件里", "title": "怎么把布局里的图纸复制到另外一个文件中去百度文库", "label": "0"}
34 |     {"id": 214, "query": "免费观看电影速度与激情1", "title": "《速度与激情1》全集-高清电影完整版-在线观看", "label": "2"}
35 |     {"id": 98, "query": "昆明公积金", "title": "昆明异地购房不能用住房公积金中新网", "label": "2"}
36 |     {"id": 217, "query": "多张图片怎么排版好看", "title": "怎么排版图片", "label": "2"}
37 | 
38 | # 更多内容见
39 | <a href='https://github.com/CLUEbenchmark/QBQTC'>QBQTC项目</a>


--------------------------------------------------------------------------------
/datasets/raw_qbqtc/train.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/datasets/raw_qbqtc/train.json


--------------------------------------------------------------------------------
/datasets/raw_tnews/labels.json:
--------------------------------------------------------------------------------
 1 | {"label_des": "教育", "label": "0"}
 2 | {"label_des": "财经", "label": "1"}
 3 | {"label_des": "房产", "label": "2"}
 4 | {"label_des": "旅游", "label": "3"}
 5 | {"label_des": "科技", "label": "4"}
 6 | {"label_des": "体育", "label": "5"}
 7 | {"label_des": "电竞", "label": "6"}
 8 | {"label_des": "文化", "label": "7"}
 9 | {"label_des": "汽车", "label": "8"}
10 | {"label_des": "故事", "label": "9"}
11 | {"label_des": "娱乐", "label": "10"}
12 | {"label_des": "军事", "label": "11"}
13 | {"label_des": "农业", "label": "12"}
14 | {"label_des": "国际", "label": "13"}
15 | {"label_des": "股票", "label": "14"}
16 | 


--------------------------------------------------------------------------------
/dckit/README.md:
--------------------------------------------------------------------------------
 1 | # DataCLUE Toolkit
 2 | 
 3 | [安装](#安装) | [使用](#使用) | [示例](#示例) | [贡献](#贡献) | [**References**](#references)
 4 | 
 5 | 为了方便各个算法之间的整合，这里提供了一套统一的输入输出接口。
 6 | 并且提供了一些辅助函数，帮助大家更方便地使用DataCLUE。
 7 | (我们鼓励大家用dckit进行开发，以更好的实现不同算法的共享。但是你也完全可以自己实现相应功能完成DataCLUE的任务)。
 8 | 
 9 | # Updates
10 | [Nov 16, 2021] First version of dckit is released.
11 | 
12 | # 安装
13 | 在DataCLUE目录下
14 | 
15 | `pip install -e .`
16 | 
17 | # 使用
18 | ```python
19 | from dckit import read_datasets, random_split_data, evaluate
20 | 
21 | data = read_datasets(dataset='CIC')  # 读取数据
22 | # TODO 对数据进行处理，这里example_transform 是你需要实现的变换
23 | data = example_transform(data)
24 | 
25 | random_split_data(data, test_size=2000, seed=0)  # 随机切分数据到训练、测试集
26 | f1 = evaluate()  # 运行模型并返回相应的结果
27 | ```
28 | 
29 | # 示例
30 | 我们在中`baseline`实现了几个策略都用到了dckit，比如你可以看`baseline/single/data_aug`或其它相应baseline代码中的实现
31 | 
32 | 
33 | # 贡献
34 | - 如果你觉得dckit缺少一些通用的基本功能，你可以提一个issue。
35 | - 如果你已经实现了dckit的扩展功能，欢迎开启一个PR。
36 | 
37 | # References
38 | ```bib
39 | @article{xu2021dataclue,
40 |       title={DataCLUE: A Benchmark Suite for Data-centric NLP}, 
41 |       author={Liang Xu and Jiacheng Liu and Xiang Pan and Xiaojing Lu and Xiaofeng Hou},
42 |       year={2021},
43 |       eprint={2111.08647},
44 |       archivePrefix={arXiv},
45 |       primaryClass={cs.CL}
46 | }
47 | ```
48 | 


--------------------------------------------------------------------------------
/dckit/__init__.py:
--------------------------------------------------------------------------------
1 | from dckit.utils import read_datasets, random_split_data
2 | from dckit.evaluate import evaluate
3 | 


--------------------------------------------------------------------------------
/dckit/evaluate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from sklearn.metrics import f1_score
 3 | import json
 4 | import numpy as np
 5 | 
 6 | path = os.path.split(os.path.split(os.path.realpath(__file__))[0])[0]
 7 | 
 8 | 
 9 | def calc_f1(dataset='cic'):
10 |     y_true = []
11 |     for line in open('{}/datasets/raw_{}/test_public.json'.format(path, dataset.lower()), 'r', encoding='utf-8'):
12 |         y_true.append(json.loads(line)['label'])
13 |     y_pred = []
14 |     for line in open('{}/output_dir/bert/test_prediction.json'.format(path), 'r', encoding='utf-8'):
15 |         y_pred.append(json.loads(line)['label'])
16 | 
17 |     f1_macro = f1_score(y_true, y_pred, average='macro')
18 |     return f1_macro
19 | 
20 | 
21 | def evaluate(dataset='cic'):
22 |     cmds = [
23 |         'rm -rf {}/output_dir/bert'.format(path),
24 |         'rm -f {}/datasets/{}/cached*'.format(path, dataset),
25 |         'cd {}/baselines/models_pytorch/classifier_pytorch'.format(path),
26 |         'bash run_classifier_{}.sh'.format(dataset),
27 |         'bash run_classifier_{}.sh predict'.format(dataset),
28 |     ]
29 |     os.system('&&'.join(cmds))
30 |     return calc_f1(dataset.lower())
31 | 


--------------------------------------------------------------------------------
/dckit/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import numpy as np
 4 | from sklearn.model_selection import train_test_split
 5 | 
 6 | path = os.path.split(os.path.split(os.path.realpath(__file__))[0])[0]
 7 | 
 8 | 
 9 | def read_datasets(dataset='cic'):
10 |     """
11 |     根据输入的数据名称读取数据
12 |     参数：
13 |         dataset： 数据集名称
14 |     输出：
15 |         full_data： 字典形式存储的数据，包括：
16 |                     - 'json': json数据的每一行，如 {"id": 13, "label": "79", "sentence": "一斤大概有多少个", "label_des": "买家咨询商品规格数量"}
17 |                     这里为了统一输入输出没有区分train和dev了
18 |                     - 'info': 标签号好描述的对应关系，如{79:'买家咨询商品规格数量'}
19 |     """
20 |     dataset = dataset.lower()
21 |     if dataset in ['cic', 'tnews', 'iflytek']:
22 |         json_data = []
23 |         for data_type in ['train', 'dev']:
24 |             for line in open('{}/datasets/raw_{}/{}.json'.format(path, dataset, data_type), 'r', encoding='utf-8'):
25 |                 # line = {"id": 13, "label": "79", "sentence": "一斤大概有多少个", "label_des": "买家咨询商品规格数量"}
26 |                 one = json.loads(line)
27 |                 json_data.append(one)
28 | 
29 |         label_info = {}
30 |         for line in open('{}/datasets/raw_{}/{}.json'.format(path, dataset, 'labels'), 'r', encoding='utf-8'):
31 |             one = json.loads(line)
32 |             label_info[one['label']] = one['label_des']
33 |         full_data = {'json': json_data, 'info': label_info}
34 |         return full_data
35 |     elif dataset in ['afqmc', 'qbqtc', 'triclue']:
36 |         json_data = []
37 |         for data_type in ['train', 'dev']:
38 |             for line in open('{}/datasets/raw_{}/{}.json'.format(path, dataset, data_type), 'r', encoding='utf-8'):
39 |                 # line = {"label": "79", "sentence1": "一斤大概有多少个", "sentence2": "买家咨询商品规格数量"}
40 |                 one = json.loads(line)
41 |                 json_data.append(one)
42 |         label_info = {}
43 |         full_data = {'json': json_data, 'info': label_info}
44 |         return full_data
45 |     elif dataset in ['cluener']:
46 |         """
47 |         {"text": "浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，", 
48 |         "label": {"name": {"叶老桂": [[9, 11]]}, "company": {"浙商银行": [[0, 3]]}}}
49 |         {"text": "生生不息CSOL生化狂潮让你填弹狂扫", "label": {"game": {"CSOL": [[4, 7]]}}}
50 |         """
51 |         json_data = []
52 |         for data_type in ['train', 'dev']:
53 |             for line in open('{}/datasets/raw_{}/{}.json'.format(path, dataset, data_type), 'r', encoding='utf-8'):
54 |                 # line = {"label": "79", "sentence1": "一斤大概有多少个", "sentence2": "买家咨询商品规格数量"}
55 |                 one = json.loads(line)
56 |                 json_data.append(one)
57 |         label_info = {}
58 |         full_data = {'json': json_data, 'info': label_info}
59 |         return full_data
60 |     else:
61 |         raise NotImplementedError
62 | 
63 | 
64 | def random_split_data(data, test_size=2000, seed=0, dataset='cic'):
65 |     if dataset == 'cluener':
66 |         raise NotImplementedError
67 |     json_data = data['json']
68 |     labels = []
69 |     for line in json_data:
70 |         labels.append(int(line['label']))
71 |     train_idx, test_idx, _, _ = train_test_split(range(len(labels)), labels, stratify=labels,
72 |                                                  shuffle=True, test_size=test_size, random_state=seed)
73 | 
74 |     f = open('{}/datasets/{}/train.json'.format(path, dataset), 'w', encoding='utf-8')
75 |     for idx in train_idx:
76 |         dic = json_data[idx]
77 |         str_sen = json.dumps(dic, ensure_ascii=False)
78 |         f.write(str_sen + '\n')
79 | 
80 |     f = open('{}/datasets/{}/dev.json'.format(path, dataset), 'w', encoding='utf-8')
81 |     for idx in test_idx:
82 |         dic = json_data[idx]
83 |         str_sen = json.dumps(dic, ensure_ascii=False)
84 |         f.write(str_sen + '\n')
85 | 


--------------------------------------------------------------------------------
/resources/dataclue_submit_examples/dataclue_submit_examples.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/dataclue_submit_examples/dataclue_submit_examples.zip


--------------------------------------------------------------------------------
/resources/dataclue_submit_examples_old_nouse_iflytek/dataclue_submit_examples.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/dataclue_submit_examples_old_nouse_iflytek/dataclue_submit_examples.zip


--------------------------------------------------------------------------------
/resources/img/bxu.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/img/bxu.jpg


--------------------------------------------------------------------------------
/resources/img/improve.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/img/improve.jpeg


--------------------------------------------------------------------------------
/resources/img/lifec.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/img/lifec.jpeg


--------------------------------------------------------------------------------
/resources/img/takeaway2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/img/takeaway2.jpeg


--------------------------------------------------------------------------------
/resources/img/teamgroup.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLUEbenchmark/DataCLUE/32425a4621d4614a766c4434b29089d41867dd20/resources/img/teamgroup.jpeg


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | import setuptools
 6 | 
 7 | from setuptools import setup
 8 | from setuptools.command.develop import develop
 9 | from setuptools.command.install import install
10 | from subprocess import call
11 | 
12 | with open('dckit/README.md', 'r') as f:
13 |     long_description = f.read()
14 | 
15 | 
16 | class Installation(install):
17 |     def run(self):
18 |         call(['pip install -r requirements.txt --no-clean'], shell=True)
19 |         install.run(self)
20 | 
21 | 
22 | setuptools.setup(
23 |     name='dckit',
24 |     version='0.0.1',
25 |     author='JC Liu',
26 |     author_email='CLUE@CLUEbenchmarks.com',
27 |     maintainer='DataCLUE',
28 |     maintainer_email='CLUE@CLUEbenchmarks.com',
29 |     description='Python toolkit for Data-centric Chinese Language Understanding Evaluation benchmark.',
30 |     long_description=long_description,
31 |     long_description_content_type='text/markdown',
32 |     url='https://github.com/CLUEBenchmark/DataCLUE',
33 |     include_package_data=True,
34 |     packages=setuptools.find_packages(),
35 |     classifiers=[
36 |         'Programming Language :: Python :: 3',
37 |         'License :: OSI Approved :: MIT License',
38 |         'Operating System :: OS Independent'],
39 |     install_requires=[],
40 |     cmdclass={'install': Installation})
41 | 


--------------------------------------------------------------------------------