├── .gitignore
├── LICENSE
├── README.md
├── arguments.py
├── model
    ├── lora
    │   ├── __init__.py
    │   ├── layers.py
    │   └── utils.py
    ├── multiple_choice.py
    ├── prefix_encoder.py
    ├── prompt.py
    ├── roberta
    │   ├── __init__.py
    │   ├── configuration_roberta.py
    │   ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
    │   ├── modeling_flax_roberta.py
    │   ├── modeling_roberta.py
    │   ├── modeling_tf_roberta.py
    │   ├── tokenization_roberta.py
    │   └── tokenization_roberta_fast.py
    ├── sequence_classification.py
    └── utils.py
├── requirements.txt
├── run.py
├── scripts
    ├── mulitruns_scripts
    │   ├── adapter
    │   │   ├── run_boolq_roberta_both.sh
    │   │   ├── run_cb_roberta_both.sh
    │   │   ├── run_cola_roberta_both.sh
    │   │   ├── run_copa_roberta_both.sh
    │   │   ├── run_mnli_roberta_both.sh
    │   │   ├── run_mrpc_roberta_both.sh
    │   │   ├── run_multirc_roberta_both.sh
    │   │   ├── run_qnli_roberta_both.sh
    │   │   ├── run_qqp_roberta_both.sh
    │   │   ├── run_rte_roberta_both.sh
    │   │   ├── run_sst2_roberta_both.sh
    │   │   ├── run_stsb_roberta_both.sh
    │   │   ├── run_wic_roberta_both.sh
    │   │   └── run_wsc_roberta_both.sh
    │   ├── bitfit
    │   │   ├── run_boolq_roberta_both.sh
    │   │   ├── run_cb_roberta_both.sh
    │   │   ├── run_cola_roberta_both.sh
    │   │   ├── run_copa_roberta_both.sh
    │   │   ├── run_mrpc_roberta_both.sh
    │   │   ├── run_multirc_roberta_both.sh
    │   │   ├── run_rte_roberta_both.sh
    │   │   ├── run_sst2_roberta_both.sh
    │   │   ├── run_stsb_roberta_both.sh
    │   │   ├── run_wic_roberta_both.sh
    │   │   └── run_wsc_roberta_both.sh
    │   ├── finetuning
    │   │   ├── run_boolq_roberta_both.sh
    │   │   ├── run_cb_roberta_both.sh
    │   │   ├── run_cola_roberta_both.sh
    │   │   ├── run_copa_roberta_both.sh
    │   │   ├── run_mnli_roberta_both.sh
    │   │   ├── run_mrpc_roberta_both.sh
    │   │   ├── run_multirc_roberta_both.sh
    │   │   ├── run_qnli_roberta_both.sh
    │   │   ├── run_qqp_roberta_both.sh
    │   │   ├── run_rte_roberta_both.sh
    │   │   ├── run_sst2_roberta_both.sh
    │   │   ├── run_stsb_roberta_both.sh
    │   │   ├── run_wic_roberta_both.sh
    │   │   └── run_wsc_roberta_both.sh
    │   ├── lora
    │   │   ├── run_boolq_roberta_both.sh
    │   │   ├── run_cb_roberta_both.sh
    │   │   ├── run_cola_roberta_both.sh
    │   │   ├── run_copa_roberta_both.sh
    │   │   ├── run_mrpc_roberta_both.sh
    │   │   ├── run_multirc_roberta_both.sh
    │   │   ├── run_rte_roberta_both.sh
    │   │   ├── run_sst2_roberta_both.sh
    │   │   ├── run_stsb_roberta_both.sh
    │   │   ├── run_wic_roberta_both.sh
    │   │   └── run_wsc_roberta_both.sh
    │   └── prefixtuning
    │   │   ├── run_boolq_roberta_both.sh
    │   │   ├── run_cb_roberta_both.sh
    │   │   ├── run_cola_roberta_both.sh
    │   │   ├── run_copa_roberta_both.sh
    │   │   ├── run_mnli_roberta_both.sh
    │   │   ├── run_mrpc_roberta_both.sh
    │   │   ├── run_multirc_roberta_both.sh
    │   │   ├── run_qnli_roberta_both.sh
    │   │   ├── run_qqp_roberta_both.sh
    │   │   ├── run_rte_roberta_both.sh
    │   │   ├── run_sst2_roberta_both.sh
    │   │   ├── run_stsb_roberta_both.sh
    │   │   ├── run_wic_roberta_both.sh
    │   │   └── run_wsc_roberta_both.sh
    └── search_scipts
    │   ├── glue
    │       ├── search_adapter.sh
    │       ├── search_bitfit.sh
    │       ├── search_ft.sh
    │       ├── search_lora.sh
    │       └── search_pt.sh
    │   ├── search.py
    │   └── superglue
    │       ├── search_adapter.sh
    │       ├── search_bitfit.sh
    │       ├── search_ft.sh
    │       ├── search_lora.sh
    │       └── search_pt.sh
├── tasks
    ├── glue
    │   ├── dataset.py
    │   ├── get_trainer.py
    │   └── glue.py
    ├── superglue
    │   ├── dataset.py
    │   ├── get_trainer.py
    │   ├── record_evaluation.py
    │   ├── super_glue.py
    │   ├── super_glue_metric.py
    │   └── utils.py
    └── utils.py
└── training
    └── trainer_base.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 guanzhchen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Revisiting Parameter-Efficient Tuning: Are We Really There Yet?
  2 | 
  3 | This is the code implementation of our paper accepted in EMNLP 2022:
  4 | 
  5 | > Guanzheng Chen, Fangyu Liu, Zaiqiao Meng, Shangsong Liang. [Revisiting Parameter-Efficient Tuning: Are We Really There Yet?](https://arxiv.org/abs/2202.07962).
  6 | 
  7 | 
  8 | 
  9 | We provide a comprehensive study for existing famous **P**arameter-**E**fficient **Tuning** (PETuning) methods, i.e., Adapter, Prompt, LoRA, and BitFit, focusing on their performance and stability.
 10 | 
 11 | 
 12 | 
 13 | The code structure is based in part on [P-tuning v2](https://github.com/THUDM/P-tuning-v2). (Thanks for their awesome work.)
 14 | 
 15 | 
 16 | 
 17 | ## File Structure
 18 | 
 19 | - `model`: codes to implement PETuning methods.
 20 | - `tasks`: codes to preprocess datasets and choose model for each task.
 21 | - `training`: codes to define the trainer for training.
 22 | - `scripts`: scripts to run training, evaluation, and prediction for each task.
 23 |   - `search_scripts`: the scripts to perform grid search for each task.
 24 |   - `multiruns_scripts`: the scripts to conduct multi runs for each task, where each script contains the best hyper-parameters for corresponding task.
 25 | - `arguments.py & run.py`:  the arguments and running codes for training, evaluation, and prediction.
 26 | 
 27 | 
 28 | 
 29 | ## Dependency
 30 | 
 31 | ```
 32 | torch==1.8.1
 33 | transformers==4.5.0
 34 | adapter-transformers==2.2.0
 35 | ```
 36 | 
 37 | Please view `requirements.txt` for more details.
 38 | 
 39 | 
 40 | 
 41 | ## Data
 42 | 
 43 | All datasets in GLUE and SuperGLUE will be automatically downloaded (from Huggingface Datasets APIs) when running the scripts.
 44 | 
 45 | 
 46 | 
 47 | ## PETuning for Each Task
 48 | 
 49 | To search the best hyper-parameters for each task,  you can run the scripts in the `scripts/search_scripts/` folder. For example,  you can run the CB tasks with adapter by the command:
 50 | 
 51 | ```bash
 52 | bash scripts/search_scripts/superglue/search_adapter.sh cb
 53 | ```
 54 | 
 55 | 
 56 | 
 57 | To conduct multiple runs for one task, you can run the scripts in the `scripts/multiruns_scripts/` folder.  For example,  you can run the CB tasks with adapter by the command:
 58 | 
 59 | ```bash
 60 | bash scripts/multiruns_scripts/adapter/run_cb_roberta_both.sh
 61 | ```
 62 | 
 63 | We provide the best hyper-parameters for each task in corresponding multi-runs scripts. If you cannot reproduce our reported results, please check the environment (package version) and conduct the grid search in your environment.
 64 | 
 65 | 
 66 | 
 67 | ## Acknowledgments
 68 | 
 69 | [P-tuning v2](https://github.com/THUDM/P-tuning-v2)
 70 | 
 71 | [Hugging Face Transformers](https://github.com/huggingface/transformers)
 72 | 
 73 | [Adapter-Hub](https://github.com/Adapter-Hub/adapter-transformers)
 74 | 
 75 | [LoRA](https://github.com/microsoft/LoRA)
 76 | 
 77 | [BitFit](https://github.com/benzakenelad/BitFit)
 78 | 
 79 | 
 80 | ## Citation
 81 | 
 82 | If you find our paper and resources useful, please kindly cite our paper:
 83 | 
 84 | ```
 85 | @inproceedings{chen-etal-2022-revisiting,
 86 |     title = "Revisiting Parameter-Efficient Tuning: Are We Really There Yet?",
 87 |     author = "Chen, Guanzheng  and
 88 |       Liu, Fangyu  and
 89 |       Meng, Zaiqiao  and
 90 |       Liang, Shangsong",
 91 |     booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
 92 |     month = dec,
 93 |     year = "2022",
 94 |     address = "Abu Dhabi, United Arab Emirates",
 95 |     publisher = "Association for Computational Linguistics",
 96 |     url = "https://aclanthology.org/2022.emnlp-main.168",
 97 |     pages = "2612--2626",
 98 | }
 99 | 
100 | ```
101 | 


--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | import argparse
  3 | import dataclasses
  4 | from dataclasses import dataclass, field
  5 | from typing import Optional
  6 | 
  7 | from transformers import HfArgumentParser, TrainingArguments, AdapterArguments
  8 | 
  9 | from tasks.utils import *
 10 | 
 11 | 
 12 | @dataclass
 13 | class DataTrainingArguments:
 14 |     """
 15 |     Arguments pertaining to what data we are going to input our model for training and eval.
 16 | 
 17 |     Using `HfArgumentParser` we can turn this class
 18 |     into argparse arguments to be able to specify them on
 19 |     the command line.training_args
 20 |     """
 21 | 
 22 |     task_name: str = field(
 23 |         metadata={
 24 |             "help": "The name of the task to train on: " + ", ".join(TASKS),
 25 |             "choices": TASKS
 26 |         },
 27 |     )
 28 |     dataset_name: str = field(
 29 |         metadata={
 30 |             "help": "The name of the dataset to use: " + ", ".join(DATASETS),
 31 |             "choices": DATASETS
 32 |         }
 33 |     )
 34 |     dataset_config_name: Optional[str] = field(
 35 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
 36 |     )
 37 |     max_seq_length: int = field(
 38 |         default=128,
 39 |         metadata={
 40 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
 41 |             "than this will be truncated, sequences shorter will be padded."
 42 |         },
 43 |     )
 44 |     overwrite_cache: bool = field(
 45 |         default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
 46 |     )
 47 |     pad_to_max_length: bool = field(
 48 |         default=True,
 49 |         metadata={
 50 |             "help": "Whether to pad all samples to `max_seq_length`. "
 51 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch."
 52 |         },
 53 |     )
 54 |     max_train_samples: Optional[int] = field(
 55 |         default=None,
 56 |         metadata={
 57 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
 58 |             "value if set."
 59 |         },
 60 |     )
 61 |     max_eval_samples: Optional[int] = field(
 62 |         default=None,
 63 |         metadata={
 64 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
 65 |             "value if set."
 66 |         },
 67 |     )
 68 |     max_predict_samples: Optional[int] = field(
 69 |         default=None,
 70 |         metadata={
 71 |             "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
 72 |             "value if set."
 73 |         },
 74 |     )
 75 |     train_file: Optional[str] = field(
 76 |         default=None, metadata={"help": "A csv or a json file containing the training data."}
 77 |     )
 78 |     validation_file: Optional[str] = field(
 79 |         default=None, metadata={"help": "A csv or a json file containing the validation data."}
 80 |     )
 81 |     test_file: Optional[str] = field(
 82 |         default=None, 
 83 |         metadata={"help": "A csv or a json file containing the test data."}
 84 |     )
 85 |     template_id: Optional[int] = field(
 86 |         default=0,
 87 |         metadata={
 88 |             "help": "The specific prompt string to use"
 89 |         }
 90 |     )
 91 |     pilot: Optional[str] = field(
 92 |         default=None, 
 93 |         metadata={"help": "do the pilot experiments."}
 94 |     )
 95 | 
 96 | 
 97 | @dataclass
 98 | class ModelArguments:
 99 |     """
100 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
101 |     """
102 |     model_name_or_path: str = field(
103 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
104 |     )
105 |     config_name: Optional[str] = field(
106 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
107 |     )
108 |     tokenizer_name: Optional[str] = field(
109 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
110 |     )
111 |     cache_dir: Optional[str] = field(
112 |         default=None,
113 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
114 |     )
115 |     use_fast_tokenizer: bool = field(
116 |         default=True,
117 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
118 |     )
119 |     model_revision: str = field(
120 |         default="main",
121 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
122 |     )
123 |     use_auth_token: bool = field(
124 |         default=False,
125 |         metadata={
126 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
127 |             "with private models)."
128 |         },
129 |     )
130 |     prefix: bool = field(
131 |         default=False,
132 |         metadata={
133 |             "help": "Will use P-tuning v2 during training"
134 |         }
135 |     )
136 |     prompt: bool = field(
137 |         default=False,
138 |         metadata={
139 |             "help": "Will use prompt tuning during training"
140 |         }
141 |     )
142 |     pre_seq_len: int = field(
143 |         default=4,
144 |         metadata={
145 |             "help": "The length of prompt"
146 |         }
147 |     )
148 |     prefix_projection: bool = field(
149 |         default=False,
150 |         metadata={
151 |             "help": "Apply a two-layer MLP head over the prefix embeddings"
152 |         }
153 |     ) 
154 |     prefix_hidden_size: int = field(
155 |         default=512,
156 |         metadata={
157 |             "help": "The hidden size of the MLP projection head in Prefix Encoder if prefix projection is used"
158 |         }
159 |     )
160 |     hidden_dropout_prob: float = field(
161 |         default=0.1,
162 |         metadata={
163 |             "help": "The dropout probability used in the models"
164 |         }
165 |     )
166 |     lora: bool = field(
167 |         default=False,
168 |         metadata={
169 |             "help": "Will use lora during training"
170 |         }
171 |     )
172 |     lora_r: int = field(
173 |         default=8,
174 |         metadata={
175 |             "help": "The rank of lora"
176 |         }
177 |     )
178 |     lora_alpha: int = field(
179 |         default=16,
180 |         metadata={
181 |             "help": "The length of prompt"
182 |         }
183 |     )
184 |     model_seed: int = field(
185 |         default=1111,
186 |         metadata={
187 |             "help": "The random seed of model initialization."
188 |         }
189 |     )
190 |     bitfit: bool = field(
191 |         default=False,
192 |         metadata={
193 |             "help": "Will use bitfit during training"
194 |         }
195 |     )
196 |     patient: int = field(
197 |         default=10,
198 |         metadata={
199 |             "help": "The patient of early stopping."
200 |         }
201 |     )
202 | 
203 | @dataclass
204 | class QuestionAnwseringArguments:
205 |     n_best_size: int = field(
206 |         default=20,
207 |         metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
208 |     )
209 |     max_answer_length: int = field(
210 |         default=30,
211 |         metadata={
212 |             "help": "The maximum length of an answer that can be generated. This is needed because the start "
213 |             "and end predictions are not conditioned on one another."
214 |         },
215 |     )
216 |     version_2_with_negative: bool = field(
217 |         default=False, metadata={"help": "If true, some of the examples do not have an answer."}
218 |     )
219 |     null_score_diff_threshold: float = field(
220 |         default=0.0,
221 |         metadata={
222 |             "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
223 |             "the score of the null answer minus this threshold, the null answer is selected for this example. "
224 |             "Only useful when `version_2_with_negative=True`."
225 |         },
226 |     )
227 | 
228 | def get_args():
229 |     """Parse all the args."""
230 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, QuestionAnwseringArguments, AdapterArguments))
231 | 
232 |     args = parser.parse_args_into_dataclasses()
233 | 
234 |     return args


--------------------------------------------------------------------------------
/model/lora/__init__.py:
--------------------------------------------------------------------------------
1 | from .layers import *
2 | from .utils import *


--------------------------------------------------------------------------------
/model/lora/utils.py:
--------------------------------------------------------------------------------
 1 | #  ------------------------------------------------------------------------------------------
 2 | #  Copyright (c) Microsoft Corporation. All rights reserved.
 3 | #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 4 | #  ------------------------------------------------------------------------------------------
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from typing import Dict
 9 | 
10 | from .layers import LoRALayer
11 | 
12 | 
13 | def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None:
14 |     for n, p in model.named_parameters():
15 |         if 'lora_' not in n:
16 |             p.requires_grad = False
17 |     if bias == 'none':
18 |         return
19 |     elif bias == 'all':
20 |         for n, p in model.named_parameters():
21 |             if 'bias' in n:
22 |                 p.requires_grad = True
23 |     elif bias == 'lora_only':
24 |         for m in model.modules():
25 |             if isinstance(m, LoRALayer) and \
26 |                 hasattr(m, 'bias') and \
27 |                 m.bias is not None:
28 |                     m.bias.requires_grad = True
29 |     else:
30 |         raise NotImplementedError
31 | 
32 | 
33 | def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]:
34 |     my_state_dict = model.state_dict()
35 |     if bias == 'none':
36 |         return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k}
37 |     elif bias == 'all':
38 |         return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k}
39 |     elif bias == 'lora_only':
40 |         to_return = {}
41 |         for k in my_state_dict:
42 |             if 'lora_' in k:
43 |                 to_return[k] = my_state_dict[k]
44 |                 bias_name = k.split('lora_')[0]+'bias'
45 |                 if bias_name in my_state_dict:
46 |                     to_return[bias_name] = my_state_dict[bias_name]
47 |         return to_return
48 |     else:
49 |         raise NotImplementedError
50 | 


--------------------------------------------------------------------------------
/model/multiple_choice.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch._C import NoopLogger
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch import Tensor
  6 | from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
  7 | 
  8 | from transformers import BertModel, BertPreTrainedModel
  9 | from transformers.modeling_outputs import MultipleChoiceModelOutput, BaseModelOutput, Seq2SeqLMOutput
 10 | 
 11 | from model.prefix_encoder import PrefixEncoder
 12 | from transformers.adapters.model_mixin import ModelWithHeadsAdaptersMixin
 13 | # from transformers import RobertaModel, RobertaPreTrainedModel
 14 | from model.roberta import RobertaModel, RobertaPreTrainedModel
 15 | 
 16 | 
 17 | 
 18 | class RobertaPrefixForMultipleChoice(ModelWithHeadsAdaptersMixin, RobertaPreTrainedModel):
 19 |     _keys_to_ignore_on_load_missing = [r"position_ids"]
 20 | 
 21 |     def __init__(self, config):
 22 |         super().__init__(config)
 23 | 
 24 |         self.roberta = RobertaModel(config)
 25 |         self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
 26 |         self.classifier = torch.nn.Linear(config.hidden_size, 1)
 27 | 
 28 |         self.init_weights()
 29 | 
 30 | 
 31 |         for param in self.roberta.parameters():
 32 |             param.requires_grad = False
 33 |             
 34 |         self.pre_seq_len = config.pre_seq_len
 35 |         self.n_layer = config.num_hidden_layers
 36 |         self.n_head = config.num_attention_heads
 37 |         self.n_embd = config.hidden_size // config.num_attention_heads
 38 | 
 39 |         self.prefix_tokens = torch.arange(self.pre_seq_len).long()
 40 |         self.prefix_encoder = PrefixEncoder(config)
 41 | 
 42 |         bert_param = 0
 43 |         for name, param in self.roberta.named_parameters():
 44 |             bert_param += param.numel()
 45 |         all_param = 0
 46 |         for name, param in self.named_parameters():
 47 |             all_param += param.numel()
 48 |         total_param = all_param - bert_param
 49 |         print('total param is {}'.format(total_param))
 50 | 
 51 |     def get_prompt(self, batch_size):
 52 |         prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(self.roberta.device)
 53 |         past_key_values = self.prefix_encoder(prefix_tokens)
 54 |         past_key_values = past_key_values.view(
 55 |             batch_size,
 56 |             self.pre_seq_len,
 57 |             self.n_layer * 2, 
 58 |             self.n_head,
 59 |             self.n_embd
 60 |         )
 61 |         past_key_values = self.dropout(past_key_values)
 62 |         past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(2)
 63 |         return past_key_values
 64 | 
 65 |     def forward(
 66 |         self,
 67 |         input_ids=None,
 68 |         token_type_ids=None,
 69 |         attention_mask=None,
 70 |         labels=None,
 71 |         position_ids=None,
 72 |         head_mask=None,
 73 |         inputs_embeds=None,
 74 |         output_attentions=None,
 75 |         output_hidden_states=None,
 76 |         return_dict=None,
 77 |         adapter_names=None,
 78 |         head=None,
 79 |         **kwargs
 80 |     ):
 81 |         r"""
 82 |         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
 83 |             Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
 84 |             num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
 85 |             :obj:`input_ids` above)
 86 |         """
 87 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 88 |         batch_size, num_choices = input_ids.shape[:2] if input_ids is not None else inputs_embeds.shape[:2]
 89 | 
 90 |         flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
 91 |         flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
 92 |         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
 93 |         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
 94 |         flat_inputs_embeds = (
 95 |             inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
 96 |             if inputs_embeds is not None
 97 |             else None
 98 |         )
 99 | 
100 |         past_key_values = self.get_prompt(batch_size=batch_size * num_choices)
101 |         prefix_attention_mask = torch.ones(batch_size * num_choices, self.pre_seq_len).to(self.roberta.device)
102 |         flat_attention_mask = torch.cat((prefix_attention_mask, flat_attention_mask), dim=1)
103 | 
104 |         outputs = self.roberta(
105 |             flat_input_ids,
106 |             position_ids=flat_position_ids,
107 |             token_type_ids=flat_token_type_ids,
108 |             attention_mask=flat_attention_mask,
109 |             head_mask=head_mask,
110 |             inputs_embeds=flat_inputs_embeds,
111 |             output_attentions=output_attentions,
112 |             output_hidden_states=output_hidden_states,
113 |             return_dict=return_dict,
114 |             adapter_names=adapter_names,
115 |             past_key_values=past_key_values,
116 |         )
117 |         pooled_output = outputs[1]
118 | 
119 |         pooled_output = self.dropout(pooled_output)
120 |         logits = self.classifier(pooled_output)
121 |         reshaped_logits = logits.view(-1, num_choices)
122 | 
123 |         loss = None
124 |         if labels is not None:
125 |             loss_fct = CrossEntropyLoss()
126 |             loss = loss_fct(reshaped_logits, labels)
127 | 
128 |         if not return_dict:
129 |             output = (reshaped_logits,) + outputs[2:]
130 |             return ((loss,) + output) if loss is not None else output
131 | 
132 |         return MultipleChoiceModelOutput(
133 |             loss=loss,
134 |             logits=reshaped_logits,
135 |             hidden_states=outputs.hidden_states,
136 |             attentions=outputs.attentions,
137 |         )
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | class RobertaLoraForMultipleChoice(ModelWithHeadsAdaptersMixin, RobertaPreTrainedModel):
145 |     _keys_to_ignore_on_load_missing = [r"position_ids"]
146 | 
147 |     def __init__(self, config):
148 |         super().__init__(config)
149 | 
150 |         self.roberta = RobertaModel(config)
151 |         self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
152 |         self.classifier = torch.nn.Linear(config.hidden_size, 1)
153 | 
154 |         self.init_weights()
155 | 
156 | 
157 |         for name, param in self.roberta.named_parameters():
158 |             if "lora" not in name.lower():
159 |                 param.requires_grad = False
160 |             
161 | 
162 |         bert_param = 0
163 |         for name, param in self.roberta.named_parameters():
164 |             bert_param += param.numel()
165 |         all_param = 0
166 |         for name, param in self.named_parameters():
167 |             all_param += param.numel()
168 |         total_param = all_param - bert_param
169 |         print('total param is {}'.format(total_param))
170 | 
171 |     def forward(
172 |         self,
173 |         input_ids=None,
174 |         token_type_ids=None,
175 |         attention_mask=None,
176 |         labels=None,
177 |         position_ids=None,
178 |         head_mask=None,
179 |         inputs_embeds=None,
180 |         output_attentions=None,
181 |         output_hidden_states=None,
182 |         return_dict=None,
183 |         adapter_names=None,
184 |         head=None,
185 |         **kwargs
186 |     ):
187 |         r"""
188 |         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
189 |             Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
190 |             num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
191 |             :obj:`input_ids` above)
192 |         """
193 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
194 |         batch_size, num_choices = input_ids.shape[:2] if input_ids is not None else inputs_embeds.shape[:2]
195 | 
196 |         flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
197 |         flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
198 |         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
199 |         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
200 |         flat_inputs_embeds = (
201 |             inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
202 |             if inputs_embeds is not None
203 |             else None
204 |         )
205 | 
206 |         outputs = self.roberta(
207 |             flat_input_ids,
208 |             position_ids=flat_position_ids,
209 |             token_type_ids=flat_token_type_ids,
210 |             attention_mask=flat_attention_mask,
211 |             head_mask=head_mask,
212 |             inputs_embeds=flat_inputs_embeds,
213 |             output_attentions=output_attentions,
214 |             output_hidden_states=output_hidden_states,
215 |             return_dict=return_dict,
216 |             adapter_names=adapter_names,
217 |         )
218 |         pooled_output = outputs[1]
219 | 
220 |         pooled_output = self.dropout(pooled_output)
221 |         logits = self.classifier(pooled_output)
222 |         reshaped_logits = logits.view(-1, num_choices)
223 | 
224 |         loss = None
225 |         if labels is not None:
226 |             loss_fct = CrossEntropyLoss()
227 |             loss = loss_fct(reshaped_logits, labels)
228 | 
229 |         if not return_dict:
230 |             output = (reshaped_logits,) + outputs[2:]
231 |             return ((loss,) + output) if loss is not None else output
232 | 
233 |         return MultipleChoiceModelOutput(
234 |             loss=loss,
235 |             logits=reshaped_logits,
236 |             hidden_states=outputs.hidden_states,
237 |             attentions=outputs.attentions,
238 |         )


--------------------------------------------------------------------------------
/model/prefix_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class PrefixEncoder(torch.nn.Module):
 5 |     r'''
 6 |     The torch.nn model to encode the prefix
 7 | 
 8 |     Input shape: (batch-size, prefix-length)
 9 | 
10 |     Output shape: (batch-size, prefix-length, 2*layers*hidden)
11 |     '''
12 |     def __init__(self, config):
13 |         super().__init__()
14 |         self.prefix_projection = config.prefix_projection
15 |         if self.prefix_projection:
16 |             # Use a two-layer MLP to encode the prefix
17 |             self.embedding = torch.nn.Embedding(config.pre_seq_len, config.hidden_size)
18 |             self.trans = torch.nn.Sequential(
19 |                 torch.nn.Linear(config.hidden_size, config.prefix_hidden_size),
20 |                 torch.nn.Tanh(),
21 |                 torch.nn.Linear(config.prefix_hidden_size, config.num_hidden_layers * 2 * config.hidden_size)
22 |             )
23 |         else:
24 |             self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_hidden_layers * 2 * config.hidden_size)
25 | 
26 |     def forward(self, prefix: torch.Tensor):
27 |         if self.prefix_projection:
28 |             prefix_tokens = self.embedding(prefix)
29 |             past_key_values = self.trans(prefix_tokens)
30 |         else:
31 |             past_key_values = self.embedding(prefix)
32 |         return past_key_values


--------------------------------------------------------------------------------
/model/roberta/__init__.py:
--------------------------------------------------------------------------------
  1 | # flake8: noqa
  2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
  3 | # module, but to preserve other warnings. So, don't check this module at all.
  4 | 
  5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | 
 19 | from typing import TYPE_CHECKING
 20 | 
 21 | from transformers.file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 22 | 
 23 | 
 24 | _import_structure = {
 25 |     "configuration_roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaOnnxConfig"],
 26 |     "tokenization_roberta": ["RobertaTokenizer"],
 27 | }
 28 | 
 29 | if is_tokenizers_available():
 30 |     _import_structure["tokenization_roberta_fast"] = ["RobertaTokenizerFast"]
 31 | 
 32 | if is_torch_available():
 33 |     _import_structure["modeling_roberta"] = [
 34 |         "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
 35 |         "RobertaForCausalLM",
 36 |         "RobertaForMaskedLM",
 37 |         "RobertaForMultipleChoice",
 38 |         "RobertaForQuestionAnswering",
 39 |         "RobertaForSequenceClassification",
 40 |         "RobertaForTokenClassification",
 41 |         "RobertaModel",
 42 |         "RobertaModelWithHeads",
 43 |         "RobertaPreTrainedModel",
 44 |     ]
 45 | 
 46 | if is_tf_available():
 47 |     _import_structure["modeling_tf_roberta"] = [
 48 |         "TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
 49 |         "TFRobertaForMaskedLM",
 50 |         "TFRobertaForMultipleChoice",
 51 |         "TFRobertaForQuestionAnswering",
 52 |         "TFRobertaForSequenceClassification",
 53 |         "TFRobertaForTokenClassification",
 54 |         "TFRobertaMainLayer",
 55 |         "TFRobertaModel",
 56 |         "TFRobertaPreTrainedModel",
 57 |     ]
 58 | 
 59 | if is_flax_available():
 60 |     _import_structure["modeling_flax_roberta"] = [
 61 |         "FlaxRobertaForMaskedLM",
 62 |         "FlaxRobertaForMultipleChoice",
 63 |         "FlaxRobertaForQuestionAnswering",
 64 |         "FlaxRobertaForSequenceClassification",
 65 |         "FlaxRobertaForTokenClassification",
 66 |         "FlaxRobertaModel",
 67 |         "FlaxRobertaPreTrainedModel",
 68 |     ]
 69 | 
 70 | 
 71 | if TYPE_CHECKING:
 72 |     from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaOnnxConfig
 73 |     from .tokenization_roberta import RobertaTokenizer
 74 | 
 75 |     if is_tokenizers_available():
 76 |         from .tokenization_roberta_fast import RobertaTokenizerFast
 77 | 
 78 |     if is_torch_available():
 79 |         from .modeling_roberta import (
 80 |             ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
 81 |             RobertaForCausalLM,
 82 |             RobertaForMaskedLM,
 83 |             RobertaForMultipleChoice,
 84 |             RobertaForQuestionAnswering,
 85 |             RobertaForSequenceClassification,
 86 |             RobertaForTokenClassification,
 87 |             RobertaModel,
 88 |             RobertaModelWithHeads,
 89 |             RobertaPreTrainedModel,
 90 |         )
 91 | 
 92 |     if is_tf_available():
 93 |         from .modeling_tf_roberta import (
 94 |             TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
 95 |             TFRobertaForMaskedLM,
 96 |             TFRobertaForMultipleChoice,
 97 |             TFRobertaForQuestionAnswering,
 98 |             TFRobertaForSequenceClassification,
 99 |             TFRobertaForTokenClassification,
100 |             TFRobertaMainLayer,
101 |             TFRobertaModel,
102 |             TFRobertaPreTrainedModel,
103 |         )
104 | 
105 |     if is_flax_available():
106 |         from .modeling_tf_roberta import (
107 |             FlaxRobertaForMaskedLM,
108 |             FlaxRobertaForMultipleChoice,
109 |             FlaxRobertaForQuestionAnswering,
110 |             FlaxRobertaForSequenceClassification,
111 |             FlaxRobertaForTokenClassification,
112 |             FlaxRobertaModel,
113 |             FlaxRobertaPreTrainedModel,
114 |         )
115 | 
116 | else:
117 |     import sys
118 | 
119 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
120 | 


--------------------------------------------------------------------------------
/model/roberta/configuration_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | from collections import OrderedDict
18 | from typing import Mapping
19 | 
20 | from transformers.onnx import OnnxConfig
21 | from transformers.utils import logging
22 | from transformers import BertConfig
23 | 
24 | 
25 | logger = logging.get_logger(__name__)
26 | 
27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 |     "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json",
29 |     "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json",
30 |     "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json",
31 |     "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json",
32 |     "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json",
33 |     "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json",
34 | }
35 | 
36 | 
37 | class RobertaConfig(BertConfig):
38 |     r"""
39 |     This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a
40 |     :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified
41 |     arguments, defining the model architecture.
42 | 
43 | 
44 |     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
45 |     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
46 | 
47 |     The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the
48 |     same defaults. Please check the parent class for more information.
49 | 
50 |     Examples::
51 | 
52 |         >>> from transformers import RobertaConfig, RobertaModel
53 | 
54 |         >>> # Initializing a RoBERTa configuration
55 |         >>> configuration = RobertaConfig()
56 | 
57 |         >>> # Initializing a model from the configuration
58 |         >>> model = RobertaModel(configuration)
59 | 
60 |         >>> # Accessing the model configuration
61 |         >>> configuration = model.config
62 |     """
63 |     model_type = "roberta"
64 | 
65 |     def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
66 |         """Constructs RobertaConfig."""
67 |         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
68 | 
69 | 
70 | class RobertaOnnxConfig(OnnxConfig):
71 |     @property
72 |     def inputs(self) -> Mapping[str, Mapping[int, str]]:
73 |         return OrderedDict(
74 |             [
75 |                 ("input_ids", {0: "batch", 1: "sequence"}),
76 |                 ("attention_mask", {0: "batch", 1: "sequence"}),
77 |             ]
78 |         )
79 | 
80 |     @property
81 |     def outputs(self) -> Mapping[str, Mapping[int, str]]:
82 |         return OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"}), ("pooler_output", {0: "batch"})])
83 | 


--------------------------------------------------------------------------------
/model/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert RoBERTa checkpoint."""
 16 | 
 17 | 
 18 | import argparse
 19 | import pathlib
 20 | 
 21 | import fairseq
 22 | import torch
 23 | from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 24 | from fairseq.modules import TransformerSentenceEncoderLayer
 25 | from packaging import version
 26 | 
 27 | from transformers import RobertaConfig, RobertaForMaskedLM, RobertaForSequenceClassification
 28 | from transformers.models.bert.modeling_bert import (
 29 |     BertIntermediate,
 30 |     BertLayer,
 31 |     BertOutput,
 32 |     BertSelfAttention,
 33 |     BertSelfOutput,
 34 | )
 35 | from transformers.utils import logging
 36 | 
 37 | 
 38 | if version.parse(fairseq.__version__) < version.parse("0.9.0"):
 39 |     raise Exception("requires fairseq >= 0.9.0")
 40 | 
 41 | 
 42 | logging.set_verbosity_info()
 43 | logger = logging.get_logger(__name__)
 44 | 
 45 | SAMPLE_TEXT = "Hello world! cécé herlolip"
 46 | 
 47 | 
 48 | def convert_roberta_checkpoint_to_pytorch(
 49 |     roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
 50 | ):
 51 |     """
 52 |     Copy/paste/tweak roberta's weights to our BERT structure.
 53 |     """
 54 |     roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
 55 |     roberta.eval()  # disable dropout
 56 |     roberta_sent_encoder = roberta.model.encoder.sentence_encoder
 57 |     config = RobertaConfig(
 58 |         vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
 59 |         hidden_size=roberta.args.encoder_embed_dim,
 60 |         num_hidden_layers=roberta.args.encoder_layers,
 61 |         num_attention_heads=roberta.args.encoder_attention_heads,
 62 |         intermediate_size=roberta.args.encoder_ffn_embed_dim,
 63 |         max_position_embeddings=514,
 64 |         type_vocab_size=1,
 65 |         layer_norm_eps=1e-5,  # PyTorch default used in fairseq
 66 |     )
 67 |     if classification_head:
 68 |         config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0]
 69 |     print("Our BERT config:", config)
 70 | 
 71 |     model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
 72 |     model.eval()
 73 | 
 74 |     # Now let's copy all the weights.
 75 |     # Embeddings
 76 |     model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
 77 |     model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
 78 |     model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
 79 |         model.roberta.embeddings.token_type_embeddings.weight
 80 |     )  # just zero them out b/c RoBERTa doesn't use them.
 81 |     model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
 82 |     model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
 83 | 
 84 |     for i in range(config.num_hidden_layers):
 85 |         # Encoder: start of layer
 86 |         layer: BertLayer = model.roberta.encoder.layer[i]
 87 |         roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
 88 | 
 89 |         # self attention
 90 |         self_attn: BertSelfAttention = layer.attention.self
 91 |         assert (
 92 |             roberta_layer.self_attn.k_proj.weight.data.shape
 93 |             == roberta_layer.self_attn.q_proj.weight.data.shape
 94 |             == roberta_layer.self_attn.v_proj.weight.data.shape
 95 |             == torch.Size((config.hidden_size, config.hidden_size))
 96 |         )
 97 | 
 98 |         self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
 99 |         self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
100 |         self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
101 |         self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
102 |         self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
103 |         self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
104 | 
105 |         # self-attention output
106 |         self_output: BertSelfOutput = layer.attention.output
107 |         assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
108 |         self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
109 |         self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
110 |         self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
111 |         self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
112 | 
113 |         # intermediate
114 |         intermediate: BertIntermediate = layer.intermediate
115 |         assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
116 |         intermediate.dense.weight = roberta_layer.fc1.weight
117 |         intermediate.dense.bias = roberta_layer.fc1.bias
118 | 
119 |         # output
120 |         bert_output: BertOutput = layer.output
121 |         assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
122 |         bert_output.dense.weight = roberta_layer.fc2.weight
123 |         bert_output.dense.bias = roberta_layer.fc2.bias
124 |         bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
125 |         bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
126 |         # end of layer
127 | 
128 |     if classification_head:
129 |         model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
130 |         model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
131 |         model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
132 |         model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
133 |     else:
134 |         # LM Head
135 |         model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
136 |         model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
137 |         model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
138 |         model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
139 |         model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
140 |         model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias
141 | 
142 |     # Let's check that we get the same results.
143 |     input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
144 | 
145 |     our_output = model(input_ids)[0]
146 |     if classification_head:
147 |         their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
148 |     else:
149 |         their_output = roberta.model(input_ids)[0]
150 |     print(our_output.shape, their_output.shape)
151 |     max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
152 |     print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
153 |     success = torch.allclose(our_output, their_output, atol=1e-3)
154 |     print("Do both models output the same tensors?", "🔥" if success else "💩")
155 |     if not success:
156 |         raise Exception("Something went wRoNg")
157 | 
158 |     pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
159 |     print(f"Saving model to {pytorch_dump_folder_path}")
160 |     model.save_pretrained(pytorch_dump_folder_path)
161 | 
162 | 
163 | if __name__ == "__main__":
164 |     parser = argparse.ArgumentParser()
165 |     # Required parameters
166 |     parser.add_argument(
167 |         "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
168 |     )
169 |     parser.add_argument(
170 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
171 |     )
172 |     parser.add_argument(
173 |         "--classification_head", action="store_true", help="Whether to convert a final classification head."
174 |     )
175 |     args = parser.parse_args()
176 |     convert_roberta_checkpoint_to_pytorch(
177 |         args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
178 |     )
179 | 


--------------------------------------------------------------------------------
/model/roberta/tokenization_roberta_fast.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Fast Tokenization classes for RoBERTa."""
 16 | 
 17 | from typing import List, Optional
 18 | 
 19 | from transformers.tokenization_utils_base import AddedToken
 20 | from transformers.utils import logging
 21 | from transformers.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
 22 | from .tokenization_roberta import RobertaTokenizer
 23 | 
 24 | 
 25 | logger = logging.get_logger(__name__)
 26 | 
 27 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
 28 | 
 29 | PRETRAINED_VOCAB_FILES_MAP = {
 30 |     "vocab_file": {
 31 |         "roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
 32 |         "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
 33 |         "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
 34 |         "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
 35 |         "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json",
 36 |         "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json",
 37 |     },
 38 |     "merges_file": {
 39 |         "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
 40 |         "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
 41 |         "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
 42 |         "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
 43 |         "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt",
 44 |         "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt",
 45 |     },
 46 |     "tokenizer_file": {
 47 |         "roberta-base": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json",
 48 |         "roberta-large": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json",
 49 |         "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer.json",
 50 |         "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/tokenizer.json",
 51 |         "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/tokenizer.json",
 52 |         "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/tokenizer.json",
 53 |     },
 54 | }
 55 | 
 56 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 57 |     "roberta-base": 512,
 58 |     "roberta-large": 512,
 59 |     "roberta-large-mnli": 512,
 60 |     "distilroberta-base": 512,
 61 |     "roberta-base-openai-detector": 512,
 62 |     "roberta-large-openai-detector": 512,
 63 | }
 64 | 
 65 | 
 66 | class RobertaTokenizerFast(GPT2TokenizerFast):
 67 |     """
 68 |     Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2
 69 |     tokenizer, using byte-level Byte-Pair-Encoding.
 70 | 
 71 |     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
 72 |     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 73 | 
 74 |     ::
 75 | 
 76 |         >>> from transformers import RobertaTokenizerFast
 77 |         >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
 78 |         >>> tokenizer("Hello world")['input_ids']
 79 |         [0, 31414, 232, 328, 2]
 80 |         >>> tokenizer(" Hello world")['input_ids']
 81 |         [0, 20920, 232, 2]
 82 | 
 83 |     You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
 84 |     call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
 85 | 
 86 |     .. note::
 87 | 
 88 |         When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
 89 |         ``add_prefix_space=True``.
 90 | 
 91 |     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
 92 |     methods. Users should refer to this superclass for more information regarding those methods.
 93 | 
 94 |     Args:
 95 |         vocab_file (:obj:`str`):
 96 |             Path to the vocabulary file.
 97 |         merges_file (:obj:`str`):
 98 |             Path to the merges file.
 99 |         errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
100 |             Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
101 |             <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
102 |         bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
103 |             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
104 | 
105 |             .. note::
106 | 
107 |                 When building a sequence using special tokens, this is not the token that is used for the beginning of
108 |                 sequence. The token used is the :obj:`cls_token`.
109 |         eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
110 |             The end of sequence token.
111 | 
112 |             .. note::
113 | 
114 |                 When building a sequence using special tokens, this is not the token that is used for the end of
115 |                 sequence. The token used is the :obj:`sep_token`.
116 |         sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
117 |             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
118 |             sequence classification or for a text and a question for question answering. It is also used as the last
119 |             token of a sequence built with special tokens.
120 |         cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
121 |             The classifier token which is used when doing sequence classification (classification of the whole sequence
122 |             instead of per-token classification). It is the first token of the sequence when built with special tokens.
123 |         unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
124 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
125 |             token instead.
126 |         pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
127 |             The token used for padding, for example when batching sequences of different lengths.
128 |         mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
129 |             The token used for masking values. This is the token used when training this model with masked language
130 |             modeling. This is the token which the model will try to predict.
131 |         add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
132 |             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
133 |             other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
134 |         trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
135 |             Whether the post processing step should trim offsets to avoid including whitespaces.
136 |     """
137 | 
138 |     vocab_files_names = VOCAB_FILES_NAMES
139 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
140 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
141 |     model_input_names = ["input_ids", "attention_mask"]
142 |     slow_tokenizer_class = RobertaTokenizer
143 | 
144 |     def __init__(
145 |         self,
146 |         vocab_file=None,
147 |         merges_file=None,
148 |         tokenizer_file=None,
149 |         errors="replace",
150 |         bos_token="<s>",
151 |         eos_token="</s>",
152 |         sep_token="</s>",
153 |         cls_token="<s>",
154 |         unk_token="<unk>",
155 |         pad_token="<pad>",
156 |         mask_token="<mask>",
157 |         add_prefix_space=False,
158 |         **kwargs
159 |     ):
160 |         super().__init__(
161 |             vocab_file,
162 |             merges_file,
163 |             tokenizer_file=tokenizer_file,
164 |             errors=errors,
165 |             bos_token=bos_token,
166 |             eos_token=eos_token,
167 |             sep_token=sep_token,
168 |             cls_token=cls_token,
169 |             unk_token=unk_token,
170 |             pad_token=pad_token,
171 |             mask_token=mask_token,
172 |             add_prefix_space=add_prefix_space,
173 |             **kwargs,
174 |         )
175 | 
176 |     @property
177 |     def mask_token(self) -> str:
178 |         """
179 |         :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
180 |         not having been set.
181 | 
182 |         Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
183 |         comprise the space before the `<mask>`.
184 |         """
185 |         if self._mask_token is None and self.verbose:
186 |             logger.error("Using mask_token, but it is not set yet.")
187 |             return None
188 |         return str(self._mask_token)
189 | 
190 |     @mask_token.setter
191 |     def mask_token(self, value):
192 |         """
193 |         Overriding the default behavior of the mask token to have it eat the space before it.
194 | 
195 |         This is needed to preserve backward compatibility with all the previously used models based on Roberta.
196 |         """
197 |         # Mask token behave like a normal word, i.e. include the space before it
198 |         # So we set lstrip to True
199 |         value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
200 |         self._mask_token = value
201 | 
202 |     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
203 |         output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
204 |         if token_ids_1 is None:
205 |             return output
206 | 
207 |         return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
208 | 
209 |     def create_token_type_ids_from_sequences(
210 |         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
211 |     ) -> List[int]:
212 |         """
213 |         Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
214 |         make use of token type ids, therefore a list of zeros is returned.
215 | 
216 |         Args:
217 |             token_ids_0 (:obj:`List[int]`):
218 |                 List of IDs.
219 |             token_ids_1 (:obj:`List[int]`, `optional`):
220 |                 Optional second list of IDs for sequence pairs.
221 | 
222 |         Returns:
223 |             :obj:`List[int]`: List of zeros.
224 |         """
225 |         sep = [self.sep_token_id]
226 |         cls = [self.cls_token_id]
227 | 
228 |         if token_ids_1 is None:
229 |             return len(cls + token_ids_0 + sep) * [0]
230 |         return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
231 | 


--------------------------------------------------------------------------------
/model/sequence_classification.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch._C import NoopLogger
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch import Tensor
  6 | from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
  7 | 
  8 | from transformers import BertModel, BertPreTrainedModel
  9 | 
 10 | from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutput, Seq2SeqLMOutput
 11 | 
 12 | from model.prefix_encoder import PrefixEncoder
 13 | 
 14 | from transformers.adapters.models.bert import BertModelHeadsMixin, BertModelAdaptersMixin
 15 | from transformers.adapters.model_mixin import ModelWithHeadsAdaptersMixin
 16 | # from transformers import RobertaModel, RobertaPreTrainedModel
 17 | from model.roberta import RobertaModel, RobertaPreTrainedModel
 18 | 
 19 | import copy
 20 | 
 21 | 
 22 |         
 23 | class RobertaPrefixForSequenceClassification(ModelWithHeadsAdaptersMixin, RobertaPreTrainedModel):
 24 |     def __init__(self, config):
 25 |         super().__init__(config)
 26 |         self.num_labels = config.num_labels
 27 |         self.config = config
 28 |         self.roberta = RobertaModel(config)
 29 |         self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
 30 |         # self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)
 31 |         self.classifier = RobertaClassificationHead(config)
 32 |         self.init_weights()
 33 | 
 34 |         for param in self.roberta.parameters():
 35 |             param.requires_grad = False
 36 |         
 37 |         self.pre_seq_len = config.pre_seq_len
 38 |         self.n_layer = config.num_hidden_layers
 39 |         self.n_head = config.num_attention_heads
 40 |         self.n_embd = config.hidden_size // config.num_attention_heads
 41 | 
 42 |         self.prefix_tokens = torch.arange(self.pre_seq_len).long()
 43 |         self.prefix_encoder = PrefixEncoder(config)
 44 | 
 45 |         bert_param = 0
 46 |         for name, param in self.roberta.named_parameters():
 47 |             bert_param += param.numel()
 48 |         all_param = 0
 49 |         for name, param in self.named_parameters():
 50 |             all_param += param.numel()
 51 |         total_param = all_param - bert_param
 52 |         print('total param is {}'.format(total_param)) # 9860105
 53 | 
 54 |     
 55 |     def get_prompt(self, batch_size):
 56 |         prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(self.roberta.device)
 57 |         past_key_values = self.prefix_encoder(prefix_tokens)
 58 |         past_key_values = past_key_values.view(
 59 |             batch_size,
 60 |             self.pre_seq_len,
 61 |             self.n_layer * 2, 
 62 |             self.n_head,
 63 |             self.n_embd
 64 |         )
 65 |         past_key_values = self.dropout(past_key_values)
 66 |         past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(2)
 67 |         return past_key_values
 68 | 
 69 |     def forward(
 70 |         self,
 71 |         input_ids=None,
 72 |         attention_mask=None,
 73 |         token_type_ids=None,
 74 |         position_ids=None,
 75 |         head_mask=None,
 76 |         inputs_embeds=None,
 77 |         labels=None,
 78 |         output_attentions=None,
 79 |         output_hidden_states=None,
 80 |         return_dict=None,
 81 |         adapter_names=None,
 82 |         head=None,
 83 |         **kwargs
 84 |     ):
 85 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 86 | 
 87 |         batch_size = input_ids.shape[0]
 88 |         past_key_values = self.get_prompt(batch_size=batch_size)
 89 |         prefix_attention_mask = torch.ones(batch_size, self.pre_seq_len).to(self.roberta.device)
 90 |         attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
 91 | 
 92 |         outputs = self.roberta(
 93 |             input_ids,
 94 |             attention_mask=attention_mask,
 95 |             token_type_ids=token_type_ids,
 96 |             position_ids=position_ids,
 97 |             head_mask=head_mask,
 98 |             inputs_embeds=inputs_embeds,
 99 |             output_attentions=output_attentions,
100 |             output_hidden_states=output_hidden_states,
101 |             return_dict=return_dict,
102 |             adapter_names=adapter_names,
103 |             past_key_values=past_key_values,
104 |         )
105 | 
106 |         # pooled_output = outputs[1]
107 | 
108 |         # pooled_output = self.dropout(pooled_output)
109 |         # logits = self.classifier(pooled_output)
110 | 
111 |         sequence_output = outputs[0]
112 |         logits = self.classifier(sequence_output)
113 | 
114 |         loss = None
115 |         if labels is not None:
116 |             if self.config.problem_type is None:
117 |                 if self.num_labels == 1:
118 |                     self.config.problem_type = "regression"
119 |                 elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
120 |                     self.config.problem_type = "single_label_classification"
121 |                 else:
122 |                     self.config.problem_type = "multi_label_classification"
123 | 
124 |             if self.config.problem_type == "regression":
125 |                 loss_fct = MSELoss()
126 |                 if self.num_labels == 1:
127 |                     loss = loss_fct(logits.squeeze(), labels.squeeze())
128 |                 else:
129 |                     loss = loss_fct(logits, labels)
130 |             elif self.config.problem_type == "single_label_classification":
131 |                 loss_fct = CrossEntropyLoss()
132 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
133 |             elif self.config.problem_type == "multi_label_classification":
134 |                 loss_fct = BCEWithLogitsLoss()
135 |                 loss = loss_fct(logits, labels)
136 |         if not return_dict:
137 |             output = (logits,) + outputs[2:]
138 |             return ((loss,) + output) if loss is not None else output
139 | 
140 |         return SequenceClassifierOutput(
141 |             loss=loss,
142 |             logits=logits,
143 |             hidden_states=outputs.hidden_states,
144 |             attentions=outputs.attentions,
145 |         )
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | class RobertaClassificationHead(nn.Module):
153 |     """Head for sentence-level classification tasks."""
154 | 
155 |     def __init__(self, config):
156 |         super().__init__()
157 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
158 |         classifier_dropout = (
159 |             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
160 |         )
161 |         self.dropout = nn.Dropout(classifier_dropout)
162 |         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
163 | 
164 |     def forward(self, features, **kwargs):
165 |         x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
166 |         x = self.dropout(x)
167 |         x = self.dense(x)
168 |         x = torch.tanh(x)
169 |         x = self.dropout(x)
170 |         x = self.out_proj(x)
171 |         return x
172 | 
173 | 
174 | class RobertaLoraForSequenceClassification(ModelWithHeadsAdaptersMixin, RobertaPreTrainedModel):
175 |     def __init__(self, config):
176 |         super().__init__(config)
177 |         self.num_labels = config.num_labels
178 |         self.config = config
179 |         self.roberta = RobertaModel(config)
180 |         self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
181 |         # self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)
182 |         self.classifier = RobertaClassificationHead(config)
183 |         self.init_weights()
184 | 
185 |         for name, param in self.roberta.named_parameters():
186 |             if "lora" not in name.lower():
187 |                 param.requires_grad = False
188 | 
189 |         bert_param = 0
190 |         for name, param in self.roberta.named_parameters():
191 |             bert_param += param.numel()
192 |         all_param = 0
193 |         for name, param in self.named_parameters():
194 |             all_param += param.numel()
195 |         total_param = all_param - bert_param
196 |         print('total param is {}'.format(total_param)) # 9860105
197 | 
198 |     
199 | 
200 |     def forward(
201 |         self,
202 |         input_ids=None,
203 |         attention_mask=None,
204 |         token_type_ids=None,
205 |         position_ids=None,
206 |         head_mask=None,
207 |         inputs_embeds=None,
208 |         labels=None,
209 |         output_attentions=None,
210 |         output_hidden_states=None,
211 |         return_dict=None,
212 |         adapter_names=None,
213 |         head=None,
214 |         **kwargs
215 |     ):
216 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
217 | 
218 |         outputs = self.roberta(
219 |             input_ids,
220 |             attention_mask=attention_mask,
221 |             token_type_ids=token_type_ids,
222 |             position_ids=position_ids,
223 |             head_mask=head_mask,
224 |             inputs_embeds=inputs_embeds,
225 |             output_attentions=output_attentions,
226 |             output_hidden_states=output_hidden_states,
227 |             return_dict=return_dict,
228 |             adapter_names=adapter_names
229 |         )
230 | 
231 |         sequence_output = outputs[0]
232 |         logits = self.classifier(sequence_output)
233 | 
234 |         loss = None
235 |         if labels is not None:
236 |             if self.config.problem_type is None:
237 |                 if self.num_labels == 1:
238 |                     self.config.problem_type = "regression"
239 |                 elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
240 |                     self.config.problem_type = "single_label_classification"
241 |                 else:
242 |                     self.config.problem_type = "multi_label_classification"
243 | 
244 |             if self.config.problem_type == "regression":
245 |                 loss_fct = MSELoss()
246 |                 if self.num_labels == 1:
247 |                     loss = loss_fct(logits.squeeze(), labels.squeeze())
248 |                 else:
249 |                     loss = loss_fct(logits, labels)
250 |             elif self.config.problem_type == "single_label_classification":
251 |                 loss_fct = CrossEntropyLoss()
252 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
253 |             elif self.config.problem_type == "multi_label_classification":
254 |                 loss_fct = BCEWithLogitsLoss()
255 |                 loss = loss_fct(logits, labels)
256 |         if not return_dict:
257 |             output = (logits,) + outputs[2:]
258 |             return ((loss,) + output) if loss is not None else output
259 | 
260 |         return SequenceClassifierOutput(
261 |             loss=loss,
262 |             logits=logits,
263 |             hidden_states=outputs.hidden_states,
264 |             attentions=outputs.attentions,
265 |         )


--------------------------------------------------------------------------------
/model/utils.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | 
  4 | from model.sequence_classification import (
  5 |     RobertaPrefixForSequenceClassification,
  6 |     RobertaLoraForSequenceClassification
  7 | )
  8 | 
  9 | 
 10 | from model.multiple_choice import (
 11 |     RobertaPrefixForMultipleChoice,
 12 |     RobertaLoraForMultipleChoice
 13 | )
 14 | 
 15 | from transformers import (
 16 |     AutoConfig,
 17 |     AutoModelForTokenClassification,
 18 |     AutoModelForSequenceClassification,
 19 |     AutoModelForQuestionAnswering,
 20 |     AutoModelForMultipleChoice
 21 | )
 22 | 
 23 | 
 24 | 
 25 | class TaskType(Enum):
 26 |     TOKEN_CLASSIFICATION = 1,
 27 |     SEQUENCE_CLASSIFICATION = 2,
 28 |     QUESTION_ANSWERING = 3,
 29 |     MULTIPLE_CHOICE = 4
 30 | 
 31 | PREFIX_MODELS = {
 32 | 
 33 |     "roberta": {
 34 |         TaskType.SEQUENCE_CLASSIFICATION: RobertaPrefixForSequenceClassification,
 35 |         TaskType.MULTIPLE_CHOICE: RobertaPrefixForMultipleChoice,
 36 |         
 37 |     },
 38 | }
 39 | 
 40 | 
 41 | 
 42 | AUTO_MODELS = {
 43 |     TaskType.TOKEN_CLASSIFICATION: AutoModelForTokenClassification,
 44 |     TaskType.SEQUENCE_CLASSIFICATION: AutoModelForSequenceClassification,
 45 |     TaskType.QUESTION_ANSWERING: AutoModelForQuestionAnswering,
 46 |     TaskType.MULTIPLE_CHOICE: AutoModelForMultipleChoice,
 47 | }
 48 | 
 49 | LORA_MODELS = {
 50 |     "roberta": {
 51 |         TaskType.SEQUENCE_CLASSIFICATION: RobertaLoraForSequenceClassification,
 52 |         TaskType.MULTIPLE_CHOICE: RobertaLoraForMultipleChoice,
 53 |     },
 54 | 
 55 | }
 56 | 
 57 | def get_model(model_args, task_type: TaskType, config: AutoConfig, fix_bert: bool = False):
 58 |     if model_args.prefix:
 59 |         config.hidden_dropout_prob = model_args.hidden_dropout_prob
 60 |         config.pre_seq_len = model_args.pre_seq_len
 61 |         config.prefix_projection = model_args.prefix_projection
 62 |         config.prefix_hidden_size = model_args.prefix_hidden_size
 63 |         
 64 |         model_class = PREFIX_MODELS[config.model_type][task_type]
 65 |         model = model_class.from_pretrained(
 66 |             model_args.model_name_or_path,
 67 |             config=config,
 68 |             revision=model_args.model_revision,
 69 |         )
 70 |     elif model_args.lora:
 71 |         config.lora = model_args.lora
 72 |         config.lora_r = model_args.lora_r
 73 |         config.lora_alpha = model_args.lora_alpha
 74 |         model_class = LORA_MODELS[config.model_type][task_type]
 75 |         model = model_class.from_pretrained(
 76 |             model_args.model_name_or_path,
 77 |             config=config,
 78 |             revision=model_args.model_revision,
 79 |         )
 80 |     else:
 81 |         model_class = AUTO_MODELS[task_type]
 82 |         model = model_class.from_pretrained(
 83 |             model_args.model_name_or_path,
 84 |             config=config,
 85 |             revision=model_args.model_revision,
 86 |         )
 87 | 
 88 |         bert_param = 0
 89 |         if fix_bert:
 90 |             if config.model_type == "bert":
 91 |                 for param in model.bert.parameters():
 92 |                     param.requires_grad = False
 93 |                 for _, param in model.bert.named_parameters():
 94 |                     bert_param += param.numel()
 95 |             elif config.model_type == "roberta":
 96 |                 for param in model.roberta.parameters():
 97 |                     param.requires_grad = False
 98 |                 for _, param in model.roberta.named_parameters():
 99 |                     bert_param += param.numel()
100 |             elif config.model_type == "deberta":
101 |                 for param in model.deberta.parameters():
102 |                     param.requires_grad = False
103 |                 for _, param in model.deberta.named_parameters():
104 |                     bert_param += param.numel()
105 |         all_param = 0
106 |         for _, param in model.named_parameters():
107 |             all_param += param.numel()
108 |         total_param = all_param - bert_param
109 |         print('***** total param is {} *****'.format(total_param))
110 |     return model
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | adapter-transformers==2.2.0
  2 | aiohttp==3.8.1
  3 | aiosignal==1.2.0
  4 | albumentations==0.4.3
  5 | async-timeout==4.0.1
  6 | asynctest==0.13.0
  7 | attrs==21.2.0
  8 | audioread==2.1.9
  9 | backports.shutil-get-terminal-size==1.0.0
 10 | blessings==1.7
 11 | certifi==2021.10.8
 12 | cffi==1.15.0
 13 | charset-normalizer==2.0.9
 14 | click==8.0.3
 15 | configparser==5.2.0
 16 | ConfigSpace==0.4.20
 17 | cycler==0.11.0
 18 | Cython==0.29.26
 19 | datasets==1.15.1
 20 | decorator==5.1.0
 21 | dill==0.3.4
 22 | docker-pycreds==0.4.0
 23 | docopt==0.6.2
 24 | filelock==3.4.0
 25 | fonttools==4.28.5
 26 | frozenlist==1.2.0
 27 | fsspec==2021.11.1
 28 | gitdb==4.0.9
 29 | GitPython==3.1.24
 30 | googledrivedownloader==0.4
 31 | gpustat==0.6.0
 32 | hpbandster==0.7.4
 33 | huggingface-hub==0.2.1
 34 | idna==3.3
 35 | imageio==2.13.4
 36 | imgaug==0.2.6
 37 | importlib-metadata==4.8.2
 38 | joblib==1.1.0
 39 | kiwisolver==1.3.2
 40 | librosa==0.7.2
 41 | llvmlite==0.37.0
 42 | matplotlib==3.5.1
 43 | multidict==5.2.0
 44 | multiprocess==0.70.12.2
 45 | netifaces==0.11.0
 46 | networkx==2.6.3
 47 | numba==0.49.1
 48 | numpy==1.21.5
 49 | nvidia-ml-py3==7.352.0
 50 | opencv-python==4.5.4.60
 51 | packaging==21.3
 52 | pandas==1.3.4
 53 | pathtools==0.1.2
 54 | patsy==0.5.2
 55 | Pillow==8.4.0
 56 | pipreqs==0.4.11
 57 | promise==2.3
 58 | protobuf==3.19.1
 59 | psutil==5.8.0
 60 | pyarrow==6.0.1
 61 | pycparser==2.21
 62 | pyparsing==3.0.6
 63 | Pyro4==4.81
 64 | python-dateutil==2.8.2
 65 | pytz==2021.3
 66 | PyWavelets==1.2.0
 67 | PyYAML==6.0
 68 | regex==2021.11.10
 69 | requests==2.26.0
 70 | resampy==0.2.2
 71 | sacremoses==0.0.46
 72 | scikit-image==0.19.1
 73 | scikit-learn==1.0.1
 74 | scipy==1.7.3
 75 | sentry-sdk==1.5.0
 76 | seqeval==1.2.2
 77 | serpent==1.40
 78 | shortuuid==1.0.8
 79 | six==1.16.0
 80 | smmap==5.0.0
 81 | SoundFile==0.10.3.post1
 82 | statsmodels==0.13.1
 83 | subprocess32==3.5.4
 84 | tabulate==0.8.6
 85 | tensorboardX==2.0
 86 | termcolor==1.1.0
 87 | threadpoolctl==3.0.0
 88 | tifffile==2021.11.2
 89 | tokenizers==0.10.3
 90 | torch==1.8.1
 91 | torchvision==0.9.1
 92 | tqdm==4.62.3
 93 | transformers==4.5.0
 94 | typing_extensions==4.0.1
 95 | urllib3==1.26.7
 96 | wandb==0.12.7
 97 | xxhash==2.0.2
 98 | yarg==0.1.9
 99 | yarl==1.7.2
100 | yaspin==2.1.0
101 | zipp==3.6.0
102 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import sys
  4 | import numpy as np
  5 | from typing import Dict
  6 | 
  7 | import datasets
  8 | import transformers
  9 | from transformers import set_seed, Trainer
 10 | from transformers.trainer_utils import get_last_checkpoint
 11 | from transformers import EarlyStoppingCallback
 12 | 
 13 | from arguments import get_args
 14 | 
 15 | from tasks.utils import *
 16 | import wandb
 17 | os.environ["WANDB_DISABLED"] = "true"
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | def train(trainer, resume_from_checkpoint=None, last_checkpoint=None):
 22 |     checkpoint = None
 23 |     if resume_from_checkpoint is not None:
 24 |         checkpoint = resume_from_checkpoint
 25 |     elif last_checkpoint is not None:
 26 |         checkpoint = last_checkpoint
 27 |     train_result = trainer.train(resume_from_checkpoint=checkpoint)
 28 |     # trainer.save_model()
 29 | 
 30 |     # metrics = train_result.metrics
 31 |     trainer.save_model()
 32 |     # trainer.log_metrics("train", metrics)
 33 |     # trainer.save_metrics("train", metrics)
 34 |     # trainer.save_state()
 35 | 
 36 |     # trainer.log_best_metrics()
 37 | 
 38 | def evaluate(trainer):
 39 |     logger.info("*** Evaluate ***")
 40 |     metrics = trainer.evaluate()
 41 | 
 42 |     trainer.log_metrics("eval", metrics)
 43 |     trainer.save_metrics("eval", metrics)
 44 | 
 45 | def predict(trainer, predict_dataset=None):
 46 |     logger.info("*** Predict ***")
 47 |     predictions= trainer.predict(predict_dataset, metric_key_prefix="predict")
 48 | 
 49 |     trainer.log_metrics("predict", predictions.metrics)
 50 |     trainer.save_metrics("predict", predictions.metrics)
 51 | 
 52 | if __name__ == '__main__':
 53 | 
 54 |     args = get_args()
 55 | 
 56 |     _, data_args, training_args, _, adapter_args = args
 57 | 
 58 |     logging.basicConfig(
 59 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 60 |         datefmt="%m/%d/%Y %H:%M:%S",
 61 |         handlers=[logging.StreamHandler(sys.stdout)],
 62 |     )
 63 |     # wandb.init(project=data_args.task_name, name=training_args.run_name)
 64 |     log_level = training_args.get_process_log_level()
 65 |     logger.setLevel(log_level)
 66 |     datasets.utils.logging.set_verbosity(log_level)
 67 |     transformers.utils.logging.set_verbosity(log_level)
 68 |     transformers.utils.logging.enable_default_handler()
 69 |     transformers.utils.logging.enable_explicit_format()
 70 | 
 71 |     # Log on each process the small summary:
 72 |     logger.warning(
 73 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
 74 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
 75 |     )
 76 |     logger.info(f"Training/evaluation parameters {training_args}")
 77 |     
 78 |     # print("***************************", training_args.local_rank)
 79 |     if not os.path.isdir("checkpoints") or not os.path.exists("checkpoints"):
 80 |         os.mkdir("checkpoints")
 81 | 
 82 |     if data_args.task_name.lower() == "superglue":
 83 |         assert data_args.dataset_name.lower() in SUPERGLUE_DATASETS
 84 |         from tasks.superglue.get_trainer import get_trainer
 85 | 
 86 |     elif data_args.task_name.lower() == "glue":
 87 |         assert data_args.dataset_name.lower() in GLUE_DATASETS
 88 |         from tasks.glue.get_trainer import get_trainer
 89 |         
 90 |     else:
 91 |         raise NotImplementedError('Task {} is not implemented. Please choose a task from: {}'.format(data_args.task_name, ", ".join(TASKS)))
 92 | 
 93 |     set_seed(training_args.seed)
 94 | 
 95 |     trainer, predict_dataset = get_trainer(args)
 96 | 
 97 |     last_checkpoint = None
 98 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
 99 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
100 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
101 |             raise ValueError(
102 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
103 |                 "Use --overwrite_output_dir to overcome."
104 |             )
105 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
106 |             logger.info(
107 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
108 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
109 |             )
110 | 
111 | 
112 |     if training_args.do_train:
113 |         train(trainer, training_args.resume_from_checkpoint, last_checkpoint)
114 |     
115 |     # if training_args.do_eval:
116 |     #     evaluate(trainer)
117 | 
118 |     if training_args.do_predict:
119 |         predict(trainer, predict_dataset)
120 | 
121 |    


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_boolq_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=boolq
 3 | 
 4 | bs=32
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |     python3 run.py \
12 |       --model_name_or_path roberta-base \
13 |       --task_name $TASK_NAME \
14 |       --dataset_name $DATASET_NAME \
15 |       --do_train \
16 |       --do_eval \
17 |       --do_predict \
18 |       --max_seq_length 128 \
19 |       --per_device_train_batch_size $bs \
20 |       --learning_rate $lr \
21 |       --num_train_epochs $epoch \
22 |       --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |       --overwrite_output_dir \
24 |       --hidden_dropout_prob $dropout \
25 |       --seed $model_seed \
26 |       --model_seed $model_seed \
27 |       --save_strategy epoch \
28 |       --evaluation_strategy epoch \
29 |       --load_best_model_at_end\
30 |       --metric_for_best_model loss\
31 |       --greater_is_better False \
32 |       --train_adapter\
33 |       --adapter_config pfeiffer \
34 |       --adapter_reduction_factor 2
35 | done
36 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_cb_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=cb
 3 | 
 4 | bs=16
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --logging_steps 5 \
29 |     --evaluation_strategy epoch \
30 |     --load_best_model_at_end\
31 |     --metric_for_best_model loss\
32 |     --greater_is_better False \
33 |     --train_adapter\
34 |     --adapter_config pfeiffer \
35 |     --adapter_reduction_factor 64
36 | done
37 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_cola_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=cola
 3 | 
 4 | bs=16
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --train_adapter\
33 |     --adapter_config pfeiffer \
34 |     --adapter_reduction_factor 16
35 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_copa_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=copa
 3 | 
 4 | bs=16
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --train_adapter\
33 |     --adapter_config pfeiffer \
34 |     --adapter_reduction_factor 64
35 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_mnli_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=mnli
 3 | 
 4 | bs=32
 5 | lr=1e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --train_adapter\
33 |     --adapter_config pfeiffer \
34 |     --adapter_reduction_factor 16
35 | done
36 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_mrpc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=mrpc
 3 | 
 4 | bs=32
 5 | lr=1e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --train_adapter\
33 |     --adapter_config pfeiffer \
34 |     --adapter_reduction_factor 2
35 | done
36 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_multirc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=multirc
 3 | 
 4 | bs=32
 5 | lr=1e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |     python3 run.py \
12 |       --model_name_or_path roberta-base \
13 |       --task_name $TASK_NAME \
14 |       --dataset_name $DATASET_NAME \
15 |       --do_train \
16 |       --do_eval \
17 |       --do_predict \
18 |       --max_seq_length 128 \
19 |       --per_device_train_batch_size $bs \
20 |       --learning_rate $lr \
21 |       --num_train_epochs $epoch \
22 |       --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |       --overwrite_output_dir \
24 |       --hidden_dropout_prob $dropout \
25 |       --seed $model_seed \
26 |       --model_seed $model_seed \
27 |       --save_strategy epoch \
28 |       --evaluation_strategy epoch \
29 |       --load_best_model_at_end\
30 |       --metric_for_best_model loss\
31 |       --greater_is_better False \
32 |       --train_adapter\
33 |       --adapter_config pfeiffer \
34 |       --adapter_reduction_factor 64
35 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_qnli_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=qnli
 3 | 
 4 | bs=32
 5 | lr=1e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --train_adapter\
33 |     --adapter_config pfeiffer \
34 |     --adapter_reduction_factor 16
35 | done
36 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_qqp_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=qqp
 3 | 
 4 | bs=32
 5 | lr=1e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --train_adapter\
33 |     --adapter_config pfeiffer \
34 |     --adapter_reduction_factor 2
35 | done
36 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_rte_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=rte
 3 | 
 4 | bs=16
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |     python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --train_adapter\
33 |     --adapter_config pfeiffer \
34 |     --adapter_reduction_factor 64
35 | done
36 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_sst2_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=sst2
 3 | 
 4 | bs=32
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --train_adapter\
33 |     --adapter_config pfeiffer \
34 |     --adapter_reduction_factor 16
35 | done
36 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_stsb_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=stsb
 3 | 
 4 | bs=16
 5 | lr=1e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --train_adapter\
33 |     --adapter_config pfeiffer \
34 |     --adapter_reduction_factor 16
35 | done
36 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_wic_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=wic
 3 | 
 4 | bs=16
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |     python3 run.py \
12 |       --model_name_or_path roberta-base \
13 |       --task_name $TASK_NAME \
14 |       --dataset_name $DATASET_NAME \
15 |       --do_train \
16 |       --do_eval \
17 |       --do_predict \
18 |       --max_seq_length 128 \
19 |       --per_device_train_batch_size $bs \
20 |       --learning_rate $lr \
21 |       --num_train_epochs $epoch \
22 |       --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |       --overwrite_output_dir \
24 |       --hidden_dropout_prob $dropout \
25 |       --seed $model_seed \
26 |       --model_seed $model_seed \
27 |       --save_strategy epoch \
28 |       --evaluation_strategy epoch \
29 |       --load_best_model_at_end\
30 |       --metric_for_best_model loss\
31 |       --greater_is_better False \
32 |       --train_adapter\
33 |       --adapter_config pfeiffer \
34 |       --adapter_reduction_factor 64
35 | done
36 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/adapter/run_wsc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=wsc
 3 | 
 4 | bs=32
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |     python3 run.py \
12 |       --model_name_or_path roberta-base \
13 |       --task_name $TASK_NAME \
14 |       --dataset_name $DATASET_NAME \
15 |       --do_train \
16 |       --do_eval \
17 |       --do_predict \
18 |       --max_seq_length 128 \
19 |       --per_device_train_batch_size $bs \
20 |       --learning_rate $lr \
21 |       --num_train_epochs $epoch \
22 |       --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \
23 |       --overwrite_output_dir \
24 |       --hidden_dropout_prob $dropout \
25 |       --seed $model_seed \
26 |       --model_seed $model_seed \
27 |       --save_strategy epoch \
28 |       --evaluation_strategy epoch \
29 |       --load_best_model_at_end\
30 |       --metric_for_best_model loss\
31 |       --greater_is_better False \
32 |       --train_adapter\
33 |       --adapter_config pfeiffer \
34 |       --adapter_reduction_factor 2
35 | done
36 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/bitfit/run_boolq_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=boolq
 3 | 
 4 | bs=16
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --bitfit
33 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/bitfit/run_cb_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=cb
 3 | 
 4 | bs=16
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --logging_steps 5 \
29 |     --evaluation_strategy epoch \
30 |     --load_best_model_at_end\
31 |     --metric_for_best_model loss\
32 |     --greater_is_better False \
33 |     --bitfit
34 | done
35 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/bitfit/run_cola_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=cola
 3 | 
 4 | bs=32
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --bitfit
33 | done
34 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/bitfit/run_copa_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=copa
 3 | 
 4 | bs=32
 5 | lr=5e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --bitfit
33 | done
34 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/bitfit/run_mrpc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=mrpc
 3 | 
 4 | bs=16
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --bitfit
33 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/bitfit/run_multirc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=multirc
 3 | 
 4 | bs=16
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |     python3 run.py \
12 |       --model_name_or_path roberta-base \
13 |       --task_name $TASK_NAME \
14 |       --dataset_name $DATASET_NAME \
15 |       --do_train \
16 |       --do_eval \
17 |       --do_predict \
18 |       --max_seq_length 128 \
19 |       --per_device_train_batch_size $bs \
20 |       --learning_rate $lr \
21 |       --num_train_epochs $epoch \
22 |       --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \
23 |       --overwrite_output_dir \
24 |       --hidden_dropout_prob $dropout \
25 |       --seed $model_seed \
26 |       --model_seed $model_seed \
27 |       --save_strategy epoch \
28 |       --evaluation_strategy epoch \
29 |       --load_best_model_at_end\
30 |       --metric_for_best_model loss\
31 |       --greater_is_better False \
32 |       --bitfit
33 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/bitfit/run_rte_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=rte
 3 | 
 4 | bs=16
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
 9 | do
10 |   python3 run.py \
11 |     --model_name_or_path roberta-base \
12 |     --task_name $TASK_NAME \
13 |     --dataset_name $DATASET_NAME \
14 |     --do_train \
15 |     --do_eval \
16 |     --do_predict \
17 |     --max_seq_length 128 \
18 |     --per_device_train_batch_size $bs \
19 |     --learning_rate $lr \
20 |     --num_train_epochs $epoch \
21 |     --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \
22 |     --overwrite_output_dir \
23 |     --hidden_dropout_prob $dropout \
24 |     --seed $model_seed \
25 |     --model_seed $model_seed\
26 |     --save_strategy epoch \
27 |     --evaluation_strategy epoch \
28 |     --load_best_model_at_end\
29 |     --metric_for_best_model loss\
30 |     --greater_is_better False \
31 |     --bitfit
32 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/bitfit/run_sst2_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=sst2
 3 | 
 4 | bs=32
 5 | lr=1e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --bitfit
33 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/bitfit/run_stsb_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=stsb
 3 | 
 4 | bs=16
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --bitfit
33 | done
34 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/bitfit/run_wic_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=wic
 3 | 
 4 | bs=16
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |     python3 run.py \
12 |       --model_name_or_path roberta-base \
13 |       --task_name $TASK_NAME \
14 |       --dataset_name $DATASET_NAME \
15 |       --do_train \
16 |       --do_eval \
17 |       --do_predict \
18 |       --max_seq_length 128 \
19 |       --per_device_train_batch_size $bs \
20 |       --learning_rate $lr \
21 |       --num_train_epochs $epoch \
22 |       --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \
23 |       --overwrite_output_dir \
24 |       --hidden_dropout_prob $dropout \
25 |       --seed $model_seed \
26 |       --model_seed $model_seed \
27 |       --save_strategy epoch \
28 |       --evaluation_strategy epoch \
29 |       --load_best_model_at_end\
30 |       --metric_for_best_model loss\
31 |       --greater_is_better False \
32 |       --bitfit
33 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/bitfit/run_wsc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=wsc
 3 | 
 4 | bs=32
 5 | lr=1e-2
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |     python3 run.py \
12 |       --model_name_or_path roberta-base \
13 |       --task_name $TASK_NAME \
14 |       --dataset_name $DATASET_NAME \
15 |       --do_train \
16 |       --do_eval \
17 |       --do_predict \
18 |       --max_seq_length 128 \
19 |       --per_device_train_batch_size $bs \
20 |       --learning_rate $lr \
21 |       --num_train_epochs $epoch \
22 |       --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \
23 |       --overwrite_output_dir \
24 |       --hidden_dropout_prob $dropout \
25 |       --seed $model_seed \
26 |       --model_seed $model_seed \
27 |       --save_strategy epoch \
28 |       --evaluation_strategy epoch \
29 |       --load_best_model_at_end\
30 |       --metric_for_best_model loss\
31 |       --greater_is_better False \
32 |       --bitfit
33 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_boolq_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=boolq
 3 | 
 4 | bs=32
 5 | lr=1e-5
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.06
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |     python3 run.py \
13 |       --model_name_or_path roberta-base \
14 |       --task_name $TASK_NAME \
15 |       --dataset_name $DATASET_NAME \
16 |       --do_train \
17 |       --do_eval \
18 |       --do_predict \
19 |       --max_seq_length 128 \
20 |       --per_device_train_batch_size $bs \
21 |       --learning_rate $lr \
22 |       --num_train_epochs $epoch \
23 |       --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |       --overwrite_output_dir \
25 |       --hidden_dropout_prob $dropout \
26 |       --seed $model_seed \
27 |       --model_seed $model_seed \
28 |       --save_strategy epoch \
29 |       --evaluation_strategy epoch \
30 |       --load_best_model_at_end\
31 |       --metric_for_best_model loss\
32 |       --greater_is_better False \
33 |       --warmup_ratio $wr
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_cb_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=cb
 3 | 
 4 | bs=16
 5 | lr=5e-5
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.0
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |   python3 run.py \
13 |     --model_name_or_path roberta-base \
14 |     --task_name $TASK_NAME \
15 |     --dataset_name $DATASET_NAME \
16 |     --do_train \
17 |     --do_eval \
18 |     --do_predict \
19 |     --max_seq_length 128 \
20 |     --per_device_train_batch_size $bs \
21 |     --learning_rate $lr \
22 |     --num_train_epochs $epoch \
23 |     --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |     --overwrite_output_dir \
25 |     --hidden_dropout_prob $dropout \
26 |     --seed $model_seed \
27 |     --model_seed $model_seed \
28 |     --save_strategy epoch \
29 |     --logging_steps 5 \
30 |     --evaluation_strategy epoch \
31 |     --load_best_model_at_end\
32 |     --metric_for_best_model loss\
33 |     --greater_is_better False \
34 |     --warmup_ratio $wr
35 | done
36 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_cola_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=cola
 3 | 
 4 | bs=32
 5 | lr=5e-5
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.06
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |   python3 run.py \
13 |     --model_name_or_path roberta-base \
14 |     --task_name $TASK_NAME \
15 |     --dataset_name $DATASET_NAME \
16 |     --do_train \
17 |     --do_eval \
18 |     --do_predict \
19 |     --max_seq_length 128 \
20 |     --per_device_train_batch_size $bs \
21 |     --learning_rate $lr \
22 |     --num_train_epochs $epoch \
23 |     --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |     --overwrite_output_dir \
25 |     --hidden_dropout_prob $dropout \
26 |     --seed $model_seed \
27 |     --model_seed $model_seed \
28 |     --save_strategy epoch \
29 |     --evaluation_strategy epoch \
30 |     --load_best_model_at_end\
31 |     --metric_for_best_model loss\
32 |     --greater_is_better False \
33 |     --warmup_ratio $wr
34 | done
35 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_copa_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=copa
 3 | 
 4 | bs=32
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.0
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |   python3 run.py \
13 |     --model_name_or_path roberta-base \
14 |     --task_name $TASK_NAME \
15 |     --dataset_name $DATASET_NAME \
16 |     --do_train \
17 |     --do_eval \
18 |     --do_predict \
19 |     --max_seq_length 128 \
20 |     --per_device_train_batch_size $bs \
21 |     --learning_rate $lr \
22 |     --num_train_epochs $epoch \
23 |     --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |     --overwrite_output_dir \
25 |     --hidden_dropout_prob $dropout \
26 |     --seed $model_seed \
27 |     --model_seed $model_seed \
28 |     --save_strategy epoch \
29 |     --evaluation_strategy epoch \
30 |     --load_best_model_at_end\
31 |     --metric_for_best_model loss\
32 |     --greater_is_better False \
33 |     --warmup_ratio $wr
34 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_mnli_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=mnli
 3 | 
 4 | bs=32
 5 | lr=1e-5
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.0
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |   python3 run.py \
13 |     --model_name_or_path roberta-base \
14 |     --task_name $TASK_NAME \
15 |     --dataset_name $DATASET_NAME \
16 |     --do_train \
17 |     --do_eval \
18 |     --do_predict \
19 |     --max_seq_length 128 \
20 |     --per_device_train_batch_size $bs \
21 |     --learning_rate $lr \
22 |     --num_train_epochs $epoch \
23 |     --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |     --overwrite_output_dir \
25 |     --hidden_dropout_prob $dropout \
26 |     --seed $model_seed \
27 |     --model_seed $model_seed \
28 |     --save_strategy epoch \
29 |     --evaluation_strategy epoch \
30 |     --load_best_model_at_end\
31 |     --metric_for_best_model loss\
32 |     --greater_is_better False \
33 |     --warmup_ratio $wr
34 | done
35 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_mrpc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=mrpc
 3 | 
 4 | bs=16
 5 | lr=1e-5
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.06
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |   python3 run.py \
13 |     --model_name_or_path roberta-base \
14 |     --task_name $TASK_NAME \
15 |     --dataset_name $DATASET_NAME \
16 |     --do_train \
17 |     --do_eval \
18 |     --do_predict \
19 |     --max_seq_length 128 \
20 |     --per_device_train_batch_size $bs \
21 |     --learning_rate $lr \
22 |     --num_train_epochs $epoch \
23 |     --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |     --overwrite_output_dir \
25 |     --hidden_dropout_prob $dropout \
26 |     --seed $model_seed \
27 |     --model_seed $model_seed \
28 |     --save_strategy epoch \
29 |     --evaluation_strategy epoch \
30 |     --load_best_model_at_end\
31 |     --metric_for_best_model loss\
32 |     --greater_is_better False \
33 |     --warmup_ratio $wr
34 | done
35 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_multirc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=multirc
 3 | 
 4 | bs=32
 5 | lr=5e-6
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.0
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |     python3 run.py \
13 |       --model_name_or_path roberta-base \
14 |       --task_name $TASK_NAME \
15 |       --dataset_name $DATASET_NAME \
16 |       --do_train \
17 |       --do_eval \
18 |       --do_predict \
19 |       --max_seq_length 128 \
20 |       --per_device_train_batch_size $bs \
21 |       --learning_rate $lr \
22 |       --num_train_epochs $epoch \
23 |       --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |       --overwrite_output_dir \
25 |       --hidden_dropout_prob $dropout \
26 |       --seed $model_seed \
27 |       --model_seed $model_seed \
28 |       --save_strategy epoch \
29 |       --evaluation_strategy epoch \
30 |       --load_best_model_at_end\
31 |       --metric_for_best_model loss\
32 |       --greater_is_better False \
33 |       --warmup_ratio $wr
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_qnli_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=qnli
 3 | 
 4 | bs=32
 5 | lr=1e-5
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.0
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |   python3 run.py \
13 |     --model_name_or_path roberta-base \
14 |     --task_name $TASK_NAME \
15 |     --dataset_name $DATASET_NAME \
16 |     --do_train \
17 |     --do_eval \
18 |     --do_predict \
19 |     --max_seq_length 128 \
20 |     --per_device_train_batch_size $bs \
21 |     --learning_rate $lr \
22 |     --num_train_epochs $epoch \
23 |     --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |     --overwrite_output_dir \
25 |     --hidden_dropout_prob $dropout \
26 |     --seed $model_seed \
27 |     --model_seed $model_seed \
28 |     --save_strategy epoch \
29 |     --evaluation_strategy epoch \
30 |     --load_best_model_at_end\
31 |     --metric_for_best_model loss\
32 |     --greater_is_better False \
33 |     --warmup_ratio $wr
34 | done
35 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_qqp_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=qqp
 3 | 
 4 | bs=32
 5 | lr=1e-5
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.0
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |   python3 run.py \
13 |     --model_name_or_path roberta-base \
14 |     --task_name $TASK_NAME \
15 |     --dataset_name $DATASET_NAME \
16 |     --do_train \
17 |     --do_eval \
18 |     --do_predict \
19 |     --max_seq_length 128 \
20 |     --per_device_train_batch_size $bs \
21 |     --learning_rate $lr \
22 |     --num_train_epochs $epoch \
23 |     --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |     --overwrite_output_dir \
25 |     --hidden_dropout_prob $dropout \
26 |     --seed $model_seed \
27 |     --model_seed $model_seed \
28 |     --save_strategy epoch \
29 |     --evaluation_strategy epoch \
30 |     --load_best_model_at_end\
31 |     --metric_for_best_model loss\
32 |     --greater_is_better False \
33 |     --warmup_ratio $wr
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_rte_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=rte
 3 | 
 4 | bs=16
 5 | lr=1e-5
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.06
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed\
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --warmup_ratio $wr
33 | done
34 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_sst2_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=sst2
 3 | 
 4 | bs=32
 5 | lr=1e-5
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.0
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |   python3 run.py \
13 |     --model_name_or_path roberta-base \
14 |     --task_name $TASK_NAME \
15 |     --dataset_name $DATASET_NAME \
16 |     --do_train \
17 |     --do_eval \
18 |     --do_predict \
19 |     --max_seq_length 128 \
20 |     --per_device_train_batch_size $bs \
21 |     --learning_rate $lr \
22 |     --num_train_epochs $epoch \
23 |     --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |     --overwrite_output_dir \
25 |     --hidden_dropout_prob $dropout \
26 |     --seed $model_seed \
27 |     --model_seed $model_seed \
28 |     --save_strategy epoch \
29 |     --evaluation_strategy epoch \
30 |     --load_best_model_at_end\
31 |     --metric_for_best_model loss\
32 |     --greater_is_better False \
33 |     --warmup_ratio $wr
34 | done
35 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_stsb_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=stsb
 3 | 
 4 | bs=16
 5 | lr=1e-5
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.06
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |   python3 run.py \
13 |     --model_name_or_path roberta-base \
14 |     --task_name $TASK_NAME \
15 |     --dataset_name $DATASET_NAME \
16 |     --do_train \
17 |     --do_eval \
18 |     --do_predict \
19 |     --max_seq_length 128 \
20 |     --per_device_train_batch_size $bs \
21 |     --learning_rate $lr \
22 |     --num_train_epochs $epoch \
23 |     --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |     --overwrite_output_dir \
25 |     --hidden_dropout_prob $dropout \
26 |     --seed $model_seed \
27 |     --model_seed $model_seed \
28 |     --save_strategy epoch \
29 |     --evaluation_strategy epoch \
30 |     --load_best_model_at_end\
31 |     --metric_for_best_model loss\
32 |     --greater_is_better False \
33 |     --warmup_ratio $wr
34 | done
35 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_wic_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=wic
 3 | 
 4 | bs=32
 5 | lr=1e-5
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.0
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |     python3 run.py \
13 |       --model_name_or_path roberta-base \
14 |       --task_name $TASK_NAME \
15 |       --dataset_name $DATASET_NAME \
16 |       --do_train \
17 |       --do_eval \
18 |       --do_predict \
19 |       --max_seq_length 128 \
20 |       --per_device_train_batch_size $bs \
21 |       --learning_rate $lr \
22 |       --num_train_epochs $epoch \
23 |       --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |       --overwrite_output_dir \
25 |       --hidden_dropout_prob $dropout \
26 |       --seed $model_seed \
27 |       --model_seed $model_seed \
28 |       --save_strategy epoch \
29 |       --evaluation_strategy epoch \
30 |       --load_best_model_at_end\
31 |       --metric_for_best_model loss\
32 |       --greater_is_better False \
33 |       --warmup_ratio $wr
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/finetuning/run_wsc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=wsc
 3 | 
 4 | bs=16
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | wr=0.0
 9 | 
10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
11 | do
12 |     python3 run.py \
13 |       --model_name_or_path roberta-base \
14 |       --task_name $TASK_NAME \
15 |       --dataset_name $DATASET_NAME \
16 |       --do_train \
17 |       --do_eval \
18 |       --do_predict \
19 |       --max_seq_length 128 \
20 |       --per_device_train_batch_size $bs \
21 |       --learning_rate $lr \
22 |       --num_train_epochs $epoch \
23 |       --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \
24 |       --overwrite_output_dir \
25 |       --hidden_dropout_prob $dropout \
26 |       --seed $model_seed \
27 |       --model_seed $model_seed \
28 |       --save_strategy epoch \
29 |       --evaluation_strategy epoch \
30 |       --load_best_model_at_end\
31 |       --metric_for_best_model loss\
32 |       --greater_is_better False \
33 |       --warmup_ratio $wr
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/lora/run_boolq_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=boolq
 3 | 
 4 | bs=32
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | lora_alpha=16
 9 | lora_r=8
10 | 
11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
12 | do
13 |   python3 run.py \
14 |     --model_name_or_path roberta-base \
15 |     --task_name $TASK_NAME \
16 |     --dataset_name $DATASET_NAME \
17 |     --do_train \
18 |     --do_eval \
19 |     --do_predict \
20 |     --max_seq_length 128 \
21 |     --per_device_train_batch_size $bs \
22 |     --learning_rate $lr \
23 |     --num_train_epochs $epoch \
24 |     --output_dir test/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \
25 |     --overwrite_output_dir \
26 |     --hidden_dropout_prob $dropout \
27 |     --seed $model_seed \
28 |     --model_seed $model_seed \
29 |     --save_strategy epoch \
30 |     --evaluation_strategy epoch \
31 |     --load_best_model_at_end\
32 |     --metric_for_best_model loss\
33 |     --greater_is_better False \
34 |     --lora \
35 |     --lora_alpha $lora_alpha \
36 |     --lora_r $lora_r
37 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/lora/run_cb_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=cb
 3 | 
 4 | bs=16
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | lora_alpha=8
 9 | lora_r=8
10 | 
11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
12 | do
13 |   python3 run.py \
14 |     --model_name_or_path roberta-base \
15 |     --task_name $TASK_NAME \
16 |     --dataset_name $DATASET_NAME \
17 |     --do_train \
18 |     --do_eval \
19 |     --do_predict \
20 |     --max_seq_length 128 \
21 |     --per_device_train_batch_size $bs \
22 |     --learning_rate $lr \
23 |     --num_train_epochs $epoch \
24 |     --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \
25 |     --overwrite_output_dir \
26 |     --hidden_dropout_prob $dropout \
27 |     --seed $model_seed \
28 |     --model_seed $model_seed \
29 |     --save_strategy epoch \
30 |     --evaluation_strategy epoch \
31 |     --load_best_model_at_end\
32 |     --metric_for_best_model loss\
33 |     --greater_is_better False \
34 |     --lora \
35 |     --lora_alpha $lora_alpha \
36 |     --lora_r $lora_r
37 | done
38 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/lora/run_cola_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=cola
 3 | 
 4 | bs=16
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | lora_alpha=8
 9 | lora_r=8
10 | 
11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
12 | do
13 |   python3 run.py \
14 |     --model_name_or_path roberta-base \
15 |     --task_name $TASK_NAME \
16 |     --dataset_name $DATASET_NAME \
17 |     --do_train \
18 |     --do_eval \
19 |     --do_predict \
20 |     --max_seq_length 128 \
21 |     --per_device_train_batch_size $bs \
22 |     --learning_rate $lr \
23 |     --num_train_epochs $epoch \
24 |     --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \
25 |     --overwrite_output_dir \
26 |     --hidden_dropout_prob $dropout \
27 |     --seed $model_seed \
28 |     --model_seed $model_seed \
29 |     --save_strategy epoch \
30 |     --evaluation_strategy epoch \
31 |     --load_best_model_at_end\
32 |     --metric_for_best_model loss\
33 |     --greater_is_better False \
34 |     --lora \
35 |     --lora_alpha $lora_alpha \
36 |     --lora_r $lora_r
37 | done
38 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/lora/run_copa_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=copa
 3 | 
 4 | bs=16
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | lora_alpha=8
 9 | lora_r=8
10 | 
11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
12 | do
13 |   python3 run.py \
14 |     --model_name_or_path roberta-base \
15 |     --task_name $TASK_NAME \
16 |     --dataset_name $DATASET_NAME \
17 |     --do_train \
18 |     --do_eval \
19 |     --do_predict \
20 |     --max_seq_length 128 \
21 |     --per_device_train_batch_size $bs \
22 |     --learning_rate $lr \
23 |     --num_train_epochs $epoch \
24 |     --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \
25 |     --overwrite_output_dir \
26 |     --hidden_dropout_prob $dropout \
27 |     --seed $model_seed \
28 |     --model_seed $model_seed \
29 |     --save_strategy epoch \
30 |     --evaluation_strategy epoch \
31 |     --load_best_model_at_end\
32 |     --metric_for_best_model loss\
33 |     --greater_is_better False \
34 |     --lora \
35 |     --lora_alpha $lora_alpha \
36 |     --lora_r $lora_r
37 | done
38 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/lora/run_mrpc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=mrpc
 3 | 
 4 | bs=32
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | lora_alpha=16
 9 | lora_r=8
10 | 
11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
12 | do
13 |   python3 run.py \
14 |     --model_name_or_path roberta-base \
15 |     --task_name $TASK_NAME \
16 |     --dataset_name $DATASET_NAME \
17 |     --do_train \
18 |     --do_eval \
19 |     --do_predict \
20 |     --max_seq_length 128 \
21 |     --per_device_train_batch_size $bs \
22 |     --learning_rate $lr \
23 |     --num_train_epochs $epoch \
24 |     --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \
25 |     --overwrite_output_dir \
26 |     --hidden_dropout_prob $dropout \
27 |     --seed $model_seed \
28 |     --model_seed $model_seed \
29 |     --save_strategy epoch \
30 |     --evaluation_strategy epoch \
31 |     --load_best_model_at_end\
32 |     --metric_for_best_model loss\
33 |     --greater_is_better False \
34 |     --lora \
35 |     --lora_alpha $lora_alpha \
36 |     --lora_r $lora_r
37 | done
38 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/lora/run_multirc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=multirc
 3 | 
 4 | bs=16
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | lora_alpha=16
 9 | lora_r=8
10 | 
11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
12 | do
13 |     python3 run.py \
14 |       --model_name_or_path roberta-base \
15 |       --task_name $TASK_NAME \
16 |       --dataset_name $DATASET_NAME \
17 |       --do_train \
18 |       --do_eval \
19 |       --do_predict \
20 |       --max_seq_length 128 \
21 |       --per_device_train_batch_size $bs \
22 |       --learning_rate $lr \
23 |       --num_train_epochs $epoch \
24 |       --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \
25 |       --overwrite_output_dir \
26 |       --hidden_dropout_prob $dropout \
27 |       --seed $model_seed \
28 |       --model_seed $model_seed \
29 |       --save_strategy epoch \
30 |       --evaluation_strategy epoch \
31 |       --load_best_model_at_end\
32 |       --metric_for_best_model loss\
33 |       --greater_is_better False \
34 |       --lora \
35 |       --lora_alpha $lora_alpha \
36 |       --lora_r $lora_r
37 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/lora/run_rte_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=rte
 3 | 
 4 | bs=16
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | lora_alpha=8
 9 | lora_r=8
10 | 
11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
12 | do
13 |   python3 run.py \
14 |     --model_name_or_path roberta-base \
15 |     --task_name $TASK_NAME \
16 |     --dataset_name $DATASET_NAME \
17 |     --do_train \
18 |     --do_eval \
19 |     --do_predict \
20 |     --max_seq_length 128 \
21 |     --per_device_train_batch_size $bs \
22 |     --learning_rate $lr \
23 |     --num_train_epochs $epoch \
24 |     --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \
25 |     --overwrite_output_dir \
26 |     --hidden_dropout_prob $dropout \
27 |     --seed $model_seed \
28 |     --model_seed $model_seed\
29 |     --save_strategy epoch \
30 |     --evaluation_strategy epoch \
31 |     --load_best_model_at_end\
32 |     --metric_for_best_model loss\
33 |     --greater_is_better False \
34 |     --lora \
35 |     --lora_alpha $lora_alpha \
36 |     --lora_r $lora_r
37 | done
38 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/lora/run_sst2_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=sst2
 3 | 
 4 | bs=32
 5 | lr=5e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | lora_alpha=8
 9 | lora_r=8
10 | 
11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
12 | do
13 |   python3 run.py \
14 |     --model_name_or_path roberta-base \
15 |     --task_name $TASK_NAME \
16 |     --dataset_name $DATASET_NAME \
17 |     --do_train \
18 |     --do_eval \
19 |     --do_predict \
20 |     --max_seq_length 128 \
21 |     --per_device_train_batch_size $bs \
22 |     --learning_rate $lr \
23 |     --num_train_epochs $epoch \
24 |     --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \
25 |     --overwrite_output_dir \
26 |     --hidden_dropout_prob $dropout \
27 |     --seed $model_seed \
28 |     --model_seed $model_seed \
29 |     --save_strategy epoch \
30 |     --evaluation_strategy epoch \
31 |     --load_best_model_at_end\
32 |     --metric_for_best_model loss\
33 |     --greater_is_better False \
34 |     --lora \
35 |     --lora_alpha $lora_alpha \
36 |     --lora_r $lora_r
37 | done
38 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/lora/run_stsb_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=stsb
 3 | 
 4 | bs=32
 5 | lr=1e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | lora_alpha=16
 9 | lora_r=8
10 | 
11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
12 | do
13 |   python3 run.py \
14 |     --model_name_or_path roberta-base \
15 |     --task_name $TASK_NAME \
16 |     --dataset_name $DATASET_NAME \
17 |     --do_train \
18 |     --do_eval \
19 |     --do_predict \
20 |     --max_seq_length 128 \
21 |     --per_device_train_batch_size $bs \
22 |     --learning_rate $lr \
23 |     --num_train_epochs $epoch \
24 |     --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \
25 |     --overwrite_output_dir \
26 |     --hidden_dropout_prob $dropout \
27 |     --seed $model_seed \
28 |     --model_seed $model_seed \
29 |     --save_strategy epoch \
30 |     --evaluation_strategy epoch \
31 |     --load_best_model_at_end\
32 |     --metric_for_best_model loss\
33 |     --greater_is_better False \
34 |     --lora \
35 |     --lora_alpha $lora_alpha \
36 |     --lora_r $lora_r
37 | done
38 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/lora/run_wic_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=wic
 3 | 
 4 | bs=16
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | lora_alpha=8
 9 | lora_r=8
10 | 
11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
12 | do
13 |     python3 run.py \
14 |       --model_name_or_path roberta-base \
15 |       --task_name $TASK_NAME \
16 |       --dataset_name $DATASET_NAME \
17 |       --do_train \
18 |       --do_eval \
19 |       --do_predict \
20 |       --max_seq_length 128 \
21 |       --per_device_train_batch_size $bs \
22 |       --learning_rate $lr \
23 |       --num_train_epochs $epoch \
24 |       --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \
25 |       --overwrite_output_dir \
26 |       --hidden_dropout_prob $dropout \
27 |       --seed $model_seed \
28 |       --model_seed $model_seed \
29 |       --save_strategy epoch \
30 |       --evaluation_strategy epoch \
31 |       --load_best_model_at_end\
32 |       --metric_for_best_model loss\
33 |       --greater_is_better False \
34 |       --lora \
35 |       --lora_alpha $lora_alpha \
36 |       --lora_r $lora_r
37 | done
38 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/lora/run_wsc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=wsc
 3 | 
 4 | bs=16
 5 | lr=1e-4
 6 | dropout=0.1
 7 | epoch=50
 8 | lora_alpha=8
 9 | lora_r=8
10 | 
11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
12 | do
13 |     python3 run.py \
14 |       --model_name_or_path roberta-base \
15 |       --task_name $TASK_NAME \
16 |       --dataset_name $DATASET_NAME \
17 |       --do_train \
18 |       --do_eval \
19 |       --do_predict \
20 |       --max_seq_length 128 \
21 |       --per_device_train_batch_size $bs \
22 |       --learning_rate $lr \
23 |       --num_train_epochs $epoch \
24 |       --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \
25 |       --overwrite_output_dir \
26 |       --hidden_dropout_prob $dropout \
27 |       --seed $model_seed \
28 |       --model_seed $model_seed \
29 |       --save_strategy epoch \
30 |       --evaluation_strategy epoch \
31 |       --load_best_model_at_end\
32 |       --metric_for_best_model loss\
33 |       --greater_is_better False \
34 |       --lora \
35 |       --lora_alpha $lora_alpha \
36 |       --lora_r $lora_r
37 | done
38 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_boolq_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=boolq
 3 | 
 4 | bs=32
 5 | lr=5e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --prefix \
33 |     --pre_seq_len 64
34 | done
35 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_cb_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=cb
 3 | 
 4 | bs=32
 5 | lr=5e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --logging_steps 5 \
29 |     --evaluation_strategy epoch \
30 |     --load_best_model_at_end\
31 |     --metric_for_best_model loss\
32 |     --greater_is_better False \
33 |     --prefix \
34 |     --pre_seq_len 8
35 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_cola_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=cola
 3 | 
 4 | bs=16
 5 | lr=5e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --prefix \
33 |     --pre_seq_len 8
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_copa_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=copa
 3 | 
 4 | bs=16
 5 | lr=5e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --prefix \
33 |     --pre_seq_len 32
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_mnli_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=mnli
 3 | 
 4 | bs=32
 5 | lr=5e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --prefix \
33 |     --pre_seq_len 32
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_mrpc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=mrpc
 3 | 
 4 | bs=16
 5 | lr=1e-2
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --prefix \
33 |     --pre_seq_len 16
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_multirc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=multirc
 3 | 
 4 | bs=32
 5 | lr=5e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |     python3 run.py \
12 |       --model_name_or_path roberta-base \
13 |       --task_name $TASK_NAME \
14 |       --dataset_name $DATASET_NAME \
15 |       --do_train \
16 |       --do_eval \
17 |       --do_predict \
18 |       --max_seq_length 128 \
19 |       --per_device_train_batch_size $bs \
20 |       --learning_rate $lr \
21 |       --num_train_epochs $epoch \
22 |       --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |       --overwrite_output_dir \
24 |       --hidden_dropout_prob $dropout \
25 |       --seed $model_seed \
26 |       --model_seed $model_seed \
27 |       --save_strategy epoch \
28 |       --evaluation_strategy epoch \
29 |       --load_best_model_at_end\
30 |       --metric_for_best_model loss\
31 |       --greater_is_better False \
32 |       --prefix \
33 |       --pre_seq_len 8
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_qnli_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=qnli
 3 | 
 4 | bs=32
 5 | lr=5e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --prefix \
33 |     --pre_seq_len 64
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_qqp_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=qqp
 3 | 
 4 | bs=32
 5 | lr=5e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --prefix \
33 |     --pre_seq_len 64
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_rte_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=rte
 3 | 
 4 | bs=16
 5 | lr=1e-2
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed\
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --prefix \
33 |     --pre_seq_len 32
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_sst2_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=sst2
 3 | 
 4 | bs=32
 5 | lr=1e-2
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --prefix \
33 |     --pre_seq_len 64
34 | done
35 | 


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_stsb_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export DATASET_NAME=stsb
 3 | 
 4 | bs=16
 5 | lr=5e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --prefix \
33 |     --pre_seq_len 64
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_wic_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=wic
 3 | 
 4 | bs=16
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |   python3 run.py \
12 |     --model_name_or_path roberta-base \
13 |     --task_name $TASK_NAME \
14 |     --dataset_name $DATASET_NAME \
15 |     --do_train \
16 |     --do_eval \
17 |     --do_predict \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size $bs \
20 |     --learning_rate $lr \
21 |     --num_train_epochs $epoch \
22 |     --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |     --overwrite_output_dir \
24 |     --hidden_dropout_prob $dropout \
25 |     --seed $model_seed \
26 |     --model_seed $model_seed \
27 |     --save_strategy epoch \
28 |     --evaluation_strategy epoch \
29 |     --load_best_model_at_end\
30 |     --metric_for_best_model loss\
31 |     --greater_is_better False \
32 |     --prefix \
33 |     --pre_seq_len 32
34 | done


--------------------------------------------------------------------------------
/scripts/mulitruns_scripts/prefixtuning/run_wsc_roberta_both.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export DATASET_NAME=wsc
 3 | 
 4 | bs=32
 5 | lr=1e-3
 6 | dropout=0.1
 7 | epoch=50
 8 | 
 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010
10 | do
11 |     python3 run.py \
12 |       --model_name_or_path roberta-base \
13 |       --task_name $TASK_NAME \
14 |       --dataset_name $DATASET_NAME \
15 |       --do_train \
16 |       --do_eval \
17 |       --do_predict \
18 |       --max_seq_length 128 \
19 |       --per_device_train_batch_size $bs \
20 |       --learning_rate $lr \
21 |       --num_train_epochs $epoch \
22 |       --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \
23 |       --overwrite_output_dir \
24 |       --hidden_dropout_prob $dropout \
25 |       --seed $model_seed \
26 |       --model_seed $model_seed \
27 |       --save_strategy epoch \
28 |       --evaluation_strategy epoch \
29 |       --load_best_model_at_end\
30 |       --metric_for_best_model loss\
31 |       --greater_is_better False \
32 |       --prefix \
33 |       --pre_seq_len 8
34 | done


--------------------------------------------------------------------------------
/scripts/search_scipts/glue/search_adapter.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export epoch=50
 3 | export DATASET_NAME=$1
 4 | 
 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2
 6 |   do
 7 |     for bs in 16 32
 8 |     do
 9 |       for rf in 64 16 2
10 |       do
11 |       python3 run.py \
12 |         --model_name_or_path roberta-base \
13 |         --task_name $TASK_NAME \
14 |         --dataset_name $DATASET_NAME \
15 |         --do_train \
16 |         --do_eval \
17 |         --do_predict \
18 |         --max_seq_length 128 \
19 |         --per_device_train_batch_size $bs \
20 |         --learning_rate $lr \
21 |         --num_train_epochs $epoch \
22 |         --output_dir checkpoints/$DATASET_NAME-roberta-searchadapter/$DATASET_NAME-$bs-$lr-$rf/ \
23 |         --overwrite_output_dir \
24 |         --hidden_dropout_prob 0.1 \
25 |         --seed 1111 \
26 |         --save_strategy epoch \
27 |         --evaluation_strategy epoch \
28 |         --load_best_model_at_end\
29 |         --metric_for_best_model loss\
30 |         --greater_is_better False \
31 |         --train_adapter \
32 |         --adapter_config pfeiffer \
33 |         --adapter_reduction_factor $rf
34 |         done
35 |     done
36 | done
37 | 
38 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta adapter


--------------------------------------------------------------------------------
/scripts/search_scipts/glue/search_bitfit.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export epoch=50
 3 | export DATASET_NAME=$1
 4 | 
 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2
 6 | do
 7 |     for bs in 16 32
 8 |       do
 9 |       python3 run.py \
10 |         --model_name_or_path roberta-base \
11 |         --task_name $TASK_NAME \
12 |         --dataset_name $DATASET_NAME \
13 |         --do_train \
14 |         --do_eval \
15 |         --do_predict \
16 |         --max_seq_length 128 \
17 |         --per_device_train_batch_size $bs \
18 |         --learning_rate $lr \
19 |         --num_train_epochs $epoch \
20 |         --output_dir checkpoints/$DATASET_NAME-roberta-searchbitfit/$DATASET_NAME-$bs-$lr/ \
21 |         --overwrite_output_dir \
22 |         --hidden_dropout_prob 0.1 \
23 |         --seed 1111 \
24 |         --save_strategy epoch \
25 |         --evaluation_strategy epoch \
26 |         --load_best_model_at_end\
27 |         --metric_for_best_model loss\
28 |         --greater_is_better False \
29 |         --bitfit
30 |       done
31 | done
32 | 
33 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta bitfit


--------------------------------------------------------------------------------
/scripts/search_scipts/glue/search_ft.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export epoch=50
 3 | export DATASET_NAME=$1
 4 | 
 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2
 6 |   do
 7 |   for bs in 16 32
 8 |     do
 9 |     for wr in 0.0 0.06
10 |     do
11 |       python3 run.py \
12 |         --model_name_or_path roberta-base \
13 |         --task_name $TASK_NAME \
14 |         --dataset_name $DATASET_NAME \
15 |         --do_train \
16 |         --do_eval \
17 |         --do_predict \
18 |         --max_seq_length 128 \
19 |         --per_device_train_batch_size $bs \
20 |         --learning_rate $lr \
21 |         --num_train_epochs $epoch \
22 |         --output_dir checkpoints/$DATASET_NAME-roberta-searchft/$DATASET_NAME-$bs-$lr-$wr/ \
23 |         --overwrite_output_dir \
24 |         --hidden_dropout_prob 0.1 \
25 |         --seed 1111 \
26 |         --save_strategy epoch \
27 |         --evaluation_strategy epoch \
28 |         --load_best_model_at_end\
29 |         --metric_for_best_model loss\
30 |         --greater_is_better False \
31 |         --warmup_ratio $wr
32 |         done
33 |     done
34 | done
35 | 
36 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta ft


--------------------------------------------------------------------------------
/scripts/search_scipts/glue/search_lora.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=glue
 2 | export epoch=50
 3 | export DATASET_NAME=$1
 4 | 
 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2
 6 | do
 7 |     for bs in 16 32
 8 |       do
 9 |         for lora_alpha in 8 16
10 |         do
11 |           for lora_r in 8 16
12 |           do
13 |           python3 run.py \
14 |             --model_name_or_path roberta-base \
15 |             --task_name $TASK_NAME \
16 |             --dataset_name $DATASET_NAME \
17 |             --do_train \
18 |             --do_eval \
19 |             --do_predict \
20 |             --max_seq_length 128 \
21 |             --per_device_train_batch_size $bs \
22 |             --learning_rate $lr \
23 |             --num_train_epochs $epoch \
24 |             --output_dir checkpoints/$DATASET_NAME-roberta-searchlora/$DATASET_NAME-$bs-$lr-$lora_alpha/ \
25 |             --overwrite_output_dir \
26 |             --hidden_dropout_prob 0.1 \
27 |             --seed 1111 \
28 |             --save_strategy epoch \
29 |             --evaluation_strategy epoch \
30 |             --load_best_model_at_end\
31 |             --metric_for_best_model loss\
32 |             --greater_is_better False \
33 |             --lora \
34 |             --lora_alpha $lora_alpha \
35 |             --lora_r $lora_r
36 |             done
37 |           done
38 |       done
39 | done
40 | 
41 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta lora


--------------------------------------------------------------------------------
/scripts/search_scipts/glue/search_pt.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export epoch=50
 3 | export DATASET_NAME=$1
 4 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2
 5 | do
 6 |   for psl in 8 16 32 64
 7 |   do
 8 |     for bs in 16 32
 9 |       do
10 |         python3 run.py \
11 |           --model_name_or_path roberta-base \
12 |           --task_name $TASK_NAME \
13 |           --dataset_name $DATASET_NAME \
14 |           --do_train \
15 |           --do_eval \
16 |           --do_predict \
17 |           --max_seq_length 128 \
18 |           --per_device_train_batch_size $bs \
19 |           --learning_rate $lr \
20 |           --num_train_epochs $epoch \
21 |           --pre_seq_len $psl \
22 |           --output_dir checkpoints/$DATASET_NAME-roberta-searchpt/$DATASET_NAME-$bs-$lr-$psl/ \
23 |           --overwrite_output_dir \
24 |           --hidden_dropout_prob 0.1 \
25 |           --seed 1111 \
26 |           --save_strategy epoch \
27 |           --evaluation_strategy epoch \
28 |           --load_best_model_at_end\
29 |           --metric_for_best_model loss\
30 |           --greater_is_better False \
31 |           --prefix
32 |       done
33 |     done
34 | 
35 |     python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta pt


--------------------------------------------------------------------------------
/scripts/search_scipts/search.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import sys
 4 | 
 5 | from glob import glob
 6 | 
 7 | from tasks.utils import *
 8 | 
 9 | TASK = sys.argv[1]
10 | MODEL = sys.argv[2]
11 | METHOD = sys.argv[3]
12 | 
13 | 
14 | SPECIAL_METRICS = {
15 |     'cb' : 'f1',
16 |     'mrpc' : 'f1',
17 |     'cola' : 'matthews_correlation',
18 |     'stsb' : 'combined_score'
19 | }
20 | 
21 | METRIC = "accuracy"
22 | if TASK in SPECIAL_METRICS:
23 |     METRIC = SPECIAL_METRICS[TASK]
24 | 
25 | best_score = 0
26 | 
27 | files = glob(f"./checkpoints/{TASK}-{MODEL}-search{METHOD}/*/predict_results.json")
28 | 
29 | for f in files:
30 |     metrics = json.load(open(f, 'r'))
31 |     if metrics["predict_"+METRIC] > best_score:
32 |         best_score = metrics["predict_"+METRIC]
33 |         best_metrics = metrics
34 |         best_file_name = f
35 | 
36 | print(f"best_{METRIC}: {best_score}")
37 | print(f"best_metrics: {best_metrics}")
38 | print(f"best_file: {best_file_name}")
39 | 


--------------------------------------------------------------------------------
/scripts/search_scipts/superglue/search_adapter.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export epoch=50
 3 | export DATASET_NAME=$1
 4 | 
 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2
 6 |   do
 7 |     for bs in 16 32
 8 |     do
 9 |       for rf in 64 16 2
10 |       do
11 |       python3 run.py \
12 |         --model_name_or_path roberta-base \
13 |         --task_name $TASK_NAME \
14 |         --dataset_name $DATASET_NAME \
15 |         --do_train \
16 |         --do_eval \
17 |         --do_predict \
18 |         --max_seq_length 128 \
19 |         --per_device_train_batch_size $bs \
20 |         --learning_rate $lr \
21 |         --num_train_epochs $epoch \
22 |         --output_dir checkpoints/$DATASET_NAME-roberta-searchadapter/$DATASET_NAME-$bs-$lr-$rf/ \
23 |         --overwrite_output_dir \
24 |         --hidden_dropout_prob 0.1 \
25 |         --seed 1111 \
26 |         --save_strategy epoch \
27 |         --evaluation_strategy epoch \
28 |         --load_best_model_at_end\
29 |         --metric_for_best_model loss\
30 |         --greater_is_better False \
31 |         --train_adapter \
32 |         --adapter_config pfeiffer \
33 |         --adapter_reduction_factor $rf
34 |         done
35 |     done
36 | done
37 | 
38 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta adapter
39 | 


--------------------------------------------------------------------------------
/scripts/search_scipts/superglue/search_bitfit.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export epoch=50
 3 | export DATASET_NAME=$1
 4 | 
 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2
 6 | do
 7 |     for bs in 16 32
 8 |       do
 9 |       python3 run.py \
10 |         --model_name_or_path roberta-base \
11 |         --task_name $TASK_NAME \
12 |         --dataset_name $DATASET_NAME \
13 |         --do_train \
14 |         --do_eval \
15 |         --do_predict \
16 |         --max_seq_length 128 \
17 |         --per_device_train_batch_size $bs \
18 |         --learning_rate $lr \
19 |         --num_train_epochs $epoch \
20 |         --output_dir checkpoints/$DATASET_NAME-roberta-searchbitfit/$DATASET_NAME-$bs-$lr/ \
21 |         --overwrite_output_dir \
22 |         --hidden_dropout_prob 0.1 \
23 |         --seed 1111 \
24 |         --save_strategy epoch \
25 |         --evaluation_strategy epoch \
26 |         --load_best_model_at_end\
27 |         --metric_for_best_model loss\
28 |         --greater_is_better False \
29 |         --bitfit
30 |       done
31 | done
32 | 
33 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta bitfit


--------------------------------------------------------------------------------
/scripts/search_scipts/superglue/search_ft.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export epoch=50
 3 | export DATASET_NAME=$1
 4 | 
 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2
 6 |   do
 7 |   for bs in 16 32
 8 |     do
 9 |     for wr in 0.0 0.06
10 |     do
11 |       python3 run.py \
12 |         --model_name_or_path roberta-base \
13 |         --task_name $TASK_NAME \
14 |         --dataset_name $DATASET_NAME \
15 |         --do_train \
16 |         --do_eval \
17 |         --do_predict \
18 |         --max_seq_length 128 \
19 |         --per_device_train_batch_size $bs \
20 |         --learning_rate $lr \
21 |         --num_train_epochs $epoch \
22 |         --output_dir checkpoints/$DATASET_NAME-roberta-searchft/$DATASET_NAME-$bs-$lr-$wr/ \
23 |         --overwrite_output_dir \
24 |         --hidden_dropout_prob 0.1 \
25 |         --seed 1111 \
26 |         --save_strategy epoch \
27 |         --evaluation_strategy epoch \
28 |         --load_best_model_at_end\
29 |         --metric_for_best_model loss\
30 |         --greater_is_better False \
31 |         --warmup_ratio $wr
32 |         done
33 |     done
34 | done
35 | 
36 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta ft


--------------------------------------------------------------------------------
/scripts/search_scipts/superglue/search_lora.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export epoch=50
 3 | export DATASET_NAME=$1
 4 | 
 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2
 6 | do
 7 |     for bs in 16 32
 8 |       do
 9 |       for lora_alpha in 8 16
10 |         do
11 |           for lora_r in 8 16
12 |           do
13 |           python3 run.py \
14 |             --model_name_or_path roberta-base \
15 |             --task_name $TASK_NAME \
16 |             --dataset_name $DATASET_NAME \
17 |             --do_train \
18 |             --do_eval \
19 |             --do_predict \
20 |             --max_seq_length 128 \
21 |             --per_device_train_batch_size $bs \
22 |             --learning_rate $lr \
23 |             --num_train_epochs $epoch \
24 |             --output_dir checkpoints/$DATASET_NAME-roberta-searchlora/$DATASET_NAME-$bs-$lr-$lora_alpha/ \
25 |             --overwrite_output_dir \
26 |             --hidden_dropout_prob 0.1 \
27 |             --seed 1111 \
28 |             --save_strategy epoch \
29 |             --evaluation_strategy epoch \
30 |             --load_best_model_at_end\
31 |             --metric_for_best_model loss\
32 |             --greater_is_better False \
33 |             --lora \
34 |             --lora_alpha $lora_alpha \
35 |             --lora_r $lora_r
36 |             done
37 |         done
38 |       done
39 | done
40 | 
41 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta lora


--------------------------------------------------------------------------------
/scripts/search_scipts/superglue/search_pt.sh:
--------------------------------------------------------------------------------
 1 | export TASK_NAME=superglue
 2 | export epoch=50
 3 | export DATASET_NAME=$1
 4 | 
 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2
 6 | do
 7 |   for psl in 8 16 32 64
 8 |   do
 9 |     for bs in 16 32
10 |       do
11 |         python3 run.py \
12 |           --model_name_or_path roberta-base \
13 |           --task_name $TASK_NAME \
14 |           --dataset_name $DATASET_NAME \
15 |           --do_train \
16 |           --do_eval \
17 |           --do_predict \
18 |           --max_seq_length 128 \
19 |           --per_device_train_batch_size $bs \
20 |           --learning_rate $lr \
21 |           --num_train_epochs $epoch \
22 |           --pre_seq_len $psl \
23 |           --output_dir checkpoints/$DATASET_NAME-roberta-searchpt/$DATASET_NAME-$bs-$lr-$psl/ \
24 |           --overwrite_output_dir \
25 |           --hidden_dropout_prob 0.1 \
26 |           --seed 1111 \
27 |           --save_strategy epoch \
28 |           --evaluation_strategy epoch \
29 |           --load_best_model_at_end\
30 |           --metric_for_best_model loss\
31 |           --greater_is_better False \
32 |           --prefix
33 |       done
34 |     done
35 | done
36 | 
37 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta pt


--------------------------------------------------------------------------------
/tasks/glue/dataset.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.utils import data
  3 | from torch.utils.data import Dataset
  4 | from datasets.arrow_dataset import Dataset as HFDataset
  5 | from datasets.load import load_dataset, load_metric
  6 | from transformers import (
  7 |     AutoTokenizer,
  8 |     DataCollatorWithPadding,
  9 |     EvalPrediction,
 10 |     default_data_collator,
 11 | )
 12 | import numpy as np
 13 | import logging
 14 | 
 15 | task_to_keys = {
 16 |     "cola": ("sentence", None),
 17 |     "mnli": ("premise", "hypothesis"),
 18 |     "mrpc": ("sentence1", "sentence2"),
 19 |     "qnli": ("question", "sentence"),
 20 |     "qqp": ("question1", "question2"),
 21 |     "rte": ("sentence1", "sentence2"),
 22 |     "sst2": ("sentence", None),
 23 |     "stsb": ("sentence1", "sentence2"),
 24 |     "wnli": ("sentence1", "sentence2"),
 25 | }
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | class GlueDataset():
 31 |     def __init__(self, tokenizer: AutoTokenizer, data_args, training_args) -> None:
 32 |         super().__init__()
 33 |         raw_datasets = load_dataset("glue", data_args.dataset_name)
 34 |         self.tokenizer = tokenizer
 35 |         self.data_args = data_args
 36 |         #labels
 37 |         self.is_regression = data_args.dataset_name == "stsb"
 38 |         if not self.is_regression:
 39 |             self.label_list = raw_datasets["train"].features["label"].names
 40 |             self.num_labels = len(self.label_list)
 41 |         else:
 42 |             self.num_labels = 1
 43 | 
 44 |         # Preprocessing the raw_datasets
 45 |         self.sentence1_key, self.sentence2_key = task_to_keys[data_args.dataset_name]
 46 | 
 47 |         # Padding strategy
 48 |         if data_args.pad_to_max_length:
 49 |             self.padding = "max_length"
 50 |         else:
 51 |             # We will pad later, dynamically at batch creation, to the max sequence length in each batch
 52 |             self.padding = False
 53 | 
 54 |         # Some models have set the order of the labels to use, so let's make sure we do use it.
 55 |         if not self.is_regression:
 56 |             self.label2id = {l: i for i, l in enumerate(self.label_list)}
 57 |             self.id2label = {id: label for label, id in self.label2id.items()}
 58 | 
 59 |         if data_args.max_seq_length > tokenizer.model_max_length:
 60 |             logger.warning(
 61 |                 f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
 62 |                 f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
 63 |             )
 64 |         self.max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
 65 | 
 66 |         raw_datasets = raw_datasets.map(
 67 |             self.preprocess_function,
 68 |             batched=True,
 69 |             load_from_cache_file=not data_args.overwrite_cache,
 70 |             desc="Running tokenizer on dataset",
 71 |         )
 72 |         if training_args.do_train:
 73 |             train_dataset = raw_datasets["train"].train_test_split(test_size=0.1, shuffle=False)
 74 |             self.train_dataset, self.eval_dataset = train_dataset = train_dataset['train'], train_dataset['test']
 75 |             if data_args.max_train_samples is not None:
 76 |                 self.train_dataset = self.train_dataset.select(range(data_args.max_train_samples))
 77 | 
 78 |         if training_args.do_predict:
 79 |             self.predict_dataset = raw_datasets["validation_matched" if data_args.dataset_name == "mnli" else "validation"]
 80 |             if data_args.max_predict_samples is not None:
 81 |                 self.predict_dataset = self.predict_dataset.select(range(data_args.max_predict_samples))
 82 | 
 83 |         # if training_args.do_train:
 84 |         #     self.train_dataset = raw_datasets["train"]
 85 |         #     if data_args.max_train_samples is not None:
 86 |         #         self.train_dataset = self.train_dataset.select(range(data_args.max_train_samples))
 87 | 
 88 |         # if training_args.do_eval:
 89 |         #     self.eval_dataset = raw_datasets["validation_matched" if data_args.dataset_name == "mnli" else "validation"]
 90 |         #     if data_args.max_eval_samples is not None:
 91 |         #         self.eval_dataset = self.eval_dataset.select(range(data_args.max_eval_samples))
 92 | 
 93 |         # if training_args.do_predict or data_args.dataset_name is not None or data_args.test_file is not None:
 94 |         #     self.predict_dataset = raw_datasets["test_matched" if data_args.dataset_name == "mnli" else "test"]
 95 |         #     if data_args.max_predict_samples is not None:
 96 |         #         self.predict_dataset = self.predict_dataset.select(range(data_args.max_predict_samples))
 97 | 
 98 |         self.metric = load_metric("tasks/glue/glue.py", data_args.dataset_name)
 99 | 
100 |         if data_args.pad_to_max_length:
101 |             self.data_collator = default_data_collator
102 |         elif training_args.fp16:
103 |             self.data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
104 | 
105 | 
106 |     def preprocess_function(self, examples):
107 |         # Tokenize the texts
108 |         args = (
109 |             (examples[self.sentence1_key],) if self.sentence2_key is None else (examples[self.sentence1_key], examples[self.sentence2_key])
110 |         )
111 |         result = self.tokenizer(*args, padding=self.padding, max_length=self.max_seq_length, truncation=True)
112 | 
113 |         return result
114 | 
115 |     def compute_metrics(self, p: EvalPrediction):
116 |         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
117 |         preds = np.squeeze(preds) if self.is_regression else np.argmax(preds, axis=1)
118 |         if self.data_args.dataset_name is not None:
119 |             result = self.metric.compute(predictions=preds, references=p.label_ids)
120 |             if len(result) > 1:
121 |                 result["combined_score"] = np.mean(list(result.values())).item()
122 |             return result
123 |         elif self.is_regression:
124 |             return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
125 |         else:
126 |             return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
127 | 
128 | 
129 |     


--------------------------------------------------------------------------------
/tasks/glue/get_trainer.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import random
  4 | import sys
  5 | 
  6 | from transformers import (
  7 |     AutoConfig,
  8 |     AutoTokenizer,
  9 |     AdapterConfig
 10 | )
 11 | 
 12 | from model.utils import get_model, TaskType
 13 | from tasks.glue.dataset import GlueDataset
 14 | from training.trainer_base import BaseTrainer
 15 | from transformers import Trainer, AdapterTrainer, EarlyStoppingCallback
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | def get_trainer(args):
 20 |     model_args, data_args, training_args, _, adapter_args = args
 21 | 
 22 |     tokenizer = AutoTokenizer.from_pretrained(
 23 |         model_args.model_name_or_path,
 24 |         use_fast=model_args.use_fast_tokenizer,
 25 |         revision=model_args.model_revision,
 26 |     )
 27 |     dataset = GlueDataset(tokenizer, data_args, training_args)
 28 | 
 29 |     if not dataset.is_regression:
 30 |         config = AutoConfig.from_pretrained(
 31 |             model_args.model_name_or_path,
 32 |             num_labels=dataset.num_labels,
 33 |             label2id=dataset.label2id,
 34 |             id2label=dataset.id2label,
 35 |             finetuning_task=data_args.dataset_name,
 36 |             revision=model_args.model_revision,
 37 |         )
 38 |     else:
 39 |         config = AutoConfig.from_pretrained(
 40 |             model_args.model_name_or_path,
 41 |             num_labels=dataset.num_labels,
 42 |             finetuning_task=data_args.dataset_name,
 43 |             revision=model_args.model_revision,
 44 |         )
 45 |     config.lora = False
 46 |     model = get_model(model_args, TaskType.SEQUENCE_CLASSIFICATION, config)
 47 | 
 48 |     if adapter_args.train_adapter:
 49 |         logger.info(f"Reduction Factor: {adapter_args.adapter_reduction_factor}")
 50 |         task_name = data_args.task_name or "superglue"
 51 |         # check if adapter already exists, otherwise add it
 52 |         if task_name not in model.config.adapters:
 53 |             # resolve the adapter config
 54 |             adapter_config = AdapterConfig.load(
 55 |                 adapter_args.adapter_config,
 56 |                 non_linearity=adapter_args.adapter_non_linearity,
 57 |                 reduction_factor=adapter_args.adapter_reduction_factor,
 58 |             )
 59 |             # load a pre-trained from Hub if specified
 60 |             # if adapter_args.load_adapter:
 61 |             #     model.load_adapter(
 62 |             #         adapter_args.load_adapter,
 63 |             #         config=adapter_config,
 64 |             #         load_as=task_name,
 65 |             #     )
 66 |             # # otherwise, add a fresh adapter
 67 |             # else:
 68 |             model.add_adapter(task_name, config=adapter_config)
 69 |         # Freeze all model weights except of those of this adapter
 70 |         model.train_adapter([task_name])
 71 |         # Set the adapters to be used in every forward pass
 72 |         model.set_active_adapters(task_name)
 73 |     else:
 74 |         if adapter_args.load_adapter:
 75 |             raise ValueError(
 76 |                 "Adapters can only be loaded in adapters training mode."
 77 |                 "Use --train_adapter to enable adapter training"
 78 |             )
 79 |     if model_args.bitfit:
 80 |         for name, param in model.named_parameters():
 81 |             if name.startswith('roberta') and "bias" not in name.lower():
 82 |                 param.requires_grad = False
 83 |     param_optimizer = list(model.named_parameters())
 84 |     logger.info("Trainable parameters:")
 85 |     for n, p in param_optimizer:
 86 |         if p.requires_grad:
 87 |             logger.info(f"{n}")
 88 |             # print(n)
 89 | 
 90 |     trainer_cls = AdapterTrainer if adapter_args.train_adapter else Trainer
 91 |     trainer = trainer_cls(
 92 |         model=model,
 93 |         args=training_args,
 94 |         train_dataset=dataset.train_dataset if training_args.do_train else None,
 95 |         eval_dataset=dataset.eval_dataset if training_args.do_eval else None,
 96 |         compute_metrics=dataset.compute_metrics,
 97 |         tokenizer=tokenizer,
 98 |         data_collator=dataset.data_collator,
 99 |         callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]
100 |     )
101 | 
102 |     return trainer, dataset.predict_dataset


--------------------------------------------------------------------------------
/tasks/glue/glue.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The HuggingFace Datasets Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ GLUE benchmark metric. """
 16 | 
 17 | from scipy.stats import pearsonr, spearmanr
 18 | from sklearn.metrics import f1_score, matthews_corrcoef
 19 | 
 20 | import datasets
 21 | 
 22 | 
 23 | _CITATION = """\
 24 | @inproceedings{wang2019glue,
 25 |   title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
 26 |   author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},
 27 |   note={In the Proceedings of ICLR.},
 28 |   year={2019}
 29 | }
 30 | """
 31 | 
 32 | _DESCRIPTION = """\
 33 | GLUE, the General Language Understanding Evaluation benchmark
 34 | (https://gluebenchmark.com/) is a collection of resources for training,
 35 | evaluating, and analyzing natural language understanding systems.
 36 | """
 37 | 
 38 | _KWARGS_DESCRIPTION = """
 39 | Compute GLUE evaluation metric associated to each GLUE dataset.
 40 | Args:
 41 |     predictions: list of predictions to score.
 42 |         Each translation should be tokenized into a list of tokens.
 43 |     references: list of lists of references for each translation.
 44 |         Each reference should be tokenized into a list of tokens.
 45 | Returns: depending on the GLUE subset, one or several of:
 46 |     "accuracy": Accuracy
 47 |     "f1": F1 score
 48 |     "pearson": Pearson Correlation
 49 |     "spearmanr": Spearman Correlation
 50 |     "matthews_correlation": Matthew Correlation
 51 | Examples:
 52 | 
 53 |     >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
 54 |     >>> references = [0, 1]
 55 |     >>> predictions = [0, 1]
 56 |     >>> results = glue_metric.compute(predictions=predictions, references=references)
 57 |     >>> print(results)
 58 |     {'accuracy': 1.0}
 59 | 
 60 |     >>> glue_metric = datasets.load_metric('glue', 'mrpc')  # 'mrpc' or 'qqp'
 61 |     >>> references = [0, 1]
 62 |     >>> predictions = [0, 1]
 63 |     >>> results = glue_metric.compute(predictions=predictions, references=references)
 64 |     >>> print(results)
 65 |     {'accuracy': 1.0, 'f1': 1.0}
 66 | 
 67 |     >>> glue_metric = datasets.load_metric('glue', 'stsb')
 68 |     >>> references = [0., 1., 2., 3., 4., 5.]
 69 |     >>> predictions = [0., 1., 2., 3., 4., 5.]
 70 |     >>> results = glue_metric.compute(predictions=predictions, references=references)
 71 |     >>> print({"pearson": round(results["pearson"], 2), "spearmanr": round(results["spearmanr"], 2)})
 72 |     {'pearson': 1.0, 'spearmanr': 1.0}
 73 | 
 74 |     >>> glue_metric = datasets.load_metric('glue', 'cola')
 75 |     >>> references = [0, 1]
 76 |     >>> predictions = [0, 1]
 77 |     >>> results = glue_metric.compute(predictions=predictions, references=references)
 78 |     >>> print(results)
 79 |     {'matthews_correlation': 1.0}
 80 | """
 81 | 
 82 | 
 83 | def simple_accuracy(preds, labels):
 84 |     return float((preds == labels).mean())
 85 | 
 86 | 
 87 | def acc_and_f1(preds, labels):
 88 |     acc = simple_accuracy(preds, labels)
 89 |     f1 = float(f1_score(y_true=labels, y_pred=preds))
 90 |     return {
 91 |         "accuracy": acc,
 92 |         "f1": f1,
 93 |     }
 94 | 
 95 | 
 96 | def pearson_and_spearman(preds, labels):
 97 |     pearson_corr = float(pearsonr(preds, labels)[0])
 98 |     spearman_corr = float(spearmanr(preds, labels)[0])
 99 |     return {
100 |         "pearson": pearson_corr,
101 |         "spearmanr": spearman_corr,
102 |     }
103 | 
104 | 
105 | @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
106 | class Glue(datasets.Metric):
107 |     def _info(self):
108 |         if self.config_name not in [
109 |             "sst2",
110 |             "mnli",
111 |             "mnli_mismatched",
112 |             "mnli_matched",
113 |             "cola",
114 |             "stsb",
115 |             "mrpc",
116 |             "qqp",
117 |             "qnli",
118 |             "rte",
119 |             "wnli",
120 |             "hans",
121 |         ]:
122 |             raise KeyError(
123 |                 "You should supply a configuration name selected in "
124 |                 '["sst2", "mnli", "mnli_mismatched", "mnli_matched", '
125 |                 '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
126 |             )
127 |         return datasets.MetricInfo(
128 |             description=_DESCRIPTION,
129 |             citation=_CITATION,
130 |             inputs_description=_KWARGS_DESCRIPTION,
131 |             features=datasets.Features(
132 |                 {
133 |                     "predictions": datasets.Value("int64" if self.config_name != "stsb" else "float32"),
134 |                     "references": datasets.Value("int64" if self.config_name != "stsb" else "float32"),
135 |                 }
136 |             ),
137 |             codebase_urls=[],
138 |             reference_urls=[],
139 |             format="numpy",
140 |         )
141 | 
142 |     def _compute(self, predictions, references):
143 |         if self.config_name == "cola":
144 |             return {"matthews_correlation": matthews_corrcoef(references, predictions)}
145 |         elif self.config_name == "stsb":
146 |             return pearson_and_spearman(predictions, references)
147 |         elif self.config_name in ["mrpc", "qqp"]:
148 |             return acc_and_f1(predictions, references)
149 |         elif self.config_name in ["sst2", "mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]:
150 |             return {"accuracy": simple_accuracy(predictions, references)}
151 |         else:
152 |             raise KeyError(
153 |                 "You should supply a configuration name selected in "
154 |                 '["sst2", "mnli", "mnli_mismatched", "mnli_matched", '
155 |                 '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
156 |             )


--------------------------------------------------------------------------------
/tasks/superglue/get_trainer.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import random
  4 | import sys
  5 | import torch
  6 | from transformers import (
  7 |     AutoConfig,
  8 |     AutoTokenizer,
  9 |     AutoModelWithHeads,
 10 |     AdapterConfig
 11 | )
 12 | 
 13 | from model.utils import get_model, TaskType
 14 | 
 15 | from training.trainer_base import BaseTrainer, BaseAdapterTrainer
 16 | from transformers import  Trainer, AdapterTrainer, EarlyStoppingCallback, set_seed
 17 | from tasks.superglue.dataset import SuperGlueDataset
 18 | # from training.trainer import Trainer
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | def get_trainer(args):
 23 |     model_args, data_args, training_args, _, adapter_args = args
 24 |     print("set model randome seed ", model_args.model_seed)
 25 |     set_seed(model_args.model_seed)
 26 |     log_level = training_args.get_process_log_level()
 27 |     logger.setLevel(log_level)
 28 | 
 29 |     
 30 |     tokenizer = AutoTokenizer.from_pretrained(
 31 |         model_args.model_name_or_path,
 32 |         use_fast=model_args.use_fast_tokenizer,
 33 |         revision=model_args.model_revision,
 34 |     )
 35 |     # if data_args.dataset_name == 'record':
 36 |     #     dataset = SuperGlueDatasetForRecord(tokenizer, data_args, training_args)
 37 |     # else:
 38 |     dataset = SuperGlueDataset(tokenizer, data_args, training_args)
 39 | 
 40 |     # if training_args.do_train:
 41 |     #     for index in random.sample(range(len(dataset.train_dataset)), 3):
 42 |     #         logger.info(f"Sample {index} of the training set: {dataset.train_dataset[index]}.")
 43 |     if not dataset.multiple_choice:
 44 |         config = AutoConfig.from_pretrained(
 45 |             model_args.model_name_or_path,
 46 |             num_labels=dataset.num_labels,
 47 |             label2id=dataset.label2id,
 48 |             id2label=dataset.id2label,
 49 |             finetuning_task=data_args.dataset_name,
 50 |             revision=model_args.model_revision,
 51 |         )
 52 |     else:
 53 |         config = AutoConfig.from_pretrained(
 54 |             model_args.model_name_or_path,
 55 |             num_labels=dataset.num_labels,
 56 |             finetuning_task=data_args.dataset_name,
 57 |             revision=model_args.model_revision,
 58 |         )
 59 |     config.lora = False
 60 |     if not dataset.multiple_choice:
 61 |         model = get_model(model_args, TaskType.SEQUENCE_CLASSIFICATION, config)
 62 |     else:
 63 |         model = get_model(model_args, TaskType.MULTIPLE_CHOICE, config, fix_bert=True)
 64 | 
 65 | 
 66 |     if adapter_args.train_adapter:
 67 |         logger.info(f"Reduction Factor: {adapter_args.adapter_reduction_factor}")
 68 |         task_name = data_args.task_name or "superglue"
 69 |         # check if adapter already exists, otherwise add it
 70 |         if task_name not in model.config.adapters:
 71 |             # resolve the adapter config
 72 |             adapter_config = AdapterConfig.load(
 73 |                 adapter_args.adapter_config,
 74 |                 non_linearity=adapter_args.adapter_non_linearity,
 75 |                 reduction_factor=adapter_args.adapter_reduction_factor,
 76 |             )
 77 | 
 78 |             model.add_adapter(task_name, config=adapter_config)
 79 |         # Freeze all model weights except of those of this adapter
 80 |         model.train_adapter([task_name])
 81 |         # Set the adapters to be used in every forward pass
 82 |         model.set_active_adapters(task_name)
 83 |     else:
 84 |         if adapter_args.load_adapter:
 85 |             raise ValueError(
 86 |                 "Adapters can only be loaded in adapters training mode."
 87 |                 "Use --train_adapter to enable adapter training"
 88 |             )
 89 |     if model_args.bitfit:
 90 |         for name, param in model.named_parameters():
 91 |             if name.startswith('roberta') and "bias" not in name.lower():
 92 |                 param.requires_grad = False
 93 |     param_optimizer = list(model.named_parameters())
 94 |     logger.info("Trainable parameters:")
 95 |     trained_param = 0
 96 |     
 97 |     for n, p in param_optimizer:
 98 |         if p.requires_grad:
 99 |             trained_param += p.numel()
100 |             logger.info(f"{n}")
101 | 
102 |     set_seed(training_args.seed)
103 |     print("set data randome seed ", training_args.seed)
104 |     trainer_cls = AdapterTrainer if adapter_args.train_adapter else Trainer
105 |     trainer = trainer_cls(
106 |         model=model,
107 |         args=training_args,
108 |         train_dataset=dataset.train_dataset if training_args.do_train else None,
109 |         eval_dataset=dataset.eval_dataset if training_args.do_eval else None,
110 |         compute_metrics=dataset.compute_metrics,
111 |         tokenizer=tokenizer,
112 |         data_collator=dataset.data_collator,
113 |         callbacks = [EarlyStoppingCallback(early_stopping_patience=model_args.patient)]
114 |     )
115 | 
116 | 
117 |     return trainer, dataset.predict_dataset
118 | 


--------------------------------------------------------------------------------
/tasks/superglue/record_evaluation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Official evaluation script for ReCoRD v1.0.
  3 | (Some functions are adopted from the SQuAD evaluation script.)
  4 | """
  5 | 
  6 | 
  7 | import argparse
  8 | import json
  9 | import re
 10 | import string
 11 | import sys
 12 | from collections import Counter
 13 | 
 14 | 
 15 | def normalize_answer(s):
 16 |     """Lower text and remove punctuation, articles and extra whitespace."""
 17 | 
 18 |     def remove_articles(text):
 19 |         return re.sub(r"\b(a|an|the)\b", " ", text)
 20 | 
 21 |     def white_space_fix(text):
 22 |         return " ".join(text.split())
 23 | 
 24 |     def remove_punc(text):
 25 |         exclude = set(string.punctuation)
 26 |         return "".join(ch for ch in text if ch not in exclude)
 27 | 
 28 |     def lower(text):
 29 |         return text.lower()
 30 | 
 31 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
 32 | 
 33 | 
 34 | def f1_score(prediction, ground_truth):
 35 |     prediction_tokens = normalize_answer(prediction).split()
 36 |     ground_truth_tokens = normalize_answer(ground_truth).split()
 37 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 38 |     num_same = sum(common.values())
 39 |     if num_same == 0:
 40 |         return 0
 41 |     precision = 1.0 * num_same / len(prediction_tokens)
 42 |     recall = 1.0 * num_same / len(ground_truth_tokens)
 43 |     f1 = (2 * precision * recall) / (precision + recall)
 44 |     return f1
 45 | 
 46 | 
 47 | def exact_match_score(prediction, ground_truth):
 48 |     return normalize_answer(prediction) == normalize_answer(ground_truth)
 49 | 
 50 | 
 51 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 52 |     scores_for_ground_truths = []
 53 |     for ground_truth in ground_truths:
 54 |         score = metric_fn(prediction, ground_truth)
 55 |         scores_for_ground_truths.append(score)
 56 |     return max(scores_for_ground_truths)
 57 | 
 58 | 
 59 | def evaluate(dataset, predictions):
 60 |     f1 = exact_match = total = 0
 61 |     correct_ids = []
 62 |     for passage in dataset:
 63 |         for qa in passage["qas"]:
 64 |             total += 1
 65 |             if qa["id"] not in predictions:
 66 |                 message = "Unanswered question {} will receive score 0.".format(qa["id"])
 67 |                 print(message, file=sys.stderr)
 68 |                 continue
 69 | 
 70 |             ground_truths = list(map(lambda x: x["text"], qa["answers"]))
 71 |             prediction = predictions[qa["id"]]
 72 | 
 73 |             _exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
 74 |             if int(_exact_match) == 1:
 75 |                 correct_ids.append(qa["id"])
 76 |             exact_match += _exact_match
 77 | 
 78 |             f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
 79 | 
 80 |     exact_match = exact_match / total
 81 |     f1 = f1 / total
 82 | 
 83 |     return {"exact_match": exact_match, "f1": f1}, correct_ids
 84 | 
 85 | 
 86 | if __name__ == "__main__":
 87 |     expected_version = "1.0"
 88 |     parser = argparse.ArgumentParser("Official evaluation script for ReCoRD v1.0.")
 89 |     parser.add_argument("data_file", help="The dataset file in JSON format.")
 90 |     parser.add_argument("pred_file", help="The model prediction file in JSON format.")
 91 |     parser.add_argument("--output_correct_ids", action="store_true", help="Output the correctly answered query IDs.")
 92 |     args = parser.parse_args()
 93 | 
 94 |     with open(args.data_file) as data_file:
 95 |         dataset_json = json.load(data_file)
 96 |         if dataset_json["version"] != expected_version:
 97 |             print(
 98 |                 "Evaluation expects v-{}, but got dataset with v-{}".format(expected_version, dataset_json["version"]),
 99 |                 file=sys.stderr,
100 |             )
101 |         dataset = dataset_json["data"]
102 | 
103 |     with open(args.pred_file) as pred_file:
104 |         predictions = json.load(pred_file)
105 | 
106 |     metrics, correct_ids = evaluate(dataset, predictions)
107 | 
108 |     if args.output_correct_ids:
109 |         print("Output {} correctly answered question IDs.".format(len(correct_ids)))
110 |         with open("correct_ids.json", "w") as f:
111 |             json.dump(correct_ids, f)


--------------------------------------------------------------------------------
/tasks/superglue/super_glue_metric.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The HuggingFace Datasets Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """The SuperGLUE benchmark metric."""
 16 | 
 17 | from sklearn.metrics import f1_score, matthews_corrcoef
 18 | 
 19 | import datasets
 20 | 
 21 | from .record_evaluation import evaluate as evaluate_record
 22 | 
 23 | 
 24 | _CITATION = """\
 25 | @article{wang2019superglue,
 26 |   title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
 27 |   author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},
 28 |   journal={arXiv preprint arXiv:1905.00537},
 29 |   year={2019}
 30 | }
 31 | """
 32 | 
 33 | _DESCRIPTION = """\
 34 | SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after
 35 | GLUE with a new set of more difficult language understanding tasks, improved
 36 | resources, and a new public leaderboard.
 37 | """
 38 | 
 39 | _KWARGS_DESCRIPTION = """
 40 | Compute SuperGLUE evaluation metric associated to each SuperGLUE dataset.
 41 | Args:
 42 |     predictions: list of predictions to score. Depending on the SuperGlUE subset:
 43 |         - for 'record': list of question-answer dictionaries with the following keys:
 44 |             - 'idx': index of the question as specified by the dataset
 45 |             - 'prediction_text': the predicted answer text
 46 |         - for 'multirc': list of question-answer dictionaries with the following keys:
 47 |             - 'idx': index of the question-answer pair as specified by the dataset
 48 |             - 'prediction': the predicted answer label
 49 |         - otherwise: list of predicted labels
 50 |     references: list of reference labels. Depending on the SuperGLUE subset:
 51 |         - for 'record': list of question-answers dictionaries with the following keys:
 52 |             - 'idx': index of the question as specified by the dataset
 53 |             - 'answers': list of possible answers
 54 |         - otherwise: list of reference labels
 55 | Returns: depending on the SuperGLUE subset:
 56 |     - for 'record':
 57 |         - 'exact_match': Exact match between answer and gold answer
 58 |         - 'f1': F1 score
 59 |     - for 'multirc':
 60 |         - 'exact_match': Exact match between answer and gold answer
 61 |         - 'f1_m': Per-question macro-F1 score
 62 |         - 'f1_a': Average F1 score over all answers
 63 |     - for 'axb':
 64 |         'matthews_correlation': Matthew Correlation
 65 |     - for 'cb':
 66 |         - 'accuracy': Accuracy
 67 |         - 'f1': F1 score
 68 |     - for all others:
 69 |         - 'accuracy': Accuracy
 70 | Examples:
 71 | 
 72 |     >>> super_glue_metric = datasets.load_metric('super_glue', 'copa')  # any of ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"]
 73 |     >>> predictions = [0, 1]
 74 |     >>> references = [0, 1]
 75 |     >>> results = super_glue_metric.compute(predictions=predictions, references=references)
 76 |     >>> print(results)
 77 |     {'accuracy': 1.0}
 78 | 
 79 |     >>> super_glue_metric = datasets.load_metric('super_glue', 'cb')
 80 |     >>> predictions = [0, 1]
 81 |     >>> references = [0, 1]
 82 |     >>> results = super_glue_metric.compute(predictions=predictions, references=references)
 83 |     >>> print(results)
 84 |     {'accuracy': 1.0, 'f1': 1.0}
 85 | 
 86 |     >>> super_glue_metric = datasets.load_metric('super_glue', 'record')
 87 |     >>> predictions = [{'idx': {'passage': 0, 'query': 0}, 'prediction_text': 'answer'}]
 88 |     >>> references = [{'idx': {'passage': 0, 'query': 0}, 'answers': ['answer', 'another_answer']}]
 89 |     >>> results = super_glue_metric.compute(predictions=predictions, references=references)
 90 |     >>> print(results)
 91 |     {'exact_match': 1.0, 'f1': 1.0}
 92 | 
 93 |     >>> super_glue_metric = datasets.load_metric('super_glue', 'multirc')
 94 |     >>> predictions = [{'idx': {'answer': 0, 'paragraph': 0, 'question': 0}, 'prediction': 0}, {'idx': {'answer': 1, 'paragraph': 2, 'question': 3}, 'prediction': 1}]
 95 |     >>> references = [0, 1]
 96 |     >>> results = super_glue_metric.compute(predictions=predictions, references=references)
 97 |     >>> print(results)
 98 |     {'exact_match': 1.0, 'f1_m': 1.0, 'f1_a': 1.0}
 99 | 
100 |     >>> super_glue_metric = datasets.load_metric('super_glue', 'axb')
101 |     >>> references = [0, 1]
102 |     >>> predictions = [0, 1]
103 |     >>> results = super_glue_metric.compute(predictions=predictions, references=references)
104 |     >>> print(results)
105 |     {'matthews_correlation': 1.0}
106 | """
107 | 
108 | 
109 | def simple_accuracy(preds, labels):
110 |     return float((preds == labels).mean())
111 | 
112 | 
113 | def acc_and_f1(preds, labels, f1_avg="binary"):
114 |     acc = simple_accuracy(preds, labels)
115 |     f1 = float(f1_score(y_true=labels, y_pred=preds, average=f1_avg))
116 |     return {
117 |         "accuracy": acc,
118 |         "f1": f1,
119 |     }
120 | 
121 | 
122 | def evaluate_multirc(ids_preds, labels):
123 |     """
124 |     Computes F1 score and Exact Match for MultiRC predictions.
125 |     """
126 |     question_map = {}
127 |     for id_pred, label in zip(ids_preds, labels):
128 |         question_id = "{}-{}".format(id_pred["idx"]["paragraph"], id_pred["idx"]["question"])
129 |         pred = id_pred["prediction"]
130 |         if question_id in question_map:
131 |             question_map[question_id].append((pred, label))
132 |         else:
133 |             question_map[question_id] = [(pred, label)]
134 |     f1s, ems = [], []
135 |     for question, preds_labels in question_map.items():
136 |         question_preds, question_labels = zip(*preds_labels)
137 |         f1 = f1_score(y_true=question_labels, y_pred=question_preds, average="macro")
138 |         f1s.append(f1)
139 |         em = int(sum([p == l for p, l in preds_labels]) == len(preds_labels))
140 |         ems.append(em)
141 |     f1_m = float((sum(f1s) / len(f1s)))
142 |     em = sum(ems) / len(ems)
143 |     f1_a = float(f1_score(y_true=labels, y_pred=[id_pred["prediction"] for id_pred in ids_preds]))
144 |     return {"exact_match": em, "f1_m": f1_m, "f1_a": f1_a}
145 | 
146 | 
147 | @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
148 | class SuperGlue(datasets.Metric):
149 |     def _info(self):
150 |         if self.config_name not in [
151 |             "boolq",
152 |             "cb",
153 |             "copa",
154 |             "multirc",
155 |             "record",
156 |             "rte",
157 |             "wic",
158 |             "wsc",
159 |             "wsc.fixed",
160 |             "axb",
161 |             "axg",
162 |         ]:
163 |             raise KeyError(
164 |                 "You should supply a configuration name selected in "
165 |                 '["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc", "wsc.fixed", "axb", "axg",]'
166 |             )
167 |         return datasets.MetricInfo(
168 |             description=_DESCRIPTION,
169 |             citation=_CITATION,
170 |             inputs_description=_KWARGS_DESCRIPTION,
171 |             features=datasets.Features(self._get_feature_types()),
172 |             codebase_urls=[],
173 |             reference_urls=[],
174 |             format="numpy" if not self.config_name == "record" and not self.config_name == "multirc" else None,
175 |         )
176 | 
177 |     def _get_feature_types(self):
178 |         if self.config_name == "record":
179 |             return {
180 |                 "predictions": {
181 |                     "idx": {
182 |                         "passage": datasets.Value("int64"),
183 |                         "query": datasets.Value("int64"),
184 |                     },
185 |                     "prediction_text": datasets.Value("string"),
186 |                 },
187 |                 "references": {
188 |                     "idx": {
189 |                         "passage": datasets.Value("int64"),
190 |                         "query": datasets.Value("int64"),
191 |                     },
192 |                     "answers": datasets.Sequence(datasets.Value("string")),
193 |                 },
194 |             }
195 |         elif self.config_name == "multirc":
196 |             return {
197 |                 "predictions": {
198 |                     "idx": {
199 |                         "answer": datasets.Value("int64"),
200 |                         "paragraph": datasets.Value("int64"),
201 |                         "question": datasets.Value("int64"),
202 |                     },
203 |                     "prediction": datasets.Value("int64"),
204 |                 },
205 |                 "references": datasets.Value("int64"),
206 |             }
207 |         else:
208 |             return {
209 |                 "predictions": datasets.Value("int64"),
210 |                 "references": datasets.Value("int64"),
211 |             }
212 | 
213 |     def _compute(self, predictions, references):
214 |         if self.config_name == "axb":
215 |             return {"matthews_correlation": matthews_corrcoef(references, predictions)}
216 |         elif self.config_name == "cb":
217 |             return acc_and_f1(predictions, references, f1_avg="macro")
218 |         elif self.config_name == "record":
219 |             dataset = [
220 |                 {
221 |                     "qas": [
222 |                         {"id": ref["idx"]["query"], "answers": [{"text": ans} for ans in ref["answers"]]}
223 |                         for ref in references
224 |                     ]
225 |                 }
226 |             ]
227 |             predictions = {pred["idx"]["query"]: pred["prediction_text"] for pred in predictions}
228 |             return evaluate_record(dataset, predictions)[0]
229 |         elif self.config_name == "multirc":
230 |             return evaluate_multirc(predictions, references)
231 |         elif self.config_name in ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"]:
232 |             return {"accuracy": simple_accuracy(predictions, references)}
233 |         else:
234 |             raise KeyError(
235 |                 "You should supply a configuration name selected in "
236 |                 '["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc", "wsc.fixed", "axb", "axg",]'
237 |             )


--------------------------------------------------------------------------------
/tasks/superglue/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | from collections import defaultdict, Counter
 4 | 
 5 | 
 6 | def normalize_answer(s):
 7 |     """Lower text and remove punctuation, articles and extra whitespace."""
 8 | 
 9 |     def remove_articles(text):
10 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
11 | 
12 |     def white_space_fix(text):
13 |         return ' '.join(text.split())
14 | 
15 |     def remove_punc(text):
16 |         exclude = set(string.punctuation)
17 |         return ''.join(ch for ch in text if ch not in exclude)
18 | 
19 |     def lower(text):
20 |         return text.lower()
21 | 
22 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
23 | 
24 | def f1_score(prediction, ground_truth):
25 |     prediction_tokens = normalize_answer(prediction).split()
26 |     ground_truth_tokens = normalize_answer(ground_truth).split()
27 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
28 |     num_same = sum(common.values())
29 |     if num_same == 0:
30 |         return 0
31 |     precision = 1.0 * num_same / len(prediction_tokens)
32 |     recall = 1.0 * num_same / len(ground_truth_tokens)
33 |     f1 = (2 * precision * recall) / (precision + recall)
34 |     return f1
35 | 
36 | 
37 | def exact_match_score(prediction, ground_truth):
38 |     return normalize_answer(prediction) == normalize_answer(ground_truth)
39 | 
40 | 
41 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
42 |     scores_for_ground_truths = []
43 |     for ground_truth in ground_truths:
44 |         score = metric_fn(prediction, ground_truth)
45 |         scores_for_ground_truths.append(score)
46 |     return max(scores_for_ground_truths)


--------------------------------------------------------------------------------
/tasks/utils.py:
--------------------------------------------------------------------------------
 1 | from tasks.glue.dataset import task_to_keys as glue_tasks
 2 | from tasks.superglue.dataset import task_to_keys as superglue_tasks
 3 | 
 4 | GLUE_DATASETS = list(glue_tasks.keys())
 5 | SUPERGLUE_DATASETS = list(superglue_tasks.keys())
 6 | 
 7 | 
 8 | TASKS = ["glue", "superglue"]
 9 | 
10 | DATASETS = GLUE_DATASETS + SUPERGLUE_DATASETS
11 | 
12 | ADD_PREFIX_SPACE = {
13 |     'bert': False,
14 |     'roberta': True,
15 |     'deberta': True,
16 |     'gpt2': True,
17 |     'deberta-v2': True,
18 | }
19 | 
20 | USE_FAST = {
21 |     'bert': True,
22 |     'roberta': True,
23 |     'deberta': True,
24 |     'gpt2': True,
25 |     'deberta-v2': False,
26 | }


--------------------------------------------------------------------------------
/training/trainer_base.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import Dict, OrderedDict
  4 | 
  5 | from transformers import Trainer, AdapterTrainer
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | _default_log_level = logging.INFO
 10 | logger.setLevel(_default_log_level)
 11 | 
 12 | class BaseTrainer(Trainer):
 13 |     def __init__(self, *args, predict_dataset = None, test_key = "accuracy", **kwargs):
 14 |         super().__init__(*args, **kwargs)
 15 |         self.predict_dataset = predict_dataset
 16 |         self.test_key = test_key
 17 |         self.best_metrics = OrderedDict({
 18 |             "best_epoch": 0,
 19 |             f"best_eval_{self.test_key}": 0,
 20 |         })
 21 | 
 22 |     def log_best_metrics(self):
 23 |         self.log_metrics("best", self.best_metrics)
 24 |         self.save_metrics("best", self.best_metrics, combined=False)
 25 | 
 26 |       
 27 | 
 28 |     def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval):
 29 |         if self.control.should_log:
 30 |             logs: Dict[str, float] = {}
 31 | 
 32 | 
 33 |             tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
 34 | 
 35 |             # reset tr_loss to zero
 36 |             tr_loss -= tr_loss
 37 | 
 38 |             logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
 39 |             logs["learning_rate"] = self._get_learning_rate()
 40 | 
 41 |             self._total_loss_scalar += tr_loss_scalar
 42 |             self._globalstep_last_logged = self.state.global_step
 43 |             self.store_flos()
 44 | 
 45 |             self.log(logs)
 46 | 
 47 |         eval_metrics = None
 48 |         if self.control.should_evaluate:
 49 |             eval_metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
 50 |             self._report_to_hp_search(trial, epoch, eval_metrics)
 51 | 
 52 |             if eval_metrics["eval_"+self.test_key] > self.best_metrics["best_eval_"+self.test_key]:
 53 |                 self.best_metrics["best_epoch"] = epoch
 54 |                 self.best_metrics["best_eval_"+self.test_key] = eval_metrics["eval_"+self.test_key]
 55 | 
 56 |                 if self.predict_dataset is not None:
 57 |                     if isinstance(self.predict_dataset, dict):
 58 |                         for dataset_name, dataset in self.predict_dataset.items():
 59 |                             _, _, test_metrics = self.predict(dataset, metric_key_prefix="test")
 60 |                             self.best_metrics[f"best_test_{dataset_name}_{self.test_key}"] = test_metrics["test_"+self.test_key]
 61 |                     else:
 62 |                         _, _, test_metrics = self.predict(self.predict_dataset, metric_key_prefix="test")
 63 |                         self.best_metrics["best_test_"+self.test_key] = test_metrics["test_"+self.test_key]
 64 | 
 65 |             logger.info(f"***** Epoch {epoch}: Best results *****")
 66 |             for key, value in self.best_metrics.items():
 67 |                 logger.info(f"{key} = {value}")
 68 |             self.log(self.best_metrics)
 69 | 
 70 |         if self.control.should_save:
 71 |             self._save_checkpoint(model, trial, metrics=eval_metrics)
 72 |             self.control = self.callback_handler.on_save(self.args, self.state, self.control)
 73 | 
 74 | class BaseAdapterTrainer(AdapterTrainer):
 75 |     def __init__(self, *args, predict_dataset = None, test_key = "accuracy", **kwargs):
 76 |         super().__init__(*args, **kwargs)
 77 |         self.predict_dataset = predict_dataset
 78 |         self.test_key = test_key
 79 |         self.best_metrics = OrderedDict({
 80 |             "best_epoch": 0,
 81 |             f"best_eval_{self.test_key}": 0,
 82 |         })
 83 | 
 84 |     def log_best_metrics(self):
 85 |         self.log_metrics("best", self.best_metrics)
 86 |         self.save_metrics("best", self.best_metrics, combined=False)
 87 | 
 88 |       
 89 | 
 90 |     def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval):
 91 |         if self.control.should_log:
 92 |             logs: Dict[str, float] = {}
 93 | 
 94 | 
 95 |             tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
 96 | 
 97 |             # reset tr_loss to zero
 98 |             tr_loss -= tr_loss
 99 | 
100 |             logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
101 |             logs["learning_rate"] = self._get_learning_rate()
102 | 
103 |             self._total_loss_scalar += tr_loss_scalar
104 |             self._globalstep_last_logged = self.state.global_step
105 |             self.store_flos()
106 | 
107 |             self.log(logs)
108 | 
109 |         eval_metrics = None
110 |         if self.control.should_evaluate:
111 |             eval_metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
112 |             self._report_to_hp_search(trial, epoch, eval_metrics)
113 | 
114 |             if eval_metrics["eval_"+self.test_key] > self.best_metrics["best_eval_"+self.test_key]:
115 |                 self.best_metrics["best_epoch"] = epoch
116 |                 self.best_metrics["best_eval_"+self.test_key] = eval_metrics["eval_"+self.test_key]
117 | 
118 |                 if self.predict_dataset is not None:
119 |                     if isinstance(self.predict_dataset, dict):
120 |                         for dataset_name, dataset in self.predict_dataset.items():
121 |                             _, _, test_metrics = self.predict(dataset, metric_key_prefix="test")
122 |                             self.best_metrics[f"best_test_{dataset_name}_{self.test_key}"] = test_metrics["test_"+self.test_key]
123 |                     else:
124 |                         _, _, test_metrics = self.predict(self.predict_dataset, metric_key_prefix="test")
125 |                         self.best_metrics["best_test_"+self.test_key] = test_metrics["test_"+self.test_key]
126 | 
127 |             logger.info(f"***** Epoch {epoch}: Best results *****")
128 |             for key, value in self.best_metrics.items():
129 |                 logger.info(f"{key} = {value}")
130 |             self.log(self.best_metrics)
131 | 
132 |         if self.control.should_save:
133 |             self._save_checkpoint(model, trial, metrics=eval_metrics)
134 |             self.control = self.callback_handler.on_save(self.args, self.state, self.control)


--------------------------------------------------------------------------------