├── .gitignore ├── LICENSE ├── README.md ├── arguments.py ├── model ├── lora │ ├── __init__.py │ ├── layers.py │ └── utils.py ├── multiple_choice.py ├── prefix_encoder.py ├── prompt.py ├── roberta │ ├── __init__.py │ ├── configuration_roberta.py │ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py │ ├── modeling_flax_roberta.py │ ├── modeling_roberta.py │ ├── modeling_tf_roberta.py │ ├── tokenization_roberta.py │ └── tokenization_roberta_fast.py ├── sequence_classification.py └── utils.py ├── requirements.txt ├── run.py ├── scripts ├── mulitruns_scripts │ ├── adapter │ │ ├── run_boolq_roberta_both.sh │ │ ├── run_cb_roberta_both.sh │ │ ├── run_cola_roberta_both.sh │ │ ├── run_copa_roberta_both.sh │ │ ├── run_mnli_roberta_both.sh │ │ ├── run_mrpc_roberta_both.sh │ │ ├── run_multirc_roberta_both.sh │ │ ├── run_qnli_roberta_both.sh │ │ ├── run_qqp_roberta_both.sh │ │ ├── run_rte_roberta_both.sh │ │ ├── run_sst2_roberta_both.sh │ │ ├── run_stsb_roberta_both.sh │ │ ├── run_wic_roberta_both.sh │ │ └── run_wsc_roberta_both.sh │ ├── bitfit │ │ ├── run_boolq_roberta_both.sh │ │ ├── run_cb_roberta_both.sh │ │ ├── run_cola_roberta_both.sh │ │ ├── run_copa_roberta_both.sh │ │ ├── run_mrpc_roberta_both.sh │ │ ├── run_multirc_roberta_both.sh │ │ ├── run_rte_roberta_both.sh │ │ ├── run_sst2_roberta_both.sh │ │ ├── run_stsb_roberta_both.sh │ │ ├── run_wic_roberta_both.sh │ │ └── run_wsc_roberta_both.sh │ ├── finetuning │ │ ├── run_boolq_roberta_both.sh │ │ ├── run_cb_roberta_both.sh │ │ ├── run_cola_roberta_both.sh │ │ ├── run_copa_roberta_both.sh │ │ ├── run_mnli_roberta_both.sh │ │ ├── run_mrpc_roberta_both.sh │ │ ├── run_multirc_roberta_both.sh │ │ ├── run_qnli_roberta_both.sh │ │ ├── run_qqp_roberta_both.sh │ │ ├── run_rte_roberta_both.sh │ │ ├── run_sst2_roberta_both.sh │ │ ├── run_stsb_roberta_both.sh │ │ ├── run_wic_roberta_both.sh │ │ └── run_wsc_roberta_both.sh │ ├── lora │ │ ├── run_boolq_roberta_both.sh │ │ ├── run_cb_roberta_both.sh │ │ ├── run_cola_roberta_both.sh │ │ ├── run_copa_roberta_both.sh │ │ ├── run_mrpc_roberta_both.sh │ │ ├── run_multirc_roberta_both.sh │ │ ├── run_rte_roberta_both.sh │ │ ├── run_sst2_roberta_both.sh │ │ ├── run_stsb_roberta_both.sh │ │ ├── run_wic_roberta_both.sh │ │ └── run_wsc_roberta_both.sh │ └── prefixtuning │ │ ├── run_boolq_roberta_both.sh │ │ ├── run_cb_roberta_both.sh │ │ ├── run_cola_roberta_both.sh │ │ ├── run_copa_roberta_both.sh │ │ ├── run_mnli_roberta_both.sh │ │ ├── run_mrpc_roberta_both.sh │ │ ├── run_multirc_roberta_both.sh │ │ ├── run_qnli_roberta_both.sh │ │ ├── run_qqp_roberta_both.sh │ │ ├── run_rte_roberta_both.sh │ │ ├── run_sst2_roberta_both.sh │ │ ├── run_stsb_roberta_both.sh │ │ ├── run_wic_roberta_both.sh │ │ └── run_wsc_roberta_both.sh └── search_scipts │ ├── glue │ ├── search_adapter.sh │ ├── search_bitfit.sh │ ├── search_ft.sh │ ├── search_lora.sh │ └── search_pt.sh │ ├── search.py │ └── superglue │ ├── search_adapter.sh │ ├── search_bitfit.sh │ ├── search_ft.sh │ ├── search_lora.sh │ └── search_pt.sh ├── tasks ├── glue │ ├── dataset.py │ ├── get_trainer.py │ └── glue.py ├── superglue │ ├── dataset.py │ ├── get_trainer.py │ ├── record_evaluation.py │ ├── super_glue.py │ ├── super_glue_metric.py │ └── utils.py └── utils.py └── training └── trainer_base.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 guanzhchen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Revisiting Parameter-Efficient Tuning: Are We Really There Yet? 2 | 3 | This is the code implementation of our paper accepted in EMNLP 2022: 4 | 5 | > Guanzheng Chen, Fangyu Liu, Zaiqiao Meng, Shangsong Liang. [Revisiting Parameter-Efficient Tuning: Are We Really There Yet?](https://arxiv.org/abs/2202.07962). 6 | 7 | 8 | 9 | We provide a comprehensive study for existing famous **P**arameter-**E**fficient **Tuning** (PETuning) methods, i.e., Adapter, Prompt, LoRA, and BitFit, focusing on their performance and stability. 10 | 11 | 12 | 13 | The code structure is based in part on [P-tuning v2](https://github.com/THUDM/P-tuning-v2). (Thanks for their awesome work.) 14 | 15 | 16 | 17 | ## File Structure 18 | 19 | - `model`: codes to implement PETuning methods. 20 | - `tasks`: codes to preprocess datasets and choose model for each task. 21 | - `training`: codes to define the trainer for training. 22 | - `scripts`: scripts to run training, evaluation, and prediction for each task. 23 | - `search_scripts`: the scripts to perform grid search for each task. 24 | - `multiruns_scripts`: the scripts to conduct multi runs for each task, where each script contains the best hyper-parameters for corresponding task. 25 | - `arguments.py & run.py`: the arguments and running codes for training, evaluation, and prediction. 26 | 27 | 28 | 29 | ## Dependency 30 | 31 | ``` 32 | torch==1.8.1 33 | transformers==4.5.0 34 | adapter-transformers==2.2.0 35 | ``` 36 | 37 | Please view `requirements.txt` for more details. 38 | 39 | 40 | 41 | ## Data 42 | 43 | All datasets in GLUE and SuperGLUE will be automatically downloaded (from Huggingface Datasets APIs) when running the scripts. 44 | 45 | 46 | 47 | ## PETuning for Each Task 48 | 49 | To search the best hyper-parameters for each task, you can run the scripts in the `scripts/search_scripts/` folder. For example, you can run the CB tasks with adapter by the command: 50 | 51 | ```bash 52 | bash scripts/search_scripts/superglue/search_adapter.sh cb 53 | ``` 54 | 55 | 56 | 57 | To conduct multiple runs for one task, you can run the scripts in the `scripts/multiruns_scripts/` folder. For example, you can run the CB tasks with adapter by the command: 58 | 59 | ```bash 60 | bash scripts/multiruns_scripts/adapter/run_cb_roberta_both.sh 61 | ``` 62 | 63 | We provide the best hyper-parameters for each task in corresponding multi-runs scripts. If you cannot reproduce our reported results, please check the environment (package version) and conduct the grid search in your environment. 64 | 65 | 66 | 67 | ## Acknowledgments 68 | 69 | [P-tuning v2](https://github.com/THUDM/P-tuning-v2) 70 | 71 | [Hugging Face Transformers](https://github.com/huggingface/transformers) 72 | 73 | [Adapter-Hub](https://github.com/Adapter-Hub/adapter-transformers) 74 | 75 | [LoRA](https://github.com/microsoft/LoRA) 76 | 77 | [BitFit](https://github.com/benzakenelad/BitFit) 78 | 79 | 80 | ## Citation 81 | 82 | If you find our paper and resources useful, please kindly cite our paper: 83 | 84 | ``` 85 | @inproceedings{chen-etal-2022-revisiting, 86 | title = "Revisiting Parameter-Efficient Tuning: Are We Really There Yet?", 87 | author = "Chen, Guanzheng and 88 | Liu, Fangyu and 89 | Meng, Zaiqiao and 90 | Liang, Shangsong", 91 | booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing", 92 | month = dec, 93 | year = "2022", 94 | address = "Abu Dhabi, United Arab Emirates", 95 | publisher = "Association for Computational Linguistics", 96 | url = "https://aclanthology.org/2022.emnlp-main.168", 97 | pages = "2612--2626", 98 | } 99 | 100 | ``` 101 | -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | import argparse 3 | import dataclasses 4 | from dataclasses import dataclass, field 5 | from typing import Optional 6 | 7 | from transformers import HfArgumentParser, TrainingArguments, AdapterArguments 8 | 9 | from tasks.utils import * 10 | 11 | 12 | @dataclass 13 | class DataTrainingArguments: 14 | """ 15 | Arguments pertaining to what data we are going to input our model for training and eval. 16 | 17 | Using `HfArgumentParser` we can turn this class 18 | into argparse arguments to be able to specify them on 19 | the command line.training_args 20 | """ 21 | 22 | task_name: str = field( 23 | metadata={ 24 | "help": "The name of the task to train on: " + ", ".join(TASKS), 25 | "choices": TASKS 26 | }, 27 | ) 28 | dataset_name: str = field( 29 | metadata={ 30 | "help": "The name of the dataset to use: " + ", ".join(DATASETS), 31 | "choices": DATASETS 32 | } 33 | ) 34 | dataset_config_name: Optional[str] = field( 35 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 36 | ) 37 | max_seq_length: int = field( 38 | default=128, 39 | metadata={ 40 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 41 | "than this will be truncated, sequences shorter will be padded." 42 | }, 43 | ) 44 | overwrite_cache: bool = field( 45 | default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} 46 | ) 47 | pad_to_max_length: bool = field( 48 | default=True, 49 | metadata={ 50 | "help": "Whether to pad all samples to `max_seq_length`. " 51 | "If False, will pad the samples dynamically when batching to the maximum length in the batch." 52 | }, 53 | ) 54 | max_train_samples: Optional[int] = field( 55 | default=None, 56 | metadata={ 57 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 58 | "value if set." 59 | }, 60 | ) 61 | max_eval_samples: Optional[int] = field( 62 | default=None, 63 | metadata={ 64 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 65 | "value if set." 66 | }, 67 | ) 68 | max_predict_samples: Optional[int] = field( 69 | default=None, 70 | metadata={ 71 | "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " 72 | "value if set." 73 | }, 74 | ) 75 | train_file: Optional[str] = field( 76 | default=None, metadata={"help": "A csv or a json file containing the training data."} 77 | ) 78 | validation_file: Optional[str] = field( 79 | default=None, metadata={"help": "A csv or a json file containing the validation data."} 80 | ) 81 | test_file: Optional[str] = field( 82 | default=None, 83 | metadata={"help": "A csv or a json file containing the test data."} 84 | ) 85 | template_id: Optional[int] = field( 86 | default=0, 87 | metadata={ 88 | "help": "The specific prompt string to use" 89 | } 90 | ) 91 | pilot: Optional[str] = field( 92 | default=None, 93 | metadata={"help": "do the pilot experiments."} 94 | ) 95 | 96 | 97 | @dataclass 98 | class ModelArguments: 99 | """ 100 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 101 | """ 102 | model_name_or_path: str = field( 103 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 104 | ) 105 | config_name: Optional[str] = field( 106 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 107 | ) 108 | tokenizer_name: Optional[str] = field( 109 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 110 | ) 111 | cache_dir: Optional[str] = field( 112 | default=None, 113 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 114 | ) 115 | use_fast_tokenizer: bool = field( 116 | default=True, 117 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 118 | ) 119 | model_revision: str = field( 120 | default="main", 121 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 122 | ) 123 | use_auth_token: bool = field( 124 | default=False, 125 | metadata={ 126 | "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " 127 | "with private models)." 128 | }, 129 | ) 130 | prefix: bool = field( 131 | default=False, 132 | metadata={ 133 | "help": "Will use P-tuning v2 during training" 134 | } 135 | ) 136 | prompt: bool = field( 137 | default=False, 138 | metadata={ 139 | "help": "Will use prompt tuning during training" 140 | } 141 | ) 142 | pre_seq_len: int = field( 143 | default=4, 144 | metadata={ 145 | "help": "The length of prompt" 146 | } 147 | ) 148 | prefix_projection: bool = field( 149 | default=False, 150 | metadata={ 151 | "help": "Apply a two-layer MLP head over the prefix embeddings" 152 | } 153 | ) 154 | prefix_hidden_size: int = field( 155 | default=512, 156 | metadata={ 157 | "help": "The hidden size of the MLP projection head in Prefix Encoder if prefix projection is used" 158 | } 159 | ) 160 | hidden_dropout_prob: float = field( 161 | default=0.1, 162 | metadata={ 163 | "help": "The dropout probability used in the models" 164 | } 165 | ) 166 | lora: bool = field( 167 | default=False, 168 | metadata={ 169 | "help": "Will use lora during training" 170 | } 171 | ) 172 | lora_r: int = field( 173 | default=8, 174 | metadata={ 175 | "help": "The rank of lora" 176 | } 177 | ) 178 | lora_alpha: int = field( 179 | default=16, 180 | metadata={ 181 | "help": "The length of prompt" 182 | } 183 | ) 184 | model_seed: int = field( 185 | default=1111, 186 | metadata={ 187 | "help": "The random seed of model initialization." 188 | } 189 | ) 190 | bitfit: bool = field( 191 | default=False, 192 | metadata={ 193 | "help": "Will use bitfit during training" 194 | } 195 | ) 196 | patient: int = field( 197 | default=10, 198 | metadata={ 199 | "help": "The patient of early stopping." 200 | } 201 | ) 202 | 203 | @dataclass 204 | class QuestionAnwseringArguments: 205 | n_best_size: int = field( 206 | default=20, 207 | metadata={"help": "The total number of n-best predictions to generate when looking for an answer."}, 208 | ) 209 | max_answer_length: int = field( 210 | default=30, 211 | metadata={ 212 | "help": "The maximum length of an answer that can be generated. This is needed because the start " 213 | "and end predictions are not conditioned on one another." 214 | }, 215 | ) 216 | version_2_with_negative: bool = field( 217 | default=False, metadata={"help": "If true, some of the examples do not have an answer."} 218 | ) 219 | null_score_diff_threshold: float = field( 220 | default=0.0, 221 | metadata={ 222 | "help": "The threshold used to select the null answer: if the best answer has a score that is less than " 223 | "the score of the null answer minus this threshold, the null answer is selected for this example. " 224 | "Only useful when `version_2_with_negative=True`." 225 | }, 226 | ) 227 | 228 | def get_args(): 229 | """Parse all the args.""" 230 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, QuestionAnwseringArguments, AdapterArguments)) 231 | 232 | args = parser.parse_args_into_dataclasses() 233 | 234 | return args -------------------------------------------------------------------------------- /model/lora/__init__.py: -------------------------------------------------------------------------------- 1 | from .layers import * 2 | from .utils import * -------------------------------------------------------------------------------- /model/lora/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. 4 | # ------------------------------------------------------------------------------------------ 5 | import torch 6 | import torch.nn as nn 7 | 8 | from typing import Dict 9 | 10 | from .layers import LoRALayer 11 | 12 | 13 | def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None: 14 | for n, p in model.named_parameters(): 15 | if 'lora_' not in n: 16 | p.requires_grad = False 17 | if bias == 'none': 18 | return 19 | elif bias == 'all': 20 | for n, p in model.named_parameters(): 21 | if 'bias' in n: 22 | p.requires_grad = True 23 | elif bias == 'lora_only': 24 | for m in model.modules(): 25 | if isinstance(m, LoRALayer) and \ 26 | hasattr(m, 'bias') and \ 27 | m.bias is not None: 28 | m.bias.requires_grad = True 29 | else: 30 | raise NotImplementedError 31 | 32 | 33 | def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]: 34 | my_state_dict = model.state_dict() 35 | if bias == 'none': 36 | return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k} 37 | elif bias == 'all': 38 | return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k} 39 | elif bias == 'lora_only': 40 | to_return = {} 41 | for k in my_state_dict: 42 | if 'lora_' in k: 43 | to_return[k] = my_state_dict[k] 44 | bias_name = k.split('lora_')[0]+'bias' 45 | if bias_name in my_state_dict: 46 | to_return[bias_name] = my_state_dict[bias_name] 47 | return to_return 48 | else: 49 | raise NotImplementedError 50 | -------------------------------------------------------------------------------- /model/multiple_choice.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch._C import NoopLogger 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch import Tensor 6 | from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss 7 | 8 | from transformers import BertModel, BertPreTrainedModel 9 | from transformers.modeling_outputs import MultipleChoiceModelOutput, BaseModelOutput, Seq2SeqLMOutput 10 | 11 | from model.prefix_encoder import PrefixEncoder 12 | from transformers.adapters.model_mixin import ModelWithHeadsAdaptersMixin 13 | # from transformers import RobertaModel, RobertaPreTrainedModel 14 | from model.roberta import RobertaModel, RobertaPreTrainedModel 15 | 16 | 17 | 18 | class RobertaPrefixForMultipleChoice(ModelWithHeadsAdaptersMixin, RobertaPreTrainedModel): 19 | _keys_to_ignore_on_load_missing = [r"position_ids"] 20 | 21 | def __init__(self, config): 22 | super().__init__(config) 23 | 24 | self.roberta = RobertaModel(config) 25 | self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) 26 | self.classifier = torch.nn.Linear(config.hidden_size, 1) 27 | 28 | self.init_weights() 29 | 30 | 31 | for param in self.roberta.parameters(): 32 | param.requires_grad = False 33 | 34 | self.pre_seq_len = config.pre_seq_len 35 | self.n_layer = config.num_hidden_layers 36 | self.n_head = config.num_attention_heads 37 | self.n_embd = config.hidden_size // config.num_attention_heads 38 | 39 | self.prefix_tokens = torch.arange(self.pre_seq_len).long() 40 | self.prefix_encoder = PrefixEncoder(config) 41 | 42 | bert_param = 0 43 | for name, param in self.roberta.named_parameters(): 44 | bert_param += param.numel() 45 | all_param = 0 46 | for name, param in self.named_parameters(): 47 | all_param += param.numel() 48 | total_param = all_param - bert_param 49 | print('total param is {}'.format(total_param)) 50 | 51 | def get_prompt(self, batch_size): 52 | prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(self.roberta.device) 53 | past_key_values = self.prefix_encoder(prefix_tokens) 54 | past_key_values = past_key_values.view( 55 | batch_size, 56 | self.pre_seq_len, 57 | self.n_layer * 2, 58 | self.n_head, 59 | self.n_embd 60 | ) 61 | past_key_values = self.dropout(past_key_values) 62 | past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(2) 63 | return past_key_values 64 | 65 | def forward( 66 | self, 67 | input_ids=None, 68 | token_type_ids=None, 69 | attention_mask=None, 70 | labels=None, 71 | position_ids=None, 72 | head_mask=None, 73 | inputs_embeds=None, 74 | output_attentions=None, 75 | output_hidden_states=None, 76 | return_dict=None, 77 | adapter_names=None, 78 | head=None, 79 | **kwargs 80 | ): 81 | r""" 82 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): 83 | Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., 84 | num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See 85 | :obj:`input_ids` above) 86 | """ 87 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 88 | batch_size, num_choices = input_ids.shape[:2] if input_ids is not None else inputs_embeds.shape[:2] 89 | 90 | flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None 91 | flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None 92 | flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None 93 | flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None 94 | flat_inputs_embeds = ( 95 | inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) 96 | if inputs_embeds is not None 97 | else None 98 | ) 99 | 100 | past_key_values = self.get_prompt(batch_size=batch_size * num_choices) 101 | prefix_attention_mask = torch.ones(batch_size * num_choices, self.pre_seq_len).to(self.roberta.device) 102 | flat_attention_mask = torch.cat((prefix_attention_mask, flat_attention_mask), dim=1) 103 | 104 | outputs = self.roberta( 105 | flat_input_ids, 106 | position_ids=flat_position_ids, 107 | token_type_ids=flat_token_type_ids, 108 | attention_mask=flat_attention_mask, 109 | head_mask=head_mask, 110 | inputs_embeds=flat_inputs_embeds, 111 | output_attentions=output_attentions, 112 | output_hidden_states=output_hidden_states, 113 | return_dict=return_dict, 114 | adapter_names=adapter_names, 115 | past_key_values=past_key_values, 116 | ) 117 | pooled_output = outputs[1] 118 | 119 | pooled_output = self.dropout(pooled_output) 120 | logits = self.classifier(pooled_output) 121 | reshaped_logits = logits.view(-1, num_choices) 122 | 123 | loss = None 124 | if labels is not None: 125 | loss_fct = CrossEntropyLoss() 126 | loss = loss_fct(reshaped_logits, labels) 127 | 128 | if not return_dict: 129 | output = (reshaped_logits,) + outputs[2:] 130 | return ((loss,) + output) if loss is not None else output 131 | 132 | return MultipleChoiceModelOutput( 133 | loss=loss, 134 | logits=reshaped_logits, 135 | hidden_states=outputs.hidden_states, 136 | attentions=outputs.attentions, 137 | ) 138 | 139 | 140 | 141 | 142 | 143 | 144 | class RobertaLoraForMultipleChoice(ModelWithHeadsAdaptersMixin, RobertaPreTrainedModel): 145 | _keys_to_ignore_on_load_missing = [r"position_ids"] 146 | 147 | def __init__(self, config): 148 | super().__init__(config) 149 | 150 | self.roberta = RobertaModel(config) 151 | self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) 152 | self.classifier = torch.nn.Linear(config.hidden_size, 1) 153 | 154 | self.init_weights() 155 | 156 | 157 | for name, param in self.roberta.named_parameters(): 158 | if "lora" not in name.lower(): 159 | param.requires_grad = False 160 | 161 | 162 | bert_param = 0 163 | for name, param in self.roberta.named_parameters(): 164 | bert_param += param.numel() 165 | all_param = 0 166 | for name, param in self.named_parameters(): 167 | all_param += param.numel() 168 | total_param = all_param - bert_param 169 | print('total param is {}'.format(total_param)) 170 | 171 | def forward( 172 | self, 173 | input_ids=None, 174 | token_type_ids=None, 175 | attention_mask=None, 176 | labels=None, 177 | position_ids=None, 178 | head_mask=None, 179 | inputs_embeds=None, 180 | output_attentions=None, 181 | output_hidden_states=None, 182 | return_dict=None, 183 | adapter_names=None, 184 | head=None, 185 | **kwargs 186 | ): 187 | r""" 188 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): 189 | Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., 190 | num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See 191 | :obj:`input_ids` above) 192 | """ 193 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 194 | batch_size, num_choices = input_ids.shape[:2] if input_ids is not None else inputs_embeds.shape[:2] 195 | 196 | flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None 197 | flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None 198 | flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None 199 | flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None 200 | flat_inputs_embeds = ( 201 | inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) 202 | if inputs_embeds is not None 203 | else None 204 | ) 205 | 206 | outputs = self.roberta( 207 | flat_input_ids, 208 | position_ids=flat_position_ids, 209 | token_type_ids=flat_token_type_ids, 210 | attention_mask=flat_attention_mask, 211 | head_mask=head_mask, 212 | inputs_embeds=flat_inputs_embeds, 213 | output_attentions=output_attentions, 214 | output_hidden_states=output_hidden_states, 215 | return_dict=return_dict, 216 | adapter_names=adapter_names, 217 | ) 218 | pooled_output = outputs[1] 219 | 220 | pooled_output = self.dropout(pooled_output) 221 | logits = self.classifier(pooled_output) 222 | reshaped_logits = logits.view(-1, num_choices) 223 | 224 | loss = None 225 | if labels is not None: 226 | loss_fct = CrossEntropyLoss() 227 | loss = loss_fct(reshaped_logits, labels) 228 | 229 | if not return_dict: 230 | output = (reshaped_logits,) + outputs[2:] 231 | return ((loss,) + output) if loss is not None else output 232 | 233 | return MultipleChoiceModelOutput( 234 | loss=loss, 235 | logits=reshaped_logits, 236 | hidden_states=outputs.hidden_states, 237 | attentions=outputs.attentions, 238 | ) -------------------------------------------------------------------------------- /model/prefix_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class PrefixEncoder(torch.nn.Module): 5 | r''' 6 | The torch.nn model to encode the prefix 7 | 8 | Input shape: (batch-size, prefix-length) 9 | 10 | Output shape: (batch-size, prefix-length, 2*layers*hidden) 11 | ''' 12 | def __init__(self, config): 13 | super().__init__() 14 | self.prefix_projection = config.prefix_projection 15 | if self.prefix_projection: 16 | # Use a two-layer MLP to encode the prefix 17 | self.embedding = torch.nn.Embedding(config.pre_seq_len, config.hidden_size) 18 | self.trans = torch.nn.Sequential( 19 | torch.nn.Linear(config.hidden_size, config.prefix_hidden_size), 20 | torch.nn.Tanh(), 21 | torch.nn.Linear(config.prefix_hidden_size, config.num_hidden_layers * 2 * config.hidden_size) 22 | ) 23 | else: 24 | self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_hidden_layers * 2 * config.hidden_size) 25 | 26 | def forward(self, prefix: torch.Tensor): 27 | if self.prefix_projection: 28 | prefix_tokens = self.embedding(prefix) 29 | past_key_values = self.trans(prefix_tokens) 30 | else: 31 | past_key_values = self.embedding(prefix) 32 | return past_key_values -------------------------------------------------------------------------------- /model/roberta/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from transformers.file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaOnnxConfig"], 26 | "tokenization_roberta": ["RobertaTokenizer"], 27 | } 28 | 29 | if is_tokenizers_available(): 30 | _import_structure["tokenization_roberta_fast"] = ["RobertaTokenizerFast"] 31 | 32 | if is_torch_available(): 33 | _import_structure["modeling_roberta"] = [ 34 | "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST", 35 | "RobertaForCausalLM", 36 | "RobertaForMaskedLM", 37 | "RobertaForMultipleChoice", 38 | "RobertaForQuestionAnswering", 39 | "RobertaForSequenceClassification", 40 | "RobertaForTokenClassification", 41 | "RobertaModel", 42 | "RobertaModelWithHeads", 43 | "RobertaPreTrainedModel", 44 | ] 45 | 46 | if is_tf_available(): 47 | _import_structure["modeling_tf_roberta"] = [ 48 | "TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST", 49 | "TFRobertaForMaskedLM", 50 | "TFRobertaForMultipleChoice", 51 | "TFRobertaForQuestionAnswering", 52 | "TFRobertaForSequenceClassification", 53 | "TFRobertaForTokenClassification", 54 | "TFRobertaMainLayer", 55 | "TFRobertaModel", 56 | "TFRobertaPreTrainedModel", 57 | ] 58 | 59 | if is_flax_available(): 60 | _import_structure["modeling_flax_roberta"] = [ 61 | "FlaxRobertaForMaskedLM", 62 | "FlaxRobertaForMultipleChoice", 63 | "FlaxRobertaForQuestionAnswering", 64 | "FlaxRobertaForSequenceClassification", 65 | "FlaxRobertaForTokenClassification", 66 | "FlaxRobertaModel", 67 | "FlaxRobertaPreTrainedModel", 68 | ] 69 | 70 | 71 | if TYPE_CHECKING: 72 | from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaOnnxConfig 73 | from .tokenization_roberta import RobertaTokenizer 74 | 75 | if is_tokenizers_available(): 76 | from .tokenization_roberta_fast import RobertaTokenizerFast 77 | 78 | if is_torch_available(): 79 | from .modeling_roberta import ( 80 | ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, 81 | RobertaForCausalLM, 82 | RobertaForMaskedLM, 83 | RobertaForMultipleChoice, 84 | RobertaForQuestionAnswering, 85 | RobertaForSequenceClassification, 86 | RobertaForTokenClassification, 87 | RobertaModel, 88 | RobertaModelWithHeads, 89 | RobertaPreTrainedModel, 90 | ) 91 | 92 | if is_tf_available(): 93 | from .modeling_tf_roberta import ( 94 | TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, 95 | TFRobertaForMaskedLM, 96 | TFRobertaForMultipleChoice, 97 | TFRobertaForQuestionAnswering, 98 | TFRobertaForSequenceClassification, 99 | TFRobertaForTokenClassification, 100 | TFRobertaMainLayer, 101 | TFRobertaModel, 102 | TFRobertaPreTrainedModel, 103 | ) 104 | 105 | if is_flax_available(): 106 | from .modeling_tf_roberta import ( 107 | FlaxRobertaForMaskedLM, 108 | FlaxRobertaForMultipleChoice, 109 | FlaxRobertaForQuestionAnswering, 110 | FlaxRobertaForSequenceClassification, 111 | FlaxRobertaForTokenClassification, 112 | FlaxRobertaModel, 113 | FlaxRobertaPreTrainedModel, 114 | ) 115 | 116 | else: 117 | import sys 118 | 119 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) 120 | -------------------------------------------------------------------------------- /model/roberta/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | from collections import OrderedDict 18 | from typing import Mapping 19 | 20 | from transformers.onnx import OnnxConfig 21 | from transformers.utils import logging 22 | from transformers import BertConfig 23 | 24 | 25 | logger = logging.get_logger(__name__) 26 | 27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 28 | "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json", 29 | "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json", 30 | "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json", 31 | "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json", 32 | "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json", 33 | "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json", 34 | } 35 | 36 | 37 | class RobertaConfig(BertConfig): 38 | r""" 39 | This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a 40 | :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified 41 | arguments, defining the model architecture. 42 | 43 | 44 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model 45 | outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. 46 | 47 | The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the 48 | same defaults. Please check the parent class for more information. 49 | 50 | Examples:: 51 | 52 | >>> from transformers import RobertaConfig, RobertaModel 53 | 54 | >>> # Initializing a RoBERTa configuration 55 | >>> configuration = RobertaConfig() 56 | 57 | >>> # Initializing a model from the configuration 58 | >>> model = RobertaModel(configuration) 59 | 60 | >>> # Accessing the model configuration 61 | >>> configuration = model.config 62 | """ 63 | model_type = "roberta" 64 | 65 | def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs): 66 | """Constructs RobertaConfig.""" 67 | super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 68 | 69 | 70 | class RobertaOnnxConfig(OnnxConfig): 71 | @property 72 | def inputs(self) -> Mapping[str, Mapping[int, str]]: 73 | return OrderedDict( 74 | [ 75 | ("input_ids", {0: "batch", 1: "sequence"}), 76 | ("attention_mask", {0: "batch", 1: "sequence"}), 77 | ] 78 | ) 79 | 80 | @property 81 | def outputs(self) -> Mapping[str, Mapping[int, str]]: 82 | return OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"}), ("pooler_output", {0: "batch"})]) 83 | -------------------------------------------------------------------------------- /model/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert RoBERTa checkpoint.""" 16 | 17 | 18 | import argparse 19 | import pathlib 20 | 21 | import fairseq 22 | import torch 23 | from fairseq.models.roberta import RobertaModel as FairseqRobertaModel 24 | from fairseq.modules import TransformerSentenceEncoderLayer 25 | from packaging import version 26 | 27 | from transformers import RobertaConfig, RobertaForMaskedLM, RobertaForSequenceClassification 28 | from transformers.models.bert.modeling_bert import ( 29 | BertIntermediate, 30 | BertLayer, 31 | BertOutput, 32 | BertSelfAttention, 33 | BertSelfOutput, 34 | ) 35 | from transformers.utils import logging 36 | 37 | 38 | if version.parse(fairseq.__version__) < version.parse("0.9.0"): 39 | raise Exception("requires fairseq >= 0.9.0") 40 | 41 | 42 | logging.set_verbosity_info() 43 | logger = logging.get_logger(__name__) 44 | 45 | SAMPLE_TEXT = "Hello world! cécé herlolip" 46 | 47 | 48 | def convert_roberta_checkpoint_to_pytorch( 49 | roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool 50 | ): 51 | """ 52 | Copy/paste/tweak roberta's weights to our BERT structure. 53 | """ 54 | roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) 55 | roberta.eval() # disable dropout 56 | roberta_sent_encoder = roberta.model.encoder.sentence_encoder 57 | config = RobertaConfig( 58 | vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings, 59 | hidden_size=roberta.args.encoder_embed_dim, 60 | num_hidden_layers=roberta.args.encoder_layers, 61 | num_attention_heads=roberta.args.encoder_attention_heads, 62 | intermediate_size=roberta.args.encoder_ffn_embed_dim, 63 | max_position_embeddings=514, 64 | type_vocab_size=1, 65 | layer_norm_eps=1e-5, # PyTorch default used in fairseq 66 | ) 67 | if classification_head: 68 | config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0] 69 | print("Our BERT config:", config) 70 | 71 | model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config) 72 | model.eval() 73 | 74 | # Now let's copy all the weights. 75 | # Embeddings 76 | model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight 77 | model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight 78 | model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( 79 | model.roberta.embeddings.token_type_embeddings.weight 80 | ) # just zero them out b/c RoBERTa doesn't use them. 81 | model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight 82 | model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias 83 | 84 | for i in range(config.num_hidden_layers): 85 | # Encoder: start of layer 86 | layer: BertLayer = model.roberta.encoder.layer[i] 87 | roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i] 88 | 89 | # self attention 90 | self_attn: BertSelfAttention = layer.attention.self 91 | assert ( 92 | roberta_layer.self_attn.k_proj.weight.data.shape 93 | == roberta_layer.self_attn.q_proj.weight.data.shape 94 | == roberta_layer.self_attn.v_proj.weight.data.shape 95 | == torch.Size((config.hidden_size, config.hidden_size)) 96 | ) 97 | 98 | self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight 99 | self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias 100 | self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight 101 | self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias 102 | self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight 103 | self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias 104 | 105 | # self-attention output 106 | self_output: BertSelfOutput = layer.attention.output 107 | assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape 108 | self_output.dense.weight = roberta_layer.self_attn.out_proj.weight 109 | self_output.dense.bias = roberta_layer.self_attn.out_proj.bias 110 | self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight 111 | self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias 112 | 113 | # intermediate 114 | intermediate: BertIntermediate = layer.intermediate 115 | assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape 116 | intermediate.dense.weight = roberta_layer.fc1.weight 117 | intermediate.dense.bias = roberta_layer.fc1.bias 118 | 119 | # output 120 | bert_output: BertOutput = layer.output 121 | assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape 122 | bert_output.dense.weight = roberta_layer.fc2.weight 123 | bert_output.dense.bias = roberta_layer.fc2.bias 124 | bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight 125 | bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias 126 | # end of layer 127 | 128 | if classification_head: 129 | model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight 130 | model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias 131 | model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight 132 | model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias 133 | else: 134 | # LM Head 135 | model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight 136 | model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias 137 | model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight 138 | model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias 139 | model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight 140 | model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias 141 | 142 | # Let's check that we get the same results. 143 | input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 144 | 145 | our_output = model(input_ids)[0] 146 | if classification_head: 147 | their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids)) 148 | else: 149 | their_output = roberta.model(input_ids)[0] 150 | print(our_output.shape, their_output.shape) 151 | max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() 152 | print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 153 | success = torch.allclose(our_output, their_output, atol=1e-3) 154 | print("Do both models output the same tensors?", "🔥" if success else "💩") 155 | if not success: 156 | raise Exception("Something went wRoNg") 157 | 158 | pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) 159 | print(f"Saving model to {pytorch_dump_folder_path}") 160 | model.save_pretrained(pytorch_dump_folder_path) 161 | 162 | 163 | if __name__ == "__main__": 164 | parser = argparse.ArgumentParser() 165 | # Required parameters 166 | parser.add_argument( 167 | "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." 168 | ) 169 | parser.add_argument( 170 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 171 | ) 172 | parser.add_argument( 173 | "--classification_head", action="store_true", help="Whether to convert a final classification head." 174 | ) 175 | args = parser.parse_args() 176 | convert_roberta_checkpoint_to_pytorch( 177 | args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head 178 | ) 179 | -------------------------------------------------------------------------------- /model/roberta/tokenization_roberta_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Fast Tokenization classes for RoBERTa.""" 16 | 17 | from typing import List, Optional 18 | 19 | from transformers.tokenization_utils_base import AddedToken 20 | from transformers.utils import logging 21 | from transformers.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast 22 | from .tokenization_roberta import RobertaTokenizer 23 | 24 | 25 | logger = logging.get_logger(__name__) 26 | 27 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} 28 | 29 | PRETRAINED_VOCAB_FILES_MAP = { 30 | "vocab_file": { 31 | "roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json", 32 | "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json", 33 | "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json", 34 | "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json", 35 | "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json", 36 | "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json", 37 | }, 38 | "merges_file": { 39 | "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt", 40 | "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt", 41 | "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt", 42 | "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt", 43 | "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt", 44 | "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt", 45 | }, 46 | "tokenizer_file": { 47 | "roberta-base": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json", 48 | "roberta-large": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json", 49 | "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer.json", 50 | "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/tokenizer.json", 51 | "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/tokenizer.json", 52 | "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/tokenizer.json", 53 | }, 54 | } 55 | 56 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 57 | "roberta-base": 512, 58 | "roberta-large": 512, 59 | "roberta-large-mnli": 512, 60 | "distilroberta-base": 512, 61 | "roberta-base-openai-detector": 512, 62 | "roberta-large-openai-detector": 512, 63 | } 64 | 65 | 66 | class RobertaTokenizerFast(GPT2TokenizerFast): 67 | """ 68 | Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2 69 | tokenizer, using byte-level Byte-Pair-Encoding. 70 | 71 | This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will 72 | be encoded differently whether it is at the beginning of the sentence (without space) or not: 73 | 74 | :: 75 | 76 | >>> from transformers import RobertaTokenizerFast 77 | >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") 78 | >>> tokenizer("Hello world")['input_ids'] 79 | [0, 31414, 232, 328, 2] 80 | >>> tokenizer(" Hello world")['input_ids'] 81 | [0, 20920, 232, 2] 82 | 83 | You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you 84 | call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. 85 | 86 | .. note:: 87 | 88 | When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with 89 | ``add_prefix_space=True``. 90 | 91 | This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main 92 | methods. Users should refer to this superclass for more information regarding those methods. 93 | 94 | Args: 95 | vocab_file (:obj:`str`): 96 | Path to the vocabulary file. 97 | merges_file (:obj:`str`): 98 | Path to the merges file. 99 | errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`): 100 | Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode 101 | `__ for more information. 102 | bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): 103 | The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. 104 | 105 | .. note:: 106 | 107 | When building a sequence using special tokens, this is not the token that is used for the beginning of 108 | sequence. The token used is the :obj:`cls_token`. 109 | eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): 110 | The end of sequence token. 111 | 112 | .. note:: 113 | 114 | When building a sequence using special tokens, this is not the token that is used for the end of 115 | sequence. The token used is the :obj:`sep_token`. 116 | sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): 117 | The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for 118 | sequence classification or for a text and a question for question answering. It is also used as the last 119 | token of a sequence built with special tokens. 120 | cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): 121 | The classifier token which is used when doing sequence classification (classification of the whole sequence 122 | instead of per-token classification). It is the first token of the sequence when built with special tokens. 123 | unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): 124 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 125 | token instead. 126 | pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): 127 | The token used for padding, for example when batching sequences of different lengths. 128 | mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): 129 | The token used for masking values. This is the token used when training this model with masked language 130 | modeling. This is the token which the model will try to predict. 131 | add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`): 132 | Whether or not to add an initial space to the input. This allows to treat the leading word just as any 133 | other word. (RoBERTa tokenizer detect beginning of words by the preceding space). 134 | trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`): 135 | Whether the post processing step should trim offsets to avoid including whitespaces. 136 | """ 137 | 138 | vocab_files_names = VOCAB_FILES_NAMES 139 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 140 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 141 | model_input_names = ["input_ids", "attention_mask"] 142 | slow_tokenizer_class = RobertaTokenizer 143 | 144 | def __init__( 145 | self, 146 | vocab_file=None, 147 | merges_file=None, 148 | tokenizer_file=None, 149 | errors="replace", 150 | bos_token="", 151 | eos_token="", 152 | sep_token="", 153 | cls_token="", 154 | unk_token="", 155 | pad_token="", 156 | mask_token="", 157 | add_prefix_space=False, 158 | **kwargs 159 | ): 160 | super().__init__( 161 | vocab_file, 162 | merges_file, 163 | tokenizer_file=tokenizer_file, 164 | errors=errors, 165 | bos_token=bos_token, 166 | eos_token=eos_token, 167 | sep_token=sep_token, 168 | cls_token=cls_token, 169 | unk_token=unk_token, 170 | pad_token=pad_token, 171 | mask_token=mask_token, 172 | add_prefix_space=add_prefix_space, 173 | **kwargs, 174 | ) 175 | 176 | @property 177 | def mask_token(self) -> str: 178 | """ 179 | :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while 180 | not having been set. 181 | 182 | Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily 183 | comprise the space before the ``. 184 | """ 185 | if self._mask_token is None and self.verbose: 186 | logger.error("Using mask_token, but it is not set yet.") 187 | return None 188 | return str(self._mask_token) 189 | 190 | @mask_token.setter 191 | def mask_token(self, value): 192 | """ 193 | Overriding the default behavior of the mask token to have it eat the space before it. 194 | 195 | This is needed to preserve backward compatibility with all the previously used models based on Roberta. 196 | """ 197 | # Mask token behave like a normal word, i.e. include the space before it 198 | # So we set lstrip to True 199 | value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value 200 | self._mask_token = value 201 | 202 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): 203 | output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] 204 | if token_ids_1 is None: 205 | return output 206 | 207 | return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] 208 | 209 | def create_token_type_ids_from_sequences( 210 | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None 211 | ) -> List[int]: 212 | """ 213 | Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not 214 | make use of token type ids, therefore a list of zeros is returned. 215 | 216 | Args: 217 | token_ids_0 (:obj:`List[int]`): 218 | List of IDs. 219 | token_ids_1 (:obj:`List[int]`, `optional`): 220 | Optional second list of IDs for sequence pairs. 221 | 222 | Returns: 223 | :obj:`List[int]`: List of zeros. 224 | """ 225 | sep = [self.sep_token_id] 226 | cls = [self.cls_token_id] 227 | 228 | if token_ids_1 is None: 229 | return len(cls + token_ids_0 + sep) * [0] 230 | return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] 231 | -------------------------------------------------------------------------------- /model/sequence_classification.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch._C import NoopLogger 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch import Tensor 6 | from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss 7 | 8 | from transformers import BertModel, BertPreTrainedModel 9 | 10 | from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutput, Seq2SeqLMOutput 11 | 12 | from model.prefix_encoder import PrefixEncoder 13 | 14 | from transformers.adapters.models.bert import BertModelHeadsMixin, BertModelAdaptersMixin 15 | from transformers.adapters.model_mixin import ModelWithHeadsAdaptersMixin 16 | # from transformers import RobertaModel, RobertaPreTrainedModel 17 | from model.roberta import RobertaModel, RobertaPreTrainedModel 18 | 19 | import copy 20 | 21 | 22 | 23 | class RobertaPrefixForSequenceClassification(ModelWithHeadsAdaptersMixin, RobertaPreTrainedModel): 24 | def __init__(self, config): 25 | super().__init__(config) 26 | self.num_labels = config.num_labels 27 | self.config = config 28 | self.roberta = RobertaModel(config) 29 | self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) 30 | # self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels) 31 | self.classifier = RobertaClassificationHead(config) 32 | self.init_weights() 33 | 34 | for param in self.roberta.parameters(): 35 | param.requires_grad = False 36 | 37 | self.pre_seq_len = config.pre_seq_len 38 | self.n_layer = config.num_hidden_layers 39 | self.n_head = config.num_attention_heads 40 | self.n_embd = config.hidden_size // config.num_attention_heads 41 | 42 | self.prefix_tokens = torch.arange(self.pre_seq_len).long() 43 | self.prefix_encoder = PrefixEncoder(config) 44 | 45 | bert_param = 0 46 | for name, param in self.roberta.named_parameters(): 47 | bert_param += param.numel() 48 | all_param = 0 49 | for name, param in self.named_parameters(): 50 | all_param += param.numel() 51 | total_param = all_param - bert_param 52 | print('total param is {}'.format(total_param)) # 9860105 53 | 54 | 55 | def get_prompt(self, batch_size): 56 | prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(self.roberta.device) 57 | past_key_values = self.prefix_encoder(prefix_tokens) 58 | past_key_values = past_key_values.view( 59 | batch_size, 60 | self.pre_seq_len, 61 | self.n_layer * 2, 62 | self.n_head, 63 | self.n_embd 64 | ) 65 | past_key_values = self.dropout(past_key_values) 66 | past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(2) 67 | return past_key_values 68 | 69 | def forward( 70 | self, 71 | input_ids=None, 72 | attention_mask=None, 73 | token_type_ids=None, 74 | position_ids=None, 75 | head_mask=None, 76 | inputs_embeds=None, 77 | labels=None, 78 | output_attentions=None, 79 | output_hidden_states=None, 80 | return_dict=None, 81 | adapter_names=None, 82 | head=None, 83 | **kwargs 84 | ): 85 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 86 | 87 | batch_size = input_ids.shape[0] 88 | past_key_values = self.get_prompt(batch_size=batch_size) 89 | prefix_attention_mask = torch.ones(batch_size, self.pre_seq_len).to(self.roberta.device) 90 | attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) 91 | 92 | outputs = self.roberta( 93 | input_ids, 94 | attention_mask=attention_mask, 95 | token_type_ids=token_type_ids, 96 | position_ids=position_ids, 97 | head_mask=head_mask, 98 | inputs_embeds=inputs_embeds, 99 | output_attentions=output_attentions, 100 | output_hidden_states=output_hidden_states, 101 | return_dict=return_dict, 102 | adapter_names=adapter_names, 103 | past_key_values=past_key_values, 104 | ) 105 | 106 | # pooled_output = outputs[1] 107 | 108 | # pooled_output = self.dropout(pooled_output) 109 | # logits = self.classifier(pooled_output) 110 | 111 | sequence_output = outputs[0] 112 | logits = self.classifier(sequence_output) 113 | 114 | loss = None 115 | if labels is not None: 116 | if self.config.problem_type is None: 117 | if self.num_labels == 1: 118 | self.config.problem_type = "regression" 119 | elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): 120 | self.config.problem_type = "single_label_classification" 121 | else: 122 | self.config.problem_type = "multi_label_classification" 123 | 124 | if self.config.problem_type == "regression": 125 | loss_fct = MSELoss() 126 | if self.num_labels == 1: 127 | loss = loss_fct(logits.squeeze(), labels.squeeze()) 128 | else: 129 | loss = loss_fct(logits, labels) 130 | elif self.config.problem_type == "single_label_classification": 131 | loss_fct = CrossEntropyLoss() 132 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 133 | elif self.config.problem_type == "multi_label_classification": 134 | loss_fct = BCEWithLogitsLoss() 135 | loss = loss_fct(logits, labels) 136 | if not return_dict: 137 | output = (logits,) + outputs[2:] 138 | return ((loss,) + output) if loss is not None else output 139 | 140 | return SequenceClassifierOutput( 141 | loss=loss, 142 | logits=logits, 143 | hidden_states=outputs.hidden_states, 144 | attentions=outputs.attentions, 145 | ) 146 | 147 | 148 | 149 | 150 | 151 | 152 | class RobertaClassificationHead(nn.Module): 153 | """Head for sentence-level classification tasks.""" 154 | 155 | def __init__(self, config): 156 | super().__init__() 157 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 158 | classifier_dropout = ( 159 | config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob 160 | ) 161 | self.dropout = nn.Dropout(classifier_dropout) 162 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels) 163 | 164 | def forward(self, features, **kwargs): 165 | x = features[:, 0, :] # take token (equiv. to [CLS]) 166 | x = self.dropout(x) 167 | x = self.dense(x) 168 | x = torch.tanh(x) 169 | x = self.dropout(x) 170 | x = self.out_proj(x) 171 | return x 172 | 173 | 174 | class RobertaLoraForSequenceClassification(ModelWithHeadsAdaptersMixin, RobertaPreTrainedModel): 175 | def __init__(self, config): 176 | super().__init__(config) 177 | self.num_labels = config.num_labels 178 | self.config = config 179 | self.roberta = RobertaModel(config) 180 | self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) 181 | # self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels) 182 | self.classifier = RobertaClassificationHead(config) 183 | self.init_weights() 184 | 185 | for name, param in self.roberta.named_parameters(): 186 | if "lora" not in name.lower(): 187 | param.requires_grad = False 188 | 189 | bert_param = 0 190 | for name, param in self.roberta.named_parameters(): 191 | bert_param += param.numel() 192 | all_param = 0 193 | for name, param in self.named_parameters(): 194 | all_param += param.numel() 195 | total_param = all_param - bert_param 196 | print('total param is {}'.format(total_param)) # 9860105 197 | 198 | 199 | 200 | def forward( 201 | self, 202 | input_ids=None, 203 | attention_mask=None, 204 | token_type_ids=None, 205 | position_ids=None, 206 | head_mask=None, 207 | inputs_embeds=None, 208 | labels=None, 209 | output_attentions=None, 210 | output_hidden_states=None, 211 | return_dict=None, 212 | adapter_names=None, 213 | head=None, 214 | **kwargs 215 | ): 216 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 217 | 218 | outputs = self.roberta( 219 | input_ids, 220 | attention_mask=attention_mask, 221 | token_type_ids=token_type_ids, 222 | position_ids=position_ids, 223 | head_mask=head_mask, 224 | inputs_embeds=inputs_embeds, 225 | output_attentions=output_attentions, 226 | output_hidden_states=output_hidden_states, 227 | return_dict=return_dict, 228 | adapter_names=adapter_names 229 | ) 230 | 231 | sequence_output = outputs[0] 232 | logits = self.classifier(sequence_output) 233 | 234 | loss = None 235 | if labels is not None: 236 | if self.config.problem_type is None: 237 | if self.num_labels == 1: 238 | self.config.problem_type = "regression" 239 | elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): 240 | self.config.problem_type = "single_label_classification" 241 | else: 242 | self.config.problem_type = "multi_label_classification" 243 | 244 | if self.config.problem_type == "regression": 245 | loss_fct = MSELoss() 246 | if self.num_labels == 1: 247 | loss = loss_fct(logits.squeeze(), labels.squeeze()) 248 | else: 249 | loss = loss_fct(logits, labels) 250 | elif self.config.problem_type == "single_label_classification": 251 | loss_fct = CrossEntropyLoss() 252 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 253 | elif self.config.problem_type == "multi_label_classification": 254 | loss_fct = BCEWithLogitsLoss() 255 | loss = loss_fct(logits, labels) 256 | if not return_dict: 257 | output = (logits,) + outputs[2:] 258 | return ((loss,) + output) if loss is not None else output 259 | 260 | return SequenceClassifierOutput( 261 | loss=loss, 262 | logits=logits, 263 | hidden_states=outputs.hidden_states, 264 | attentions=outputs.attentions, 265 | ) -------------------------------------------------------------------------------- /model/utils.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | from model.sequence_classification import ( 5 | RobertaPrefixForSequenceClassification, 6 | RobertaLoraForSequenceClassification 7 | ) 8 | 9 | 10 | from model.multiple_choice import ( 11 | RobertaPrefixForMultipleChoice, 12 | RobertaLoraForMultipleChoice 13 | ) 14 | 15 | from transformers import ( 16 | AutoConfig, 17 | AutoModelForTokenClassification, 18 | AutoModelForSequenceClassification, 19 | AutoModelForQuestionAnswering, 20 | AutoModelForMultipleChoice 21 | ) 22 | 23 | 24 | 25 | class TaskType(Enum): 26 | TOKEN_CLASSIFICATION = 1, 27 | SEQUENCE_CLASSIFICATION = 2, 28 | QUESTION_ANSWERING = 3, 29 | MULTIPLE_CHOICE = 4 30 | 31 | PREFIX_MODELS = { 32 | 33 | "roberta": { 34 | TaskType.SEQUENCE_CLASSIFICATION: RobertaPrefixForSequenceClassification, 35 | TaskType.MULTIPLE_CHOICE: RobertaPrefixForMultipleChoice, 36 | 37 | }, 38 | } 39 | 40 | 41 | 42 | AUTO_MODELS = { 43 | TaskType.TOKEN_CLASSIFICATION: AutoModelForTokenClassification, 44 | TaskType.SEQUENCE_CLASSIFICATION: AutoModelForSequenceClassification, 45 | TaskType.QUESTION_ANSWERING: AutoModelForQuestionAnswering, 46 | TaskType.MULTIPLE_CHOICE: AutoModelForMultipleChoice, 47 | } 48 | 49 | LORA_MODELS = { 50 | "roberta": { 51 | TaskType.SEQUENCE_CLASSIFICATION: RobertaLoraForSequenceClassification, 52 | TaskType.MULTIPLE_CHOICE: RobertaLoraForMultipleChoice, 53 | }, 54 | 55 | } 56 | 57 | def get_model(model_args, task_type: TaskType, config: AutoConfig, fix_bert: bool = False): 58 | if model_args.prefix: 59 | config.hidden_dropout_prob = model_args.hidden_dropout_prob 60 | config.pre_seq_len = model_args.pre_seq_len 61 | config.prefix_projection = model_args.prefix_projection 62 | config.prefix_hidden_size = model_args.prefix_hidden_size 63 | 64 | model_class = PREFIX_MODELS[config.model_type][task_type] 65 | model = model_class.from_pretrained( 66 | model_args.model_name_or_path, 67 | config=config, 68 | revision=model_args.model_revision, 69 | ) 70 | elif model_args.lora: 71 | config.lora = model_args.lora 72 | config.lora_r = model_args.lora_r 73 | config.lora_alpha = model_args.lora_alpha 74 | model_class = LORA_MODELS[config.model_type][task_type] 75 | model = model_class.from_pretrained( 76 | model_args.model_name_or_path, 77 | config=config, 78 | revision=model_args.model_revision, 79 | ) 80 | else: 81 | model_class = AUTO_MODELS[task_type] 82 | model = model_class.from_pretrained( 83 | model_args.model_name_or_path, 84 | config=config, 85 | revision=model_args.model_revision, 86 | ) 87 | 88 | bert_param = 0 89 | if fix_bert: 90 | if config.model_type == "bert": 91 | for param in model.bert.parameters(): 92 | param.requires_grad = False 93 | for _, param in model.bert.named_parameters(): 94 | bert_param += param.numel() 95 | elif config.model_type == "roberta": 96 | for param in model.roberta.parameters(): 97 | param.requires_grad = False 98 | for _, param in model.roberta.named_parameters(): 99 | bert_param += param.numel() 100 | elif config.model_type == "deberta": 101 | for param in model.deberta.parameters(): 102 | param.requires_grad = False 103 | for _, param in model.deberta.named_parameters(): 104 | bert_param += param.numel() 105 | all_param = 0 106 | for _, param in model.named_parameters(): 107 | all_param += param.numel() 108 | total_param = all_param - bert_param 109 | print('***** total param is {} *****'.format(total_param)) 110 | return model 111 | 112 | 113 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | adapter-transformers==2.2.0 2 | aiohttp==3.8.1 3 | aiosignal==1.2.0 4 | albumentations==0.4.3 5 | async-timeout==4.0.1 6 | asynctest==0.13.0 7 | attrs==21.2.0 8 | audioread==2.1.9 9 | backports.shutil-get-terminal-size==1.0.0 10 | blessings==1.7 11 | certifi==2021.10.8 12 | cffi==1.15.0 13 | charset-normalizer==2.0.9 14 | click==8.0.3 15 | configparser==5.2.0 16 | ConfigSpace==0.4.20 17 | cycler==0.11.0 18 | Cython==0.29.26 19 | datasets==1.15.1 20 | decorator==5.1.0 21 | dill==0.3.4 22 | docker-pycreds==0.4.0 23 | docopt==0.6.2 24 | filelock==3.4.0 25 | fonttools==4.28.5 26 | frozenlist==1.2.0 27 | fsspec==2021.11.1 28 | gitdb==4.0.9 29 | GitPython==3.1.24 30 | googledrivedownloader==0.4 31 | gpustat==0.6.0 32 | hpbandster==0.7.4 33 | huggingface-hub==0.2.1 34 | idna==3.3 35 | imageio==2.13.4 36 | imgaug==0.2.6 37 | importlib-metadata==4.8.2 38 | joblib==1.1.0 39 | kiwisolver==1.3.2 40 | librosa==0.7.2 41 | llvmlite==0.37.0 42 | matplotlib==3.5.1 43 | multidict==5.2.0 44 | multiprocess==0.70.12.2 45 | netifaces==0.11.0 46 | networkx==2.6.3 47 | numba==0.49.1 48 | numpy==1.21.5 49 | nvidia-ml-py3==7.352.0 50 | opencv-python==4.5.4.60 51 | packaging==21.3 52 | pandas==1.3.4 53 | pathtools==0.1.2 54 | patsy==0.5.2 55 | Pillow==8.4.0 56 | pipreqs==0.4.11 57 | promise==2.3 58 | protobuf==3.19.1 59 | psutil==5.8.0 60 | pyarrow==6.0.1 61 | pycparser==2.21 62 | pyparsing==3.0.6 63 | Pyro4==4.81 64 | python-dateutil==2.8.2 65 | pytz==2021.3 66 | PyWavelets==1.2.0 67 | PyYAML==6.0 68 | regex==2021.11.10 69 | requests==2.26.0 70 | resampy==0.2.2 71 | sacremoses==0.0.46 72 | scikit-image==0.19.1 73 | scikit-learn==1.0.1 74 | scipy==1.7.3 75 | sentry-sdk==1.5.0 76 | seqeval==1.2.2 77 | serpent==1.40 78 | shortuuid==1.0.8 79 | six==1.16.0 80 | smmap==5.0.0 81 | SoundFile==0.10.3.post1 82 | statsmodels==0.13.1 83 | subprocess32==3.5.4 84 | tabulate==0.8.6 85 | tensorboardX==2.0 86 | termcolor==1.1.0 87 | threadpoolctl==3.0.0 88 | tifffile==2021.11.2 89 | tokenizers==0.10.3 90 | torch==1.8.1 91 | torchvision==0.9.1 92 | tqdm==4.62.3 93 | transformers==4.5.0 94 | typing_extensions==4.0.1 95 | urllib3==1.26.7 96 | wandb==0.12.7 97 | xxhash==2.0.2 98 | yarg==0.1.9 99 | yarl==1.7.2 100 | yaspin==2.1.0 101 | zipp==3.6.0 102 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import numpy as np 5 | from typing import Dict 6 | 7 | import datasets 8 | import transformers 9 | from transformers import set_seed, Trainer 10 | from transformers.trainer_utils import get_last_checkpoint 11 | from transformers import EarlyStoppingCallback 12 | 13 | from arguments import get_args 14 | 15 | from tasks.utils import * 16 | import wandb 17 | os.environ["WANDB_DISABLED"] = "true" 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | def train(trainer, resume_from_checkpoint=None, last_checkpoint=None): 22 | checkpoint = None 23 | if resume_from_checkpoint is not None: 24 | checkpoint = resume_from_checkpoint 25 | elif last_checkpoint is not None: 26 | checkpoint = last_checkpoint 27 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 28 | # trainer.save_model() 29 | 30 | # metrics = train_result.metrics 31 | trainer.save_model() 32 | # trainer.log_metrics("train", metrics) 33 | # trainer.save_metrics("train", metrics) 34 | # trainer.save_state() 35 | 36 | # trainer.log_best_metrics() 37 | 38 | def evaluate(trainer): 39 | logger.info("*** Evaluate ***") 40 | metrics = trainer.evaluate() 41 | 42 | trainer.log_metrics("eval", metrics) 43 | trainer.save_metrics("eval", metrics) 44 | 45 | def predict(trainer, predict_dataset=None): 46 | logger.info("*** Predict ***") 47 | predictions= trainer.predict(predict_dataset, metric_key_prefix="predict") 48 | 49 | trainer.log_metrics("predict", predictions.metrics) 50 | trainer.save_metrics("predict", predictions.metrics) 51 | 52 | if __name__ == '__main__': 53 | 54 | args = get_args() 55 | 56 | _, data_args, training_args, _, adapter_args = args 57 | 58 | logging.basicConfig( 59 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 60 | datefmt="%m/%d/%Y %H:%M:%S", 61 | handlers=[logging.StreamHandler(sys.stdout)], 62 | ) 63 | # wandb.init(project=data_args.task_name, name=training_args.run_name) 64 | log_level = training_args.get_process_log_level() 65 | logger.setLevel(log_level) 66 | datasets.utils.logging.set_verbosity(log_level) 67 | transformers.utils.logging.set_verbosity(log_level) 68 | transformers.utils.logging.enable_default_handler() 69 | transformers.utils.logging.enable_explicit_format() 70 | 71 | # Log on each process the small summary: 72 | logger.warning( 73 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 74 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 75 | ) 76 | logger.info(f"Training/evaluation parameters {training_args}") 77 | 78 | # print("***************************", training_args.local_rank) 79 | if not os.path.isdir("checkpoints") or not os.path.exists("checkpoints"): 80 | os.mkdir("checkpoints") 81 | 82 | if data_args.task_name.lower() == "superglue": 83 | assert data_args.dataset_name.lower() in SUPERGLUE_DATASETS 84 | from tasks.superglue.get_trainer import get_trainer 85 | 86 | elif data_args.task_name.lower() == "glue": 87 | assert data_args.dataset_name.lower() in GLUE_DATASETS 88 | from tasks.glue.get_trainer import get_trainer 89 | 90 | else: 91 | raise NotImplementedError('Task {} is not implemented. Please choose a task from: {}'.format(data_args.task_name, ", ".join(TASKS))) 92 | 93 | set_seed(training_args.seed) 94 | 95 | trainer, predict_dataset = get_trainer(args) 96 | 97 | last_checkpoint = None 98 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 99 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 100 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 101 | raise ValueError( 102 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 103 | "Use --overwrite_output_dir to overcome." 104 | ) 105 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 106 | logger.info( 107 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 108 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 109 | ) 110 | 111 | 112 | if training_args.do_train: 113 | train(trainer, training_args.resume_from_checkpoint, last_checkpoint) 114 | 115 | # if training_args.do_eval: 116 | # evaluate(trainer) 117 | 118 | if training_args.do_predict: 119 | predict(trainer, predict_dataset) 120 | 121 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_boolq_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=boolq 3 | 4 | bs=32 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 2 35 | done 36 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_cb_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=cb 3 | 4 | bs=16 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --logging_steps 5 \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --train_adapter\ 34 | --adapter_config pfeiffer \ 35 | --adapter_reduction_factor 64 36 | done 37 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_cola_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=cola 3 | 4 | bs=16 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 16 35 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_copa_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=copa 3 | 4 | bs=16 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 64 35 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_mnli_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=mnli 3 | 4 | bs=32 5 | lr=1e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 16 35 | done 36 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_mrpc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=mrpc 3 | 4 | bs=32 5 | lr=1e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 2 35 | done 36 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_multirc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=multirc 3 | 4 | bs=32 5 | lr=1e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 64 35 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_qnli_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=qnli 3 | 4 | bs=32 5 | lr=1e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 16 35 | done 36 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_qqp_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=qqp 3 | 4 | bs=32 5 | lr=1e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 2 35 | done 36 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_rte_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=rte 3 | 4 | bs=16 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 64 35 | done 36 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_sst2_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=sst2 3 | 4 | bs=32 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 16 35 | done 36 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_stsb_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=stsb 3 | 4 | bs=16 5 | lr=1e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 16 35 | done 36 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_wic_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=wic 3 | 4 | bs=16 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 64 35 | done 36 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/adapter/run_wsc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=wsc 3 | 4 | bs=32 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-adapter/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --train_adapter\ 33 | --adapter_config pfeiffer \ 34 | --adapter_reduction_factor 2 35 | done 36 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/bitfit/run_boolq_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=boolq 3 | 4 | bs=16 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --bitfit 33 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/bitfit/run_cb_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=cb 3 | 4 | bs=16 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --logging_steps 5 \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --bitfit 34 | done 35 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/bitfit/run_cola_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=cola 3 | 4 | bs=32 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --bitfit 33 | done 34 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/bitfit/run_copa_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=copa 3 | 4 | bs=32 5 | lr=5e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --bitfit 33 | done 34 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/bitfit/run_mrpc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=mrpc 3 | 4 | bs=16 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --bitfit 33 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/bitfit/run_multirc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=multirc 3 | 4 | bs=16 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --bitfit 33 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/bitfit/run_rte_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=rte 3 | 4 | bs=16 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 9 | do 10 | python3 run.py \ 11 | --model_name_or_path roberta-base \ 12 | --task_name $TASK_NAME \ 13 | --dataset_name $DATASET_NAME \ 14 | --do_train \ 15 | --do_eval \ 16 | --do_predict \ 17 | --max_seq_length 128 \ 18 | --per_device_train_batch_size $bs \ 19 | --learning_rate $lr \ 20 | --num_train_epochs $epoch \ 21 | --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \ 22 | --overwrite_output_dir \ 23 | --hidden_dropout_prob $dropout \ 24 | --seed $model_seed \ 25 | --model_seed $model_seed\ 26 | --save_strategy epoch \ 27 | --evaluation_strategy epoch \ 28 | --load_best_model_at_end\ 29 | --metric_for_best_model loss\ 30 | --greater_is_better False \ 31 | --bitfit 32 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/bitfit/run_sst2_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=sst2 3 | 4 | bs=32 5 | lr=1e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --bitfit 33 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/bitfit/run_stsb_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=stsb 3 | 4 | bs=16 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --bitfit 33 | done 34 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/bitfit/run_wic_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=wic 3 | 4 | bs=16 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --bitfit 33 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/bitfit/run_wsc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=wsc 3 | 4 | bs=32 5 | lr=1e-2 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-bitfit/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --bitfit 33 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_boolq_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=boolq 3 | 4 | bs=32 5 | lr=1e-5 6 | dropout=0.1 7 | epoch=50 8 | wr=0.06 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_cb_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=cb 3 | 4 | bs=16 5 | lr=5e-5 6 | dropout=0.1 7 | epoch=50 8 | wr=0.0 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --logging_steps 5 \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --warmup_ratio $wr 35 | done 36 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_cola_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=cola 3 | 4 | bs=32 5 | lr=5e-5 6 | dropout=0.1 7 | epoch=50 8 | wr=0.06 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | done 35 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_copa_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=copa 3 | 4 | bs=32 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | wr=0.0 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_mnli_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=mnli 3 | 4 | bs=32 5 | lr=1e-5 6 | dropout=0.1 7 | epoch=50 8 | wr=0.0 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | done 35 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_mrpc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=mrpc 3 | 4 | bs=16 5 | lr=1e-5 6 | dropout=0.1 7 | epoch=50 8 | wr=0.06 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | done 35 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_multirc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=multirc 3 | 4 | bs=32 5 | lr=5e-6 6 | dropout=0.1 7 | epoch=50 8 | wr=0.0 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_qnli_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=qnli 3 | 4 | bs=32 5 | lr=1e-5 6 | dropout=0.1 7 | epoch=50 8 | wr=0.0 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | done 35 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_qqp_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=qqp 3 | 4 | bs=32 5 | lr=1e-5 6 | dropout=0.1 7 | epoch=50 8 | wr=0.0 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_rte_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=rte 3 | 4 | bs=16 5 | lr=1e-5 6 | dropout=0.1 7 | epoch=50 8 | wr=0.06 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed\ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --warmup_ratio $wr 33 | done 34 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_sst2_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=sst2 3 | 4 | bs=32 5 | lr=1e-5 6 | dropout=0.1 7 | epoch=50 8 | wr=0.0 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | done 35 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_stsb_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=stsb 3 | 4 | bs=16 5 | lr=1e-5 6 | dropout=0.1 7 | epoch=50 8 | wr=0.06 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | done 35 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_wic_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=wic 3 | 4 | bs=32 5 | lr=1e-5 6 | dropout=0.1 7 | epoch=50 8 | wr=0.0 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/finetuning/run_wsc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=wsc 3 | 4 | bs=16 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | wr=0.0 9 | 10 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 11 | do 12 | python3 run.py \ 13 | --model_name_or_path roberta-base \ 14 | --task_name $TASK_NAME \ 15 | --dataset_name $DATASET_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size $bs \ 21 | --learning_rate $lr \ 22 | --num_train_epochs $epoch \ 23 | --output_dir both_seeds/$DATASET_NAME-roberta-ft/$seed-$model_seed/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob $dropout \ 26 | --seed $model_seed \ 27 | --model_seed $model_seed \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --warmup_ratio $wr 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/lora/run_boolq_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=boolq 3 | 4 | bs=32 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | lora_alpha=16 9 | lora_r=8 10 | 11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir test/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob $dropout \ 27 | --seed $model_seed \ 28 | --model_seed $model_seed \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --lora \ 35 | --lora_alpha $lora_alpha \ 36 | --lora_r $lora_r 37 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/lora/run_cb_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=cb 3 | 4 | bs=16 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | lora_alpha=8 9 | lora_r=8 10 | 11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob $dropout \ 27 | --seed $model_seed \ 28 | --model_seed $model_seed \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --lora \ 35 | --lora_alpha $lora_alpha \ 36 | --lora_r $lora_r 37 | done 38 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/lora/run_cola_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=cola 3 | 4 | bs=16 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | lora_alpha=8 9 | lora_r=8 10 | 11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob $dropout \ 27 | --seed $model_seed \ 28 | --model_seed $model_seed \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --lora \ 35 | --lora_alpha $lora_alpha \ 36 | --lora_r $lora_r 37 | done 38 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/lora/run_copa_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=copa 3 | 4 | bs=16 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | lora_alpha=8 9 | lora_r=8 10 | 11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob $dropout \ 27 | --seed $model_seed \ 28 | --model_seed $model_seed \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --lora \ 35 | --lora_alpha $lora_alpha \ 36 | --lora_r $lora_r 37 | done 38 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/lora/run_mrpc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=mrpc 3 | 4 | bs=32 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | lora_alpha=16 9 | lora_r=8 10 | 11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob $dropout \ 27 | --seed $model_seed \ 28 | --model_seed $model_seed \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --lora \ 35 | --lora_alpha $lora_alpha \ 36 | --lora_r $lora_r 37 | done 38 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/lora/run_multirc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=multirc 3 | 4 | bs=16 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | lora_alpha=16 9 | lora_r=8 10 | 11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob $dropout \ 27 | --seed $model_seed \ 28 | --model_seed $model_seed \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --lora \ 35 | --lora_alpha $lora_alpha \ 36 | --lora_r $lora_r 37 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/lora/run_rte_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=rte 3 | 4 | bs=16 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | lora_alpha=8 9 | lora_r=8 10 | 11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob $dropout \ 27 | --seed $model_seed \ 28 | --model_seed $model_seed\ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --lora \ 35 | --lora_alpha $lora_alpha \ 36 | --lora_r $lora_r 37 | done 38 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/lora/run_sst2_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=sst2 3 | 4 | bs=32 5 | lr=5e-4 6 | dropout=0.1 7 | epoch=50 8 | lora_alpha=8 9 | lora_r=8 10 | 11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob $dropout \ 27 | --seed $model_seed \ 28 | --model_seed $model_seed \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --lora \ 35 | --lora_alpha $lora_alpha \ 36 | --lora_r $lora_r 37 | done 38 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/lora/run_stsb_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=stsb 3 | 4 | bs=32 5 | lr=1e-4 6 | dropout=0.1 7 | epoch=50 8 | lora_alpha=16 9 | lora_r=8 10 | 11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob $dropout \ 27 | --seed $model_seed \ 28 | --model_seed $model_seed \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --lora \ 35 | --lora_alpha $lora_alpha \ 36 | --lora_r $lora_r 37 | done 38 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/lora/run_wic_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=wic 3 | 4 | bs=16 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | lora_alpha=8 9 | lora_r=8 10 | 11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob $dropout \ 27 | --seed $model_seed \ 28 | --model_seed $model_seed \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --lora \ 35 | --lora_alpha $lora_alpha \ 36 | --lora_r $lora_r 37 | done 38 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/lora/run_wsc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=wsc 3 | 4 | bs=16 5 | lr=1e-4 6 | dropout=0.1 7 | epoch=50 8 | lora_alpha=8 9 | lora_r=8 10 | 11 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir both_seeds/$DATASET_NAME-roberta-lora/$seed-$model_seed/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob $dropout \ 27 | --seed $model_seed \ 28 | --model_seed $model_seed \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --load_best_model_at_end\ 32 | --metric_for_best_model loss\ 33 | --greater_is_better False \ 34 | --lora \ 35 | --lora_alpha $lora_alpha \ 36 | --lora_r $lora_r 37 | done 38 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_boolq_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=boolq 3 | 4 | bs=32 5 | lr=5e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 64 34 | done 35 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_cb_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=cb 3 | 4 | bs=32 5 | lr=5e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --logging_steps 5 \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --prefix \ 34 | --pre_seq_len 8 35 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_cola_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=cola 3 | 4 | bs=16 5 | lr=5e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 8 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_copa_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=copa 3 | 4 | bs=16 5 | lr=5e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 32 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_mnli_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=mnli 3 | 4 | bs=32 5 | lr=5e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 32 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_mrpc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=mrpc 3 | 4 | bs=16 5 | lr=1e-2 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 16 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_multirc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=multirc 3 | 4 | bs=32 5 | lr=5e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 8 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_qnli_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=qnli 3 | 4 | bs=32 5 | lr=5e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 64 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_qqp_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=qqp 3 | 4 | bs=32 5 | lr=5e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 64 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_rte_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=rte 3 | 4 | bs=16 5 | lr=1e-2 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed\ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 32 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_sst2_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=sst2 3 | 4 | bs=32 5 | lr=1e-2 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 64 34 | done 35 | -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_stsb_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export DATASET_NAME=stsb 3 | 4 | bs=16 5 | lr=5e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 64 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_wic_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=wic 3 | 4 | bs=16 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 32 34 | done -------------------------------------------------------------------------------- /scripts/mulitruns_scripts/prefixtuning/run_wsc_roberta_both.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export DATASET_NAME=wsc 3 | 4 | bs=32 5 | lr=1e-3 6 | dropout=0.1 7 | epoch=50 8 | 9 | for model_seed in 1111 2222 3333 4444 5555 6666 7777 8888 9999 101010 111111 222222 333333 444444 555555 666666 777777 888888 999999 10101010 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir both_seeds/$DATASET_NAME-roberta-pt/$seed-$model_seed/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob $dropout \ 25 | --seed $model_seed \ 26 | --model_seed $model_seed \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix \ 33 | --pre_seq_len 8 34 | done -------------------------------------------------------------------------------- /scripts/search_scipts/glue/search_adapter.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export epoch=50 3 | export DATASET_NAME=$1 4 | 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2 6 | do 7 | for bs in 16 32 8 | do 9 | for rf in 64 16 2 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir checkpoints/$DATASET_NAME-roberta-searchadapter/$DATASET_NAME-$bs-$lr-$rf/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob 0.1 \ 25 | --seed 1111 \ 26 | --save_strategy epoch \ 27 | --evaluation_strategy epoch \ 28 | --load_best_model_at_end\ 29 | --metric_for_best_model loss\ 30 | --greater_is_better False \ 31 | --train_adapter \ 32 | --adapter_config pfeiffer \ 33 | --adapter_reduction_factor $rf 34 | done 35 | done 36 | done 37 | 38 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta adapter -------------------------------------------------------------------------------- /scripts/search_scipts/glue/search_bitfit.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export epoch=50 3 | export DATASET_NAME=$1 4 | 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2 6 | do 7 | for bs in 16 32 8 | do 9 | python3 run.py \ 10 | --model_name_or_path roberta-base \ 11 | --task_name $TASK_NAME \ 12 | --dataset_name $DATASET_NAME \ 13 | --do_train \ 14 | --do_eval \ 15 | --do_predict \ 16 | --max_seq_length 128 \ 17 | --per_device_train_batch_size $bs \ 18 | --learning_rate $lr \ 19 | --num_train_epochs $epoch \ 20 | --output_dir checkpoints/$DATASET_NAME-roberta-searchbitfit/$DATASET_NAME-$bs-$lr/ \ 21 | --overwrite_output_dir \ 22 | --hidden_dropout_prob 0.1 \ 23 | --seed 1111 \ 24 | --save_strategy epoch \ 25 | --evaluation_strategy epoch \ 26 | --load_best_model_at_end\ 27 | --metric_for_best_model loss\ 28 | --greater_is_better False \ 29 | --bitfit 30 | done 31 | done 32 | 33 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta bitfit -------------------------------------------------------------------------------- /scripts/search_scipts/glue/search_ft.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export epoch=50 3 | export DATASET_NAME=$1 4 | 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2 6 | do 7 | for bs in 16 32 8 | do 9 | for wr in 0.0 0.06 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir checkpoints/$DATASET_NAME-roberta-searchft/$DATASET_NAME-$bs-$lr-$wr/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob 0.1 \ 25 | --seed 1111 \ 26 | --save_strategy epoch \ 27 | --evaluation_strategy epoch \ 28 | --load_best_model_at_end\ 29 | --metric_for_best_model loss\ 30 | --greater_is_better False \ 31 | --warmup_ratio $wr 32 | done 33 | done 34 | done 35 | 36 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta ft -------------------------------------------------------------------------------- /scripts/search_scipts/glue/search_lora.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=glue 2 | export epoch=50 3 | export DATASET_NAME=$1 4 | 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2 6 | do 7 | for bs in 16 32 8 | do 9 | for lora_alpha in 8 16 10 | do 11 | for lora_r in 8 16 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir checkpoints/$DATASET_NAME-roberta-searchlora/$DATASET_NAME-$bs-$lr-$lora_alpha/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob 0.1 \ 27 | --seed 1111 \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --lora \ 34 | --lora_alpha $lora_alpha \ 35 | --lora_r $lora_r 36 | done 37 | done 38 | done 39 | done 40 | 41 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta lora -------------------------------------------------------------------------------- /scripts/search_scipts/glue/search_pt.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export epoch=50 3 | export DATASET_NAME=$1 4 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2 5 | do 6 | for psl in 8 16 32 64 7 | do 8 | for bs in 16 32 9 | do 10 | python3 run.py \ 11 | --model_name_or_path roberta-base \ 12 | --task_name $TASK_NAME \ 13 | --dataset_name $DATASET_NAME \ 14 | --do_train \ 15 | --do_eval \ 16 | --do_predict \ 17 | --max_seq_length 128 \ 18 | --per_device_train_batch_size $bs \ 19 | --learning_rate $lr \ 20 | --num_train_epochs $epoch \ 21 | --pre_seq_len $psl \ 22 | --output_dir checkpoints/$DATASET_NAME-roberta-searchpt/$DATASET_NAME-$bs-$lr-$psl/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob 0.1 \ 25 | --seed 1111 \ 26 | --save_strategy epoch \ 27 | --evaluation_strategy epoch \ 28 | --load_best_model_at_end\ 29 | --metric_for_best_model loss\ 30 | --greater_is_better False \ 31 | --prefix 32 | done 33 | done 34 | 35 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta pt -------------------------------------------------------------------------------- /scripts/search_scipts/search.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | from glob import glob 6 | 7 | from tasks.utils import * 8 | 9 | TASK = sys.argv[1] 10 | MODEL = sys.argv[2] 11 | METHOD = sys.argv[3] 12 | 13 | 14 | SPECIAL_METRICS = { 15 | 'cb' : 'f1', 16 | 'mrpc' : 'f1', 17 | 'cola' : 'matthews_correlation', 18 | 'stsb' : 'combined_score' 19 | } 20 | 21 | METRIC = "accuracy" 22 | if TASK in SPECIAL_METRICS: 23 | METRIC = SPECIAL_METRICS[TASK] 24 | 25 | best_score = 0 26 | 27 | files = glob(f"./checkpoints/{TASK}-{MODEL}-search{METHOD}/*/predict_results.json") 28 | 29 | for f in files: 30 | metrics = json.load(open(f, 'r')) 31 | if metrics["predict_"+METRIC] > best_score: 32 | best_score = metrics["predict_"+METRIC] 33 | best_metrics = metrics 34 | best_file_name = f 35 | 36 | print(f"best_{METRIC}: {best_score}") 37 | print(f"best_metrics: {best_metrics}") 38 | print(f"best_file: {best_file_name}") 39 | -------------------------------------------------------------------------------- /scripts/search_scipts/superglue/search_adapter.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export epoch=50 3 | export DATASET_NAME=$1 4 | 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2 6 | do 7 | for bs in 16 32 8 | do 9 | for rf in 64 16 2 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir checkpoints/$DATASET_NAME-roberta-searchadapter/$DATASET_NAME-$bs-$lr-$rf/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob 0.1 \ 25 | --seed 1111 \ 26 | --save_strategy epoch \ 27 | --evaluation_strategy epoch \ 28 | --load_best_model_at_end\ 29 | --metric_for_best_model loss\ 30 | --greater_is_better False \ 31 | --train_adapter \ 32 | --adapter_config pfeiffer \ 33 | --adapter_reduction_factor $rf 34 | done 35 | done 36 | done 37 | 38 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta adapter 39 | -------------------------------------------------------------------------------- /scripts/search_scipts/superglue/search_bitfit.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export epoch=50 3 | export DATASET_NAME=$1 4 | 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2 6 | do 7 | for bs in 16 32 8 | do 9 | python3 run.py \ 10 | --model_name_or_path roberta-base \ 11 | --task_name $TASK_NAME \ 12 | --dataset_name $DATASET_NAME \ 13 | --do_train \ 14 | --do_eval \ 15 | --do_predict \ 16 | --max_seq_length 128 \ 17 | --per_device_train_batch_size $bs \ 18 | --learning_rate $lr \ 19 | --num_train_epochs $epoch \ 20 | --output_dir checkpoints/$DATASET_NAME-roberta-searchbitfit/$DATASET_NAME-$bs-$lr/ \ 21 | --overwrite_output_dir \ 22 | --hidden_dropout_prob 0.1 \ 23 | --seed 1111 \ 24 | --save_strategy epoch \ 25 | --evaluation_strategy epoch \ 26 | --load_best_model_at_end\ 27 | --metric_for_best_model loss\ 28 | --greater_is_better False \ 29 | --bitfit 30 | done 31 | done 32 | 33 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta bitfit -------------------------------------------------------------------------------- /scripts/search_scipts/superglue/search_ft.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export epoch=50 3 | export DATASET_NAME=$1 4 | 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2 6 | do 7 | for bs in 16 32 8 | do 9 | for wr in 0.0 0.06 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --output_dir checkpoints/$DATASET_NAME-roberta-searchft/$DATASET_NAME-$bs-$lr-$wr/ \ 23 | --overwrite_output_dir \ 24 | --hidden_dropout_prob 0.1 \ 25 | --seed 1111 \ 26 | --save_strategy epoch \ 27 | --evaluation_strategy epoch \ 28 | --load_best_model_at_end\ 29 | --metric_for_best_model loss\ 30 | --greater_is_better False \ 31 | --warmup_ratio $wr 32 | done 33 | done 34 | done 35 | 36 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta ft -------------------------------------------------------------------------------- /scripts/search_scipts/superglue/search_lora.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export epoch=50 3 | export DATASET_NAME=$1 4 | 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2 6 | do 7 | for bs in 16 32 8 | do 9 | for lora_alpha in 8 16 10 | do 11 | for lora_r in 8 16 12 | do 13 | python3 run.py \ 14 | --model_name_or_path roberta-base \ 15 | --task_name $TASK_NAME \ 16 | --dataset_name $DATASET_NAME \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_predict \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size $bs \ 22 | --learning_rate $lr \ 23 | --num_train_epochs $epoch \ 24 | --output_dir checkpoints/$DATASET_NAME-roberta-searchlora/$DATASET_NAME-$bs-$lr-$lora_alpha/ \ 25 | --overwrite_output_dir \ 26 | --hidden_dropout_prob 0.1 \ 27 | --seed 1111 \ 28 | --save_strategy epoch \ 29 | --evaluation_strategy epoch \ 30 | --load_best_model_at_end\ 31 | --metric_for_best_model loss\ 32 | --greater_is_better False \ 33 | --lora \ 34 | --lora_alpha $lora_alpha \ 35 | --lora_r $lora_r 36 | done 37 | done 38 | done 39 | done 40 | 41 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta lora -------------------------------------------------------------------------------- /scripts/search_scipts/superglue/search_pt.sh: -------------------------------------------------------------------------------- 1 | export TASK_NAME=superglue 2 | export epoch=50 3 | export DATASET_NAME=$1 4 | 5 | for lr in 1e-6 5e-6 1e-5 5e-5 1e-4 5e-4 1e-3 5e-3 1e-2 6 | do 7 | for psl in 8 16 32 64 8 | do 9 | for bs in 16 32 10 | do 11 | python3 run.py \ 12 | --model_name_or_path roberta-base \ 13 | --task_name $TASK_NAME \ 14 | --dataset_name $DATASET_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size $bs \ 20 | --learning_rate $lr \ 21 | --num_train_epochs $epoch \ 22 | --pre_seq_len $psl \ 23 | --output_dir checkpoints/$DATASET_NAME-roberta-searchpt/$DATASET_NAME-$bs-$lr-$psl/ \ 24 | --overwrite_output_dir \ 25 | --hidden_dropout_prob 0.1 \ 26 | --seed 1111 \ 27 | --save_strategy epoch \ 28 | --evaluation_strategy epoch \ 29 | --load_best_model_at_end\ 30 | --metric_for_best_model loss\ 31 | --greater_is_better False \ 32 | --prefix 33 | done 34 | done 35 | done 36 | 37 | python3 ./scripts/search_scipts/search.py $DATASET_NAME roberta pt -------------------------------------------------------------------------------- /tasks/glue/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils import data 3 | from torch.utils.data import Dataset 4 | from datasets.arrow_dataset import Dataset as HFDataset 5 | from datasets.load import load_dataset, load_metric 6 | from transformers import ( 7 | AutoTokenizer, 8 | DataCollatorWithPadding, 9 | EvalPrediction, 10 | default_data_collator, 11 | ) 12 | import numpy as np 13 | import logging 14 | 15 | task_to_keys = { 16 | "cola": ("sentence", None), 17 | "mnli": ("premise", "hypothesis"), 18 | "mrpc": ("sentence1", "sentence2"), 19 | "qnli": ("question", "sentence"), 20 | "qqp": ("question1", "question2"), 21 | "rte": ("sentence1", "sentence2"), 22 | "sst2": ("sentence", None), 23 | "stsb": ("sentence1", "sentence2"), 24 | "wnli": ("sentence1", "sentence2"), 25 | } 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | class GlueDataset(): 31 | def __init__(self, tokenizer: AutoTokenizer, data_args, training_args) -> None: 32 | super().__init__() 33 | raw_datasets = load_dataset("glue", data_args.dataset_name) 34 | self.tokenizer = tokenizer 35 | self.data_args = data_args 36 | #labels 37 | self.is_regression = data_args.dataset_name == "stsb" 38 | if not self.is_regression: 39 | self.label_list = raw_datasets["train"].features["label"].names 40 | self.num_labels = len(self.label_list) 41 | else: 42 | self.num_labels = 1 43 | 44 | # Preprocessing the raw_datasets 45 | self.sentence1_key, self.sentence2_key = task_to_keys[data_args.dataset_name] 46 | 47 | # Padding strategy 48 | if data_args.pad_to_max_length: 49 | self.padding = "max_length" 50 | else: 51 | # We will pad later, dynamically at batch creation, to the max sequence length in each batch 52 | self.padding = False 53 | 54 | # Some models have set the order of the labels to use, so let's make sure we do use it. 55 | if not self.is_regression: 56 | self.label2id = {l: i for i, l in enumerate(self.label_list)} 57 | self.id2label = {id: label for label, id in self.label2id.items()} 58 | 59 | if data_args.max_seq_length > tokenizer.model_max_length: 60 | logger.warning( 61 | f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" 62 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 63 | ) 64 | self.max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) 65 | 66 | raw_datasets = raw_datasets.map( 67 | self.preprocess_function, 68 | batched=True, 69 | load_from_cache_file=not data_args.overwrite_cache, 70 | desc="Running tokenizer on dataset", 71 | ) 72 | if training_args.do_train: 73 | train_dataset = raw_datasets["train"].train_test_split(test_size=0.1, shuffle=False) 74 | self.train_dataset, self.eval_dataset = train_dataset = train_dataset['train'], train_dataset['test'] 75 | if data_args.max_train_samples is not None: 76 | self.train_dataset = self.train_dataset.select(range(data_args.max_train_samples)) 77 | 78 | if training_args.do_predict: 79 | self.predict_dataset = raw_datasets["validation_matched" if data_args.dataset_name == "mnli" else "validation"] 80 | if data_args.max_predict_samples is not None: 81 | self.predict_dataset = self.predict_dataset.select(range(data_args.max_predict_samples)) 82 | 83 | # if training_args.do_train: 84 | # self.train_dataset = raw_datasets["train"] 85 | # if data_args.max_train_samples is not None: 86 | # self.train_dataset = self.train_dataset.select(range(data_args.max_train_samples)) 87 | 88 | # if training_args.do_eval: 89 | # self.eval_dataset = raw_datasets["validation_matched" if data_args.dataset_name == "mnli" else "validation"] 90 | # if data_args.max_eval_samples is not None: 91 | # self.eval_dataset = self.eval_dataset.select(range(data_args.max_eval_samples)) 92 | 93 | # if training_args.do_predict or data_args.dataset_name is not None or data_args.test_file is not None: 94 | # self.predict_dataset = raw_datasets["test_matched" if data_args.dataset_name == "mnli" else "test"] 95 | # if data_args.max_predict_samples is not None: 96 | # self.predict_dataset = self.predict_dataset.select(range(data_args.max_predict_samples)) 97 | 98 | self.metric = load_metric("tasks/glue/glue.py", data_args.dataset_name) 99 | 100 | if data_args.pad_to_max_length: 101 | self.data_collator = default_data_collator 102 | elif training_args.fp16: 103 | self.data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) 104 | 105 | 106 | def preprocess_function(self, examples): 107 | # Tokenize the texts 108 | args = ( 109 | (examples[self.sentence1_key],) if self.sentence2_key is None else (examples[self.sentence1_key], examples[self.sentence2_key]) 110 | ) 111 | result = self.tokenizer(*args, padding=self.padding, max_length=self.max_seq_length, truncation=True) 112 | 113 | return result 114 | 115 | def compute_metrics(self, p: EvalPrediction): 116 | preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions 117 | preds = np.squeeze(preds) if self.is_regression else np.argmax(preds, axis=1) 118 | if self.data_args.dataset_name is not None: 119 | result = self.metric.compute(predictions=preds, references=p.label_ids) 120 | if len(result) > 1: 121 | result["combined_score"] = np.mean(list(result.values())).item() 122 | return result 123 | elif self.is_regression: 124 | return {"mse": ((preds - p.label_ids) ** 2).mean().item()} 125 | else: 126 | return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} 127 | 128 | 129 | -------------------------------------------------------------------------------- /tasks/glue/get_trainer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | import sys 5 | 6 | from transformers import ( 7 | AutoConfig, 8 | AutoTokenizer, 9 | AdapterConfig 10 | ) 11 | 12 | from model.utils import get_model, TaskType 13 | from tasks.glue.dataset import GlueDataset 14 | from training.trainer_base import BaseTrainer 15 | from transformers import Trainer, AdapterTrainer, EarlyStoppingCallback 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | def get_trainer(args): 20 | model_args, data_args, training_args, _, adapter_args = args 21 | 22 | tokenizer = AutoTokenizer.from_pretrained( 23 | model_args.model_name_or_path, 24 | use_fast=model_args.use_fast_tokenizer, 25 | revision=model_args.model_revision, 26 | ) 27 | dataset = GlueDataset(tokenizer, data_args, training_args) 28 | 29 | if not dataset.is_regression: 30 | config = AutoConfig.from_pretrained( 31 | model_args.model_name_or_path, 32 | num_labels=dataset.num_labels, 33 | label2id=dataset.label2id, 34 | id2label=dataset.id2label, 35 | finetuning_task=data_args.dataset_name, 36 | revision=model_args.model_revision, 37 | ) 38 | else: 39 | config = AutoConfig.from_pretrained( 40 | model_args.model_name_or_path, 41 | num_labels=dataset.num_labels, 42 | finetuning_task=data_args.dataset_name, 43 | revision=model_args.model_revision, 44 | ) 45 | config.lora = False 46 | model = get_model(model_args, TaskType.SEQUENCE_CLASSIFICATION, config) 47 | 48 | if adapter_args.train_adapter: 49 | logger.info(f"Reduction Factor: {adapter_args.adapter_reduction_factor}") 50 | task_name = data_args.task_name or "superglue" 51 | # check if adapter already exists, otherwise add it 52 | if task_name not in model.config.adapters: 53 | # resolve the adapter config 54 | adapter_config = AdapterConfig.load( 55 | adapter_args.adapter_config, 56 | non_linearity=adapter_args.adapter_non_linearity, 57 | reduction_factor=adapter_args.adapter_reduction_factor, 58 | ) 59 | # load a pre-trained from Hub if specified 60 | # if adapter_args.load_adapter: 61 | # model.load_adapter( 62 | # adapter_args.load_adapter, 63 | # config=adapter_config, 64 | # load_as=task_name, 65 | # ) 66 | # # otherwise, add a fresh adapter 67 | # else: 68 | model.add_adapter(task_name, config=adapter_config) 69 | # Freeze all model weights except of those of this adapter 70 | model.train_adapter([task_name]) 71 | # Set the adapters to be used in every forward pass 72 | model.set_active_adapters(task_name) 73 | else: 74 | if adapter_args.load_adapter: 75 | raise ValueError( 76 | "Adapters can only be loaded in adapters training mode." 77 | "Use --train_adapter to enable adapter training" 78 | ) 79 | if model_args.bitfit: 80 | for name, param in model.named_parameters(): 81 | if name.startswith('roberta') and "bias" not in name.lower(): 82 | param.requires_grad = False 83 | param_optimizer = list(model.named_parameters()) 84 | logger.info("Trainable parameters:") 85 | for n, p in param_optimizer: 86 | if p.requires_grad: 87 | logger.info(f"{n}") 88 | # print(n) 89 | 90 | trainer_cls = AdapterTrainer if adapter_args.train_adapter else Trainer 91 | trainer = trainer_cls( 92 | model=model, 93 | args=training_args, 94 | train_dataset=dataset.train_dataset if training_args.do_train else None, 95 | eval_dataset=dataset.eval_dataset if training_args.do_eval else None, 96 | compute_metrics=dataset.compute_metrics, 97 | tokenizer=tokenizer, 98 | data_collator=dataset.data_collator, 99 | callbacks = [EarlyStoppingCallback(early_stopping_patience=10)] 100 | ) 101 | 102 | return trainer, dataset.predict_dataset -------------------------------------------------------------------------------- /tasks/glue/glue.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Datasets Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ GLUE benchmark metric. """ 16 | 17 | from scipy.stats import pearsonr, spearmanr 18 | from sklearn.metrics import f1_score, matthews_corrcoef 19 | 20 | import datasets 21 | 22 | 23 | _CITATION = """\ 24 | @inproceedings{wang2019glue, 25 | title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding}, 26 | author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.}, 27 | note={In the Proceedings of ICLR.}, 28 | year={2019} 29 | } 30 | """ 31 | 32 | _DESCRIPTION = """\ 33 | GLUE, the General Language Understanding Evaluation benchmark 34 | (https://gluebenchmark.com/) is a collection of resources for training, 35 | evaluating, and analyzing natural language understanding systems. 36 | """ 37 | 38 | _KWARGS_DESCRIPTION = """ 39 | Compute GLUE evaluation metric associated to each GLUE dataset. 40 | Args: 41 | predictions: list of predictions to score. 42 | Each translation should be tokenized into a list of tokens. 43 | references: list of lists of references for each translation. 44 | Each reference should be tokenized into a list of tokens. 45 | Returns: depending on the GLUE subset, one or several of: 46 | "accuracy": Accuracy 47 | "f1": F1 score 48 | "pearson": Pearson Correlation 49 | "spearmanr": Spearman Correlation 50 | "matthews_correlation": Matthew Correlation 51 | Examples: 52 | 53 | >>> glue_metric = datasets.load_metric('glue', 'sst2') # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"] 54 | >>> references = [0, 1] 55 | >>> predictions = [0, 1] 56 | >>> results = glue_metric.compute(predictions=predictions, references=references) 57 | >>> print(results) 58 | {'accuracy': 1.0} 59 | 60 | >>> glue_metric = datasets.load_metric('glue', 'mrpc') # 'mrpc' or 'qqp' 61 | >>> references = [0, 1] 62 | >>> predictions = [0, 1] 63 | >>> results = glue_metric.compute(predictions=predictions, references=references) 64 | >>> print(results) 65 | {'accuracy': 1.0, 'f1': 1.0} 66 | 67 | >>> glue_metric = datasets.load_metric('glue', 'stsb') 68 | >>> references = [0., 1., 2., 3., 4., 5.] 69 | >>> predictions = [0., 1., 2., 3., 4., 5.] 70 | >>> results = glue_metric.compute(predictions=predictions, references=references) 71 | >>> print({"pearson": round(results["pearson"], 2), "spearmanr": round(results["spearmanr"], 2)}) 72 | {'pearson': 1.0, 'spearmanr': 1.0} 73 | 74 | >>> glue_metric = datasets.load_metric('glue', 'cola') 75 | >>> references = [0, 1] 76 | >>> predictions = [0, 1] 77 | >>> results = glue_metric.compute(predictions=predictions, references=references) 78 | >>> print(results) 79 | {'matthews_correlation': 1.0} 80 | """ 81 | 82 | 83 | def simple_accuracy(preds, labels): 84 | return float((preds == labels).mean()) 85 | 86 | 87 | def acc_and_f1(preds, labels): 88 | acc = simple_accuracy(preds, labels) 89 | f1 = float(f1_score(y_true=labels, y_pred=preds)) 90 | return { 91 | "accuracy": acc, 92 | "f1": f1, 93 | } 94 | 95 | 96 | def pearson_and_spearman(preds, labels): 97 | pearson_corr = float(pearsonr(preds, labels)[0]) 98 | spearman_corr = float(spearmanr(preds, labels)[0]) 99 | return { 100 | "pearson": pearson_corr, 101 | "spearmanr": spearman_corr, 102 | } 103 | 104 | 105 | @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 106 | class Glue(datasets.Metric): 107 | def _info(self): 108 | if self.config_name not in [ 109 | "sst2", 110 | "mnli", 111 | "mnli_mismatched", 112 | "mnli_matched", 113 | "cola", 114 | "stsb", 115 | "mrpc", 116 | "qqp", 117 | "qnli", 118 | "rte", 119 | "wnli", 120 | "hans", 121 | ]: 122 | raise KeyError( 123 | "You should supply a configuration name selected in " 124 | '["sst2", "mnli", "mnli_mismatched", "mnli_matched", ' 125 | '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]' 126 | ) 127 | return datasets.MetricInfo( 128 | description=_DESCRIPTION, 129 | citation=_CITATION, 130 | inputs_description=_KWARGS_DESCRIPTION, 131 | features=datasets.Features( 132 | { 133 | "predictions": datasets.Value("int64" if self.config_name != "stsb" else "float32"), 134 | "references": datasets.Value("int64" if self.config_name != "stsb" else "float32"), 135 | } 136 | ), 137 | codebase_urls=[], 138 | reference_urls=[], 139 | format="numpy", 140 | ) 141 | 142 | def _compute(self, predictions, references): 143 | if self.config_name == "cola": 144 | return {"matthews_correlation": matthews_corrcoef(references, predictions)} 145 | elif self.config_name == "stsb": 146 | return pearson_and_spearman(predictions, references) 147 | elif self.config_name in ["mrpc", "qqp"]: 148 | return acc_and_f1(predictions, references) 149 | elif self.config_name in ["sst2", "mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]: 150 | return {"accuracy": simple_accuracy(predictions, references)} 151 | else: 152 | raise KeyError( 153 | "You should supply a configuration name selected in " 154 | '["sst2", "mnli", "mnli_mismatched", "mnli_matched", ' 155 | '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]' 156 | ) -------------------------------------------------------------------------------- /tasks/superglue/get_trainer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | import sys 5 | import torch 6 | from transformers import ( 7 | AutoConfig, 8 | AutoTokenizer, 9 | AutoModelWithHeads, 10 | AdapterConfig 11 | ) 12 | 13 | from model.utils import get_model, TaskType 14 | 15 | from training.trainer_base import BaseTrainer, BaseAdapterTrainer 16 | from transformers import Trainer, AdapterTrainer, EarlyStoppingCallback, set_seed 17 | from tasks.superglue.dataset import SuperGlueDataset 18 | # from training.trainer import Trainer 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | def get_trainer(args): 23 | model_args, data_args, training_args, _, adapter_args = args 24 | print("set model randome seed ", model_args.model_seed) 25 | set_seed(model_args.model_seed) 26 | log_level = training_args.get_process_log_level() 27 | logger.setLevel(log_level) 28 | 29 | 30 | tokenizer = AutoTokenizer.from_pretrained( 31 | model_args.model_name_or_path, 32 | use_fast=model_args.use_fast_tokenizer, 33 | revision=model_args.model_revision, 34 | ) 35 | # if data_args.dataset_name == 'record': 36 | # dataset = SuperGlueDatasetForRecord(tokenizer, data_args, training_args) 37 | # else: 38 | dataset = SuperGlueDataset(tokenizer, data_args, training_args) 39 | 40 | # if training_args.do_train: 41 | # for index in random.sample(range(len(dataset.train_dataset)), 3): 42 | # logger.info(f"Sample {index} of the training set: {dataset.train_dataset[index]}.") 43 | if not dataset.multiple_choice: 44 | config = AutoConfig.from_pretrained( 45 | model_args.model_name_or_path, 46 | num_labels=dataset.num_labels, 47 | label2id=dataset.label2id, 48 | id2label=dataset.id2label, 49 | finetuning_task=data_args.dataset_name, 50 | revision=model_args.model_revision, 51 | ) 52 | else: 53 | config = AutoConfig.from_pretrained( 54 | model_args.model_name_or_path, 55 | num_labels=dataset.num_labels, 56 | finetuning_task=data_args.dataset_name, 57 | revision=model_args.model_revision, 58 | ) 59 | config.lora = False 60 | if not dataset.multiple_choice: 61 | model = get_model(model_args, TaskType.SEQUENCE_CLASSIFICATION, config) 62 | else: 63 | model = get_model(model_args, TaskType.MULTIPLE_CHOICE, config, fix_bert=True) 64 | 65 | 66 | if adapter_args.train_adapter: 67 | logger.info(f"Reduction Factor: {adapter_args.adapter_reduction_factor}") 68 | task_name = data_args.task_name or "superglue" 69 | # check if adapter already exists, otherwise add it 70 | if task_name not in model.config.adapters: 71 | # resolve the adapter config 72 | adapter_config = AdapterConfig.load( 73 | adapter_args.adapter_config, 74 | non_linearity=adapter_args.adapter_non_linearity, 75 | reduction_factor=adapter_args.adapter_reduction_factor, 76 | ) 77 | 78 | model.add_adapter(task_name, config=adapter_config) 79 | # Freeze all model weights except of those of this adapter 80 | model.train_adapter([task_name]) 81 | # Set the adapters to be used in every forward pass 82 | model.set_active_adapters(task_name) 83 | else: 84 | if adapter_args.load_adapter: 85 | raise ValueError( 86 | "Adapters can only be loaded in adapters training mode." 87 | "Use --train_adapter to enable adapter training" 88 | ) 89 | if model_args.bitfit: 90 | for name, param in model.named_parameters(): 91 | if name.startswith('roberta') and "bias" not in name.lower(): 92 | param.requires_grad = False 93 | param_optimizer = list(model.named_parameters()) 94 | logger.info("Trainable parameters:") 95 | trained_param = 0 96 | 97 | for n, p in param_optimizer: 98 | if p.requires_grad: 99 | trained_param += p.numel() 100 | logger.info(f"{n}") 101 | 102 | set_seed(training_args.seed) 103 | print("set data randome seed ", training_args.seed) 104 | trainer_cls = AdapterTrainer if adapter_args.train_adapter else Trainer 105 | trainer = trainer_cls( 106 | model=model, 107 | args=training_args, 108 | train_dataset=dataset.train_dataset if training_args.do_train else None, 109 | eval_dataset=dataset.eval_dataset if training_args.do_eval else None, 110 | compute_metrics=dataset.compute_metrics, 111 | tokenizer=tokenizer, 112 | data_collator=dataset.data_collator, 113 | callbacks = [EarlyStoppingCallback(early_stopping_patience=model_args.patient)] 114 | ) 115 | 116 | 117 | return trainer, dataset.predict_dataset 118 | -------------------------------------------------------------------------------- /tasks/superglue/record_evaluation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Official evaluation script for ReCoRD v1.0. 3 | (Some functions are adopted from the SQuAD evaluation script.) 4 | """ 5 | 6 | 7 | import argparse 8 | import json 9 | import re 10 | import string 11 | import sys 12 | from collections import Counter 13 | 14 | 15 | def normalize_answer(s): 16 | """Lower text and remove punctuation, articles and extra whitespace.""" 17 | 18 | def remove_articles(text): 19 | return re.sub(r"\b(a|an|the)\b", " ", text) 20 | 21 | def white_space_fix(text): 22 | return " ".join(text.split()) 23 | 24 | def remove_punc(text): 25 | exclude = set(string.punctuation) 26 | return "".join(ch for ch in text if ch not in exclude) 27 | 28 | def lower(text): 29 | return text.lower() 30 | 31 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 32 | 33 | 34 | def f1_score(prediction, ground_truth): 35 | prediction_tokens = normalize_answer(prediction).split() 36 | ground_truth_tokens = normalize_answer(ground_truth).split() 37 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 38 | num_same = sum(common.values()) 39 | if num_same == 0: 40 | return 0 41 | precision = 1.0 * num_same / len(prediction_tokens) 42 | recall = 1.0 * num_same / len(ground_truth_tokens) 43 | f1 = (2 * precision * recall) / (precision + recall) 44 | return f1 45 | 46 | 47 | def exact_match_score(prediction, ground_truth): 48 | return normalize_answer(prediction) == normalize_answer(ground_truth) 49 | 50 | 51 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 52 | scores_for_ground_truths = [] 53 | for ground_truth in ground_truths: 54 | score = metric_fn(prediction, ground_truth) 55 | scores_for_ground_truths.append(score) 56 | return max(scores_for_ground_truths) 57 | 58 | 59 | def evaluate(dataset, predictions): 60 | f1 = exact_match = total = 0 61 | correct_ids = [] 62 | for passage in dataset: 63 | for qa in passage["qas"]: 64 | total += 1 65 | if qa["id"] not in predictions: 66 | message = "Unanswered question {} will receive score 0.".format(qa["id"]) 67 | print(message, file=sys.stderr) 68 | continue 69 | 70 | ground_truths = list(map(lambda x: x["text"], qa["answers"])) 71 | prediction = predictions[qa["id"]] 72 | 73 | _exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths) 74 | if int(_exact_match) == 1: 75 | correct_ids.append(qa["id"]) 76 | exact_match += _exact_match 77 | 78 | f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) 79 | 80 | exact_match = exact_match / total 81 | f1 = f1 / total 82 | 83 | return {"exact_match": exact_match, "f1": f1}, correct_ids 84 | 85 | 86 | if __name__ == "__main__": 87 | expected_version = "1.0" 88 | parser = argparse.ArgumentParser("Official evaluation script for ReCoRD v1.0.") 89 | parser.add_argument("data_file", help="The dataset file in JSON format.") 90 | parser.add_argument("pred_file", help="The model prediction file in JSON format.") 91 | parser.add_argument("--output_correct_ids", action="store_true", help="Output the correctly answered query IDs.") 92 | args = parser.parse_args() 93 | 94 | with open(args.data_file) as data_file: 95 | dataset_json = json.load(data_file) 96 | if dataset_json["version"] != expected_version: 97 | print( 98 | "Evaluation expects v-{}, but got dataset with v-{}".format(expected_version, dataset_json["version"]), 99 | file=sys.stderr, 100 | ) 101 | dataset = dataset_json["data"] 102 | 103 | with open(args.pred_file) as pred_file: 104 | predictions = json.load(pred_file) 105 | 106 | metrics, correct_ids = evaluate(dataset, predictions) 107 | 108 | if args.output_correct_ids: 109 | print("Output {} correctly answered question IDs.".format(len(correct_ids))) 110 | with open("correct_ids.json", "w") as f: 111 | json.dump(correct_ids, f) -------------------------------------------------------------------------------- /tasks/superglue/super_glue_metric.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Datasets Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """The SuperGLUE benchmark metric.""" 16 | 17 | from sklearn.metrics import f1_score, matthews_corrcoef 18 | 19 | import datasets 20 | 21 | from .record_evaluation import evaluate as evaluate_record 22 | 23 | 24 | _CITATION = """\ 25 | @article{wang2019superglue, 26 | title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems}, 27 | author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R}, 28 | journal={arXiv preprint arXiv:1905.00537}, 29 | year={2019} 30 | } 31 | """ 32 | 33 | _DESCRIPTION = """\ 34 | SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after 35 | GLUE with a new set of more difficult language understanding tasks, improved 36 | resources, and a new public leaderboard. 37 | """ 38 | 39 | _KWARGS_DESCRIPTION = """ 40 | Compute SuperGLUE evaluation metric associated to each SuperGLUE dataset. 41 | Args: 42 | predictions: list of predictions to score. Depending on the SuperGlUE subset: 43 | - for 'record': list of question-answer dictionaries with the following keys: 44 | - 'idx': index of the question as specified by the dataset 45 | - 'prediction_text': the predicted answer text 46 | - for 'multirc': list of question-answer dictionaries with the following keys: 47 | - 'idx': index of the question-answer pair as specified by the dataset 48 | - 'prediction': the predicted answer label 49 | - otherwise: list of predicted labels 50 | references: list of reference labels. Depending on the SuperGLUE subset: 51 | - for 'record': list of question-answers dictionaries with the following keys: 52 | - 'idx': index of the question as specified by the dataset 53 | - 'answers': list of possible answers 54 | - otherwise: list of reference labels 55 | Returns: depending on the SuperGLUE subset: 56 | - for 'record': 57 | - 'exact_match': Exact match between answer and gold answer 58 | - 'f1': F1 score 59 | - for 'multirc': 60 | - 'exact_match': Exact match between answer and gold answer 61 | - 'f1_m': Per-question macro-F1 score 62 | - 'f1_a': Average F1 score over all answers 63 | - for 'axb': 64 | 'matthews_correlation': Matthew Correlation 65 | - for 'cb': 66 | - 'accuracy': Accuracy 67 | - 'f1': F1 score 68 | - for all others: 69 | - 'accuracy': Accuracy 70 | Examples: 71 | 72 | >>> super_glue_metric = datasets.load_metric('super_glue', 'copa') # any of ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"] 73 | >>> predictions = [0, 1] 74 | >>> references = [0, 1] 75 | >>> results = super_glue_metric.compute(predictions=predictions, references=references) 76 | >>> print(results) 77 | {'accuracy': 1.0} 78 | 79 | >>> super_glue_metric = datasets.load_metric('super_glue', 'cb') 80 | >>> predictions = [0, 1] 81 | >>> references = [0, 1] 82 | >>> results = super_glue_metric.compute(predictions=predictions, references=references) 83 | >>> print(results) 84 | {'accuracy': 1.0, 'f1': 1.0} 85 | 86 | >>> super_glue_metric = datasets.load_metric('super_glue', 'record') 87 | >>> predictions = [{'idx': {'passage': 0, 'query': 0}, 'prediction_text': 'answer'}] 88 | >>> references = [{'idx': {'passage': 0, 'query': 0}, 'answers': ['answer', 'another_answer']}] 89 | >>> results = super_glue_metric.compute(predictions=predictions, references=references) 90 | >>> print(results) 91 | {'exact_match': 1.0, 'f1': 1.0} 92 | 93 | >>> super_glue_metric = datasets.load_metric('super_glue', 'multirc') 94 | >>> predictions = [{'idx': {'answer': 0, 'paragraph': 0, 'question': 0}, 'prediction': 0}, {'idx': {'answer': 1, 'paragraph': 2, 'question': 3}, 'prediction': 1}] 95 | >>> references = [0, 1] 96 | >>> results = super_glue_metric.compute(predictions=predictions, references=references) 97 | >>> print(results) 98 | {'exact_match': 1.0, 'f1_m': 1.0, 'f1_a': 1.0} 99 | 100 | >>> super_glue_metric = datasets.load_metric('super_glue', 'axb') 101 | >>> references = [0, 1] 102 | >>> predictions = [0, 1] 103 | >>> results = super_glue_metric.compute(predictions=predictions, references=references) 104 | >>> print(results) 105 | {'matthews_correlation': 1.0} 106 | """ 107 | 108 | 109 | def simple_accuracy(preds, labels): 110 | return float((preds == labels).mean()) 111 | 112 | 113 | def acc_and_f1(preds, labels, f1_avg="binary"): 114 | acc = simple_accuracy(preds, labels) 115 | f1 = float(f1_score(y_true=labels, y_pred=preds, average=f1_avg)) 116 | return { 117 | "accuracy": acc, 118 | "f1": f1, 119 | } 120 | 121 | 122 | def evaluate_multirc(ids_preds, labels): 123 | """ 124 | Computes F1 score and Exact Match for MultiRC predictions. 125 | """ 126 | question_map = {} 127 | for id_pred, label in zip(ids_preds, labels): 128 | question_id = "{}-{}".format(id_pred["idx"]["paragraph"], id_pred["idx"]["question"]) 129 | pred = id_pred["prediction"] 130 | if question_id in question_map: 131 | question_map[question_id].append((pred, label)) 132 | else: 133 | question_map[question_id] = [(pred, label)] 134 | f1s, ems = [], [] 135 | for question, preds_labels in question_map.items(): 136 | question_preds, question_labels = zip(*preds_labels) 137 | f1 = f1_score(y_true=question_labels, y_pred=question_preds, average="macro") 138 | f1s.append(f1) 139 | em = int(sum([p == l for p, l in preds_labels]) == len(preds_labels)) 140 | ems.append(em) 141 | f1_m = float((sum(f1s) / len(f1s))) 142 | em = sum(ems) / len(ems) 143 | f1_a = float(f1_score(y_true=labels, y_pred=[id_pred["prediction"] for id_pred in ids_preds])) 144 | return {"exact_match": em, "f1_m": f1_m, "f1_a": f1_a} 145 | 146 | 147 | @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 148 | class SuperGlue(datasets.Metric): 149 | def _info(self): 150 | if self.config_name not in [ 151 | "boolq", 152 | "cb", 153 | "copa", 154 | "multirc", 155 | "record", 156 | "rte", 157 | "wic", 158 | "wsc", 159 | "wsc.fixed", 160 | "axb", 161 | "axg", 162 | ]: 163 | raise KeyError( 164 | "You should supply a configuration name selected in " 165 | '["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc", "wsc.fixed", "axb", "axg",]' 166 | ) 167 | return datasets.MetricInfo( 168 | description=_DESCRIPTION, 169 | citation=_CITATION, 170 | inputs_description=_KWARGS_DESCRIPTION, 171 | features=datasets.Features(self._get_feature_types()), 172 | codebase_urls=[], 173 | reference_urls=[], 174 | format="numpy" if not self.config_name == "record" and not self.config_name == "multirc" else None, 175 | ) 176 | 177 | def _get_feature_types(self): 178 | if self.config_name == "record": 179 | return { 180 | "predictions": { 181 | "idx": { 182 | "passage": datasets.Value("int64"), 183 | "query": datasets.Value("int64"), 184 | }, 185 | "prediction_text": datasets.Value("string"), 186 | }, 187 | "references": { 188 | "idx": { 189 | "passage": datasets.Value("int64"), 190 | "query": datasets.Value("int64"), 191 | }, 192 | "answers": datasets.Sequence(datasets.Value("string")), 193 | }, 194 | } 195 | elif self.config_name == "multirc": 196 | return { 197 | "predictions": { 198 | "idx": { 199 | "answer": datasets.Value("int64"), 200 | "paragraph": datasets.Value("int64"), 201 | "question": datasets.Value("int64"), 202 | }, 203 | "prediction": datasets.Value("int64"), 204 | }, 205 | "references": datasets.Value("int64"), 206 | } 207 | else: 208 | return { 209 | "predictions": datasets.Value("int64"), 210 | "references": datasets.Value("int64"), 211 | } 212 | 213 | def _compute(self, predictions, references): 214 | if self.config_name == "axb": 215 | return {"matthews_correlation": matthews_corrcoef(references, predictions)} 216 | elif self.config_name == "cb": 217 | return acc_and_f1(predictions, references, f1_avg="macro") 218 | elif self.config_name == "record": 219 | dataset = [ 220 | { 221 | "qas": [ 222 | {"id": ref["idx"]["query"], "answers": [{"text": ans} for ans in ref["answers"]]} 223 | for ref in references 224 | ] 225 | } 226 | ] 227 | predictions = {pred["idx"]["query"]: pred["prediction_text"] for pred in predictions} 228 | return evaluate_record(dataset, predictions)[0] 229 | elif self.config_name == "multirc": 230 | return evaluate_multirc(predictions, references) 231 | elif self.config_name in ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"]: 232 | return {"accuracy": simple_accuracy(predictions, references)} 233 | else: 234 | raise KeyError( 235 | "You should supply a configuration name selected in " 236 | '["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc", "wsc.fixed", "axb", "axg",]' 237 | ) -------------------------------------------------------------------------------- /tasks/superglue/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | from collections import defaultdict, Counter 4 | 5 | 6 | def normalize_answer(s): 7 | """Lower text and remove punctuation, articles and extra whitespace.""" 8 | 9 | def remove_articles(text): 10 | return re.sub(r'\b(a|an|the)\b', ' ', text) 11 | 12 | def white_space_fix(text): 13 | return ' '.join(text.split()) 14 | 15 | def remove_punc(text): 16 | exclude = set(string.punctuation) 17 | return ''.join(ch for ch in text if ch not in exclude) 18 | 19 | def lower(text): 20 | return text.lower() 21 | 22 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 23 | 24 | def f1_score(prediction, ground_truth): 25 | prediction_tokens = normalize_answer(prediction).split() 26 | ground_truth_tokens = normalize_answer(ground_truth).split() 27 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 28 | num_same = sum(common.values()) 29 | if num_same == 0: 30 | return 0 31 | precision = 1.0 * num_same / len(prediction_tokens) 32 | recall = 1.0 * num_same / len(ground_truth_tokens) 33 | f1 = (2 * precision * recall) / (precision + recall) 34 | return f1 35 | 36 | 37 | def exact_match_score(prediction, ground_truth): 38 | return normalize_answer(prediction) == normalize_answer(ground_truth) 39 | 40 | 41 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 42 | scores_for_ground_truths = [] 43 | for ground_truth in ground_truths: 44 | score = metric_fn(prediction, ground_truth) 45 | scores_for_ground_truths.append(score) 46 | return max(scores_for_ground_truths) -------------------------------------------------------------------------------- /tasks/utils.py: -------------------------------------------------------------------------------- 1 | from tasks.glue.dataset import task_to_keys as glue_tasks 2 | from tasks.superglue.dataset import task_to_keys as superglue_tasks 3 | 4 | GLUE_DATASETS = list(glue_tasks.keys()) 5 | SUPERGLUE_DATASETS = list(superglue_tasks.keys()) 6 | 7 | 8 | TASKS = ["glue", "superglue"] 9 | 10 | DATASETS = GLUE_DATASETS + SUPERGLUE_DATASETS 11 | 12 | ADD_PREFIX_SPACE = { 13 | 'bert': False, 14 | 'roberta': True, 15 | 'deberta': True, 16 | 'gpt2': True, 17 | 'deberta-v2': True, 18 | } 19 | 20 | USE_FAST = { 21 | 'bert': True, 22 | 'roberta': True, 23 | 'deberta': True, 24 | 'gpt2': True, 25 | 'deberta-v2': False, 26 | } -------------------------------------------------------------------------------- /training/trainer_base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Dict, OrderedDict 4 | 5 | from transformers import Trainer, AdapterTrainer 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | _default_log_level = logging.INFO 10 | logger.setLevel(_default_log_level) 11 | 12 | class BaseTrainer(Trainer): 13 | def __init__(self, *args, predict_dataset = None, test_key = "accuracy", **kwargs): 14 | super().__init__(*args, **kwargs) 15 | self.predict_dataset = predict_dataset 16 | self.test_key = test_key 17 | self.best_metrics = OrderedDict({ 18 | "best_epoch": 0, 19 | f"best_eval_{self.test_key}": 0, 20 | }) 21 | 22 | def log_best_metrics(self): 23 | self.log_metrics("best", self.best_metrics) 24 | self.save_metrics("best", self.best_metrics, combined=False) 25 | 26 | 27 | 28 | def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval): 29 | if self.control.should_log: 30 | logs: Dict[str, float] = {} 31 | 32 | 33 | tr_loss_scalar = self._nested_gather(tr_loss).mean().item() 34 | 35 | # reset tr_loss to zero 36 | tr_loss -= tr_loss 37 | 38 | logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) 39 | logs["learning_rate"] = self._get_learning_rate() 40 | 41 | self._total_loss_scalar += tr_loss_scalar 42 | self._globalstep_last_logged = self.state.global_step 43 | self.store_flos() 44 | 45 | self.log(logs) 46 | 47 | eval_metrics = None 48 | if self.control.should_evaluate: 49 | eval_metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) 50 | self._report_to_hp_search(trial, epoch, eval_metrics) 51 | 52 | if eval_metrics["eval_"+self.test_key] > self.best_metrics["best_eval_"+self.test_key]: 53 | self.best_metrics["best_epoch"] = epoch 54 | self.best_metrics["best_eval_"+self.test_key] = eval_metrics["eval_"+self.test_key] 55 | 56 | if self.predict_dataset is not None: 57 | if isinstance(self.predict_dataset, dict): 58 | for dataset_name, dataset in self.predict_dataset.items(): 59 | _, _, test_metrics = self.predict(dataset, metric_key_prefix="test") 60 | self.best_metrics[f"best_test_{dataset_name}_{self.test_key}"] = test_metrics["test_"+self.test_key] 61 | else: 62 | _, _, test_metrics = self.predict(self.predict_dataset, metric_key_prefix="test") 63 | self.best_metrics["best_test_"+self.test_key] = test_metrics["test_"+self.test_key] 64 | 65 | logger.info(f"***** Epoch {epoch}: Best results *****") 66 | for key, value in self.best_metrics.items(): 67 | logger.info(f"{key} = {value}") 68 | self.log(self.best_metrics) 69 | 70 | if self.control.should_save: 71 | self._save_checkpoint(model, trial, metrics=eval_metrics) 72 | self.control = self.callback_handler.on_save(self.args, self.state, self.control) 73 | 74 | class BaseAdapterTrainer(AdapterTrainer): 75 | def __init__(self, *args, predict_dataset = None, test_key = "accuracy", **kwargs): 76 | super().__init__(*args, **kwargs) 77 | self.predict_dataset = predict_dataset 78 | self.test_key = test_key 79 | self.best_metrics = OrderedDict({ 80 | "best_epoch": 0, 81 | f"best_eval_{self.test_key}": 0, 82 | }) 83 | 84 | def log_best_metrics(self): 85 | self.log_metrics("best", self.best_metrics) 86 | self.save_metrics("best", self.best_metrics, combined=False) 87 | 88 | 89 | 90 | def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval): 91 | if self.control.should_log: 92 | logs: Dict[str, float] = {} 93 | 94 | 95 | tr_loss_scalar = self._nested_gather(tr_loss).mean().item() 96 | 97 | # reset tr_loss to zero 98 | tr_loss -= tr_loss 99 | 100 | logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) 101 | logs["learning_rate"] = self._get_learning_rate() 102 | 103 | self._total_loss_scalar += tr_loss_scalar 104 | self._globalstep_last_logged = self.state.global_step 105 | self.store_flos() 106 | 107 | self.log(logs) 108 | 109 | eval_metrics = None 110 | if self.control.should_evaluate: 111 | eval_metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) 112 | self._report_to_hp_search(trial, epoch, eval_metrics) 113 | 114 | if eval_metrics["eval_"+self.test_key] > self.best_metrics["best_eval_"+self.test_key]: 115 | self.best_metrics["best_epoch"] = epoch 116 | self.best_metrics["best_eval_"+self.test_key] = eval_metrics["eval_"+self.test_key] 117 | 118 | if self.predict_dataset is not None: 119 | if isinstance(self.predict_dataset, dict): 120 | for dataset_name, dataset in self.predict_dataset.items(): 121 | _, _, test_metrics = self.predict(dataset, metric_key_prefix="test") 122 | self.best_metrics[f"best_test_{dataset_name}_{self.test_key}"] = test_metrics["test_"+self.test_key] 123 | else: 124 | _, _, test_metrics = self.predict(self.predict_dataset, metric_key_prefix="test") 125 | self.best_metrics["best_test_"+self.test_key] = test_metrics["test_"+self.test_key] 126 | 127 | logger.info(f"***** Epoch {epoch}: Best results *****") 128 | for key, value in self.best_metrics.items(): 129 | logger.info(f"{key} = {value}") 130 | self.log(self.best_metrics) 131 | 132 | if self.control.should_save: 133 | self._save_checkpoint(model, trial, metrics=eval_metrics) 134 | self.control = self.callback_handler.on_save(self.args, self.state, self.control) --------------------------------------------------------------------------------