├── .dockerignore ├── .gitignore ├── LICENSE ├── README.md ├── figures └── mainfig.png ├── hyperdecoder ├── __init__.py ├── configs │ ├── experiments │ │ └── glue_t5_base.json │ ├── glue_ablations │ │ ├── generate_ablations_script.py │ │ ├── glue_generated_generated.json │ │ ├── glue_generated_manual.json │ │ ├── glue_generated_none.json │ │ ├── glue_generated_task.json │ │ ├── glue_manual_generated.json │ │ ├── glue_manual_generated_no_mlp.json │ │ ├── glue_manual_generated_norm.json │ │ ├── glue_manual_generated_unbalanced.json │ │ ├── glue_manual_manual.json │ │ ├── glue_manual_none.json │ │ ├── glue_manual_task.json │ │ ├── glue_none_generated.json │ │ ├── glue_none_manual.json │ │ ├── glue_none_none.json │ │ ├── glue_none_task.json │ │ ├── glue_task_generated.json │ │ ├── glue_task_manual.json │ │ ├── glue_task_none.json │ │ └── glue_task_task.json │ ├── mrqa_experiments │ │ ├── adamw_adapter_gen.json │ │ ├── adamw_full_finetune.json │ │ ├── adapter_gen_layernorm.json │ │ ├── manual_adapter_control.json │ │ ├── per_dataset_adapter.json │ │ ├── small_adapter_large_hypernetwork.json │ │ ├── task_adamw_hypernet.json │ │ └── unbalanced_manual_generated_mrqa.json │ └── xsum_nli │ │ ├── nli.json │ │ ├── nli_adapter.json │ │ ├── nli_manual.json │ │ ├── nli_task.json │ │ ├── summarise.json │ │ ├── summarise_adapter.json │ │ ├── summarise_manual.json │ │ ├── summarise_nli.json │ │ ├── summarise_nli_gen.json │ │ ├── summarise_nli_manual.json │ │ ├── summarise_nli_task.json │ │ └── summarise_task.json ├── data │ ├── __init__.py │ ├── mrqa_preprocess.py │ ├── multitask_sampler.py │ ├── postprocessors.py │ ├── tasks.py │ └── utils.py ├── finetune_trainer.py ├── metrics │ ├── __init__.py │ ├── metrics.py │ └── squad_scoring.py ├── modeling │ ├── adapter_generators.py │ ├── adapter_layer.py │ └── adapter_t5.py ├── third_party │ ├── LICENSE │ ├── __init__.py │ ├── trainers │ │ ├── __init__.py │ │ └── t5_trainer.py │ └── utils │ │ ├── __init__.py │ │ ├── sentence_splitter.py │ │ └── utils.py ├── training_args.py └── utils │ ├── __init__.py │ └── utils.py ├── mrqa_eval ├── construct_eval_folders.sh └── eval.py └── requirements.txt /.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | outputs/ 3 | .lock 4 | .python-version 5 | venv/ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | outputs/ 3 | .lock 4 | .python-version 5 | venv/ 6 | output/ 7 | wandb/ 8 | in-domain/ 9 | out-domain/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hyperdecoders 2 | 3 | Instance-specific decoders for efficient multi-task adaptation of pretrained language models. By generating adapter parameters based off encoder representations, we are able to more effectively enhance the multi-tasking abilities of the model. [Check out our paper for details!](https://arxiv.org/abs/2203.08304) Here is an overview of our approach: 4 | 5 | ![figure describing the hyperdecoders model](figures/mainfig.png) 6 | 7 | We evaluate on GLUE, MRQA, and a mixture of summarisation and NLI tasks. Our results training and evaluating on GLUE using T5 v1.1 + LM adapt are as follows: 8 | 9 | | Model | CoLA | SST2 | STS-B | MRPC | QQP | MNLI | QNLI | RTE | Avg | 10 | | ----- | ---- | ----- | ----- | ---- | --- | ---- | ---- | --- | --- | 11 | |Full Finetuning | **63.6** | 94.8 | 91.6/92.0 | 88.7/91.8 | **92.2/89.5** | 88.6 | 93.3 | 77.5 | 86.3 | 12 | | Hyperformer | 19.2 | 87.3 | 86.2/85.8 | 73.4/81.3 | 87.0/82.8 | 77.7 | 84.2 | 55.1 | 71.5 | 13 | | Single Adapter | 58.5 | 95.7 | 90.1/90.3 | **89.4/92.2** | 91.4/88.6 | 89.8 | 94.1 | 80.7 | 86.2 | 14 | | **Hyperdecoder (ours)** | 58.7 | **95.9** | **91.8/92.0** | 89.2/92.0 | 91.1/88.3 | **90.0** | **94.2** | **80.8** | **86.5** | 15 | 16 | Our approach only trains roughly 3% of the total parameters within the model. 17 | 18 | [See our paper](https://arxiv.org/abs/2203.08304) for more! This codebase is built off the [hyperformer codebase](https://github.com/rabeehk/hyperformer), with the following major changes: 19 | - Added several tasks and relevant preprocessing, including MRQA (with and without sliding windows), xsum, CNN/Daily Mail, Wiki Lingua, abductive NLI, and adversarial NLI. 20 | - Fixed some minor bugs including the 'split validation test' not being applied to the training set. 21 | - Added new adapter and parameter generation code in `hyperdecoder/modeling`, and removed the old adapter code. Added relevant training arguments for these setups (encoder/decoder adapter sizes, etc). 22 | - Updated the trainer to save copies of generated answers along with likelihood scores for MRQA evaluation. 23 | 24 | ## Installation 25 | 26 | Install pytorch (1.10 recommended). Install required packages, preferably in a virtualenv: `pip install -r requirements.txt`. 27 | 28 | ## Training 29 | 30 | Navigate into the `hyperdecoder` directory, and then you can run any configuration with `python finetune_trainer.py configs/`. Please note training only works for non-distributed setups - sorry! 31 | 32 | For example, for GLUE, the Hyperdecoder model can be run with `python finetune_trainer.py configs/glue_ablations/glue_manual_generated.json`. The trained model will be placed in `hyperdecoder/output`, and the evaluation logs can be found in `hyperdecoder/output/log.txt`. You can control how often the model is evaluated and saved with `eval_steps` and `save_steps` in the config. 33 | 34 | ## Config 35 | 36 | Some useful config items: 37 | - `{de,en}coder_adapter`: controls how we adapt the encoder/decoder. Can be `none` (no adapters), `manual` (regular adapters), `generated` (generated adapters). Note `generated` in the encoder results in the encoder being run twice: once with adapters to produce an embedding that is then used to adapt the encoder for a second run (the output of which is passed to the decoder as usual). 38 | - `freeze_model/unfreeze_{en,de}coder_adapters/unfreeze_{en,de}coder`: freeze/unfreeze the relevant parts of the model for training. This is accomplished through the `requires_grad` flag. Usually we freeze the whole model and then unfreeze the encoder/decoder adapter bits. 39 | - `max_steps`: controls how many training steps. Note that `num_train_epochs` is ignored when this is set, we just train based on steps and do not distinguish any sort of epoch boundary. 40 | - `{en,de}coder_adapter_dim`: controls the adapter bottleneck size. You can control separately for encoder/decoder. 41 | - `hypernetwork_bottleneck`: controls the hypernetwork bottleneck size (see paper for details on this). 42 | - `split_validation_test`: split the validation sets of datasets into validation and test splits, so we can early-stop based on validation metrics and then eval on the test split. This is what we do for most experiments in our paper. 43 | 44 | Most other config options are hopefully either straightforward or do not need to be changed. Note that the hyperdecoder model is achieved by setting `encoder_adapter: manual, decoder_adapter: generated`. 45 | 46 | The primary configs to use are: 47 | - `glue_ablations/glue_manual_generated.json`: the main GLUE-trained hyperdecoder model from table 1. 48 | - `mrqa_experiments/unbalanced_manual_generated.json`: the MRQA hyperdecoder model from tables 4/5. 49 | 50 | There are many other config files from other runs that correspond either to other models in the paper or stuff we tried during the development of this work. 51 | 52 | ### MRQA Evaluation 53 | 54 | Due to the sliding window nature of MRQA, evaluation is separately to running the model. When running evaluation with MRQA, the model will at the end output answer files for the validation and test sets as `predicted_answers.json` and `predicted_answers_test.json`. 55 | 56 | After getting these files, navigate into `mrqa_eval` and run the `construct_eval_folders.sh` script, which will download the MRQA evaluation data for you and place it in useful folders. You can then run evaluation on *in-domain* data as follows (note the in-domain data is treated as validation data and so predictions are output every evaluation phase): 57 | 58 | `for file in in-domain/*.gz; do echo $file; python eval.py $file ; done` 59 | 60 | The *out-domain* data can be evaluated similarly (note the out-domain predictions are only generated during test phases): 61 | 62 | `for file in out-domain/*.gz; do echo $file; python eval.py $file ; done` 63 | 64 | In both cases, you will get terminal output that prints (a) the name of the dataset being evaluated, and then (b) the performance on that particular dataset. Note our evaluation script is the same as the original MRQA evaluation script but with some extra code to handle picking the highest likelihood answer (as the model output saves these scores but does not filter on them). As such, it is fairly simple to convert our `predicted_answer.json` files to the format needed for the original MRQA evaluation script. 65 | 66 | 67 | ## Citation 68 | 69 | If you found this code or our paper useful, please cite us: 70 | ``` 71 | @misc{https://doi.org/10.48550/arxiv.2203.08304, 72 | doi = {10.48550/ARXIV.2203.08304}, 73 | url = {https://arxiv.org/abs/2203.08304}, 74 | author = {Ivison, Hamish and Peters, Matthew E.}, 75 | title = {Hyperdecoders: Instance-specific decoders for multi-task NLP}, 76 | publisher = {arXiv}, 77 | year = {2022}, 78 | } 79 | ``` -------------------------------------------------------------------------------- /figures/mainfig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/hyperdecoders/dcaafd1e8858669923a81bbc9fd4fbfea303650e/figures/mainfig.png -------------------------------------------------------------------------------- /hyperdecoder/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__))) 6 | -------------------------------------------------------------------------------- /hyperdecoder/configs/experiments/glue_t5_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "t5-base", 3 | "tokenizer_name": "t5-base", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 128, 15 | "per_device_eval_batch_size": 128, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "manual", 35 | "decoder_adapter": "generated", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "adapter_norm_input": true, 42 | "encoder_adapter_dim": 64, 43 | "decoder_adapter_dim": 64, 44 | "hypernetwork_bottleneck": 128, 45 | "loss_scaling": false, 46 | "tasks": [ 47 | "rte", 48 | "sst2", 49 | "mrpc", 50 | "stsb", 51 | "qqp", 52 | "mnli", 53 | "qnli", 54 | "cola" 55 | ], 56 | "eval_tasks": [ 57 | "rte", 58 | "sst2", 59 | "mrpc", 60 | "stsb", 61 | "qqp", 62 | "mnli", 63 | "qnli", 64 | "cola" 65 | ] 66 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/generate_ablations_script.py: -------------------------------------------------------------------------------- 1 | """small script to just generate the enc/dec variants for ablations""" 2 | import json 3 | 4 | with open("glue_none_none.json") as f: 5 | base_config = json.load(f) 6 | 7 | for enc_setup in ["none", "manual", "task", "generated"]: 8 | for dec_setup in ["none", "manual", "task", "generated"]: 9 | config = base_config.copy() 10 | config["encoder_adapter"] = enc_setup 11 | # if ff, unfreeze all, else, just adapter stuff 12 | if enc_setup == "none": 13 | config["unfreeze_encoder_adapters"] = False 14 | config["unfreeze_encoder"] = True 15 | else: 16 | config["unfreeze_encoder_adapters"] = True 17 | config["unfreeze_encoder"] = False 18 | if enc_setup in ["task", "generated"]: 19 | config["adapter_norm_input"] = True 20 | config["decoder_adapter"] = dec_setup 21 | # if ff, unfreeze all, else, just adapter stuff 22 | if dec_setup == "none": 23 | config["unfreeze_decoder_adapters"] = False 24 | config["unfreeze_decoder"] = True 25 | else: 26 | config["unfreeze_decoder_adapters"] = True 27 | config["unfreeze_decoder"] = False 28 | # if both none, then dont freeze at all 29 | if enc_setup == "none" and dec_setup == "none": 30 | config["freeze_model"] = False 31 | else: 32 | config["freeze_model"] = True 33 | config["output_dir"] = f"glue_{enc_setup}_{dec_setup}" 34 | with open(f"glue_{enc_setup}_{dec_setup}.json", "w") as f: 35 | json.dump(config, f, indent=4) 36 | -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_generated_generated.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 64, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 2, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "generated", 35 | "decoder_adapter": "generated", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "adapter_dim": 50, 42 | "hypernetwork_bottleneck": 100, 43 | "loss_scaling": false, 44 | "adapter_norm_input": true, 45 | "tasks": [ 46 | "rte", 47 | "sst2", 48 | "mrpc", 49 | "stsb", 50 | "qqp", 51 | "mnli", 52 | "qnli", 53 | "cola" 54 | ], 55 | "eval_tasks": [ 56 | "rte", 57 | "sst2", 58 | "mrpc", 59 | "stsb", 60 | "qqp", 61 | "mnli", 62 | "qnli", 63 | "cola" 64 | ] 65 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_generated_manual.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 64, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 2, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "generated", 35 | "decoder_adapter": "manual", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "adapter_dim": 64, 42 | "hypernetwork_bottleneck": 128, 43 | "loss_scaling": false, 44 | "adapter_norm_input": true, 45 | "tasks": [ 46 | "rte", 47 | "sst2", 48 | "mrpc", 49 | "stsb", 50 | "qqp", 51 | "mnli", 52 | "qnli", 53 | "cola" 54 | ], 55 | "eval_tasks": [ 56 | "rte", 57 | "sst2", 58 | "mrpc", 59 | "stsb", 60 | "qqp", 61 | "mnli", 62 | "qnli", 63 | "cola" 64 | ] 65 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_generated_none.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-v1_1-large", 3 | "tokenizer_name": "google/t5-v1_1-large", 4 | "learning_rate": 0.0003, 5 | "output_dir": "glue_generated_none", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "generated", 35 | "decoder_adapter": "none", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": true, 41 | "adapter_dim": 64, 42 | "hypernetwork_bottleneck": 128, 43 | "loss_scaling": false, 44 | "adapter_norm_input": true, 45 | "tasks": [ 46 | "rte", 47 | "sst2", 48 | "mrpc", 49 | "stsb", 50 | "qqp", 51 | "mnli", 52 | "qnli", 53 | "cola" 54 | ], 55 | "eval_tasks": [ 56 | "rte", 57 | "sst2", 58 | "mrpc", 59 | "stsb", 60 | "qqp", 61 | "mnli", 62 | "qnli", 63 | "cola" 64 | ] 65 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_generated_task.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "generated", 35 | "decoder_adapter": "task", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "encoder_adapter_dim": 50, 42 | "decoder_adapter_dim": 50, 43 | "hypernetwork_bottleneck": 100, 44 | "loss_scaling": false, 45 | "adapter_norm_input": true, 46 | "tasks": [ 47 | "rte", 48 | "sst2", 49 | "mrpc", 50 | "stsb", 51 | "qqp", 52 | "mnli", 53 | "qnli", 54 | "cola" 55 | ], 56 | "eval_tasks": [ 57 | "rte", 58 | "sst2", 59 | "mrpc", 60 | "stsb", 61 | "qqp", 62 | "mnli", 63 | "qnli", 64 | "cola" 65 | ] 66 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_manual_generated.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 128, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "process_encoder_output": true, 28 | "evaluation_strategy": "steps", 29 | "adafactor": false, 30 | "save_steps": 1000, 31 | "eval_steps": 1000, 32 | "metric_for_best_model": "average_metrics", 33 | "greater_is_better": true, 34 | "max_steps": 65536, 35 | "print_num_parameters": true, 36 | "encoder_adapter": "manual", 37 | "decoder_adapter": "generated", 38 | "freeze_model": true, 39 | "unfreeze_encoder_adapters": true, 40 | "unfreeze_decoder_adapters": true, 41 | "unfreeze_encoder": false, 42 | "unfreeze_decoder": false, 43 | "unfreeze_layer_norms": false, 44 | "adapter_norm_input": false, 45 | "encoder_adapter_dim": 64, 46 | "decoder_adapter_dim": 64, 47 | "hypernetwork_bottleneck": 128, 48 | "loss_scaling": false, 49 | "tasks": [ 50 | "rte", 51 | "sst2", 52 | "mrpc", 53 | "stsb", 54 | "qqp", 55 | "mnli", 56 | "qnli", 57 | "cola" 58 | ], 59 | "eval_tasks": [ 60 | "rte", 61 | "sst2", 62 | "mrpc", 63 | "stsb", 64 | "qqp", 65 | "mnli", 66 | "qnli", 67 | "cola" 68 | ] 69 | } 70 | -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_manual_generated_no_mlp.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "manual", 35 | "decoder_adapter": "generated", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "encoder_adapter_dim": 64, 42 | "decoder_adapter_dim": 64, 43 | "hypernetwork_bottleneck": 128, 44 | "process_encoder_output": false, 45 | "loss_scaling": false, 46 | "tasks": [ 47 | "rte", 48 | "sst2", 49 | "mrpc", 50 | "stsb", 51 | "qqp", 52 | "mnli", 53 | "qnli", 54 | "cola" 55 | ], 56 | "eval_tasks": [ 57 | "rte", 58 | "sst2", 59 | "mrpc", 60 | "stsb", 61 | "qqp", 62 | "mnli", 63 | "qnli", 64 | "cola" 65 | ] 66 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_manual_generated_norm.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "manual", 35 | "decoder_adapter": "generated", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "adapter_norm_input": true, 42 | "encoder_adapter_dim": 64, 43 | "decoder_adapter_dim": 64, 44 | "hypernetwork_bottleneck": 128, 45 | "loss_scaling": false, 46 | "tasks": [ 47 | "rte", 48 | "sst2", 49 | "mrpc", 50 | "stsb", 51 | "qqp", 52 | "mnli", 53 | "qnli", 54 | "cola" 55 | ], 56 | "eval_tasks": [ 57 | "rte", 58 | "sst2", 59 | "mrpc", 60 | "stsb", 61 | "qqp", 62 | "mnli", 63 | "qnli", 64 | "cola" 65 | ] 66 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_manual_generated_unbalanced.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "manual", 35 | "decoder_adapter": "generated", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "adapter_norm_input": false, 42 | "encoder_adapter_dim": 512, 43 | "decoder_adapter_dim": 36, 44 | "hypernetwork_bottleneck": 72, 45 | "loss_scaling": false, 46 | "tasks": [ 47 | "rte", 48 | "sst2", 49 | "mrpc", 50 | "stsb", 51 | "qqp", 52 | "mnli", 53 | "qnli", 54 | "cola" 55 | ], 56 | "eval_tasks": [ 57 | "rte", 58 | "sst2", 59 | "mrpc", 60 | "stsb", 61 | "qqp", 62 | "mnli", 63 | "qnli", 64 | "cola" 65 | ] 66 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_manual_manual.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "manual", 35 | "decoder_adapter": "manual", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "decoder_adapter_dim": 230, 42 | "encoder_adapter_dim": 230, 43 | "hypernetwork_bottleneck": 128, 44 | "loss_scaling": false, 45 | "tasks": [ 46 | "rte", 47 | "sst2", 48 | "mrpc", 49 | "stsb", 50 | "qqp", 51 | "mnli", 52 | "qnli", 53 | "cola" 54 | ], 55 | "eval_tasks": [ 56 | "rte", 57 | "sst2", 58 | "mrpc", 59 | "stsb", 60 | "qqp", 61 | "mnli", 62 | "qnli", 63 | "cola" 64 | ] 65 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_manual_none.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-v1_1-large", 3 | "tokenizer_name": "google/t5-v1_1-large", 4 | "learning_rate": 0.0003, 5 | "output_dir": "glue_manual_none", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "manual", 35 | "decoder_adapter": "none", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": true, 41 | "adapter_dim": 230, 42 | "hypernetwork_bottleneck": 128, 43 | "loss_scaling": false, 44 | "tasks": [ 45 | "rte", 46 | "sst2", 47 | "mrpc", 48 | "stsb", 49 | "qqp", 50 | "mnli", 51 | "qnli", 52 | "cola" 53 | ], 54 | "eval_tasks": [ 55 | "rte", 56 | "sst2", 57 | "mrpc", 58 | "stsb", 59 | "qqp", 60 | "mnli", 61 | "qnli", 62 | "cola" 63 | ] 64 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_manual_task.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "manual", 35 | "decoder_adapter": "task", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "encoder_adapter_dim": 50, 42 | "decoder_adapter_dim": 50, 43 | "hypernetwork_bottleneck": 100, 44 | "loss_scaling": false, 45 | "tasks": [ 46 | "rte", 47 | "sst2", 48 | "mrpc", 49 | "stsb", 50 | "qqp", 51 | "mnli", 52 | "qnli", 53 | "cola" 54 | ], 55 | "eval_tasks": [ 56 | "rte", 57 | "sst2", 58 | "mrpc", 59 | "stsb", 60 | "qqp", 61 | "mnli", 62 | "qnli", 63 | "cola" 64 | ] 65 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_none_generated.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-v1_1-large", 3 | "tokenizer_name": "google/t5-v1_1-large", 4 | "learning_rate": 0.0003, 5 | "output_dir": "glue_none_generated", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "none", 35 | "decoder_adapter": "generated", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": false, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": true, 40 | "unfreeze_decoder": false, 41 | "adapter_dim": 64, 42 | "hypernetwork_bottleneck": 128, 43 | "loss_scaling": false, 44 | "tasks": [ 45 | "rte", 46 | "sst2", 47 | "mrpc", 48 | "stsb", 49 | "qqp", 50 | "mnli", 51 | "qnli", 52 | "cola" 53 | ], 54 | "eval_tasks": [ 55 | "rte", 56 | "sst2", 57 | "mrpc", 58 | "stsb", 59 | "qqp", 60 | "mnli", 61 | "qnli", 62 | "cola" 63 | ] 64 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_none_manual.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-v1_1-large", 3 | "tokenizer_name": "google/t5-v1_1-large", 4 | "learning_rate": 0.0003, 5 | "output_dir": "glue_none_manual", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "none", 35 | "decoder_adapter": "manual", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": true, 40 | "unfreeze_decoder": false, 41 | "adapter_dim": 230, 42 | "hypernetwork_bottleneck": 128, 43 | "loss_scaling": false, 44 | "tasks": [ 45 | "rte", 46 | "sst2", 47 | "mrpc", 48 | "stsb", 49 | "qqp", 50 | "mnli", 51 | "qnli", 52 | "cola" 53 | ], 54 | "eval_tasks": [ 55 | "rte", 56 | "sst2", 57 | "mrpc", 58 | "stsb", 59 | "qqp", 60 | "mnli", 61 | "qnli", 62 | "cola" 63 | ] 64 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_none_none.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "none", 35 | "decoder_adapter": "none", 36 | "freeze_model": false, 37 | "unfreeze_encoder_adapters": false, 38 | "unfreeze_decoder_adapters": false, 39 | "unfreeze_encoder": true, 40 | "unfreeze_decoder": true, 41 | "adapter_dim": 64, 42 | "hypernetwork_bottleneck": 128, 43 | "loss_scaling": false, 44 | "tasks": [ 45 | "rte", 46 | "sst2", 47 | "mrpc", 48 | "stsb", 49 | "qqp", 50 | "mnli", 51 | "qnli", 52 | "cola" 53 | ], 54 | "eval_tasks": [ 55 | "rte", 56 | "sst2", 57 | "mrpc", 58 | "stsb", 59 | "qqp", 60 | "mnli", 61 | "qnli", 62 | "cola" 63 | ] 64 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_none_task.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-v1_1-large", 3 | "tokenizer_name": "google/t5-v1_1-large", 4 | "learning_rate": 0.0003, 5 | "output_dir": "glue_none_task", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "none", 35 | "decoder_adapter": "task", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": false, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": true, 40 | "unfreeze_decoder": false, 41 | "adapter_dim": 64, 42 | "hypernetwork_bottleneck": 128, 43 | "loss_scaling": false, 44 | "adapter_norm_input": true, 45 | "tasks": [ 46 | "rte", 47 | "sst2", 48 | "mrpc", 49 | "stsb", 50 | "qqp", 51 | "mnli", 52 | "qnli", 53 | "cola" 54 | ], 55 | "eval_tasks": [ 56 | "rte", 57 | "sst2", 58 | "mrpc", 59 | "stsb", 60 | "qqp", 61 | "mnli", 62 | "qnli", 63 | "cola" 64 | ] 65 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_task_generated.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 64, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 2, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "task", 35 | "decoder_adapter": "generated", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "encoder_adapter_dim": 50, 42 | "decoder_adapter_dim": 50, 43 | "hypernetwork_bottleneck": 100, 44 | "loss_scaling": false, 45 | "adapter_norm_input": true, 46 | "tasks": [ 47 | "rte", 48 | "sst2", 49 | "mrpc", 50 | "stsb", 51 | "qqp", 52 | "mnli", 53 | "qnli", 54 | "cola" 55 | ], 56 | "eval_tasks": [ 57 | "rte", 58 | "sst2", 59 | "mrpc", 60 | "stsb", 61 | "qqp", 62 | "mnli", 63 | "qnli", 64 | "cola" 65 | ] 66 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_task_manual.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "task", 35 | "decoder_adapter": "manual", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "encoder_adapter_dim": 50, 42 | "decoder_adapter_dim": 50, 43 | "hypernetwork_bottleneck": 100, 44 | "loss_scaling": false, 45 | "adapter_norm_input": true, 46 | "tasks": [ 47 | "rte", 48 | "sst2", 49 | "mrpc", 50 | "stsb", 51 | "qqp", 52 | "mnli", 53 | "qnli", 54 | "cola" 55 | ], 56 | "eval_tasks": [ 57 | "rte", 58 | "sst2", 59 | "mrpc", 60 | "stsb", 61 | "qqp", 62 | "mnli", 63 | "qnli", 64 | "cola" 65 | ] 66 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_task_none.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-v1_1-large", 3 | "tokenizer_name": "google/t5-v1_1-large", 4 | "learning_rate": 0.0003, 5 | "output_dir": "glue_task_none", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "task", 35 | "decoder_adapter": "none", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": true, 41 | "adapter_dim": 64, 42 | "hypernetwork_bottleneck": 128, 43 | "loss_scaling": false, 44 | "adapter_norm_input": true, 45 | "tasks": [ 46 | "rte", 47 | "sst2", 48 | "mrpc", 49 | "stsb", 50 | "qqp", 51 | "mnli", 52 | "qnli", 53 | "cola" 54 | ], 55 | "eval_tasks": [ 56 | "rte", 57 | "sst2", 58 | "mrpc", 59 | "stsb", 60 | "qqp", 61 | "mnli", 62 | "qnli", 63 | "cola" 64 | ] 65 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/glue_ablations/glue_task_task.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 128, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 100, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": false, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 1000, 29 | "eval_steps": 1000, 30 | "metric_for_best_model": "average_metrics", 31 | "greater_is_better": true, 32 | "max_steps": 65536, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "task", 35 | "decoder_adapter": "task", 36 | "freeze_model": true, 37 | "unfreeze_encoder_adapters": true, 38 | "unfreeze_decoder_adapters": true, 39 | "unfreeze_encoder": false, 40 | "unfreeze_decoder": false, 41 | "encoder_adapter_dim": 50, 42 | "decoder_adapter_dim": 50, 43 | "hypernetwork_bottleneck": 100, 44 | "loss_scaling": false, 45 | "adapter_norm_input": true, 46 | "tasks": [ 47 | "rte", 48 | "sst2", 49 | "mrpc", 50 | "stsb", 51 | "qqp", 52 | "mnli", 53 | "qnli", 54 | "cola" 55 | ], 56 | "eval_tasks": [ 57 | "rte", 58 | "sst2", 59 | "mrpc", 60 | "stsb", 61 | "qqp", 62 | "mnli", 63 | "qnli", 64 | "cola" 65 | ] 66 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/mrqa_experiments/adamw_adapter_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 64, 8 | "val_max_target_length": 64, 9 | "test_max_target_length": 64, 10 | "num_train_epochs": 4, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 5, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": false, 27 | "evaluation_strategy": "no", 28 | "save_strategy": "epoch", 29 | "metric_for_best_model": "average_metrics", 30 | "greater_is_better": true, 31 | "print_num_parameters": true, 32 | "encoder_adapter": "manual", 33 | "decoder_adapter": "generated", 34 | "adapter_norm_input": false, 35 | "freeze_model": true, 36 | "unfreeze_encoder_adapters": true, 37 | "unfreeze_decoder_adapters": true, 38 | "unfreeze_encoder": false, 39 | "unfreeze_decoder": false, 40 | "encoder_adapter_dim": 64, 41 | "decoder_adapter_dim": 64, 42 | "hypernetwork_bottleneck": 128, 43 | "loss_scaling": false, 44 | "adafactor": false, 45 | "tasks": ["mrqa"], 46 | "eval_tasks": ["mrqa"] 47 | } 48 | 49 | -------------------------------------------------------------------------------- /hyperdecoder/configs/mrqa_experiments/adamw_full_finetune.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 64, 8 | "val_max_target_length": 64, 9 | "test_max_target_length": 64, 10 | "num_train_epochs": 4, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 64, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 5, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": false, 27 | "evaluation_strategy": "no", 28 | "save_strategy": "epoch", 29 | "metric_for_best_model": "average_metrics", 30 | "greater_is_better": true, 31 | "print_num_parameters": true, 32 | "encoder_adapter": "none", 33 | "decoder_adapter": "none", 34 | "adapter_norm_input": true, 35 | "freeze_model": false, 36 | "unfreeze_encoder_adapters": true, 37 | "unfreeze_decoder_adapters": true, 38 | "unfreeze_encoder": false, 39 | "unfreeze_decoder": false, 40 | "adapter_dim": 1, 41 | "hypernetwork_bottleneck": 1, 42 | "loss_scaling": false, 43 | "adafactor": false, 44 | "tasks": ["mrqa"], 45 | "eval_tasks": ["mrqa"] 46 | } 47 | 48 | -------------------------------------------------------------------------------- /hyperdecoder/configs/mrqa_experiments/adapter_gen_layernorm.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.001, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 64, 8 | "val_max_target_length": 64, 9 | "test_max_target_length": 64, 10 | "num_train_epochs": 4, 11 | "warmup_steps": 0, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 64, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 5, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": false, 27 | "evaluation_strategy": "no", 28 | "save_strategy": "epoch", 29 | "metric_for_best_model": "average_metrics", 30 | "greater_is_better": true, 31 | "print_num_parameters": true, 32 | "encoder_adapter": "manual", 33 | "decoder_adapter": "generated", 34 | "adapter_norm_input": true, 35 | "freeze_model": true, 36 | "unfreeze_encoder_adapters": true, 37 | "unfreeze_decoder_adapters": true, 38 | "unfreeze_encoder": false, 39 | "unfreeze_decoder": false, 40 | "adapter_dim": 64, 41 | "hypernetwork_bottleneck": 128, 42 | "loss_scaling": false, 43 | "unfreeze_layer_norms": true, 44 | "adafactor": true, 45 | "lr_scheduler": "constant", 46 | "tasks": ["mrqa"], 47 | "eval_tasks": ["mrqa"] 48 | } 49 | 50 | -------------------------------------------------------------------------------- /hyperdecoder/configs/mrqa_experiments/manual_adapter_control.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 64, 8 | "val_max_target_length": 64, 9 | "test_max_target_length": 64, 10 | "num_train_epochs": 4, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 5, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": false, 27 | "evaluation_strategy": "epoch", 28 | "save_strategy": "epoch", 29 | "metric_for_best_model": "average_metrics", 30 | "greater_is_better": true, 31 | "print_num_parameters": true, 32 | "encoder_adapter": "manual", 33 | "decoder_adapter": "manual", 34 | "adapter_norm_input": false, 35 | "freeze_model": true, 36 | "unfreeze_encoder_adapters": true, 37 | "unfreeze_decoder_adapters": true, 38 | "unfreeze_encoder": false, 39 | "unfreeze_decoder": false, 40 | "encoder_adapter_dim": 800, 41 | "decoder_adapter_dim": 2, 42 | "process_encoder_output": false, 43 | "loss_scaling": false, 44 | "adafactor": false, 45 | "report_to": "none", 46 | "tasks": ["mrqa"], 47 | "eval_tasks": ["mrqa"] 48 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/mrqa_experiments/per_dataset_adapter.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 64, 8 | "val_max_target_length": 64, 9 | "test_max_target_length": 64, 10 | "num_train_epochs": 4, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 64, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 5, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": false, 27 | "evaluation_strategy": "no", 28 | "save_strategy": "epoch", 29 | "metric_for_best_model": "average_metrics", 30 | "greater_is_better": true, 31 | "print_num_parameters": true, 32 | "encoder_adapter": "manual_specific", 33 | "decoder_adapter": "manual_specific", 34 | "adapter_norm_input": true, 35 | "freeze_model": true, 36 | "unfreeze_encoder_adapters": true, 37 | "unfreeze_decoder_adapters": true, 38 | "unfreeze_encoder": false, 39 | "unfreeze_decoder": false, 40 | "adapter_dim": 64, 41 | "hypernetwork_bottleneck": 128, 42 | "loss_scaling": false, 43 | "adafactor": false, 44 | "tasks": ["mrqa"], 45 | "eval_tasks": ["mrqa"] 46 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/mrqa_experiments/small_adapter_large_hypernetwork.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 64, 8 | "val_max_target_length": 64, 9 | "test_max_target_length": 64, 10 | "num_train_epochs": 4, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 5, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": false, 27 | "evaluation_strategy": "epoch", 28 | "save_strategy": "epoch", 29 | "metric_for_best_model": "average_metrics", 30 | "greater_is_better": true, 31 | "print_num_parameters": true, 32 | "encoder_adapter": "manual", 33 | "decoder_adapter": "generated", 34 | "adapter_norm_input": false, 35 | "freeze_model": true, 36 | "unfreeze_encoder_adapters": true, 37 | "unfreeze_decoder_adapters": true, 38 | "unfreeze_encoder": false, 39 | "unfreeze_decoder": false, 40 | "encoder_adapter_dim": 64, 41 | "decoder_adapter_dim": 64, 42 | "hypernetwork_bottleneck": 128, 43 | "process_encoder_output": false, 44 | "loss_scaling": false, 45 | "adafactor": false, 46 | "report_to": "none", 47 | "tasks": ["mrqa"], 48 | "eval_tasks": ["mrqa"] 49 | } 50 | 51 | -------------------------------------------------------------------------------- /hyperdecoder/configs/mrqa_experiments/task_adamw_hypernet.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 64, 8 | "val_max_target_length": 64, 9 | "test_max_target_length": 64, 10 | "num_train_epochs": 4, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 5, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": false, 27 | "evaluation_strategy": "no", 28 | "save_strategy": "epoch", 29 | "metric_for_best_model": "average_metrics", 30 | "greater_is_better": true, 31 | "print_num_parameters": true, 32 | "encoder_adapter": "task", 33 | "decoder_adapter": "task", 34 | "adapter_norm_input": false, 35 | "freeze_model": true, 36 | "unfreeze_encoder_adapters": true, 37 | "unfreeze_decoder_adapters": true, 38 | "unfreeze_encoder": false, 39 | "unfreeze_decoder": false, 40 | "encoder_adapter_dim": 50, 41 | "decoder_adapter_dim": 50, 42 | "hypernetwork_bottleneck": 100, 43 | "loss_scaling": false, 44 | "adafactor": false, 45 | "tasks": ["mrqa"], 46 | "eval_tasks": ["mrqa"] 47 | } -------------------------------------------------------------------------------- /hyperdecoder/configs/mrqa_experiments/unbalanced_manual_generated_mrqa.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 64, 8 | "val_max_target_length": 64, 9 | "test_max_target_length": 64, 10 | "num_train_epochs": 4, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 4, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 5, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": false, 27 | "evaluation_strategy": "epoch", 28 | "save_strategy": "epoch", 29 | "eval_steps": 1000, 30 | "save_steps": 1000, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "print_num_parameters": true, 34 | "encoder_adapter": "manual", 35 | "decoder_adapter": "generated", 36 | "adapter_norm_input": false, 37 | "freeze_model": true, 38 | "unfreeze_encoder_adapters": true, 39 | "unfreeze_decoder_adapters": true, 40 | "unfreeze_encoder": false, 41 | "unfreeze_decoder": false, 42 | "encoder_adapter_dim": 512, 43 | "decoder_adapter_dim": 36, 44 | "hypernetwork_bottleneck": 72, 45 | "process_encoder_output": true, 46 | "loss_scaling": false, 47 | "adafactor": false, 48 | "report_to": "none", 49 | "tasks": ["mrqa"], 50 | "eval_tasks": ["mrqa"] 51 | } 52 | 53 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/nli.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-small-lm-adapt", 3 | "tokenizer_name": "google/t5-small-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "nli", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "manual", 36 | "decoder_adapter": "generated", 37 | "freeze_model": true, 38 | "unfreeze_encoder_adapters": true, 39 | "unfreeze_decoder_adapters": true, 40 | "unfreeze_encoder": false, 41 | "unfreeze_decoder": false, 42 | "adapter_dim": 64, 43 | "hypernetwork_bottleneck": 128, 44 | "loss_scaling": false, 45 | "process_encoder_output": false, 46 | 47 | "tasks": ["anli", "art", "mnli"], 48 | "eval_tasks": ["anli", "art", "mnli"] 49 | } 50 | 51 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/nli_adapter.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "nli_adapter", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "manual", 36 | "decoder_adapter": "generated", 37 | "freeze_model": true, 38 | "unfreeze_encoder_adapters": true, 39 | "unfreeze_decoder_adapters": true, 40 | "unfreeze_encoder": false, 41 | "unfreeze_decoder": false, 42 | "adapter_dim": 64, 43 | "hypernetwork_bottleneck": 128, 44 | "loss_scaling": false, 45 | 46 | "tasks": ["anli", "alphanli", "mnli"], 47 | "eval_tasks": ["anli", "alphanli", "mnli"] 48 | } 49 | 50 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/nli_manual.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "manual", 36 | "decoder_adapter": "manual", 37 | "freeze_model": true, 38 | "adapter_norm_input": true, 39 | "unfreeze_encoder_adapters": true, 40 | "unfreeze_decoder_adapters": true, 41 | "unfreeze_encoder": false, 42 | "unfreeze_decoder": false, 43 | "adapter_dim": 512, 44 | "hypernetwork_bottleneck": 128, 45 | "loss_scaling": false, 46 | 47 | "tasks": ["anli", "art", "mnli"], 48 | "eval_tasks": ["anli", "art", "mnli"] 49 | } 50 | 51 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/nli_task.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": false, 23 | "do_eval": false, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": false, 27 | "evaluation_strategy": "no", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "task", 36 | "decoder_adapter": "task", 37 | "freeze_model": true, 38 | "adapter_norm_input": true, 39 | "unfreeze_encoder_adapters": true, 40 | "unfreeze_decoder_adapters": true, 41 | "unfreeze_encoder": false, 42 | "unfreeze_decoder": false, 43 | "adapter_dim": 64, 44 | "hypernetwork_bottleneck": 128, 45 | "loss_scaling": false, 46 | 47 | "tasks": ["anli", "art", "mnli"], 48 | "eval_tasks": ["anli", "art", "mnli"] 49 | } 50 | 51 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/summarise.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "summarise", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "none", 36 | "decoder_adapter": "none", 37 | "freeze_model": false, 38 | "unfreeze_encoder_adapters": true, 39 | "unfreeze_decoder_adapters": true, 40 | "unfreeze_encoder": true, 41 | "unfreeze_decoder": true, 42 | "adapter_dim": 64, 43 | "hypernetwork_bottleneck": 128, 44 | "loss_scaling": false, 45 | 46 | "tasks": ["xsum", "cnn_dailymail", "wiki_lingua"], 47 | "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua"] 48 | } 49 | 50 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/summarise_adapter.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-large-lm-adapt", 3 | "tokenizer_name": "google/t5-large-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "summarise_adapter", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "manual", 36 | "decoder_adapter": "generated", 37 | "freeze_model": true, 38 | "unfreeze_encoder_adapters": true, 39 | "unfreeze_decoder_adapters": true, 40 | "unfreeze_encoder": false, 41 | "unfreeze_decoder": false, 42 | "adapter_dim": 64, 43 | "hypernetwork_bottleneck": 128, 44 | "loss_scaling": false, 45 | 46 | "tasks": ["xsum", "cnn_dailymail", "wiki_lingua"], 47 | "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua"] 48 | } 49 | 50 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/summarise_manual.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "manual", 36 | "decoder_adapter": "manual", 37 | "freeze_model": true, 38 | "adapter_norm_input": true, 39 | "unfreeze_encoder_adapters": true, 40 | "unfreeze_decoder_adapters": true, 41 | "unfreeze_encoder": false, 42 | "unfreeze_decoder": false, 43 | "adapter_dim": 512, 44 | "hypernetwork_bottleneck": 128, 45 | "loss_scaling": false, 46 | 47 | "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en"], 48 | "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en"] 49 | } 50 | 51 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/summarise_nli.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 32, 16 | "gradient_accumulation_steps": 2, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "none", 36 | "decoder_adapter": "none", 37 | "freeze_model": false, 38 | "adapter_norm_input": false, 39 | "unfreeze_encoder_adapters": true, 40 | "unfreeze_decoder_adapters": true, 41 | "unfreeze_encoder": false, 42 | "unfreeze_decoder": false, 43 | "adapter_dim": 64, 44 | "hypernetwork_bottleneck": 128, 45 | "loss_scaling": false, 46 | 47 | "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"], 48 | "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"] 49 | } 50 | 51 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/summarise_nli_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 2, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "manual", 36 | "decoder_adapter": "generated", 37 | "freeze_model": true, 38 | "adapter_norm_input": false, 39 | "unfreeze_encoder_adapters": true, 40 | "unfreeze_decoder_adapters": true, 41 | "unfreeze_encoder": false, 42 | "unfreeze_decoder": false, 43 | "encoder_adapter_dim": 512, 44 | "decoder_adapter_dim": 36, 45 | "hypernetwork_bottleneck": 72, 46 | "loss_scaling": false, 47 | 48 | "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"], 49 | "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"] 50 | } 51 | 52 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/summarise_nli_manual.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 2, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "manual", 36 | "decoder_adapter": "manual", 37 | "freeze_model": true, 38 | "adapter_norm_input": false, 39 | "unfreeze_encoder_adapters": true, 40 | "unfreeze_decoder_adapters": true, 41 | "unfreeze_encoder": false, 42 | "unfreeze_decoder": false, 43 | "encoder_adapter_dim": 370, 44 | "decoder_adapter_dim": 370, 45 | "hypernetwork_bottleneck": 128, 46 | "loss_scaling": false, 47 | 48 | "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"], 49 | "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"] 50 | } 51 | 52 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/summarise_nli_task.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 32, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 2, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": true, 26 | "load_best_model_at_end": true, 27 | "evaluation_strategy": "steps", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "task", 36 | "decoder_adapter": "task", 37 | "freeze_model": true, 38 | "adapter_norm_input": true, 39 | "unfreeze_encoder_adapters": true, 40 | "unfreeze_decoder_adapters": true, 41 | "unfreeze_encoder": false, 42 | "unfreeze_decoder": false, 43 | "encoder_adapter_dim": 64, 44 | "decoder_adapter_dim": 64, 45 | "hypernetwork_bottleneck": 128, 46 | "loss_scaling": false, 47 | 48 | "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"], 49 | "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"] 50 | } 51 | 52 | -------------------------------------------------------------------------------- /hyperdecoder/configs/xsum_nli/summarise_task.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name_or_path": "google/t5-base-lm-adapt", 3 | "tokenizer_name": "google/t5-base-lm-adapt", 4 | "learning_rate": 0.0003, 5 | "output_dir": "output", 6 | "max_source_length": 512, 7 | "max_target_length": 128, 8 | "val_max_target_length": 128, 9 | "test_max_target_length": 128, 10 | "num_train_epochs": 3, 11 | "warmup_steps": 500, 12 | "overwrite_output_dir": true, 13 | "label_smoothing": 0.1, 14 | "per_device_train_batch_size": 16, 15 | "per_device_eval_batch_size": 16, 16 | "gradient_accumulation_steps": 1, 17 | "logging_first_step": true, 18 | "logging_steps": 200, 19 | "save_total_limit": 1, 20 | "temperature": 10, 21 | "do_train": true, 22 | "do_test": true, 23 | "do_eval": true, 24 | "predict_with_generate": true, 25 | "split_validation_test": false, 26 | "load_best_model_at_end": false, 27 | "evaluation_strategy": "no", 28 | "save_steps": 5000, 29 | "eval_steps": 5000, 30 | "n_val": 1600, 31 | "metric_for_best_model": "average_metrics", 32 | "greater_is_better": true, 33 | "max_steps": 100000, 34 | "print_num_parameters": true, 35 | "encoder_adapter": "task", 36 | "decoder_adapter": "task", 37 | "freeze_model": true, 38 | "adapter_norm_input": true, 39 | "unfreeze_encoder_adapters": true, 40 | "unfreeze_decoder_adapters": true, 41 | "unfreeze_encoder": false, 42 | "unfreeze_decoder": false, 43 | "adapter_dim": 64, 44 | "hypernetwork_bottleneck": 128, 45 | "loss_scaling": false, 46 | 47 | "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en"], 48 | "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en"] 49 | } 50 | 51 | -------------------------------------------------------------------------------- /hyperdecoder/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .multitask_sampler import MultiTaskBatchSampler 2 | from .postprocessors import string_to_float, get_post_processor 3 | from .tasks import TASK_MAPPING, AutoTask 4 | from .utils import compute_task_max_decoding_length 5 | -------------------------------------------------------------------------------- /hyperdecoder/data/mrqa_preprocess.py: -------------------------------------------------------------------------------- 1 | """ 2 | A little script to generate a chunked version of mrqa. 3 | For this version, we chunk the dataset into 512-length 4 | chunks, to simulate bert-style preprocessing. 5 | """ 6 | 7 | 8 | def chunk_sample(tokenizer, sample, stride=128, max_length=512, filter_nulls=False): 9 | initial_sample = f"question: {sample['question']} context: " 10 | init_input_ids = tokenizer(initial_sample, add_special_tokens=False)["input_ids"] 11 | start_len = len(init_input_ids) 12 | context = sample["context"] 13 | # context = context.replace('[PAR]', '') 14 | # context = context.replace('[DOC]', '') 15 | # context = context.replace('[TLE]', '') 16 | tokenized_output = tokenizer(context, return_offsets_mapping=True) 17 | context_tokens = tokenized_output["input_ids"][:-1] 18 | offsets = tokenized_output["offset_mapping"][:-1] # ignore the last (0,0) for 19 | remaining_length = max_length - start_len - 1 # for '' 20 | while len(context_tokens) > 0: 21 | chunk = context_tokens[:remaining_length] + [1] 22 | offsets_chunk = offsets[:remaining_length] 23 | # edge case: when the chunk is entirely within, finish up. 24 | # Otherwise we might add more chunks for sake of stride. 25 | if len(context_tokens) <= remaining_length: 26 | context_tokens = [] 27 | offsets = [] 28 | else: 29 | context_tokens = context_tokens[ 30 | remaining_length - stride : 31 | ] # stride for some overlap 32 | offsets = offsets[remaining_length - stride :] 33 | # assuming answer strings in same order as char spans. 34 | def detect_answer(sample, offsets_chunk): 35 | for i, span in enumerate(sample["detected_answers"]["char_spans"]): 36 | for start, end in zip( 37 | span["start"], span["end"] 38 | ): # we can have multiple answer instances 39 | if start >= offsets_chunk[0][0] and end <= offsets_chunk[-1][-1]: 40 | return sample["answers"][i] 41 | return "" # if we find nothing. 42 | 43 | chunk_ans = detect_answer(sample, offsets_chunk) 44 | # sometimes we might want to filter out chunks without answers 45 | if filter_nulls and chunk_ans == "": 46 | continue 47 | yield { 48 | "question": sample["question"], 49 | "context": sample["context"], 50 | "input_ids": init_input_ids + chunk, 51 | "answer": chunk_ans, 52 | "qid": sample["qid"], 53 | "subset": sample["subset"], 54 | "task": "mrqa", 55 | } 56 | 57 | 58 | def chunk_dataset(tokenizer, dataset, stride=128, max_length=512, filter_nulls=False): 59 | for sample in dataset: 60 | for chunked_sample in chunk_sample( 61 | tokenizer, sample, stride, max_length, filter_nulls 62 | ): 63 | yield chunked_sample 64 | 65 | 66 | # testing 67 | if __name__ == "__main__": 68 | from datasets import load_dataset 69 | from transformers import T5TokenizerFast 70 | 71 | tokenizer = T5TokenizerFast.from_pretrained("t5-base") 72 | mrqa = load_dataset("mrqa", split="validation") 73 | print(f"MRQA has {len(mrqa)} samples") 74 | print(f"First sample: {mrqa[0]}") 75 | chunked_ds = list( 76 | chunk_dataset(tokenizer, mrqa, stride=128, max_length=512, filter_nulls=True) 77 | ) 78 | print(f"Chunked MRQA has {len(chunked_ds)} samples") 79 | print(f"First sample: {chunked_ds[0]}") 80 | -------------------------------------------------------------------------------- /hyperdecoder/data/multitask_sampler.py: -------------------------------------------------------------------------------- 1 | """Implements a distributed sampler to sample different tasks with 2 | temperature sampling in a way to make sure that the same task is 3 | selected in each core.""" 4 | import numpy as np 5 | import torch 6 | import torch.distributed as dist 7 | from torch.utils.data import Sampler 8 | from typing import TypeVar, Optional, List 9 | 10 | T_co = TypeVar("T_co", covariant=True) 11 | 12 | 13 | class MultiTaskBatchSampler(Sampler[T_co]): 14 | """Defines a sampler to sample multiple datasets with temperature sampling 15 | in a distributed fashion.""" 16 | 17 | def __init__( 18 | self, 19 | dataset_sizes: List[int], 20 | batch_size: int, 21 | temperature: float, 22 | num_replicas: Optional[int] = None, 23 | rank: Optional[int] = None, 24 | seed: int = 0, 25 | shuffle: bool = True, 26 | ) -> None: 27 | """Constructor for MultiTaskBatchSampler. 28 | Args: 29 | dataset_sizes: a list of integers, specifies the number of samples in 30 | each dataset. 31 | batch_size: integer, specifies the batch size. 32 | temperature: float, temperature used for temperature sampling. The larger 33 | the value, the datasets are sampled equally, and for value of 0, the datasets 34 | will be sampled according to their number of samples. 35 | num_replicas: integer, specifies the number of processes. 36 | rank: integer, specifies the rank of the current process/ 37 | seed: integer, random seed. 38 | shuffle: bool, if set to true, the datasets will be shuffled in each epoch. 39 | """ 40 | if num_replicas is None: 41 | if not dist.is_available(): 42 | raise RuntimeError("Requires distributed package to be available") 43 | num_replicas = dist.get_world_size() 44 | if rank is None: 45 | if not dist.is_available(): 46 | raise RuntimeError("Requires distributed package to be available") 47 | rank = dist.get_rank() 48 | if rank >= num_replicas or rank < 0: 49 | raise ValueError( 50 | "Invalid rank {}, rank should be in the interval" 51 | " [0, {}]".format(rank, num_replicas - 1) 52 | ) 53 | self.num_replicas = num_replicas 54 | self.rank = rank 55 | self.batch_size = batch_size 56 | self.dataset_sizes = dataset_sizes 57 | # By default we drop the last elements if dataset is not divisible by the number of ranks. 58 | self.rank_dataset_sizes = [ 59 | dataset_size // self.num_replicas for dataset_size in self.dataset_sizes 60 | ] 61 | self.dataset_offsets = torch.cumsum(torch.LongTensor([0] + dataset_sizes), 0) 62 | self.total_sizes = [ 63 | (dataset_size // self.num_replicas) * self.num_replicas 64 | for dataset_size in self.dataset_sizes 65 | ] 66 | self.temperature = temperature 67 | self.seed = seed 68 | self.epoch = 0 69 | self.num_batches_per_epoch = ( 70 | (np.sum(dataset_sizes) + self.batch_size - 1) 71 | // self.batch_size 72 | // self.num_replicas 73 | ) 74 | self.shuffle = shuffle 75 | 76 | def generate_tasks_distribution(self): 77 | """Given the dataset sizes computes the weights to sample each dataset 78 | according to the temperature sampling.""" 79 | total_size = sum(self.dataset_sizes) 80 | weights = np.array( 81 | [ 82 | (size / total_size) ** (1.0 / self.temperature) 83 | for size in self.dataset_sizes 84 | ] 85 | ) 86 | weights = weights / np.sum(weights) 87 | return torch.as_tensor(weights, dtype=torch.double) 88 | 89 | def __iter__(self): 90 | # Defines torch generator, to make random choices consistent across cores in 91 | # different epochs, the seed needs to be set based on seed and epoch. 92 | generator = torch.Generator() 93 | generator.manual_seed(self.seed + self.epoch) 94 | 95 | # Shuffles the datasets if shuffle is set to true. 96 | indices = [] 97 | for dataset_size in self.dataset_sizes: 98 | if self.shuffle: 99 | indices.append( 100 | torch.randperm(dataset_size, generator=generator).tolist() 101 | ) 102 | else: 103 | indices.append(list(range(dataset_size))) 104 | 105 | # Shards the datasets across the all processes. 106 | self.rank_indices = [] 107 | for i in range(len(self.dataset_sizes)): 108 | self.rank_indices.append( 109 | indices[i][self.rank : self.total_sizes[i] : self.num_replicas] 110 | ) 111 | 112 | # To make the model consistent across different processes, since the 113 | # model is based on tasks, we need to make sure the same task is selected 114 | # across different processes. 115 | tasks_distribution: torch.Tensor = self.generate_tasks_distribution() 116 | 117 | # Chooses the tasks which will be used in each batch in one epoch. 118 | # With passing generator, we make sure this choice is consistent across 119 | # different processes. 120 | batch_task_assignments = torch.multinomial( 121 | tasks_distribution, 122 | self.num_batches_per_epoch, 123 | replacement=True, 124 | generator=generator, 125 | ) 126 | 127 | for batch_task in batch_task_assignments: 128 | # Gets the number of samples of the selected datasets available for the 129 | # current rank. 130 | num_task_samples = self.rank_dataset_sizes[batch_task] 131 | # Computes the random samples from the chosen dataset. 132 | indices = torch.randint( 133 | low=0, 134 | high=num_task_samples, 135 | size=(self.batch_size,), 136 | generator=generator, 137 | ).tolist() 138 | # Converts the selected indices to the global indices on the given dataset. 139 | results = ( 140 | self.dataset_offsets[batch_task] 141 | + torch.tensor(self.rank_indices[batch_task])[indices] 142 | ).tolist() 143 | yield results 144 | 145 | def __len__(self): 146 | return self.num_batches_per_epoch 147 | 148 | def set_epoch(self, epoch): 149 | self.epoch = epoch 150 | 151 | 152 | class EvenMultiTaskSampler(MultiTaskBatchSampler[T_co]): 153 | """Sampler with even balance between datasets""" 154 | 155 | def generate_tasks_distribution(self): 156 | total_size = len(self.dataset_sizes) 157 | weights = np.array([(1 / total_size) for _ in self.dataset_sizes]) 158 | return torch.as_tensor(weights, dtype=torch.double) 159 | -------------------------------------------------------------------------------- /hyperdecoder/data/postprocessors.py: -------------------------------------------------------------------------------- 1 | def string_to_float(string, default=-1.0): 2 | """Converts string to float, using default when conversion not possible.""" 3 | try: 4 | return float(string) 5 | except ValueError: 6 | return default 7 | 8 | 9 | def string_to_int(string, default=-1): 10 | """Converts string to int, using default when conversion not possible.""" 11 | try: 12 | return int(string) 13 | except ValueError: 14 | return default 15 | 16 | 17 | def get_post_processor(task): 18 | """Returns post processor required to apply on the predictions/targets 19 | before computing metrics for each task.""" 20 | if task == "stsb": 21 | return string_to_float 22 | elif task in ["qqp", "cola", "mrpc"]: 23 | return string_to_int 24 | else: 25 | return None 26 | -------------------------------------------------------------------------------- /hyperdecoder/data/utils.py: -------------------------------------------------------------------------------- 1 | """Defines utilities for the tasks.""" 2 | 3 | import numpy as np 4 | from transformers import T5Tokenizer 5 | 6 | 7 | def round_stsb_target(label): 8 | """STSB maps two sentences to a floating point number between 1 and 5 9 | representing their semantic similarity. Since we are treating all tasks as 10 | text-to-text tasks we need to convert this floating point number to a string. 11 | The vast majority of the similarity score labels in STSB are in the set 12 | [0, 0.2, 0.4, ..., 4.8, 5.0]. So, we first round the number to the closest 13 | entry in this set, and then we convert the result to a string (literally e.g. 14 | "3.4"). This converts STSB roughly into a 26-class classification dataset. 15 | Args: 16 | label: original label. 17 | Returns: 18 | A preprocessed label. 19 | """ 20 | return np.round((label * 5) / 5, decimals=1) 21 | 22 | 23 | tokenizer = T5Tokenizer.from_pretrained("t5-base") 24 | 25 | 26 | def compute_task_max_decoding_length(word_list): 27 | """Computes the max decoding length for the given list of words 28 | Args: 29 | word_list: A list of stringss. 30 | Returns: 31 | maximum length after tokenization of the inputs. 32 | """ 33 | max_len = 0 34 | for word in word_list: 35 | ids = tokenizer.encode(word) 36 | max_len = max(max_len, len(ids)) 37 | return max_len 38 | -------------------------------------------------------------------------------- /hyperdecoder/finetune_trainer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import datasets 4 | import json 5 | import logging 6 | import os 7 | from pathlib import Path 8 | import dataclasses 9 | 10 | from transformers import ( 11 | AutoTokenizer, 12 | HfArgumentParser, 13 | set_seed, 14 | ) 15 | from transformers.trainer_utils import EvaluationStrategy 16 | 17 | from modeling.adapter_t5 import ( 18 | T5WithAdapterConfig, 19 | T5ForConditionalGenerationWithAdapter, 20 | ) 21 | from third_party.trainers import T5Trainer 22 | from data import AutoTask 23 | from third_party.utils import TaskCollator, check_output_dir, MrqaTaskCollator 24 | from metrics import build_compute_metrics_fn 25 | from training_args import ( 26 | Seq2SeqTrainingArguments, 27 | ModelArguments, 28 | DataTrainingArguments, 29 | AdapterTrainingArguments, 30 | ) 31 | from utils import ( 32 | get_last_checkpoint_path, 33 | freeze_model, 34 | unfreeze_adapter_params_encoder, 35 | unfreeze_adapter_params_decoder, 36 | unfreeze_encoder, 37 | unfreeze_decoder, 38 | unfreeze_layer_norms, 39 | ) 40 | 41 | logger = logging.getLogger(__name__) 42 | 43 | 44 | def remove_rank_info_from_argv(args): 45 | extra_parameters = {} 46 | if args[1].startswith("--local_rank"): 47 | extra_parameters.update({"local_rank": int(args[1].split("=")[-1])}) 48 | del args[1] 49 | return extra_parameters 50 | 51 | 52 | def main(): 53 | # See all possible arguments in src/transformers/training_args.py or by passing 54 | # the --help flag to this script. We now keep distinct sets of args, for a cleaner 55 | # separation of concerns. 56 | parser = HfArgumentParser( 57 | ( 58 | ModelArguments, 59 | DataTrainingArguments, 60 | Seq2SeqTrainingArguments, 61 | AdapterTrainingArguments, 62 | ) 63 | ) 64 | 65 | # For running on multiple gpus with torch.distributed.launch, it adds a local_rank paramter, to allow the parser 66 | # still use the config file, we add the local_rank to the config file. 67 | if ( 68 | len(sys.argv) > 2 69 | and sys.argv[1].startswith("--local_rank") 70 | and (sys.argv[2].endswith(".json")) 71 | ): 72 | rank_info = remove_rank_info_from_argv(sys.argv) 73 | args_dict = json.loads(Path(sys.argv[1]).read_text()) 74 | args_dict.update(rank_info) 75 | model_args, data_args, training_args = parser.parse_dict(args_dict) 76 | elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 77 | logger.warning("config path: %s", sys.argv[1]) 78 | # If we pass only one argument to the script and it's the path to a json file, 79 | # let's parse it to get our arguments. 80 | model_args, data_args, training_args, adapter_args = parser.parse_json_file( 81 | json_file=os.path.abspath(sys.argv[1]) 82 | ) 83 | else: 84 | ( 85 | model_args, 86 | data_args, 87 | training_args, 88 | adapter_args, 89 | ) = parser.parse_args_into_dataclasses() 90 | check_output_dir(training_args) 91 | 92 | # Setup logging 93 | # logfile output folders must exist before telling the logger to output there 94 | os.makedirs(training_args.output_dir, exist_ok=True) 95 | logging.basicConfig( 96 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 97 | datefmt="%m/%d/%Y %H:%M:%S", 98 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, 99 | filename=os.path.join(training_args.output_dir, "log.txt"), 100 | filemode="w+", 101 | ) 102 | logger.addHandler(logging.StreamHandler(sys.stdout)) 103 | logger.warning( 104 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 105 | training_args.local_rank, 106 | training_args.device, 107 | training_args.n_gpu, 108 | bool(training_args.local_rank != -1), 109 | training_args.fp16, 110 | ) 111 | logger.info("Training/evaluation parameters %s", training_args) 112 | 113 | # Set seed 114 | set_seed(training_args.seed) 115 | 116 | from transformers import T5Config, T5ForConditionalGeneration 117 | 118 | model_class = T5ForConditionalGenerationWithAdapter 119 | config_class = T5WithAdapterConfig 120 | 121 | # Load pretrained model and tokenizer 122 | # 123 | # Distributed training: 124 | # The .from_pretrained methods guarantee that only one local process can concurrently 125 | # download model & vocab. 126 | config = config_class.from_pretrained( 127 | model_args.config_name 128 | if model_args.config_name 129 | else model_args.model_name_or_path, 130 | cache_dir=model_args.cache_dir, 131 | ) 132 | config.update(dataclasses.asdict(adapter_args)) 133 | all_tasks = list(set(data_args.tasks + data_args.eval_tasks)) 134 | # mrqa is a single 'task' with many sub-tasks 135 | if "mrqa" in data_args.tasks + data_args.eval_tasks or "mrqa_reg" in data_args.tasks + data_args.eval_tasks: 136 | all_tasks += [ 137 | "HotpotQA", 138 | "NaturalQuestionsShort", 139 | "NewsQA", 140 | "SearchQA", 141 | "SQuAD", 142 | "TriviaQA-web", 143 | ] 144 | config.update({"tasks": all_tasks}) 145 | 146 | tokenizer = AutoTokenizer.from_pretrained( 147 | model_args.tokenizer_name 148 | if model_args.tokenizer_name 149 | else model_args.model_name_or_path, 150 | cache_dir=model_args.cache_dir, 151 | ) 152 | if model_args.not_load_t5_checkpoint: 153 | model = model_class(config=config) 154 | else: 155 | last_checkpoint_path = training_args.output_dir 156 | model_path = ( 157 | model_args.model_name_or_path 158 | if ( 159 | ( 160 | training_args.optimize_from_scratch 161 | and not training_args.optimize_from_scratch_with_loading_model 162 | ) 163 | or not os.path.exists( 164 | os.path.join(last_checkpoint_path, "pytorch_model.bin") 165 | ) 166 | ) 167 | else last_checkpoint_path 168 | ) 169 | logger.warning("model path loaded from : %s", model_path) 170 | model = model_class.from_pretrained( 171 | model_path, 172 | from_tf=".ckpt" in model_args.model_name_or_path, 173 | config=config, 174 | cache_dir=model_args.cache_dir, 175 | ) 176 | 177 | # set num_beams for evaluation 178 | if data_args.eval_beams is None: 179 | data_args.eval_beams = model.config.num_beams 180 | 181 | # freezing the parameters. 182 | if model_args.freeze_model: 183 | freeze_model(model) 184 | if model_args.unfreeze_encoder_adapters: 185 | unfreeze_adapter_params_encoder(model) 186 | if model_args.unfreeze_decoder_adapters: 187 | unfreeze_adapter_params_decoder(model) 188 | if model_args.unfreeze_encoder: 189 | unfreeze_encoder(model) 190 | if model_args.unfreeze_decoder: 191 | unfreeze_decoder(model) 192 | if model_args.unfreeze_layer_norms: 193 | unfreeze_layer_norms(model) 194 | 195 | if training_args.print_num_parameters: 196 | for name, param in model.named_parameters(): 197 | if param.requires_grad: 198 | logger.info("Parameter name %s", name) 199 | total_trainable_params = sum( 200 | p.numel() for p in model.parameters() if p.requires_grad 201 | ) 202 | total_params = sum(p.numel() for p in model.parameters()) 203 | logger.info("Total trainable parameters %s", total_trainable_params) 204 | logger.info("Total parameters %s", total_params) 205 | # Gets the training/test/validation datasets. 206 | dataset_class = AutoTask 207 | if training_args.do_train: 208 | train_datasets = [ 209 | dataset_class.get(task, seed=data_args.data_seed).get_dataset( 210 | split="train", 211 | n_obs=data_args.n_train, 212 | add_prefix=True, 213 | split_validation_test=training_args.split_validation_test, 214 | ) 215 | for task in data_args.tasks 216 | ] 217 | if "mrqa" in data_args.tasks and data_args.filter_nulls: 218 | mrqa = train_datasets[data_args.tasks.index("mrqa")] 219 | mrqa.toggle_null_filter() 220 | dataset_sizes = [len(train_dataset) for train_dataset in train_datasets] 221 | train_dataset = datasets.concatenate_datasets(train_datasets) 222 | training_args.remove_unused_columns = False 223 | eval_datasets = ( 224 | { 225 | task: dataset_class.get(task, seed=data_args.data_seed).get_dataset( 226 | split="validation", 227 | n_obs=data_args.n_val, 228 | add_prefix=True, 229 | split_validation_test=training_args.split_validation_test, 230 | ) 231 | for task in data_args.eval_tasks 232 | } 233 | if training_args.do_eval 234 | or training_args.evaluation_strategy != EvaluationStrategy.NO 235 | else None 236 | ) 237 | test_dataset = ( 238 | { 239 | task: dataset_class.get(task, seed=data_args.data_seed).get_dataset( 240 | split="test", 241 | n_obs=data_args.n_test, 242 | add_prefix=True, 243 | split_validation_test=training_args.split_validation_test, 244 | ) 245 | for task in data_args.eval_tasks 246 | } 247 | if training_args.do_test 248 | else None 249 | ) 250 | 251 | # Defines the metrics for evaluation. 252 | compute_metrics_fn = ( 253 | build_compute_metrics_fn(data_args.eval_tasks, tokenizer) 254 | if training_args.predict_with_generate 255 | else None 256 | ) 257 | 258 | collator_class = TaskCollator 259 | compute_gen_probs = False 260 | if "mrqa" in eval_datasets: 261 | collator_class = MrqaTaskCollator 262 | compute_gen_probs = True 263 | elif "mrqa_reg" in eval_datasets: 264 | compute_gen_probs = True 265 | 266 | # Defines the trainer. 267 | trainer = T5Trainer( 268 | model=model, 269 | config=config, 270 | args=training_args, 271 | train_dataset=train_dataset if training_args.do_train else None, 272 | eval_dataset=eval_datasets, 273 | data_collator=collator_class( 274 | tokenizer, 275 | data_args, 276 | tpu_num_cores=training_args.tpu_num_cores, 277 | ), 278 | tokenizer=tokenizer, 279 | compute_metrics=None, 280 | multi_task_compute_metrics=compute_metrics_fn, 281 | data_args=data_args, 282 | compute_gen_probs=compute_gen_probs, 283 | dataset_sizes=dataset_sizes if training_args.do_train else None, 284 | ) 285 | 286 | # Trains the model. 287 | if training_args.do_train: 288 | if trainer.is_world_process_zero(): 289 | last_checkpoint_path = training_args.output_dir 290 | model_path = ( 291 | model_args.model_name_or_path 292 | if ( 293 | training_args.optimize_from_scratch 294 | or not os.path.exists( 295 | os.path.join(last_checkpoint_path, "pytorch_model.bin") 296 | ) 297 | ) 298 | else last_checkpoint_path 299 | ) 300 | if training_args.compute_time: 301 | torch.cuda.synchronize() # wait for move to complete 302 | start = torch.cuda.Event(enable_timing=True) 303 | end = torch.cuda.Event(enable_timing=True) 304 | start.record() 305 | trainer.train( 306 | # get_last_checkpoint_path(training_args.output_dir) \ 307 | model_path=model_path 308 | if ( 309 | os.path.exists(training_args.output_dir) 310 | and not training_args.optimize_from_scratch 311 | ) 312 | else None, 313 | ) 314 | if training_args.compute_time: 315 | torch.cuda.synchronize() # wait for all_reduce to complete 316 | end.record() 317 | total_time = {"total_time": start.elapsed_time(end)} 318 | print("###### total_time ", total_time) 319 | trainer.save_model() 320 | # For convenience, we also re-save the tokenizer to the same directory, 321 | # so that you can share your model easily on huggingface.co/models =) 322 | if trainer.is_world_process_zero(): 323 | trainer.state.save_to_json( 324 | os.path.join(training_args.output_dir, "trainer_state.json") 325 | ) 326 | tokenizer.save_pretrained(training_args.output_dir) 327 | 328 | if training_args.do_eval: 329 | trainer.evaluate() 330 | 331 | if training_args.do_test: 332 | # to avoid overwriting 333 | trainer.answer_output_file = "predicted_answers_test.json" 334 | trainer.evaluate(test_dataset) 335 | 336 | 337 | def _mp_fn(index): 338 | # For xla_spawn (TPUs) 339 | main() 340 | 341 | 342 | if __name__ == "__main__": 343 | main() 344 | -------------------------------------------------------------------------------- /hyperdecoder/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .metrics import build_compute_metrics_fn 2 | -------------------------------------------------------------------------------- /hyperdecoder/metrics/metrics.py: -------------------------------------------------------------------------------- 1 | """Defines different metrics used for evaluation of tasks.""" 2 | from collections import defaultdict 3 | import functools 4 | import numpy as np 5 | import scipy 6 | import math 7 | import sklearn 8 | from logging import getLogger 9 | from third_party.utils import calculate_rouge, calculate_bleu, lmap 10 | from transformers import EvalPrediction, PreTrainedTokenizer 11 | from typing import Callable, Dict, List, Tuple 12 | from metrics.squad_scoring import f1_score, exact_match_score 13 | 14 | logger = getLogger(__name__) 15 | 16 | 17 | def rouge(predictions, targets) -> dict: 18 | """Computes rouge score.""" 19 | return calculate_rouge(predictions, targets) 20 | 21 | 22 | def bleu(predictions, targets) -> dict: 23 | """Computes bleu score.""" 24 | return calculate_bleu(predictions, targets) 25 | 26 | 27 | def accuracy(predictions, targets) -> dict: 28 | """Computes the average accuracy.""" 29 | return {"acc": 100 * ((np.array(predictions) == np.array(targets)).mean())} 30 | 31 | 32 | def pearson_corrcoef(predictions, targets) -> dict: 33 | """Computes Pearson correlation coefficient.""" 34 | pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0] 35 | 36 | # Note that if all the predictions will be the same, spearman 37 | # correlation is nan, to gaurad against this, we check the output 38 | # and return 0 in this case. 39 | if math.isnan(pearson_corrcoef): 40 | pearson_corrcoef = 0 41 | return {"pearson_corrcoef": pearson_corrcoef} 42 | 43 | 44 | def spearman_corrcoef(predictions, targets) -> dict: 45 | """Computes Spearman correlation coefficient.""" 46 | spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0] 47 | 48 | # Note that if all the predictions will be the same, spearman 49 | # correlation is nan, to gaurad against this, we check the output 50 | # and return 0 in this case. 51 | if math.isnan(spearman_corrcoef): 52 | spearman_corrcoef = 0 53 | return {"spearman_corrcoef": spearman_corrcoef} 54 | 55 | 56 | def f1_score_with_invalid(predictions, targets) -> dict: 57 | """Computes F1 score, with any prediction != 0 or 1 is counted as incorrect. 58 | Args: 59 | targets: list of targets, either 0 or 1 60 | predictions: list of predictions, any integer value 61 | Returns: 62 | F1 score, where any prediction != 0 or 1 is counted as wrong. 63 | """ 64 | targets, predictions = np.asarray(targets), np.asarray(predictions) 65 | # Get indices of invalid predictions. 66 | invalid_idx_mask = np.logical_and(predictions != 0, predictions != 1) 67 | # For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target. 68 | predictions[invalid_idx_mask] = 1 - targets[invalid_idx_mask] 69 | return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)} 70 | 71 | 72 | # TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow 73 | def matthews_corrcoef(predictions, targets) -> dict: 74 | """Computes the Matthews correlation coefficient.""" 75 | return {"mcc": 100 * sklearn.metrics.matthews_corrcoef(targets, predictions)} 76 | 77 | 78 | def squad_metrics(predictions, targets) -> dict: 79 | d = defaultdict(list) 80 | for p, t in zip(predictions, targets): 81 | d["f1"].append(f1_score(p, t)) 82 | d["em"].append(exact_match_score(p, t)) 83 | from statistics import mean 84 | 85 | return {"f1": mean(d["f1"]), "em": mean(d["em"])} 86 | 87 | 88 | def build_compute_metrics_fn( 89 | task_names: List[str], tokenizer: PreTrainedTokenizer 90 | ) -> Callable[[EvalPrediction], Dict]: 91 | """Builds a dictionary from each task to the task metric.""" 92 | 93 | def non_pad_len(tokens: np.ndarray) -> int: 94 | return np.count_nonzero(tokens != tokenizer.pad_token_id) 95 | 96 | def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]: 97 | pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True) 98 | pred.label_ids[pred.label_ids < 0] = 0 99 | label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True) 100 | pred_str = lmap(str.strip, pred_str) 101 | label_str = lmap(str.strip, label_str) 102 | return pred_str, label_str 103 | 104 | def compute_metrics(pred: EvalPrediction, metrics, post_processor=None) -> Dict: 105 | pred_str, label_str = decode_pred(pred) 106 | 107 | # Applies task post-processor. 108 | if post_processor is not None: 109 | pred_str = [post_processor(pred) for pred in pred_str] 110 | label_str = [post_processor(label) for label in label_str] 111 | 112 | eval_results = {} 113 | for metric in metrics: 114 | eval_results.update(metric(pred_str, label_str)) 115 | if metric.__name__ in ["bleu", "rouge"]: 116 | gen_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1) 117 | eval_results.update({"gen_len": gen_len}) 118 | return eval_results 119 | 120 | def tasks_metrics(task) -> Dict: 121 | from data.tasks import TASK_MAPPING 122 | from data.postprocessors import get_post_processor 123 | 124 | return functools.partial( 125 | compute_metrics, 126 | metrics=TASK_MAPPING[task].metrics, 127 | post_processor=get_post_processor(task), 128 | ) 129 | 130 | return {task: tasks_metrics(task) for task in task_names} 131 | -------------------------------------------------------------------------------- /hyperdecoder/metrics/squad_scoring.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | 14 | def remove_articles(text): 15 | return re.sub(r"\b(a|an|the)\b", " ", text) 16 | 17 | def white_space_fix(text): 18 | return " ".join(text.split()) 19 | 20 | def remove_punc(text): 21 | exclude = set(string.punctuation) 22 | return "".join(ch for ch in text if ch not in exclude) 23 | 24 | def lower(text): 25 | return text.lower() 26 | 27 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 28 | 29 | 30 | def f1_score(prediction, ground_truth): 31 | prediction_tokens = normalize_answer(prediction).split() 32 | ground_truth_tokens = normalize_answer(ground_truth).split() 33 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 34 | num_same = sum(common.values()) 35 | if num_same == 0: 36 | return 0 37 | precision = 1.0 * num_same / len(prediction_tokens) 38 | recall = 1.0 * num_same / len(ground_truth_tokens) 39 | f1 = (2 * precision * recall) / (precision + recall) 40 | return f1 41 | 42 | 43 | def exact_match_score(prediction, ground_truth): 44 | return normalize_answer(prediction) == normalize_answer(ground_truth) 45 | 46 | 47 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 48 | scores_for_ground_truths = [] 49 | for ground_truth in ground_truths: 50 | score = metric_fn(prediction, ground_truth) 51 | scores_for_ground_truths.append(score) 52 | return max(scores_for_ground_truths) 53 | 54 | 55 | def evaluate(dataset, predictions): 56 | f1 = exact_match = total = 0 57 | for article in dataset: 58 | for paragraph in article["paragraphs"]: 59 | for qa in paragraph["qas"]: 60 | total += 1 61 | if qa["id"] not in predictions: 62 | message = ( 63 | "Unanswered question " + qa["id"] + " will receive score 0." 64 | ) 65 | print(message, file=sys.stderr) 66 | continue 67 | ground_truths = list(map(lambda x: x["text"], qa["answers"])) 68 | prediction = predictions[qa["id"]] 69 | exact_match += metric_max_over_ground_truths( 70 | exact_match_score, prediction, ground_truths 71 | ) 72 | f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) 73 | 74 | exact_match = 100.0 * exact_match / total 75 | f1 = 100.0 * f1 / total 76 | 77 | return {"exact_match": exact_match, "f1": f1} 78 | 79 | 80 | if __name__ == "__main__": 81 | expected_version = "1.1" 82 | parser = argparse.ArgumentParser( 83 | description="Evaluation for SQuAD " + expected_version 84 | ) 85 | parser.add_argument("dataset_file", help="Dataset file") 86 | parser.add_argument("prediction_file", help="Prediction File") 87 | args = parser.parse_args() 88 | with open(args.dataset_file) as dataset_file: 89 | dataset_json = json.load(dataset_file) 90 | if dataset_json["version"] != expected_version: 91 | print( 92 | "Evaluation expects v-" 93 | + expected_version 94 | + ", but got dataset with v-" 95 | + dataset_json["version"], 96 | file=sys.stderr, 97 | ) 98 | dataset = dataset_json["data"] 99 | with open(args.prediction_file) as prediction_file: 100 | predictions = json.load(prediction_file) 101 | print(json.dumps(evaluate(dataset, predictions))) 102 | -------------------------------------------------------------------------------- /hyperdecoder/modeling/adapter_generators.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | def hyperfanin_init_weight(linear_layer, hypernet_in, mainnet_in): 8 | bound = 1e-3 * math.sqrt(3 / (hypernet_in * mainnet_in)) 9 | nn.init.uniform_(linear_layer.weight, -bound, bound) 10 | nn.init.constant_(linear_layer.bias, 0.0) 11 | 12 | 13 | def hyperfanin_init_bias(linear_layer, hypernet_in): 14 | bound = 1e-3 * math.sqrt(3 / (hypernet_in)) 15 | nn.init.uniform_(linear_layer.weight, -bound, bound) 16 | nn.init.constant_(linear_layer.bias, 0.0) 17 | 18 | 19 | class SimpleGenerator(nn.Module): 20 | def __init__(self, config, input_dim, hidden_size, is_encoder=False): 21 | super().__init__() 22 | adapter_dim = ( 23 | config.encoder_adapter_dim if is_encoder else config.decoder_adapter_dim 24 | ) 25 | self.input_dim = input_dim 26 | self.hidden_dim = config.hypernetwork_bottleneck 27 | self.linear1 = nn.Linear(self.input_dim, self.hidden_dim) 28 | self.activation_fn = nn.ReLU() 29 | # output weights 30 | self.weight_up = nn.Linear(self.hidden_dim, hidden_size * adapter_dim) 31 | self.weight_down = nn.Linear(self.hidden_dim, hidden_size * adapter_dim) 32 | self.bias_up = nn.Linear(self.hidden_dim, hidden_size) 33 | self.bias_down = nn.Linear(self.hidden_dim, adapter_dim) 34 | # init weights 35 | hyperfanin_init_weight(self.weight_up, self.hidden_dim, adapter_dim) 36 | hyperfanin_init_weight(self.weight_down, self.hidden_dim, hidden_size) 37 | hyperfanin_init_bias(self.bias_up, self.hidden_dim) 38 | hyperfanin_init_bias(self.bias_down, self.hidden_dim) 39 | 40 | def forward(self, x): 41 | x = self.linear1(x) 42 | x = self.activation_fn(x) 43 | return ( 44 | self.weight_up(x), 45 | self.weight_down(x), 46 | self.bias_up(x), 47 | self.bias_down(x), 48 | ) 49 | 50 | 51 | class ParameterGenerator(nn.Module): 52 | def __init__(self, config, hidden_size, is_encoder=False): 53 | super().__init__() 54 | self.config = config 55 | self.layer_embed = nn.Embedding(config.num_hidden_layers, 10) 56 | self.decoder = SimpleGenerator( 57 | config, config.hidden_size + 10, hidden_size, is_encoder=is_encoder 58 | ) 59 | 60 | def forward(self, hidden_inputs): 61 | layers = [] 62 | # setup idxs we need 63 | layers_idxs = torch.arange( 64 | 0, 65 | self.config.num_hidden_layers, 66 | dtype=torch.long, 67 | device=hidden_inputs.device, 68 | ) 69 | layers_idxs = layers_idxs.repeat(hidden_inputs.size(0), 1) 70 | for i in range(self.config.num_hidden_layers): 71 | layer_embed = self.layer_embed(layers_idxs[:, i]) 72 | hidden_input = torch.cat([hidden_inputs, layer_embed], dim=1) 73 | layers.append(self.decoder(hidden_input)) 74 | return layers 75 | -------------------------------------------------------------------------------- /hyperdecoder/modeling/adapter_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import math 4 | 5 | 6 | class AdapterLayer(nn.Module): 7 | def __init__(self, config, is_encoder=False): 8 | super().__init__() 9 | self.adapter_dim = ( 10 | config.encoder_adapter_dim if is_encoder else config.decoder_adapter_dim 11 | ) 12 | hidden_size = config.hidden_size 13 | self.input_dim = config.hidden_size 14 | self.output_dim = config.hidden_size 15 | # insertion weights 16 | self.adapter_down_weight = None 17 | self.adapter_down_bias = None 18 | self.adapter_up_weight = None 19 | self.adapter_up_bias = None 20 | self.hidden_act = nn.ReLU() 21 | # learnt adapter + inits for it 22 | self.adapter_down_manual = nn.Linear(hidden_size, self.adapter_dim) 23 | self.adapter_up_manual = nn.Linear(self.adapter_dim, hidden_size) 24 | nn.init.xavier_uniform_(self.adapter_up_manual.weight, gain=1e-4) 25 | nn.init.xavier_uniform_(self.adapter_down_manual.weight, gain=1e-4) 26 | nn.init.constant_(self.adapter_up_manual.bias, 0.0) 27 | nn.init.constant_(self.adapter_down_manual.bias, 0.0) 28 | 29 | def clear_adapter(self): 30 | self.adapter_down_weight = None 31 | self.adapter_down_bias = None 32 | self.adapter_up_weight = None 33 | self.adapter_up_bias = None 34 | 35 | def apply_adapter_params(self, bsz, uw, dw, ub, db): 36 | self.adapter_down_weight = dw.view(bsz, self.input_dim, self.adapter_dim) 37 | self.adapter_down_bias = db.view(bsz, self.adapter_dim) 38 | self.adapter_up_weight = uw.view(bsz, self.adapter_dim, self.output_dim) 39 | self.adapter_up_bias = ub.view(bsz, self.output_dim) 40 | 41 | def forward(self, x): 42 | if self.adapter_down_weight is not None: 43 | x = (x @ self.adapter_down_weight) + self.adapter_down_bias.unsqueeze(1) 44 | x = self.hidden_act(x) 45 | x = (x @ self.adapter_up_weight) + self.adapter_up_bias.unsqueeze(1) 46 | else: 47 | x = self.adapter_down_manual(x) 48 | x = self.hidden_act(x) 49 | x = self.adapter_up_manual(x) 50 | return x # no residual connection - we let the user of this layer decide that 51 | 52 | 53 | class TaskSpecificAdapterLayer(nn.Module): 54 | def __init__(self, config, task_list, is_encoder=False): 55 | super().__init__() 56 | self.adapter_dim = ( 57 | config.encoder_adapter_dim if is_encoder else config.decoder_adapter_dim 58 | ) 59 | hidden_size = config.hidden_size 60 | task_list = config.tasks 61 | self.input_dim = hidden_size 62 | self.output_dim = hidden_size 63 | self.hidden_act = nn.ReLU() 64 | # learnt adapter + inits for it 65 | self.adapter_down_manual_weight = nn.Parameter( 66 | torch.randn(len(task_list), hidden_size, self.adapter_dim) 67 | ) 68 | self.adapter_down_manual_bias = nn.Parameter( 69 | torch.randn(len(task_list), 1, self.adapter_dim) 70 | ) 71 | self.adapter_up_manual_weight = nn.Parameter( 72 | torch.randn(len(task_list), self.adapter_dim, hidden_size) 73 | ) 74 | self.adapter_up_manual_bias = nn.Parameter( 75 | torch.randn(len(task_list), 1, hidden_size) 76 | ) 77 | 78 | nn.init.xavier_uniform_(self.adapter_down_manual_weight, gain=1e-4) 79 | nn.init.constant_(self.adapter_down_manual_bias, 0.0) 80 | nn.init.xavier_uniform_(self.adapter_up_manual_weight, gain=1e-4) 81 | nn.init.constant_(self.adapter_up_manual_bias, 0.0) 82 | # hacky method for setting task specific adapters 83 | self.adapter_down_weight_holder = None 84 | self.adapter_down_bias_holder = None 85 | self.adapter_up_weight_holder = None 86 | self.adapter_up_bias_holder = None 87 | 88 | def clear_adapter(self): 89 | self.adapter_down_weight_holder = None 90 | self.adapter_down_bias_holder = None 91 | self.adapter_up_weight_holder = None 92 | self.adapter_up_bias_holder = None 93 | 94 | def set_indices(self, indices): 95 | self.adapter_down_weight_holder = self.adapter_down_manual_weight[indices] 96 | self.adapter_down_bias_holder = self.adapter_down_manual_bias[indices] 97 | self.adapter_up_weight_holder = self.adapter_up_manual_weight[indices] 98 | self.adapter_up_bias_holder = self.adapter_up_manual_bias[indices] 99 | 100 | def forward(self, x): 101 | x = ( 102 | torch.bmm(x, self.adapter_down_weight_holder) 103 | + self.adapter_down_bias_holder 104 | ) 105 | x = self.hidden_act(x) 106 | x = torch.bmm(x, self.adapter_up_weight_holder) + self.adapter_up_bias_holder 107 | return x 108 | -------------------------------------------------------------------------------- /hyperdecoder/modeling/adapter_t5.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # modified modeling_t5 code. 17 | import copy 18 | import warnings 19 | 20 | import torch 21 | from torch import nn 22 | from torch.nn import CrossEntropyLoss 23 | from transformers.models.t5.configuration_t5 import T5Config 24 | from transformers.modeling_outputs import Seq2SeqLMOutput, BaseModelOutput 25 | from transformers.models.t5.modeling_t5 import ( 26 | T5Block, 27 | T5LayerFF, 28 | T5Stack, 29 | T5ForConditionalGeneration, 30 | __HEAD_MASK_WARNING_MSG, 31 | ) 32 | 33 | from modeling.adapter_generators import ParameterGenerator 34 | from modeling.adapter_layer import AdapterLayer, TaskSpecificAdapterLayer 35 | 36 | 37 | class T5WithAdapterConfig(T5Config): 38 | def __init__( 39 | self, 40 | encoder_adapter_dim=64, 41 | decoder_adapter_dim=64, 42 | hypernetwork_bottleneck=128, 43 | encoder_adapter="task", 44 | decoder_adapter="task", 45 | tasks=[], 46 | **kwargs, 47 | ): 48 | super().__init__(**kwargs) 49 | self.encoder_adapter_dim = encoder_adapter_dim 50 | self.decoder_adapter_dim = decoder_adapter_dim 51 | self.hypernetwork_bottleneck = hypernetwork_bottleneck 52 | self.encoder_adapter = encoder_adapter 53 | self.decoder_adapter = decoder_adapter 54 | self.tasks = tasks 55 | 56 | 57 | class T5LayerFFWithAdapter(T5LayerFF): 58 | def __init__(self, config, is_encoder=False): 59 | super().__init__(config) 60 | self.config = config 61 | if (is_encoder and config.encoder_adapter == "manual_specific") or ( 62 | not is_encoder and config.decoder_adapter == "manual_specific" 63 | ): 64 | self.adapter_layer = TaskSpecificAdapterLayer(config, is_encoder=is_encoder) 65 | else: 66 | self.adapter_layer = AdapterLayer(config, is_encoder=is_encoder) 67 | 68 | def forward(self, hidden_states): 69 | normed_states = self.layer_norm(hidden_states) 70 | forwarded_states = self.DenseReluDense(normed_states) 71 | adapter_input = ( 72 | normed_states if self.config.adapter_norm_input else hidden_states 73 | ) 74 | hidden_states = ( 75 | hidden_states 76 | + self.dropout(forwarded_states) 77 | + self.adapter_layer(adapter_input) 78 | ) 79 | return hidden_states 80 | 81 | 82 | class T5BlockWithAdapter(T5Block): 83 | def __init__(self, config, has_relative_attention_bias=False, is_encoder=False): 84 | super().__init__( 85 | config, has_relative_attention_bias=has_relative_attention_bias 86 | ) 87 | self.layer[-1] = T5LayerFFWithAdapter(config, is_encoder=is_encoder) 88 | 89 | 90 | def mean_pooling(hidden_state, attention_mask): 91 | input_masked = hidden_state * attention_mask.unsqueeze(-1) 92 | return input_masked.sum(1) / attention_mask.sum(1).unsqueeze(-1) 93 | 94 | 95 | class T5StackWithAdapter(T5Stack): 96 | def __init__(self, config, embed_tokens=None): 97 | super().__init__(config, embed_tokens=embed_tokens) 98 | blockClass = T5Block 99 | if (self.is_decoder and self.config.decoder_adapter != "none") or ( 100 | (not self.is_decoder) and self.config.encoder_adapter != "none" 101 | ): 102 | blockClass = T5BlockWithAdapter 103 | kwargs = {"is_encoder": not self.is_decoder} 104 | else: 105 | kwargs = {} 106 | self.block = torch.nn.ModuleList( 107 | [ 108 | blockClass(config, has_relative_attention_bias=bool(i == 0), **kwargs) 109 | for i in range(config.num_layers) 110 | ] 111 | ) 112 | if (self.is_decoder and self.config.decoder_adapter == "generated") or ( 113 | (not self.is_decoder) and self.config.encoder_adapter == "generated" 114 | ): 115 | self.param_gen = ParameterGenerator( 116 | config, config.hidden_size, is_encoder=not self.is_decoder 117 | ) 118 | if self.config.process_encoder_output: 119 | self.mlp = nn.Sequential( 120 | nn.Linear(config.d_model, config.d_model), 121 | nn.ReLU(), 122 | nn.Linear(config.d_model, config.d_model), 123 | ) 124 | else: 125 | # no-op to make the forward function less of an if-maze 126 | self.mlp = lambda x: x 127 | elif (self.is_decoder and self.config.decoder_adapter == "task") or ( 128 | (not self.is_decoder) and self.config.encoder_adapter == "task" 129 | ): 130 | self.param_gen = ParameterGenerator( 131 | config, config.hidden_size, is_encoder=not self.is_decoder 132 | ) 133 | self.adapter_task_embedding = nn.Embedding( 134 | len(self.config.tasks), self.config.d_model 135 | ) 136 | 137 | def forward( 138 | self, 139 | input_ids=None, 140 | encoder_hidden_states=None, 141 | tasks=None, 142 | **kwargs, 143 | ): 144 | # using input ids to determine whats going 145 | self.clear_adapters() 146 | if self.is_decoder and self.config.decoder_adapter == "generated": 147 | self.apply_params_to_adapters( 148 | encoder_hidden_states.size(0), 149 | self.param_gen( 150 | self.mlp( 151 | mean_pooling( 152 | encoder_hidden_states, kwargs["encoder_attention_mask"] 153 | ) 154 | ) 155 | ), 156 | ) 157 | elif (not self.is_decoder) and self.config.encoder_adapter == "generated": 158 | # for encoder generation, we first pass through the encoder, then set encoder adapters based on this. 159 | # currently using learnt adapters in the first pass, but potentially we could turn those off too? 160 | res = super().forward( 161 | input_ids=input_ids, 162 | encoder_hidden_states=encoder_hidden_states, 163 | **kwargs, 164 | ) 165 | self.apply_params_to_adapters( 166 | input_ids.size(0), 167 | self.param_gen( 168 | self.mlp( 169 | mean_pooling(res.last_hidden_state, kwargs["attention_mask"]) 170 | ) 171 | ), 172 | ) 173 | elif (self.is_decoder and self.config.decoder_adapter == "task") or ( 174 | not self.is_decoder and self.config.encoder_adapter == "task" 175 | ): 176 | # at test time, we only test one task at a time. 177 | if not self.training: 178 | # simple sanity check 179 | if len(tasks) > 0: 180 | assert(tasks[0] == tasks[1] and tasks[1] == tasks[-1]) 181 | tasks = [tasks[0] for _ in range(input_ids.size(0))] 182 | indices = torch.tensor( 183 | [self.config.tasks.index(task) for task in tasks], 184 | device=input_ids.device, 185 | dtype=torch.long, 186 | ) 187 | task_embed = self.adapter_task_embedding(indices) 188 | self.apply_params_to_adapters(input_ids.size(0), self.param_gen(task_embed)) 189 | elif (self.is_decoder and self.config.decoder_adapter == "manual_specific") or ( 190 | not self.is_decoder and self.config.encoder_adapter == "manual_specific" 191 | ): 192 | indices = torch.tensor( 193 | [self.config.tasks.index(task) for task in tasks], 194 | device=input_ids.device, 195 | dtype=torch.long, 196 | ) 197 | self.apply_indices_to_adapters(indices) 198 | return super().forward( 199 | input_ids=input_ids, encoder_hidden_states=encoder_hidden_states, **kwargs 200 | ) 201 | 202 | def clear_adapters(self): 203 | for block in self.block: 204 | for layer in block.layer: 205 | if isinstance(layer, T5LayerFFWithAdapter): 206 | layer.adapter_layer.clear_adapter() 207 | 208 | def apply_params_to_adapters(self, batch_size, generated_params): 209 | for param, block in zip(generated_params, self.block): 210 | block.layer[-1].adapter_layer.apply_adapter_params(batch_size, *param) 211 | 212 | def apply_indices_to_adapters(self, indices): 213 | for block in self.block: 214 | block.layer[-1].adapter_layer.set_indices(indices) 215 | 216 | 217 | class T5ForConditionalGenerationWithAdapter(T5ForConditionalGeneration): 218 | def __init__(self, config): 219 | super().__init__(config) 220 | encoder_config = copy.deepcopy(config) 221 | encoder_config.is_decoder = False 222 | encoder_config.use_cache = False 223 | encoder_config.is_encoder_decoder = False 224 | self.encoder = T5StackWithAdapter(encoder_config, self.shared) 225 | 226 | decoder_config = copy.deepcopy(config) 227 | decoder_config.is_decoder = True 228 | decoder_config.is_encoder_decoder = False 229 | decoder_config.num_layers = config.num_decoder_layers 230 | self.decoder = T5StackWithAdapter(decoder_config, self.shared) 231 | 232 | self.init_weights() 233 | 234 | # required to pass tasks through 235 | def prepare_inputs_for_generation( 236 | self, 237 | input_ids, 238 | past=None, 239 | attention_mask=None, 240 | head_mask=None, 241 | decoder_head_mask=None, 242 | cross_attn_head_mask=None, 243 | use_cache=None, 244 | encoder_outputs=None, 245 | **kwargs, 246 | ): 247 | 248 | # cut decoder_input_ids if past is used 249 | if past is not None: 250 | input_ids = input_ids[:, -1:] 251 | 252 | return { 253 | "decoder_input_ids": input_ids, 254 | "past_key_values": past, 255 | "encoder_outputs": encoder_outputs, 256 | "attention_mask": attention_mask, 257 | "head_mask": head_mask, 258 | "decoder_head_mask": decoder_head_mask, 259 | "cross_attn_head_mask": cross_attn_head_mask, 260 | "use_cache": use_cache, 261 | "tasks": kwargs["tasks"], 262 | } 263 | 264 | def forward( 265 | self, 266 | input_ids=None, 267 | attention_mask=None, 268 | tasks=None, 269 | decoder_input_ids=None, 270 | decoder_attention_mask=None, 271 | head_mask=None, 272 | decoder_head_mask=None, 273 | cross_attn_head_mask=None, 274 | encoder_outputs=None, 275 | past_key_values=None, 276 | inputs_embeds=None, 277 | decoder_inputs_embeds=None, 278 | labels=None, 279 | use_cache=None, 280 | output_attentions=None, 281 | output_hidden_states=None, 282 | return_dict=None, 283 | ): 284 | use_cache = use_cache if use_cache is not None else self.config.use_cache 285 | return_dict = ( 286 | return_dict if return_dict is not None else self.config.use_return_dict 287 | ) 288 | 289 | # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask 290 | if head_mask is not None and decoder_head_mask is None: 291 | if self.config.num_layers == self.config.num_decoder_layers: 292 | warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) 293 | decoder_head_mask = head_mask 294 | 295 | # Encode if needed (training, first prediction pass) 296 | if encoder_outputs is None: 297 | # Convert encoder inputs in embeddings if needed 298 | encoder_outputs = self.encoder( 299 | input_ids=input_ids, 300 | attention_mask=attention_mask, 301 | tasks=tasks, 302 | inputs_embeds=inputs_embeds, 303 | head_mask=head_mask, 304 | output_attentions=output_attentions, 305 | output_hidden_states=output_hidden_states, 306 | return_dict=return_dict, 307 | ) 308 | elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): 309 | encoder_outputs = BaseModelOutput( 310 | last_hidden_state=encoder_outputs[0], 311 | hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, 312 | attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, 313 | ) 314 | 315 | hidden_states = encoder_outputs[0] 316 | 317 | if self.model_parallel: 318 | torch.cuda.set_device(self.decoder.first_device) 319 | 320 | if ( 321 | labels is not None 322 | and decoder_input_ids is None 323 | and decoder_inputs_embeds is None 324 | ): 325 | # get decoder inputs from shifting lm labels to the right 326 | decoder_input_ids = self._shift_right(labels) 327 | 328 | # Set device for model parallelism 329 | if self.model_parallel: 330 | torch.cuda.set_device(self.decoder.first_device) 331 | hidden_states = hidden_states.to(self.decoder.first_device) 332 | if decoder_input_ids is not None: 333 | decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) 334 | if attention_mask is not None: 335 | attention_mask = attention_mask.to(self.decoder.first_device) 336 | if decoder_attention_mask is not None: 337 | decoder_attention_mask = decoder_attention_mask.to( 338 | self.decoder.first_device 339 | ) 340 | 341 | # Decode 342 | decoder_outputs = self.decoder( 343 | input_ids=decoder_input_ids, 344 | attention_mask=decoder_attention_mask, 345 | tasks=tasks, 346 | inputs_embeds=decoder_inputs_embeds, 347 | past_key_values=past_key_values, 348 | encoder_hidden_states=hidden_states, 349 | encoder_attention_mask=attention_mask, 350 | head_mask=decoder_head_mask, 351 | cross_attn_head_mask=cross_attn_head_mask, 352 | use_cache=use_cache, 353 | output_attentions=output_attentions, 354 | output_hidden_states=output_hidden_states, 355 | return_dict=return_dict, 356 | ) 357 | 358 | sequence_output = decoder_outputs[0] 359 | 360 | # Set device for model parallelism 361 | if self.model_parallel: 362 | torch.cuda.set_device(self.encoder.first_device) 363 | self.lm_head = self.lm_head.to(self.encoder.first_device) 364 | sequence_output = sequence_output.to(self.lm_head.weight.device) 365 | 366 | if self.config.tie_word_embeddings: 367 | # Rescale output before projecting on vocab 368 | # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 369 | sequence_output = sequence_output * (self.model_dim ** -0.5) 370 | 371 | lm_logits = self.lm_head(sequence_output) 372 | 373 | loss = None 374 | if labels is not None: 375 | loss_fct = CrossEntropyLoss(ignore_index=-100) 376 | loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) 377 | # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 378 | 379 | if not return_dict: 380 | output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs 381 | return ((loss,) + output) if loss is not None else output 382 | 383 | return Seq2SeqLMOutput( 384 | loss=loss, 385 | logits=lm_logits, 386 | past_key_values=decoder_outputs.past_key_values, 387 | decoder_hidden_states=decoder_outputs.hidden_states, 388 | decoder_attentions=decoder_outputs.attentions, 389 | cross_attentions=decoder_outputs.cross_attentions, 390 | encoder_last_hidden_state=encoder_outputs.last_hidden_state, 391 | encoder_hidden_states=encoder_outputs.hidden_states, 392 | encoder_attentions=encoder_outputs.attentions, 393 | ) 394 | -------------------------------------------------------------------------------- /hyperdecoder/third_party/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018- The Hugging Face team. All rights reserved. 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | 180 | APPENDIX: How to apply the Apache License to your work. 181 | 182 | To apply the Apache License to your work, attach the following 183 | boilerplate notice, with the fields enclosed by brackets "[]" 184 | replaced with your own identifying information. (Don't include 185 | the brackets!) The text should be enclosed in the appropriate 186 | comment syntax for the file format. We also recommend that a 187 | file or class name and description of purpose be included on the 188 | same "printed page" as the copyright notice for easier 189 | identification within third-party archives. 190 | 191 | Copyright [yyyy] [name of copyright owner] 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | you may not use this file except in compliance with the License. 195 | You may obtain a copy of the License at 196 | 197 | http://www.apache.org/licenses/LICENSE-2.0 198 | 199 | Unless required by applicable law or agreed to in writing, software 200 | distributed under the License is distributed on an "AS IS" BASIS, 201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 202 | See the License for the specific language governing permissions and 203 | limitations under the License. 204 | -------------------------------------------------------------------------------- /hyperdecoder/third_party/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import sys 15 | 16 | import os 17 | 18 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__))) 19 | -------------------------------------------------------------------------------- /hyperdecoder/third_party/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | from .t5_trainer import T5Trainer 2 | -------------------------------------------------------------------------------- /hyperdecoder/third_party/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import ( 2 | calculate_rouge, 3 | calculate_bleu, 4 | assert_all_frozen, 5 | check_output_dir, 6 | freeze_embeds, 7 | freeze_params, 8 | lmap, 9 | save_json, 10 | write_txt_file, 11 | label_smoothed_nll_loss, 12 | TaskCollator, 13 | MrqaTaskCollator, 14 | ) 15 | -------------------------------------------------------------------------------- /hyperdecoder/third_party/utils/sentence_splitter.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from filelock import FileLock 17 | 18 | try: 19 | import nltk 20 | 21 | NLTK_AVAILABLE = True 22 | except (ImportError, ModuleNotFoundError): 23 | NLTK_AVAILABLE = False 24 | 25 | if NLTK_AVAILABLE: 26 | with FileLock(".lock") as lock: 27 | nltk.download("punkt", quiet=True) 28 | 29 | 30 | def add_newline_to_end_of_each_sentence(x: str) -> str: 31 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS.""" 32 | re.sub("", "", x) # remove pegasus newline char 33 | assert ( 34 | NLTK_AVAILABLE 35 | ), "nltk must be installed to separate newlines between sentences. (pip install nltk)" 36 | return "\n".join(nltk.sent_tokenize(x)) 37 | -------------------------------------------------------------------------------- /hyperdecoder/training_args.py: -------------------------------------------------------------------------------- 1 | """Defines the arguments used for training and evaluation.""" 2 | 3 | import logging 4 | from dataclasses import dataclass, field 5 | from transformers import TrainingArguments 6 | from transformers.optimization import ( 7 | get_constant_schedule, 8 | get_constant_schedule_with_warmup, 9 | get_cosine_schedule_with_warmup, 10 | get_cosine_with_hard_restarts_schedule_with_warmup, 11 | get_linear_schedule_with_warmup, 12 | get_polynomial_decay_schedule_with_warmup, 13 | ) 14 | from typing import Optional, List, Tuple 15 | 16 | arg_to_scheduler = { 17 | "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup, 18 | "polynomial": get_polynomial_decay_schedule_with_warmup, 19 | "constant": get_constant_schedule, 20 | "linear": get_linear_schedule_with_warmup, 21 | "cosine": get_cosine_schedule_with_warmup, 22 | "constant_w_warmup": get_constant_schedule_with_warmup, 23 | } 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | @dataclass 29 | class Seq2SeqTrainingArguments(TrainingArguments): 30 | """ 31 | Contains different training parameters such as dropout, optimizers parameters, ... . 32 | """ 33 | 34 | label_smoothing: Optional[float] = field( 35 | default=0.0, 36 | metadata={"help": "The label smoothing epsilon to apply (if not zero)."}, 37 | ) 38 | loss_scaling: Optional[bool] = field( 39 | default=True, 40 | metadata={"help": "Whether to scale loss by number of tokens."}, 41 | ) 42 | predict_with_generate: bool = field( 43 | default=False, 44 | metadata={ 45 | "help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)." 46 | }, 47 | ) 48 | adafactor: bool = field( 49 | default=False, metadata={"help": "whether to use adafactor"} 50 | ) 51 | encoder_layerdrop: Optional[float] = field( 52 | default=None, 53 | metadata={"help": "Encoder layer dropout probability. Goes into model.config."}, 54 | ) 55 | decoder_layerdrop: Optional[float] = field( 56 | default=None, 57 | metadata={"help": "Decoder layer dropout probability. Goes into model.config."}, 58 | ) 59 | dropout: Optional[float] = field( 60 | default=None, metadata={"help": "Dropout probability. Goes into model.config."} 61 | ) 62 | attention_dropout: Optional[float] = field( 63 | default=None, 64 | metadata={"help": "Attention dropout probability. Goes into model.config."}, 65 | ) 66 | lr_scheduler: Optional[str] = field( 67 | default="linear", 68 | metadata={ 69 | "help": f"Which lr scheduler to use. Selected in {sorted(arg_to_scheduler.keys())}" 70 | }, 71 | ) 72 | temperature: Optional[int] = field( 73 | default=1, 74 | metadata={ 75 | "help": "Defines the temperature" 76 | "value for sampling across the multiple datasets." 77 | }, 78 | ) 79 | do_test: bool = field( 80 | default=False, 81 | metadata={"help": "Whether to comptue evaluation metrics on the test sets."}, 82 | ) 83 | eval_output_dir: Optional[str] = field( 84 | default=None, 85 | metadata={ 86 | "help": "The output directory where the evaluation of the model and checkpoints during " 87 | "evaluation will be written. Would use the original output_dir if not specified." 88 | }, 89 | ) 90 | generate_classifier_weights: Optional[bool] = field( 91 | default=False, 92 | metadata={ 93 | "help": "If set, generates the weights of the classifier by using a hyper-network." 94 | }, 95 | ) 96 | optimize_from_scratch: Optional[bool] = field( 97 | default=False, 98 | metadata={ 99 | "help": "If set, this does not load the optimizers from" 100 | "the given model path." 101 | }, 102 | ) 103 | optimize_from_scratch_with_loading_model: Optional[bool] = field( 104 | default=False, 105 | metadata={ 106 | "help": "If set, it loads the model still but optimize from scratch." 107 | }, 108 | ) 109 | split_validation_test: Optional[bool] = field( 110 | default=False, 111 | metadata={ 112 | "help": "If set, for the datasets which do not" 113 | "have the test set, we use validation set as their" 114 | "test set and make a validation set from either" 115 | "splitting the validation set into half (for smaller" 116 | "than 10K samples datasets), or by using 1K examples" 117 | "from training set as validation set (for larger" 118 | " datasets)." 119 | }, 120 | ) 121 | print_num_parameters: Optional[str] = field( 122 | default=False, 123 | metadata={"help": "If specified, prints the total number of parameters."}, 124 | ) 125 | compute_memory: Optional[bool] = field( 126 | default=False, metadata={"help": "If specified, measures the memory needed."} 127 | ) 128 | compute_time: Optional[bool] = field( 129 | default=False, metadata={"help": "If specified, measures the time needed."} 130 | ) 131 | report_to: Optional[List[str]] = field( 132 | default="none", 133 | metadata={ 134 | "help": "The list of integrations to report the results and logs to." 135 | }, 136 | ) 137 | 138 | 139 | @dataclass 140 | class ModelArguments: 141 | """ 142 | Contains the arguments defining model, tokenizer, and config which we use for finetuning. 143 | Also, it defines which parameters of the model needs to be freezed during finetuning. 144 | """ 145 | 146 | model_name_or_path: str = field( 147 | metadata={ 148 | "help": "Path to pretrained model or model identifier from huggingface.co/models" 149 | } 150 | ) 151 | not_load_t5_checkpoint: bool = field( 152 | default=False, metadata={"help": "whether to load the checkpoint."} 153 | ) 154 | config_name: Optional[str] = field( 155 | default=None, 156 | metadata={ 157 | "help": "Pretrained config name or path if not the same as model_name" 158 | }, 159 | ) 160 | tokenizer_name: Optional[str] = field( 161 | default=None, 162 | metadata={ 163 | "help": "Pretrained tokenizer name or path if not the same as model_name" 164 | }, 165 | ) 166 | cache_dir: Optional[str] = field( 167 | default=None, 168 | metadata={ 169 | "help": "Where do you want to store the pretrained models downloaded from s3" 170 | }, 171 | ) 172 | freeze_model: bool = field( 173 | default=True, metadata={"help": "Whether to freeze the model."} 174 | ) 175 | unfreeze_encoder_adapters: bool = field( 176 | default=True, metadata={"help": "Whether to unfreeze the encoder adapters."} 177 | ) 178 | unfreeze_decoder_adapters: bool = field( 179 | default=True, metadata={"help": "Whether to unfreeze the decoder adapters."} 180 | ) 181 | unfreeze_encoder: bool = field( 182 | default=False, metadata={"help": "Whether to unfreeze the encoder."} 183 | ) 184 | unfreeze_decoder: bool = field( 185 | default=False, metadata={"help": "Whether to unfreeze the decoder."} 186 | ) 187 | unfreeze_layer_norms: bool = field( 188 | default=False, metadata={"help": "Whether to unfreeze the layer norms."} 189 | ) 190 | 191 | 192 | @dataclass 193 | class DataTrainingArguments: 194 | """ 195 | Arguments related to data used for training and evaluation. 196 | """ 197 | 198 | tasks: Optional[List[str]] = field( 199 | default="MRPC", 200 | metadata={"help": "Task name from the list of registered tasks."}, 201 | ) 202 | eval_tasks: Optional[List[str]] = field( 203 | default="MRPC", 204 | metadata={"help": "Evaluation task name from the list of registered tasks."}, 205 | ) 206 | adapters: Optional[List[str]] = field( 207 | default=None, 208 | metadata={"help": "Defines a dictionary from adapters to the tasks."}, 209 | ) 210 | task_embeddings: Optional[List[str]] = field( 211 | default=None, 212 | metadata={"help": "Defines a dictionary from tasks to the tasks embeddings."}, 213 | ) 214 | max_source_length: Optional[int] = field( 215 | default=128, 216 | metadata={ 217 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 218 | "than this will be truncated, sequences shorter will be padded." 219 | }, 220 | ) 221 | max_target_length: Optional[int] = field( 222 | default=128, 223 | metadata={ 224 | "help": "The maximum total sequence length for target text after tokenization. Sequences longer " 225 | "than this will be truncated, sequences shorter will be padded." 226 | }, 227 | ) 228 | val_max_target_length: Optional[int] = field( 229 | default=128, 230 | metadata={ 231 | "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer " 232 | "than this will be truncated, sequences shorter will be padded." 233 | }, 234 | ) 235 | test_max_target_length: Optional[int] = field( 236 | default=128, 237 | metadata={ 238 | "help": "The maximum total sequence length for test target text after tokenization. Sequences longer " 239 | "than this will be truncated, sequences shorter will be padded." 240 | }, 241 | ) 242 | n_train: Optional[int] = field( 243 | default=-1, metadata={"help": "# training examples. -1 means use all."} 244 | ) 245 | n_val: Optional[int] = field( 246 | default=-1, metadata={"help": "# validation examples. -1 means use all."} 247 | ) 248 | n_test: Optional[int] = field( 249 | default=-1, metadata={"help": "# test examples. -1 means use all."} 250 | ) 251 | eval_beams: Optional[int] = field( 252 | default=None, metadata={"help": "# num_beams to use for evaluation."} 253 | ) 254 | ignore_pad_token_for_loss: bool = field( 255 | default=True, 256 | metadata={ 257 | "help": "If only pad tokens should be ignored. This assumes that `config.pad_token_id` is defined." 258 | }, 259 | ) 260 | data_seed: Optional[int] = field( 261 | default=42, metadata={"help": "The seed used to subsample the datasets."} 262 | ) 263 | ignore_metric_keys: Optional[Tuple[str]] = field( 264 | default=("xsum_eval_rouge1", "xsum_eval_rougeL", "xsum_eval_rougeLsum"), 265 | metadata={ 266 | "help": "Metric keys to ignore in calculating average for best model" 267 | }, 268 | ) 269 | filter_nulls: bool = field( 270 | default=False, 271 | metadata={ 272 | "help": "Whether to filter out nulls from the dataset. Only valid when using the chunked mrqa dataset" 273 | }, 274 | ) 275 | 276 | 277 | @dataclass 278 | class AdapterTrainingArguments: 279 | """Defines the adapters parameters.""" 280 | 281 | encoder_adapter: Optional[str] = field( 282 | default="manual", metadata={"help": "The encoder adapter to use."} 283 | ) 284 | decoder_adapter: Optional[str] = field( 285 | default="generated", metadata={"help": "The decoder adapter to use."} 286 | ) 287 | encoder_adapter_dim: Optional[int] = field( 288 | default=64, metadata={"help": "size of adapters in encoder."} 289 | ) 290 | decoder_adapter_dim: Optional[int] = field( 291 | default=64, metadata={"help": "size of adapters in decoder."} 292 | ) 293 | hypernetwork_bottleneck: Optional[int] = field( 294 | default=128, metadata={"help": "size of hypernetwork bottleneck dim"} 295 | ) 296 | adapter_norm_input: bool = field( 297 | default=False, 298 | metadata={"help": "Whether to use layer normed input into adapters or not."}, 299 | ) 300 | mean_task_embeddings: bool = field( 301 | default=False, 302 | metadata={ 303 | "help": "Whether to use average task embedding instead of task-specific or not." 304 | }, 305 | ) 306 | process_encoder_output: bool = field( 307 | default=True, 308 | metadata={ 309 | "help": "Whether to pass the encoder output through a MLP before mean-pooling or not." 310 | }, 311 | ) 312 | -------------------------------------------------------------------------------- /hyperdecoder/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .utils import ( 15 | use_task_specific_params, 16 | reset_config, 17 | freeze_model, 18 | unfreeze_adapter_params_encoder, 19 | unfreeze_adapter_params_decoder, 20 | unfreeze_encoder, 21 | unfreeze_decoder, 22 | unfreeze_layer_norms, 23 | handle_metrics, 24 | create_dir, 25 | get_last_checkpoint_path, 26 | get_training_args, 27 | ) # , T5SaveModelCallback 28 | -------------------------------------------------------------------------------- /hyperdecoder/utils/utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | from dataclasses import asdict 4 | from logging import getLogger 5 | from third_party.utils import assert_all_frozen, freeze_embeds, freeze_params, save_json 6 | from transformers.models.t5.modeling_t5 import T5LayerNorm 7 | 8 | from data import TASK_MAPPING 9 | 10 | logger = getLogger(__name__) 11 | 12 | 13 | def create_dir(output_dir): 14 | """ 15 | Checks whether to the output_dir already exists and creates it if not. 16 | Args: 17 | output_dir: path to the output_dir 18 | """ 19 | if not os.path.exists(output_dir): 20 | os.makedirs(output_dir) 21 | 22 | 23 | def handle_metrics(split, metrics, output_dir): # , gcs_bucket=None): 24 | """ 25 | Prints and saves metrics or a general dictionary of results. 26 | 27 | Args: 28 | split: one of train, val, test, or training arguments. 29 | metrics: metrics dict 30 | output_dir: where to save the metrics, if gcs_bucket is given 31 | we save the results also on the given bucket. 32 | """ 33 | logger.info(f"***** {split} metrics *****") 34 | for key in sorted(metrics.keys()): 35 | logger.info(f" {key} = {metrics[key]}") 36 | save_json_file(metrics, f"{split}_results.json", output_dir) 37 | 38 | 39 | def save_json_file(json_dict, outfile_name, output_dir): 40 | """ 41 | Saves the given dictionary as a json file to output_dir and also 42 | the given bucket if given. 43 | """ 44 | save_json(json_dict, os.path.join(output_dir, outfile_name)) 45 | 46 | 47 | def get_training_args(arguments_list): 48 | """ 49 | Concatenate all training arguments except evaluation strategy which 50 | is not Json serializable. 51 | Args: 52 | arguments_list: list of dataclasses. 53 | Return: 54 | arguments: concatenated arguments. 55 | """ 56 | all_arguments = {} 57 | for arguments in arguments_list: 58 | all_arguments.update(asdict(arguments)) 59 | all_arguments.pop("evaluation_strategy") 60 | return all_arguments 61 | 62 | 63 | def get_last_checkpoint_path(output_dir): 64 | """ 65 | Finds the path for the last checkpoint saved in the output_dir 66 | Args: 67 | output_dir: output_dir 68 | Returns: 69 | path to the last checkpoint saved in the output dir. 70 | """ 71 | paths = glob.glob(os.path.join(output_dir, "checkpoint-*")) 72 | if len(paths) == 0: 73 | return output_dir 74 | else: 75 | checkpoints = [int(checkpoint.split("-")[-1]) for checkpoint in paths] 76 | max_checkpoint = max(checkpoints) 77 | return os.path.join(output_dir, "checkpoint-" + str(max_checkpoint)) 78 | 79 | 80 | def use_task_specific_params(model, task): 81 | """Update config with task specific params during evaluation.""" 82 | task_dataset = TASK_MAPPING[task] 83 | task_specific_config = task_dataset.task_specific_config 84 | if task_specific_config is not None: 85 | logger.info(f"using task specific params for {task}: {task_specific_config}") 86 | model.config.update(task_specific_config) 87 | 88 | 89 | def reset_config(model, config): 90 | """Resets the config file to the one provided.""" 91 | model.config = model.config.from_dict(config) 92 | logger.info(f"config is reset to the initial values.") 93 | 94 | 95 | def freeze_model(model): 96 | """Freezes the model weights.""" 97 | freeze_params(model) 98 | 99 | 100 | def unfreeze_adapter_params_encoder(model): 101 | for name, param in model.named_parameters(): 102 | if ( 103 | "adapter" in name or "mlp" in name or "param_gen" in name 104 | ) and "encoder" in name: 105 | param.requires_grad = True 106 | 107 | 108 | def unfreeze_adapter_params_decoder(model): 109 | for name, param in model.named_parameters(): 110 | if ( 111 | "adapter" in name or "mlp" in name or "param_gen" in name 112 | ) and "decoder" in name: 113 | param.requires_grad = True 114 | 115 | 116 | def unfreeze_encoder(model): 117 | for name, param in model.named_parameters(): 118 | if "encoder" in name: 119 | param.requires_grad = True 120 | 121 | 122 | def unfreeze_decoder(model): 123 | for name, param in model.named_parameters(): 124 | if "decoder" in name: 125 | param.requires_grad = True 126 | 127 | 128 | def unfreeze_layer_norms(model): 129 | for name, param in model.named_parameters(): 130 | if "layer_norm" in name: 131 | param.requires_grad = True 132 | -------------------------------------------------------------------------------- /mrqa_eval/construct_eval_folders.sh: -------------------------------------------------------------------------------- 1 | mkdir in-domain 2 | mkdir out-domain 3 | 4 | echo "downloading in-domain data" 5 | 6 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/SQuAD.jsonl.gz -O in-domain/SQuAD.jsonl.gz 7 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NewsQA.jsonl.gz -O in-domain/NewsQA.jsonl.gz 8 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/TriviaQA-web.jsonl.gz -O in-domain/TriviaQA-web.jsonl.gz 9 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/SearchQA.jsonl.gz -O in-domain/SearchQA.jsonl.gz 10 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/HotpotQA.jsonl.gz -O in-domain/HotpotQA.jsonl.gz 11 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NaturalQuestionsShort.jsonl.gz -O in-domain/NaturalQuestionsShort.jsonl.gz 12 | 13 | echo "downloading out-domain data" 14 | 15 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/BioASQ.jsonl.gz -O out-domain/BioASQ.jsonl.gz 16 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/DROP.jsonl.gz -O out-domain/DROP.jsonl.gz 17 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/DuoRC.ParaphraseRC.jsonl.gz -O out-domain/DuoRC.ParaphraseRC.jsonl.gz 18 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/RACE.jsonl.gz -O out-domain/RACE.jsonl.gz 19 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/RelationExtraction.jsonl.gz -O out-domain/RelationExtraction.jsonl.gz 20 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/TextbookQA.jsonl.gz -O out-domain/TextbookQA.jsonl.gz 21 | 22 | echo "done!" -------------------------------------------------------------------------------- /mrqa_eval/eval.py: -------------------------------------------------------------------------------- 1 | """Evaluation script for MRQA 2 | Adapted from Official evaluation script for the MRQA Workshop Shared Task. 3 | Adapted from the SQuAD v1.1 official evaluation script. 4 | Usage: 5 | python official_eval.py dataset_file.jsonl.gz prediction_file.json 6 | """ 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import argparse 12 | import string 13 | import re 14 | import json 15 | import gzip 16 | from collections import Counter 17 | 18 | 19 | def normalize_answer(s): 20 | """Lower text and remove punctuation, articles and extra whitespace.""" 21 | def remove_articles(text): 22 | return re.sub(r'\b(a|an|the)\b', ' ', text) 23 | 24 | def white_space_fix(text): 25 | return ' '.join(text.split()) 26 | 27 | def remove_punc(text): 28 | exclude = set(string.punctuation) 29 | return ''.join(ch for ch in text if ch not in exclude) 30 | 31 | def lower(text): 32 | return text.lower() 33 | 34 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 35 | 36 | 37 | def f1_score(prediction, ground_truth): 38 | prediction_tokens = normalize_answer(prediction).split() 39 | ground_truth_tokens = normalize_answer(ground_truth).split() 40 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 41 | num_same = sum(common.values()) 42 | if num_same == 0: 43 | return 0 44 | precision = 1.0 * num_same / len(prediction_tokens) 45 | recall = 1.0 * num_same / len(ground_truth_tokens) 46 | f1 = (2 * precision * recall) / (precision + recall) 47 | return f1 48 | 49 | 50 | def exact_match_score(prediction, ground_truth): 51 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 52 | 53 | 54 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 55 | scores_for_ground_truths = [] 56 | for ground_truth in ground_truths: 57 | score = metric_fn(prediction, ground_truth) 58 | scores_for_ground_truths.append(score) 59 | return max(scores_for_ground_truths) 60 | 61 | 62 | def read_predictions(prediction_file): 63 | with open(prediction_file) as f: 64 | predictions = json.load(f) 65 | return predictions 66 | 67 | 68 | def read_answers(gold_file): 69 | answers = {} 70 | with gzip.open(gold_file, 'rb') as f: 71 | for i, line in enumerate(f): 72 | example = json.loads(line) 73 | if i == 0 and 'header' in example: 74 | continue 75 | for qa in example['qas']: 76 | answers[qa['qid']] = qa['answers'] 77 | return answers 78 | 79 | 80 | def evaluate(answers, predictions, skip_no_answer=False): 81 | f1 = exact_match = total = 0 82 | for qid, ground_truths in answers.items(): 83 | if qid not in predictions: 84 | continue 85 | total += 1 86 | # filter out null guesses 87 | filtered_preds = [pred for pred in predictions[qid] if pred[0] != ''] 88 | if filtered_preds == []: 89 | prediction = '' 90 | else: 91 | probs = [pred[1] for pred in filtered_preds] 92 | max_prob_index = probs.index(min(probs)) 93 | prediction = filtered_preds[max_prob_index][0] 94 | 95 | exact_match += metric_max_over_ground_truths( 96 | exact_match_score, prediction, ground_truths) 97 | f1 += metric_max_over_ground_truths( 98 | f1_score, prediction, ground_truths) 99 | 100 | exact_match = 100.0 * exact_match / total 101 | f1 = 100.0 * f1 / total 102 | print(f'{exact_match:.2f} / {f1:.2f}') 103 | return {'exact_match': exact_match, 'f1': f1} 104 | 105 | 106 | if __name__ == '__main__': 107 | parser = argparse.ArgumentParser( 108 | description='Evaluation for MRQA Workshop Shared Task') 109 | parser.add_argument('dataset_file', type=str, help='Dataset File') 110 | parser.add_argument('prediction_file', type=str, help='Prediction File') 111 | parser.add_argument('--skip-no-answer', action='store_true') 112 | args = parser.parse_args() 113 | 114 | answers = read_answers(args.dataset_file) 115 | predictions = read_predictions(args.prediction_file) 116 | metrics = evaluate(answers, predictions, args.skip_no_answer) 117 | 118 | print(json.dumps(metrics)) 119 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.0.0 2 | aiohttp==3.8.1 3 | aiosignal==1.2.0 4 | async-timeout==4.0.2 5 | attrs==21.2.0 6 | certifi==2021.10.8 7 | charset-normalizer==2.0.9 8 | click==8.0.3 9 | colorama==0.4.4 10 | datasets==1.17.0 11 | dill==0.3.4 12 | filelock==3.4.0 13 | frozenlist==1.2.0 14 | fsspec==2021.11.1 15 | gitdb==4.0.9 16 | GitPython==3.1.24 17 | huggingface-hub==0.2.1 18 | idna==3.3 19 | joblib==1.1.0 20 | multidict==5.2.0 21 | multiprocess==0.70.12.2 22 | nltk==3.6.7 23 | numpy==1.21.5 24 | packaging==21.3 25 | pandas==1.3.5 26 | portalocker==2.3.2 27 | protobuf==3.19.1 28 | pyarrow==6.0.1 29 | pyparsing==3.0.6 30 | python-dateutil==2.8.2 31 | pytz==2021.3 32 | PyYAML==6.0 33 | regex==2021.11.10 34 | requests==2.26.0 35 | rouge-score==0.0.4 36 | sacrebleu==2.0.0 37 | sacremoses==0.0.46 38 | scikit-learn==1.0.1 39 | scipy==1.7.3 40 | sentencepiece==0.1.96 41 | six==1.16.0 42 | sklearn==0.0 43 | smmap==5.0.0 44 | tabulate==0.8.9 45 | threadpoolctl==3.0.0 46 | tokenizers==0.10.3 47 | tqdm==4.62.3 48 | transformers==4.14.1 49 | typing_extensions==4.0.1 50 | urllib3==1.26.7 51 | xxhash==2.0.2 52 | yarl==1.7.2 --------------------------------------------------------------------------------