├── .dockerignore
├── .gitignore
├── LICENSE
├── README.md
├── figures
    └── mainfig.png
├── hyperdecoder
    ├── __init__.py
    ├── configs
    │   ├── experiments
    │   │   └── glue_t5_base.json
    │   ├── glue_ablations
    │   │   ├── generate_ablations_script.py
    │   │   ├── glue_generated_generated.json
    │   │   ├── glue_generated_manual.json
    │   │   ├── glue_generated_none.json
    │   │   ├── glue_generated_task.json
    │   │   ├── glue_manual_generated.json
    │   │   ├── glue_manual_generated_no_mlp.json
    │   │   ├── glue_manual_generated_norm.json
    │   │   ├── glue_manual_generated_unbalanced.json
    │   │   ├── glue_manual_manual.json
    │   │   ├── glue_manual_none.json
    │   │   ├── glue_manual_task.json
    │   │   ├── glue_none_generated.json
    │   │   ├── glue_none_manual.json
    │   │   ├── glue_none_none.json
    │   │   ├── glue_none_task.json
    │   │   ├── glue_task_generated.json
    │   │   ├── glue_task_manual.json
    │   │   ├── glue_task_none.json
    │   │   └── glue_task_task.json
    │   ├── mrqa_experiments
    │   │   ├── adamw_adapter_gen.json
    │   │   ├── adamw_full_finetune.json
    │   │   ├── adapter_gen_layernorm.json
    │   │   ├── manual_adapter_control.json
    │   │   ├── per_dataset_adapter.json
    │   │   ├── small_adapter_large_hypernetwork.json
    │   │   ├── task_adamw_hypernet.json
    │   │   └── unbalanced_manual_generated_mrqa.json
    │   └── xsum_nli
    │   │   ├── nli.json
    │   │   ├── nli_adapter.json
    │   │   ├── nli_manual.json
    │   │   ├── nli_task.json
    │   │   ├── summarise.json
    │   │   ├── summarise_adapter.json
    │   │   ├── summarise_manual.json
    │   │   ├── summarise_nli.json
    │   │   ├── summarise_nli_gen.json
    │   │   ├── summarise_nli_manual.json
    │   │   ├── summarise_nli_task.json
    │   │   └── summarise_task.json
    ├── data
    │   ├── __init__.py
    │   ├── mrqa_preprocess.py
    │   ├── multitask_sampler.py
    │   ├── postprocessors.py
    │   ├── tasks.py
    │   └── utils.py
    ├── finetune_trainer.py
    ├── metrics
    │   ├── __init__.py
    │   ├── metrics.py
    │   └── squad_scoring.py
    ├── modeling
    │   ├── adapter_generators.py
    │   ├── adapter_layer.py
    │   └── adapter_t5.py
    ├── third_party
    │   ├── LICENSE
    │   ├── __init__.py
    │   ├── trainers
    │   │   ├── __init__.py
    │   │   └── t5_trainer.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── sentence_splitter.py
    │   │   └── utils.py
    ├── training_args.py
    └── utils
    │   ├── __init__.py
    │   └── utils.py
├── mrqa_eval
    ├── construct_eval_folders.sh
    └── eval.py
└── requirements.txt


/.dockerignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | outputs/
3 | .lock
4 | .python-version
5 | venv/


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | outputs/
3 | .lock
4 | .python-version
5 | venv/
6 | output/
7 | wandb/
8 | in-domain/
9 | out-domain/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Hyperdecoders
 2 | 
 3 | Instance-specific decoders for efficient multi-task adaptation of pretrained language models. By generating adapter parameters based off encoder representations, we are able to more effectively enhance the multi-tasking abilities of the model. [Check out our paper for details!](https://arxiv.org/abs/2203.08304) Here is an overview of our approach:
 4 | 
 5 | ![figure describing the hyperdecoders model](figures/mainfig.png)
 6 | 
 7 | We evaluate on GLUE, MRQA, and a mixture of summarisation and NLI tasks. Our results training and evaluating on GLUE using T5 v1.1 + LM adapt are as follows:
 8 | 
 9 | | Model  | CoLA | SST2 | STS-B | MRPC | QQP | MNLI | QNLI | RTE | Avg |
10 | | -----  | ---- | ----- | ----- | ---- | --- | ---- | ---- | --- | --- |
11 | |Full Finetuning | **63.6** | 94.8 | 91.6/92.0 | 88.7/91.8 | **92.2/89.5** | 88.6 | 93.3 | 77.5 | 86.3 |
12 | | Hyperformer | 19.2 | 87.3 | 86.2/85.8 | 73.4/81.3 | 87.0/82.8 | 77.7 | 84.2 | 55.1 | 71.5 |
13 | | Single Adapter | 58.5 | 95.7 | 90.1/90.3 | **89.4/92.2** | 91.4/88.6 | 89.8 | 94.1 | 80.7 | 86.2 |
14 | | **Hyperdecoder (ours)** | 58.7 | **95.9** | **91.8/92.0** | 89.2/92.0 | 91.1/88.3 | **90.0** | **94.2** | **80.8** | **86.5** |
15 | 
16 | Our approach only trains roughly 3% of the total parameters within the model.
17 | 
18 | [See our paper](https://arxiv.org/abs/2203.08304) for more! This codebase is built off the [hyperformer codebase](https://github.com/rabeehk/hyperformer), with the following major changes:
19 | - Added several tasks and relevant preprocessing, including MRQA (with and without sliding windows), xsum, CNN/Daily Mail, Wiki Lingua, abductive NLI, and adversarial NLI.
20 | - Fixed some minor bugs including the 'split validation test' not being applied to the training set.
21 | - Added new adapter and parameter generation code in `hyperdecoder/modeling`, and removed the old adapter code. Added relevant training arguments for these setups (encoder/decoder adapter sizes, etc).
22 | - Updated the trainer to save copies of generated answers along with likelihood scores for MRQA evaluation.
23 | 
24 | ## Installation
25 | 
26 | Install pytorch (1.10 recommended). Install required packages, preferably in a virtualenv: `pip install -r requirements.txt`.
27 | 
28 | ## Training
29 | 
30 | Navigate into the `hyperdecoder` directory, and then you can run any configuration with `python finetune_trainer.py configs/<config>`. Please note training only works for non-distributed setups - sorry!
31 | 
32 | For example, for GLUE, the Hyperdecoder model can be run with `python finetune_trainer.py configs/glue_ablations/glue_manual_generated.json`. The trained model will be placed in `hyperdecoder/output`, and the evaluation logs can be found in `hyperdecoder/output/log.txt`. You can control how often the model is evaluated and saved with `eval_steps` and `save_steps` in the config.
33 | 
34 | ## Config
35 | 
36 | Some useful config items:
37 | - `{de,en}coder_adapter`: controls how we adapt the encoder/decoder. Can be `none` (no adapters), `manual` (regular adapters), `generated` (generated adapters). Note `generated` in the encoder results in the encoder being run twice: once with adapters to produce an embedding that is then used to adapt the encoder for a second run (the output of which is passed to the decoder as usual).
38 | - `freeze_model/unfreeze_{en,de}coder_adapters/unfreeze_{en,de}coder`: freeze/unfreeze the relevant parts of the model for training. This is accomplished through the `requires_grad` flag. Usually we freeze the whole model and then unfreeze the encoder/decoder adapter bits.
39 | - `max_steps`: controls how many training steps. Note that `num_train_epochs` is ignored when this is set, we just train based on steps and do not distinguish any sort of epoch boundary.
40 | - `{en,de}coder_adapter_dim`: controls the adapter bottleneck size. You can control separately for encoder/decoder.
41 | - `hypernetwork_bottleneck`: controls the hypernetwork bottleneck size (see paper for details on this).
42 | - `split_validation_test`: split the validation sets of datasets into validation and test splits, so we can early-stop based on validation metrics and then eval on the test split. This is what we do for most experiments in our paper.
43 | 
44 | Most other config options are hopefully either straightforward or do not need to be changed. Note that the hyperdecoder model is achieved by setting `encoder_adapter: manual, decoder_adapter: generated`.
45 | 
46 | The primary configs to use are:
47 | - `glue_ablations/glue_manual_generated.json`: the main GLUE-trained hyperdecoder model from table 1.
48 | - `mrqa_experiments/unbalanced_manual_generated.json`: the MRQA hyperdecoder model from tables 4/5.
49 | 
50 | There are many other config files from other runs that correspond either to other models in the paper or stuff we tried during the development of this work.
51 | 
52 | ### MRQA Evaluation
53 | 
54 | Due to the sliding window nature of MRQA, evaluation is separately to running the model. When running evaluation with MRQA, the model will at the end output answer files for the validation and test sets as `<step_num>predicted_answers.json` and `<step_num>predicted_answers_test.json`. 
55 | 
56 | After getting these files, navigate into `mrqa_eval` and run the `construct_eval_folders.sh` script, which will download the MRQA evaluation data for you and place it in useful folders. You can then run evaluation on *in-domain* data as follows (note the in-domain data is treated as validation data and so predictions are output every evaluation phase):
57 | 
58 | `for file in in-domain/*.gz; do echo $file; python eval.py $file <predicted_answers.json file>; done`
59 | 
60 | The *out-domain* data can be evaluated similarly (note the out-domain predictions are only generated during test phases):
61 | 
62 | `for file in out-domain/*.gz; do echo $file; python eval.py $file <predicted_answers_test.json file>; done`
63 | 
64 | In both cases, you will get terminal output that prints (a) the name of the dataset being evaluated, and then (b) the performance on that particular dataset. Note our evaluation script is the same as the original MRQA evaluation script but with some extra code to handle picking the highest likelihood answer (as the model output saves these scores but does not filter on them). As such, it is fairly simple to convert our `predicted_answer.json` files to the format needed for the original MRQA evaluation script.
65 | 
66 | 
67 | ## Citation
68 | 
69 | If you found this code or our paper useful, please cite us:
70 | ```
71 | @misc{https://doi.org/10.48550/arxiv.2203.08304,
72 |   doi = {10.48550/ARXIV.2203.08304},
73 |   url = {https://arxiv.org/abs/2203.08304},
74 |   author = {Ivison, Hamish and Peters, Matthew E.},
75 |   title = {Hyperdecoders: Instance-specific decoders for multi-task NLP},
76 |   publisher = {arXiv},
77 |   year = {2022},  
78 | }
79 | ```


--------------------------------------------------------------------------------
/figures/mainfig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/hyperdecoders/dcaafd1e8858669923a81bbc9fd4fbfea303650e/figures/mainfig.png


--------------------------------------------------------------------------------
/hyperdecoder/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | 
5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
6 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/experiments/glue_t5_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "t5-base",
 3 |     "tokenizer_name": "t5-base",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 128,
15 |     "per_device_eval_batch_size": 128,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "manual",
35 |     "decoder_adapter": "generated",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "adapter_norm_input": true,
42 |     "encoder_adapter_dim": 64,
43 |     "decoder_adapter_dim": 64,
44 |     "hypernetwork_bottleneck": 128,
45 |     "loss_scaling": false,
46 |     "tasks": [
47 |         "rte",
48 |         "sst2",
49 |         "mrpc",
50 |         "stsb",
51 |         "qqp",
52 |         "mnli",
53 |         "qnli",
54 |         "cola"
55 |     ],
56 |     "eval_tasks": [
57 |         "rte",
58 |         "sst2",
59 |         "mrpc",
60 |         "stsb",
61 |         "qqp",
62 |         "mnli",
63 |         "qnli",
64 |         "cola"
65 |     ]
66 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/generate_ablations_script.py:
--------------------------------------------------------------------------------
 1 | """small script to just generate the enc/dec variants for ablations"""
 2 | import json
 3 | 
 4 | with open("glue_none_none.json") as f:
 5 |     base_config = json.load(f)
 6 | 
 7 | for enc_setup in ["none", "manual", "task", "generated"]:
 8 |     for dec_setup in ["none", "manual", "task", "generated"]:
 9 |         config = base_config.copy()
10 |         config["encoder_adapter"] = enc_setup
11 |         # if ff, unfreeze all, else, just adapter stuff
12 |         if enc_setup == "none":
13 |             config["unfreeze_encoder_adapters"] = False
14 |             config["unfreeze_encoder"] = True
15 |         else:
16 |             config["unfreeze_encoder_adapters"] = True
17 |             config["unfreeze_encoder"] = False
18 |         if enc_setup in ["task", "generated"]:
19 |             config["adapter_norm_input"] = True
20 |         config["decoder_adapter"] = dec_setup
21 |         # if ff, unfreeze all, else, just adapter stuff
22 |         if dec_setup == "none":
23 |             config["unfreeze_decoder_adapters"] = False
24 |             config["unfreeze_decoder"] = True
25 |         else:
26 |             config["unfreeze_decoder_adapters"] = True
27 |             config["unfreeze_decoder"] = False
28 |         # if both none, then dont freeze at all
29 |         if enc_setup == "none" and dec_setup == "none":
30 |             config["freeze_model"] = False
31 |         else:
32 |             config["freeze_model"] = True
33 |         config["output_dir"] = f"glue_{enc_setup}_{dec_setup}"
34 |         with open(f"glue_{enc_setup}_{dec_setup}.json", "w") as f:
35 |             json.dump(config, f, indent=4)
36 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_generated_generated.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 64,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 2,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "generated",
35 |     "decoder_adapter": "generated",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "adapter_dim": 50,
42 |     "hypernetwork_bottleneck": 100,
43 |     "loss_scaling": false,
44 |     "adapter_norm_input": true,
45 |     "tasks": [
46 |         "rte",
47 |         "sst2",
48 |         "mrpc",
49 |         "stsb",
50 |         "qqp",
51 |         "mnli",
52 |         "qnli",
53 |         "cola"
54 |     ],
55 |     "eval_tasks": [
56 |         "rte",
57 |         "sst2",
58 |         "mrpc",
59 |         "stsb",
60 |         "qqp",
61 |         "mnli",
62 |         "qnli",
63 |         "cola"
64 |     ]
65 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_generated_manual.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 64,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 2,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "generated",
35 |     "decoder_adapter": "manual",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "adapter_dim": 64,
42 |     "hypernetwork_bottleneck": 128,
43 |     "loss_scaling": false,
44 |     "adapter_norm_input": true,
45 |     "tasks": [
46 |         "rte",
47 |         "sst2",
48 |         "mrpc",
49 |         "stsb",
50 |         "qqp",
51 |         "mnli",
52 |         "qnli",
53 |         "cola"
54 |     ],
55 |     "eval_tasks": [
56 |         "rte",
57 |         "sst2",
58 |         "mrpc",
59 |         "stsb",
60 |         "qqp",
61 |         "mnli",
62 |         "qnli",
63 |         "cola"
64 |     ]
65 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_generated_none.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-v1_1-large",
 3 |     "tokenizer_name": "google/t5-v1_1-large",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "glue_generated_none",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "generated",
35 |     "decoder_adapter": "none",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": true,
41 |     "adapter_dim": 64,
42 |     "hypernetwork_bottleneck": 128,
43 |     "loss_scaling": false,
44 |     "adapter_norm_input": true,
45 |     "tasks": [
46 |         "rte",
47 |         "sst2",
48 |         "mrpc",
49 |         "stsb",
50 |         "qqp",
51 |         "mnli",
52 |         "qnli",
53 |         "cola"
54 |     ],
55 |     "eval_tasks": [
56 |         "rte",
57 |         "sst2",
58 |         "mrpc",
59 |         "stsb",
60 |         "qqp",
61 |         "mnli",
62 |         "qnli",
63 |         "cola"
64 |     ]
65 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_generated_task.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "generated",
35 |     "decoder_adapter": "task",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "encoder_adapter_dim": 50,
42 |     "decoder_adapter_dim": 50,
43 |     "hypernetwork_bottleneck": 100,
44 |     "loss_scaling": false,
45 |     "adapter_norm_input": true,
46 |     "tasks": [
47 |         "rte",
48 |         "sst2",
49 |         "mrpc",
50 |         "stsb",
51 |         "qqp",
52 |         "mnli",
53 |         "qnli",
54 |         "cola"
55 |     ],
56 |     "eval_tasks": [
57 |         "rte",
58 |         "sst2",
59 |         "mrpc",
60 |         "stsb",
61 |         "qqp",
62 |         "mnli",
63 |         "qnli",
64 |         "cola"
65 |     ]
66 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_manual_generated.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 128,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "process_encoder_output": true,
28 |     "evaluation_strategy": "steps",
29 |     "adafactor": false,
30 |     "save_steps": 1000,
31 |     "eval_steps": 1000,
32 |     "metric_for_best_model": "average_metrics",
33 |     "greater_is_better": true,
34 |     "max_steps": 65536,
35 |     "print_num_parameters": true,
36 |     "encoder_adapter": "manual",
37 |     "decoder_adapter": "generated",
38 |     "freeze_model": true,
39 |     "unfreeze_encoder_adapters": true,
40 |     "unfreeze_decoder_adapters": true,
41 |     "unfreeze_encoder": false,
42 |     "unfreeze_decoder": false,
43 |     "unfreeze_layer_norms": false,
44 |     "adapter_norm_input": false,
45 |     "encoder_adapter_dim": 64,
46 |     "decoder_adapter_dim": 64,
47 |     "hypernetwork_bottleneck": 128,
48 |     "loss_scaling": false,
49 |     "tasks": [
50 |         "rte",
51 |         "sst2",
52 |         "mrpc",
53 |         "stsb",
54 |         "qqp",
55 |         "mnli",
56 |         "qnli",
57 |         "cola"
58 |     ],
59 |     "eval_tasks": [
60 |         "rte",
61 |         "sst2",
62 |         "mrpc",
63 |         "stsb",
64 |         "qqp",
65 |         "mnli",
66 |         "qnli",
67 |         "cola"
68 |     ]
69 | }
70 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_manual_generated_no_mlp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "manual",
35 |     "decoder_adapter": "generated",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "encoder_adapter_dim": 64,
42 |     "decoder_adapter_dim": 64,
43 |     "hypernetwork_bottleneck": 128,
44 |     "process_encoder_output": false,
45 |     "loss_scaling": false,
46 |     "tasks": [
47 |         "rte",
48 |         "sst2",
49 |         "mrpc",
50 |         "stsb",
51 |         "qqp",
52 |         "mnli",
53 |         "qnli",
54 |         "cola"
55 |     ],
56 |     "eval_tasks": [
57 |         "rte",
58 |         "sst2",
59 |         "mrpc",
60 |         "stsb",
61 |         "qqp",
62 |         "mnli",
63 |         "qnli",
64 |         "cola"
65 |     ]
66 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_manual_generated_norm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "manual",
35 |     "decoder_adapter": "generated",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "adapter_norm_input": true,
42 |     "encoder_adapter_dim": 64,
43 |     "decoder_adapter_dim": 64,
44 |     "hypernetwork_bottleneck": 128,
45 |     "loss_scaling": false,
46 |     "tasks": [
47 |         "rte",
48 |         "sst2",
49 |         "mrpc",
50 |         "stsb",
51 |         "qqp",
52 |         "mnli",
53 |         "qnli",
54 |         "cola"
55 |     ],
56 |     "eval_tasks": [
57 |         "rte",
58 |         "sst2",
59 |         "mrpc",
60 |         "stsb",
61 |         "qqp",
62 |         "mnli",
63 |         "qnli",
64 |         "cola"
65 |     ]
66 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_manual_generated_unbalanced.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "manual",
35 |     "decoder_adapter": "generated",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "adapter_norm_input": false,
42 |     "encoder_adapter_dim": 512,
43 |     "decoder_adapter_dim": 36,
44 |     "hypernetwork_bottleneck": 72,
45 |     "loss_scaling": false,
46 |     "tasks": [
47 |         "rte",
48 |         "sst2",
49 |         "mrpc",
50 |         "stsb",
51 |         "qqp",
52 |         "mnli",
53 |         "qnli",
54 |         "cola"
55 |     ],
56 |     "eval_tasks": [
57 |         "rte",
58 |         "sst2",
59 |         "mrpc",
60 |         "stsb",
61 |         "qqp",
62 |         "mnli",
63 |         "qnli",
64 |         "cola"
65 |     ]
66 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_manual_manual.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "manual",
35 |     "decoder_adapter": "manual",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "decoder_adapter_dim": 230,
42 |     "encoder_adapter_dim": 230,
43 |     "hypernetwork_bottleneck": 128,
44 |     "loss_scaling": false,
45 |     "tasks": [
46 |         "rte",
47 |         "sst2",
48 |         "mrpc",
49 |         "stsb",
50 |         "qqp",
51 |         "mnli",
52 |         "qnli",
53 |         "cola"
54 |     ],
55 |     "eval_tasks": [
56 |         "rte",
57 |         "sst2",
58 |         "mrpc",
59 |         "stsb",
60 |         "qqp",
61 |         "mnli",
62 |         "qnli",
63 |         "cola"
64 |     ]
65 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_manual_none.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-v1_1-large",
 3 |     "tokenizer_name": "google/t5-v1_1-large",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "glue_manual_none",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "manual",
35 |     "decoder_adapter": "none",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": true,
41 |     "adapter_dim": 230,
42 |     "hypernetwork_bottleneck": 128,
43 |     "loss_scaling": false,
44 |     "tasks": [
45 |         "rte",
46 |         "sst2",
47 |         "mrpc",
48 |         "stsb",
49 |         "qqp",
50 |         "mnli",
51 |         "qnli",
52 |         "cola"
53 |     ],
54 |     "eval_tasks": [
55 |         "rte",
56 |         "sst2",
57 |         "mrpc",
58 |         "stsb",
59 |         "qqp",
60 |         "mnli",
61 |         "qnli",
62 |         "cola"
63 |     ]
64 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_manual_task.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "manual",
35 |     "decoder_adapter": "task",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "encoder_adapter_dim": 50,
42 |     "decoder_adapter_dim": 50,
43 |     "hypernetwork_bottleneck": 100,
44 |     "loss_scaling": false,
45 |     "tasks": [
46 |         "rte",
47 |         "sst2",
48 |         "mrpc",
49 |         "stsb",
50 |         "qqp",
51 |         "mnli",
52 |         "qnli",
53 |         "cola"
54 |     ],
55 |     "eval_tasks": [
56 |         "rte",
57 |         "sst2",
58 |         "mrpc",
59 |         "stsb",
60 |         "qqp",
61 |         "mnli",
62 |         "qnli",
63 |         "cola"
64 |     ]
65 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_none_generated.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-v1_1-large",
 3 |     "tokenizer_name": "google/t5-v1_1-large",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "glue_none_generated",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "none",
35 |     "decoder_adapter": "generated",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": false,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": true,
40 |     "unfreeze_decoder": false,
41 |     "adapter_dim": 64,
42 |     "hypernetwork_bottleneck": 128,
43 |     "loss_scaling": false,
44 |     "tasks": [
45 |         "rte",
46 |         "sst2",
47 |         "mrpc",
48 |         "stsb",
49 |         "qqp",
50 |         "mnli",
51 |         "qnli",
52 |         "cola"
53 |     ],
54 |     "eval_tasks": [
55 |         "rte",
56 |         "sst2",
57 |         "mrpc",
58 |         "stsb",
59 |         "qqp",
60 |         "mnli",
61 |         "qnli",
62 |         "cola"
63 |     ]
64 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_none_manual.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-v1_1-large",
 3 |     "tokenizer_name": "google/t5-v1_1-large",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "glue_none_manual",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "none",
35 |     "decoder_adapter": "manual",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": true,
40 |     "unfreeze_decoder": false,
41 |     "adapter_dim": 230,
42 |     "hypernetwork_bottleneck": 128,
43 |     "loss_scaling": false,
44 |     "tasks": [
45 |         "rte",
46 |         "sst2",
47 |         "mrpc",
48 |         "stsb",
49 |         "qqp",
50 |         "mnli",
51 |         "qnli",
52 |         "cola"
53 |     ],
54 |     "eval_tasks": [
55 |         "rte",
56 |         "sst2",
57 |         "mrpc",
58 |         "stsb",
59 |         "qqp",
60 |         "mnli",
61 |         "qnli",
62 |         "cola"
63 |     ]
64 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_none_none.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "none",
35 |     "decoder_adapter": "none",
36 |     "freeze_model": false,
37 |     "unfreeze_encoder_adapters": false,
38 |     "unfreeze_decoder_adapters": false,
39 |     "unfreeze_encoder": true,
40 |     "unfreeze_decoder": true,
41 |     "adapter_dim": 64,
42 |     "hypernetwork_bottleneck": 128,
43 |     "loss_scaling": false,
44 |     "tasks": [
45 |         "rte",
46 |         "sst2",
47 |         "mrpc",
48 |         "stsb",
49 |         "qqp",
50 |         "mnli",
51 |         "qnli",
52 |         "cola"
53 |     ],
54 |     "eval_tasks": [
55 |         "rte",
56 |         "sst2",
57 |         "mrpc",
58 |         "stsb",
59 |         "qqp",
60 |         "mnli",
61 |         "qnli",
62 |         "cola"
63 |     ]
64 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_none_task.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-v1_1-large",
 3 |     "tokenizer_name": "google/t5-v1_1-large",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "glue_none_task",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "none",
35 |     "decoder_adapter": "task",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": false,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": true,
40 |     "unfreeze_decoder": false,
41 |     "adapter_dim": 64,
42 |     "hypernetwork_bottleneck": 128,
43 |     "loss_scaling": false,
44 |     "adapter_norm_input": true,
45 |     "tasks": [
46 |         "rte",
47 |         "sst2",
48 |         "mrpc",
49 |         "stsb",
50 |         "qqp",
51 |         "mnli",
52 |         "qnli",
53 |         "cola"
54 |     ],
55 |     "eval_tasks": [
56 |         "rte",
57 |         "sst2",
58 |         "mrpc",
59 |         "stsb",
60 |         "qqp",
61 |         "mnli",
62 |         "qnli",
63 |         "cola"
64 |     ]
65 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_task_generated.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 64,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 2,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "task",
35 |     "decoder_adapter": "generated",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "encoder_adapter_dim": 50,
42 |     "decoder_adapter_dim": 50,
43 |     "hypernetwork_bottleneck": 100,
44 |     "loss_scaling": false,
45 |     "adapter_norm_input": true,
46 |     "tasks": [
47 |         "rte",
48 |         "sst2",
49 |         "mrpc",
50 |         "stsb",
51 |         "qqp",
52 |         "mnli",
53 |         "qnli",
54 |         "cola"
55 |     ],
56 |     "eval_tasks": [
57 |         "rte",
58 |         "sst2",
59 |         "mrpc",
60 |         "stsb",
61 |         "qqp",
62 |         "mnli",
63 |         "qnli",
64 |         "cola"
65 |     ]
66 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_task_manual.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "task",
35 |     "decoder_adapter": "manual",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "encoder_adapter_dim": 50,
42 |     "decoder_adapter_dim": 50,
43 |     "hypernetwork_bottleneck": 100,
44 |     "loss_scaling": false,
45 |     "adapter_norm_input": true,
46 |     "tasks": [
47 |         "rte",
48 |         "sst2",
49 |         "mrpc",
50 |         "stsb",
51 |         "qqp",
52 |         "mnli",
53 |         "qnli",
54 |         "cola"
55 |     ],
56 |     "eval_tasks": [
57 |         "rte",
58 |         "sst2",
59 |         "mrpc",
60 |         "stsb",
61 |         "qqp",
62 |         "mnli",
63 |         "qnli",
64 |         "cola"
65 |     ]
66 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_task_none.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-v1_1-large",
 3 |     "tokenizer_name": "google/t5-v1_1-large",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "glue_task_none",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "task",
35 |     "decoder_adapter": "none",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": true,
41 |     "adapter_dim": 64,
42 |     "hypernetwork_bottleneck": 128,
43 |     "loss_scaling": false,
44 |     "adapter_norm_input": true,
45 |     "tasks": [
46 |         "rte",
47 |         "sst2",
48 |         "mrpc",
49 |         "stsb",
50 |         "qqp",
51 |         "mnli",
52 |         "qnli",
53 |         "cola"
54 |     ],
55 |     "eval_tasks": [
56 |         "rte",
57 |         "sst2",
58 |         "mrpc",
59 |         "stsb",
60 |         "qqp",
61 |         "mnli",
62 |         "qnli",
63 |         "cola"
64 |     ]
65 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/glue_ablations/glue_task_task.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 128,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 100,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": false,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 1000,
29 |     "eval_steps": 1000,
30 |     "metric_for_best_model": "average_metrics",
31 |     "greater_is_better": true,
32 |     "max_steps": 65536,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "task",
35 |     "decoder_adapter": "task",
36 |     "freeze_model": true,
37 |     "unfreeze_encoder_adapters": true,
38 |     "unfreeze_decoder_adapters": true,
39 |     "unfreeze_encoder": false,
40 |     "unfreeze_decoder": false,
41 |     "encoder_adapter_dim": 50,
42 |     "decoder_adapter_dim": 50,
43 |     "hypernetwork_bottleneck": 100,
44 |     "loss_scaling": false,
45 |     "adapter_norm_input": true,
46 |     "tasks": [
47 |         "rte",
48 |         "sst2",
49 |         "mrpc",
50 |         "stsb",
51 |         "qqp",
52 |         "mnli",
53 |         "qnli",
54 |         "cola"
55 |     ],
56 |     "eval_tasks": [
57 |         "rte",
58 |         "sst2",
59 |         "mrpc",
60 |         "stsb",
61 |         "qqp",
62 |         "mnli",
63 |         "qnli",
64 |         "cola"
65 |     ]
66 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/mrqa_experiments/adamw_adapter_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 64,
 8 |     "val_max_target_length": 64,
 9 |     "test_max_target_length": 64,
10 |     "num_train_epochs": 4,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 5,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": false,
27 |     "evaluation_strategy": "no",
28 |     "save_strategy": "epoch",
29 |     "metric_for_best_model": "average_metrics",
30 |     "greater_is_better": true,
31 |     "print_num_parameters": true,
32 |     "encoder_adapter": "manual",
33 |     "decoder_adapter": "generated",
34 |     "adapter_norm_input": false,
35 |     "freeze_model": true,
36 |     "unfreeze_encoder_adapters": true,
37 |     "unfreeze_decoder_adapters": true,
38 |     "unfreeze_encoder": false,
39 |     "unfreeze_decoder": false,
40 |     "encoder_adapter_dim": 64,
41 |     "decoder_adapter_dim": 64,
42 |     "hypernetwork_bottleneck": 128,
43 |     "loss_scaling": false,
44 |     "adafactor": false,
45 |     "tasks": ["mrqa"],
46 |     "eval_tasks": ["mrqa"]
47 | }
48 |     
49 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/mrqa_experiments/adamw_full_finetune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 64,
 8 |     "val_max_target_length": 64,
 9 |     "test_max_target_length": 64,
10 |     "num_train_epochs": 4,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 64,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 5,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": false,
27 |     "evaluation_strategy": "no",
28 |     "save_strategy": "epoch",
29 |     "metric_for_best_model": "average_metrics",
30 |     "greater_is_better": true,
31 |     "print_num_parameters": true,
32 |     "encoder_adapter": "none",
33 |     "decoder_adapter": "none",
34 |     "adapter_norm_input": true,
35 |     "freeze_model": false,
36 |     "unfreeze_encoder_adapters": true,
37 |     "unfreeze_decoder_adapters": true,
38 |     "unfreeze_encoder": false,
39 |     "unfreeze_decoder": false,
40 |     "adapter_dim": 1,
41 |     "hypernetwork_bottleneck": 1,
42 |     "loss_scaling": false,
43 |     "adafactor": false,
44 |     "tasks": ["mrqa"],
45 |     "eval_tasks": ["mrqa"]
46 | }
47 |     
48 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/mrqa_experiments/adapter_gen_layernorm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.001,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 64,
 8 |     "val_max_target_length": 64,
 9 |     "test_max_target_length": 64,
10 |     "num_train_epochs": 4,
11 |     "warmup_steps": 0,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 64,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 5,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": false,
27 |     "evaluation_strategy": "no",
28 |     "save_strategy": "epoch",
29 |     "metric_for_best_model": "average_metrics",
30 |     "greater_is_better": true,
31 |     "print_num_parameters": true,
32 |     "encoder_adapter": "manual",
33 |     "decoder_adapter": "generated",
34 |     "adapter_norm_input": true,
35 |     "freeze_model": true,
36 |     "unfreeze_encoder_adapters": true,
37 |     "unfreeze_decoder_adapters": true,
38 |     "unfreeze_encoder": false,
39 |     "unfreeze_decoder": false,
40 |     "adapter_dim": 64,
41 |     "hypernetwork_bottleneck": 128,
42 |     "loss_scaling": false,
43 |     "unfreeze_layer_norms": true,
44 |     "adafactor": true,
45 |     "lr_scheduler": "constant",
46 |     "tasks": ["mrqa"],
47 |     "eval_tasks": ["mrqa"]
48 | }
49 |     
50 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/mrqa_experiments/manual_adapter_control.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 64,
 8 |     "val_max_target_length": 64,
 9 |     "test_max_target_length": 64,
10 |     "num_train_epochs": 4,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 5,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": false,
27 |     "evaluation_strategy": "epoch",
28 |     "save_strategy": "epoch",
29 |     "metric_for_best_model": "average_metrics",
30 |     "greater_is_better": true,
31 |     "print_num_parameters": true,
32 |     "encoder_adapter": "manual",
33 |     "decoder_adapter": "manual",
34 |     "adapter_norm_input": false,
35 |     "freeze_model": true,
36 |     "unfreeze_encoder_adapters": true,
37 |     "unfreeze_decoder_adapters": true,
38 |     "unfreeze_encoder": false,
39 |     "unfreeze_decoder": false,
40 |     "encoder_adapter_dim": 800,
41 |     "decoder_adapter_dim": 2,
42 |     "process_encoder_output": false,
43 |     "loss_scaling": false,
44 |     "adafactor": false,
45 |     "report_to": "none",
46 |     "tasks": ["mrqa"],
47 |     "eval_tasks": ["mrqa"]
48 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/mrqa_experiments/per_dataset_adapter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 64,
 8 |     "val_max_target_length": 64,
 9 |     "test_max_target_length": 64,
10 |     "num_train_epochs": 4,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 64,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 5,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": false,
27 |     "evaluation_strategy": "no",
28 |     "save_strategy": "epoch",
29 |     "metric_for_best_model": "average_metrics",
30 |     "greater_is_better": true,
31 |     "print_num_parameters": true,
32 |     "encoder_adapter": "manual_specific",
33 |     "decoder_adapter": "manual_specific",
34 |     "adapter_norm_input": true,
35 |     "freeze_model": true,
36 |     "unfreeze_encoder_adapters": true,
37 |     "unfreeze_decoder_adapters": true,
38 |     "unfreeze_encoder": false,
39 |     "unfreeze_decoder": false,
40 |     "adapter_dim": 64,
41 |     "hypernetwork_bottleneck": 128,
42 |     "loss_scaling": false,
43 |     "adafactor": false,
44 |     "tasks": ["mrqa"],
45 |     "eval_tasks": ["mrqa"]
46 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/mrqa_experiments/small_adapter_large_hypernetwork.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 64,
 8 |     "val_max_target_length": 64,
 9 |     "test_max_target_length": 64,
10 |     "num_train_epochs": 4,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 5,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": false,
27 |     "evaluation_strategy": "epoch",
28 |     "save_strategy": "epoch",
29 |     "metric_for_best_model": "average_metrics",
30 |     "greater_is_better": true,
31 |     "print_num_parameters": true,
32 |     "encoder_adapter": "manual",
33 |     "decoder_adapter": "generated",
34 |     "adapter_norm_input": false,
35 |     "freeze_model": true,
36 |     "unfreeze_encoder_adapters": true,
37 |     "unfreeze_decoder_adapters": true,
38 |     "unfreeze_encoder": false,
39 |     "unfreeze_decoder": false,
40 |     "encoder_adapter_dim": 64,
41 |     "decoder_adapter_dim": 64,
42 |     "hypernetwork_bottleneck": 128,
43 |     "process_encoder_output": false,
44 |     "loss_scaling": false,
45 |     "adafactor": false,
46 |     "report_to": "none",
47 |     "tasks": ["mrqa"],
48 |     "eval_tasks": ["mrqa"]
49 | }
50 |     
51 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/mrqa_experiments/task_adamw_hypernet.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 64,
 8 |     "val_max_target_length": 64,
 9 |     "test_max_target_length": 64,
10 |     "num_train_epochs": 4,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 5,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": false,
27 |     "evaluation_strategy": "no",
28 |     "save_strategy": "epoch",
29 |     "metric_for_best_model": "average_metrics",
30 |     "greater_is_better": true,
31 |     "print_num_parameters": true,
32 |     "encoder_adapter": "task",
33 |     "decoder_adapter": "task",
34 |     "adapter_norm_input": false,
35 |     "freeze_model": true,
36 |     "unfreeze_encoder_adapters": true,
37 |     "unfreeze_decoder_adapters": true,
38 |     "unfreeze_encoder": false,
39 |     "unfreeze_decoder": false,
40 |     "encoder_adapter_dim": 50,
41 |     "decoder_adapter_dim": 50,
42 |     "hypernetwork_bottleneck": 100,
43 |     "loss_scaling": false,
44 |     "adafactor": false,
45 |     "tasks": ["mrqa"],
46 |     "eval_tasks": ["mrqa"]
47 | }


--------------------------------------------------------------------------------
/hyperdecoder/configs/mrqa_experiments/unbalanced_manual_generated_mrqa.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 64,
 8 |     "val_max_target_length": 64,
 9 |     "test_max_target_length": 64,
10 |     "num_train_epochs": 4,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 4,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 5,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": false,
27 |     "evaluation_strategy": "epoch",
28 |     "save_strategy": "epoch",
29 |     "eval_steps": 1000,
30 |     "save_steps": 1000,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "print_num_parameters": true,
34 |     "encoder_adapter": "manual",
35 |     "decoder_adapter": "generated",
36 |     "adapter_norm_input": false,
37 |     "freeze_model": true,
38 |     "unfreeze_encoder_adapters": true,
39 |     "unfreeze_decoder_adapters": true,
40 |     "unfreeze_encoder": false,
41 |     "unfreeze_decoder": false,
42 |     "encoder_adapter_dim": 512,
43 |     "decoder_adapter_dim": 36,
44 |     "hypernetwork_bottleneck": 72,
45 |     "process_encoder_output": true,
46 |     "loss_scaling": false,
47 |     "adafactor": false,
48 |     "report_to": "none",
49 |     "tasks": ["mrqa"],
50 |     "eval_tasks": ["mrqa"]
51 | }
52 |     
53 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/nli.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-small-lm-adapt",
 3 |     "tokenizer_name": "google/t5-small-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "nli",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "manual",
36 |     "decoder_adapter": "generated",
37 |     "freeze_model": true,
38 |     "unfreeze_encoder_adapters": true,
39 |     "unfreeze_decoder_adapters": true,
40 |     "unfreeze_encoder": false,
41 |     "unfreeze_decoder": false,
42 |     "adapter_dim": 64,
43 |     "hypernetwork_bottleneck": 128,
44 |     "loss_scaling": false,
45 |     "process_encoder_output": false,
46 |     
47 |     "tasks": ["anli", "art", "mnli"],
48 |     "eval_tasks": ["anli", "art", "mnli"]
49 | }
50 |     
51 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/nli_adapter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "nli_adapter",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "manual",
36 |     "decoder_adapter": "generated",
37 |     "freeze_model": true,
38 |     "unfreeze_encoder_adapters": true,
39 |     "unfreeze_decoder_adapters": true,
40 |     "unfreeze_encoder": false,
41 |     "unfreeze_decoder": false,
42 |     "adapter_dim": 64,
43 |     "hypernetwork_bottleneck": 128,
44 |     "loss_scaling": false,
45 |     
46 |     "tasks": ["anli", "alphanli", "mnli"],
47 |     "eval_tasks": ["anli", "alphanli", "mnli"]
48 |     }
49 |     
50 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/nli_manual.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "manual",
36 |     "decoder_adapter": "manual",
37 |     "freeze_model": true,
38 |     "adapter_norm_input": true,
39 |     "unfreeze_encoder_adapters": true,
40 |     "unfreeze_decoder_adapters": true,
41 |     "unfreeze_encoder": false,
42 |     "unfreeze_decoder": false,
43 |     "adapter_dim": 512,
44 |     "hypernetwork_bottleneck": 128,
45 |     "loss_scaling": false,
46 |     
47 |     "tasks": ["anli", "art", "mnli"],
48 |     "eval_tasks": ["anli", "art", "mnli"]
49 |     }
50 |     
51 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/nli_task.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": false,
23 |     "do_eval": false,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": false,
27 |     "evaluation_strategy": "no",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "task",
36 |     "decoder_adapter": "task",
37 |     "freeze_model": true,
38 |     "adapter_norm_input": true,
39 |     "unfreeze_encoder_adapters": true,
40 |     "unfreeze_decoder_adapters": true,
41 |     "unfreeze_encoder": false,
42 |     "unfreeze_decoder": false,
43 |     "adapter_dim": 64,
44 |     "hypernetwork_bottleneck": 128,
45 |     "loss_scaling": false,
46 |     
47 |     "tasks": ["anli", "art", "mnli"],
48 |     "eval_tasks": ["anli", "art", "mnli"]
49 |     }
50 |     
51 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/summarise.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "summarise",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "none",
36 |     "decoder_adapter": "none",
37 |     "freeze_model": false,
38 |     "unfreeze_encoder_adapters": true,
39 |     "unfreeze_decoder_adapters": true,
40 |     "unfreeze_encoder": true,
41 |     "unfreeze_decoder": true,
42 |     "adapter_dim": 64,
43 |     "hypernetwork_bottleneck": 128,
44 |     "loss_scaling": false,
45 |     
46 |     "tasks": ["xsum", "cnn_dailymail", "wiki_lingua"],
47 |     "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua"]
48 | }
49 |     
50 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/summarise_adapter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-large-lm-adapt",
 3 |     "tokenizer_name": "google/t5-large-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "summarise_adapter",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "manual",
36 |     "decoder_adapter": "generated",
37 |     "freeze_model": true,
38 |     "unfreeze_encoder_adapters": true,
39 |     "unfreeze_decoder_adapters": true,
40 |     "unfreeze_encoder": false,
41 |     "unfreeze_decoder": false,
42 |     "adapter_dim": 64,
43 |     "hypernetwork_bottleneck": 128,
44 |     "loss_scaling": false,
45 |     
46 |     "tasks": ["xsum", "cnn_dailymail", "wiki_lingua"],
47 |     "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua"]
48 |     }
49 |     
50 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/summarise_manual.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "manual",
36 |     "decoder_adapter": "manual",
37 |     "freeze_model": true,
38 |     "adapter_norm_input": true,
39 |     "unfreeze_encoder_adapters": true,
40 |     "unfreeze_decoder_adapters": true,
41 |     "unfreeze_encoder": false,
42 |     "unfreeze_decoder": false,
43 |     "adapter_dim": 512,
44 |     "hypernetwork_bottleneck": 128,
45 |     "loss_scaling": false,
46 |     
47 |     "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en"],
48 |     "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en"]
49 |     }
50 |     
51 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/summarise_nli.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 32,
16 |     "gradient_accumulation_steps": 2,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "none",
36 |     "decoder_adapter": "none",
37 |     "freeze_model": false,
38 |     "adapter_norm_input": false,
39 |     "unfreeze_encoder_adapters": true,
40 |     "unfreeze_decoder_adapters": true,
41 |     "unfreeze_encoder": false,
42 |     "unfreeze_decoder": false,
43 |     "adapter_dim": 64,
44 |     "hypernetwork_bottleneck": 128,
45 |     "loss_scaling": false,
46 |     
47 |     "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"],
48 |     "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"]
49 | }
50 |     
51 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/summarise_nli_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 2,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "manual",
36 |     "decoder_adapter": "generated",
37 |     "freeze_model": true,
38 |     "adapter_norm_input": false,
39 |     "unfreeze_encoder_adapters": true,
40 |     "unfreeze_decoder_adapters": true,
41 |     "unfreeze_encoder": false,
42 |     "unfreeze_decoder": false,
43 |     "encoder_adapter_dim": 512,
44 |     "decoder_adapter_dim": 36,
45 |     "hypernetwork_bottleneck": 72,
46 |     "loss_scaling": false,
47 |     
48 |     "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"],
49 |     "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"]
50 |     }
51 |     
52 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/summarise_nli_manual.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 2,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "manual",
36 |     "decoder_adapter": "manual",
37 |     "freeze_model": true,
38 |     "adapter_norm_input": false,
39 |     "unfreeze_encoder_adapters": true,
40 |     "unfreeze_decoder_adapters": true,
41 |     "unfreeze_encoder": false,
42 |     "unfreeze_decoder": false,
43 |     "encoder_adapter_dim": 370,
44 |     "decoder_adapter_dim": 370,
45 |     "hypernetwork_bottleneck": 128,
46 |     "loss_scaling": false,
47 |     
48 |     "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"],
49 |     "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"]
50 |     }
51 |     
52 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/summarise_nli_task.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 32,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 2,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": true,
26 |     "load_best_model_at_end": true,
27 |     "evaluation_strategy": "steps",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "task",
36 |     "decoder_adapter": "task",
37 |     "freeze_model": true,
38 |     "adapter_norm_input": true,
39 |     "unfreeze_encoder_adapters": true,
40 |     "unfreeze_decoder_adapters": true,
41 |     "unfreeze_encoder": false,
42 |     "unfreeze_decoder": false,
43 |     "encoder_adapter_dim": 64,
44 |     "decoder_adapter_dim": 64,
45 |     "hypernetwork_bottleneck": 128,
46 |     "loss_scaling": false,
47 |     
48 |     "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"],
49 |     "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en", "anli", "art", "mnli"]
50 |     }
51 |     
52 | 


--------------------------------------------------------------------------------
/hyperdecoder/configs/xsum_nli/summarise_task.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_name_or_path": "google/t5-base-lm-adapt",
 3 |     "tokenizer_name": "google/t5-base-lm-adapt",
 4 |     "learning_rate": 0.0003,
 5 |     "output_dir": "output",
 6 |     "max_source_length": 512,
 7 |     "max_target_length": 128,
 8 |     "val_max_target_length": 128,
 9 |     "test_max_target_length": 128,
10 |     "num_train_epochs": 3,
11 |     "warmup_steps": 500,
12 |     "overwrite_output_dir": true,
13 |     "label_smoothing": 0.1,
14 |     "per_device_train_batch_size": 16,
15 |     "per_device_eval_batch_size": 16,
16 |     "gradient_accumulation_steps": 1,
17 |     "logging_first_step": true,
18 |     "logging_steps": 200,
19 |     "save_total_limit": 1,
20 |     "temperature": 10,
21 |     "do_train": true,
22 |     "do_test": true,
23 |     "do_eval": true,
24 |     "predict_with_generate": true,
25 |     "split_validation_test": false,
26 |     "load_best_model_at_end": false,
27 |     "evaluation_strategy": "no",
28 |     "save_steps": 5000,
29 |     "eval_steps": 5000,
30 |     "n_val": 1600,
31 |     "metric_for_best_model": "average_metrics",
32 |     "greater_is_better": true,
33 |     "max_steps": 100000,
34 |     "print_num_parameters": true,
35 |     "encoder_adapter": "task",
36 |     "decoder_adapter": "task",
37 |     "freeze_model": true,
38 |     "adapter_norm_input": true,
39 |     "unfreeze_encoder_adapters": true,
40 |     "unfreeze_decoder_adapters": true,
41 |     "unfreeze_encoder": false,
42 |     "unfreeze_decoder": false,
43 |     "adapter_dim": 64,
44 |     "hypernetwork_bottleneck": 128,
45 |     "loss_scaling": false,
46 |     
47 |     "tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en"],
48 |     "eval_tasks": ["xsum", "cnn_dailymail", "wiki_lingua_english_en"]
49 |     }
50 |     
51 | 


--------------------------------------------------------------------------------
/hyperdecoder/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .multitask_sampler import MultiTaskBatchSampler
2 | from .postprocessors import string_to_float, get_post_processor
3 | from .tasks import TASK_MAPPING, AutoTask
4 | from .utils import compute_task_max_decoding_length
5 | 


--------------------------------------------------------------------------------
/hyperdecoder/data/mrqa_preprocess.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A little script to generate a chunked version of mrqa.
 3 | For this version, we chunk the dataset into 512-length
 4 | chunks, to simulate bert-style preprocessing. 
 5 | """
 6 | 
 7 | 
 8 | def chunk_sample(tokenizer, sample, stride=128, max_length=512, filter_nulls=False):
 9 |     initial_sample = f"question: {sample['question']} context: "
10 |     init_input_ids = tokenizer(initial_sample, add_special_tokens=False)["input_ids"]
11 |     start_len = len(init_input_ids)
12 |     context = sample["context"]
13 |     # context = context.replace('[PAR]', '</s>')
14 |     # context = context.replace('[DOC]', '</s>')
15 |     # context = context.replace('[TLE]', '</s>')
16 |     tokenized_output = tokenizer(context, return_offsets_mapping=True)
17 |     context_tokens = tokenized_output["input_ids"][:-1]
18 |     offsets = tokenized_output["offset_mapping"][:-1]  # ignore the last (0,0) for </s>
19 |     remaining_length = max_length - start_len - 1  # for '</s>'
20 |     while len(context_tokens) > 0:
21 |         chunk = context_tokens[:remaining_length] + [1]
22 |         offsets_chunk = offsets[:remaining_length]
23 |         # edge case: when the chunk is entirely within, finish up.
24 |         # Otherwise we might add more chunks for sake of stride.
25 |         if len(context_tokens) <= remaining_length:
26 |             context_tokens = []
27 |             offsets = []
28 |         else:
29 |             context_tokens = context_tokens[
30 |                 remaining_length - stride :
31 |             ]  # stride for some overlap
32 |             offsets = offsets[remaining_length - stride :]
33 |         # assuming answer strings in same order as char spans.
34 |         def detect_answer(sample, offsets_chunk):
35 |             for i, span in enumerate(sample["detected_answers"]["char_spans"]):
36 |                 for start, end in zip(
37 |                     span["start"], span["end"]
38 |                 ):  # we can have multiple answer instances
39 |                     if start >= offsets_chunk[0][0] and end <= offsets_chunk[-1][-1]:
40 |                         return sample["answers"][i]
41 |             return ""  # if we find nothing.
42 | 
43 |         chunk_ans = detect_answer(sample, offsets_chunk)
44 |         # sometimes we might want to filter out chunks without answers
45 |         if filter_nulls and chunk_ans == "":
46 |             continue
47 |         yield {
48 |             "question": sample["question"],
49 |             "context": sample["context"],
50 |             "input_ids": init_input_ids + chunk,
51 |             "answer": chunk_ans,
52 |             "qid": sample["qid"],
53 |             "subset": sample["subset"],
54 |             "task": "mrqa",
55 |         }
56 | 
57 | 
58 | def chunk_dataset(tokenizer, dataset, stride=128, max_length=512, filter_nulls=False):
59 |     for sample in dataset:
60 |         for chunked_sample in chunk_sample(
61 |             tokenizer, sample, stride, max_length, filter_nulls
62 |         ):
63 |             yield chunked_sample
64 | 
65 | 
66 | # testing
67 | if __name__ == "__main__":
68 |     from datasets import load_dataset
69 |     from transformers import T5TokenizerFast
70 | 
71 |     tokenizer = T5TokenizerFast.from_pretrained("t5-base")
72 |     mrqa = load_dataset("mrqa", split="validation")
73 |     print(f"MRQA has {len(mrqa)} samples")
74 |     print(f"First sample: {mrqa[0]}")
75 |     chunked_ds = list(
76 |         chunk_dataset(tokenizer, mrqa, stride=128, max_length=512, filter_nulls=True)
77 |     )
78 |     print(f"Chunked MRQA has {len(chunked_ds)} samples")
79 |     print(f"First sample: {chunked_ds[0]}")
80 | 


--------------------------------------------------------------------------------
/hyperdecoder/data/multitask_sampler.py:
--------------------------------------------------------------------------------
  1 | """Implements a distributed sampler to sample different tasks with
  2 | temperature sampling in a way to make sure that the same task is
  3 | selected in each core."""
  4 | import numpy as np
  5 | import torch
  6 | import torch.distributed as dist
  7 | from torch.utils.data import Sampler
  8 | from typing import TypeVar, Optional, List
  9 | 
 10 | T_co = TypeVar("T_co", covariant=True)
 11 | 
 12 | 
 13 | class MultiTaskBatchSampler(Sampler[T_co]):
 14 |     """Defines a sampler to sample multiple datasets with temperature sampling
 15 |     in a distributed fashion."""
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         dataset_sizes: List[int],
 20 |         batch_size: int,
 21 |         temperature: float,
 22 |         num_replicas: Optional[int] = None,
 23 |         rank: Optional[int] = None,
 24 |         seed: int = 0,
 25 |         shuffle: bool = True,
 26 |     ) -> None:
 27 |         """Constructor for MultiTaskBatchSampler.
 28 |         Args:
 29 |             dataset_sizes: a list of integers, specifies the number of samples in
 30 |                 each dataset.
 31 |             batch_size: integer, specifies the batch size.
 32 |             temperature: float, temperature used for temperature sampling. The larger
 33 |                 the value, the datasets are sampled equally, and for value of 0, the datasets
 34 |                 will be sampled according to their number of samples.
 35 |             num_replicas: integer, specifies the number of processes.
 36 |             rank: integer, specifies the rank of the current process/
 37 |             seed: integer, random seed.
 38 |             shuffle: bool, if set to true, the datasets will be shuffled in each epoch.
 39 |         """
 40 |         if num_replicas is None:
 41 |             if not dist.is_available():
 42 |                 raise RuntimeError("Requires distributed package to be available")
 43 |             num_replicas = dist.get_world_size()
 44 |         if rank is None:
 45 |             if not dist.is_available():
 46 |                 raise RuntimeError("Requires distributed package to be available")
 47 |             rank = dist.get_rank()
 48 |         if rank >= num_replicas or rank < 0:
 49 |             raise ValueError(
 50 |                 "Invalid rank {}, rank should be in the interval"
 51 |                 " [0, {}]".format(rank, num_replicas - 1)
 52 |             )
 53 |         self.num_replicas = num_replicas
 54 |         self.rank = rank
 55 |         self.batch_size = batch_size
 56 |         self.dataset_sizes = dataset_sizes
 57 |         # By default we drop the last elements if dataset is not divisible by the number of ranks.
 58 |         self.rank_dataset_sizes = [
 59 |             dataset_size // self.num_replicas for dataset_size in self.dataset_sizes
 60 |         ]
 61 |         self.dataset_offsets = torch.cumsum(torch.LongTensor([0] + dataset_sizes), 0)
 62 |         self.total_sizes = [
 63 |             (dataset_size // self.num_replicas) * self.num_replicas
 64 |             for dataset_size in self.dataset_sizes
 65 |         ]
 66 |         self.temperature = temperature
 67 |         self.seed = seed
 68 |         self.epoch = 0
 69 |         self.num_batches_per_epoch = (
 70 |             (np.sum(dataset_sizes) + self.batch_size - 1)
 71 |             // self.batch_size
 72 |             // self.num_replicas
 73 |         )
 74 |         self.shuffle = shuffle
 75 | 
 76 |     def generate_tasks_distribution(self):
 77 |         """Given the dataset sizes computes the weights to sample each dataset
 78 |         according to the temperature sampling."""
 79 |         total_size = sum(self.dataset_sizes)
 80 |         weights = np.array(
 81 |             [
 82 |                 (size / total_size) ** (1.0 / self.temperature)
 83 |                 for size in self.dataset_sizes
 84 |             ]
 85 |         )
 86 |         weights = weights / np.sum(weights)
 87 |         return torch.as_tensor(weights, dtype=torch.double)
 88 | 
 89 |     def __iter__(self):
 90 |         # Defines torch generator, to make random choices consistent across cores in
 91 |         # different epochs, the seed needs to be set based on seed and epoch.
 92 |         generator = torch.Generator()
 93 |         generator.manual_seed(self.seed + self.epoch)
 94 | 
 95 |         # Shuffles the datasets if shuffle is set to true.
 96 |         indices = []
 97 |         for dataset_size in self.dataset_sizes:
 98 |             if self.shuffle:
 99 |                 indices.append(
100 |                     torch.randperm(dataset_size, generator=generator).tolist()
101 |                 )
102 |             else:
103 |                 indices.append(list(range(dataset_size)))
104 | 
105 |         # Shards the datasets across the all processes.
106 |         self.rank_indices = []
107 |         for i in range(len(self.dataset_sizes)):
108 |             self.rank_indices.append(
109 |                 indices[i][self.rank : self.total_sizes[i] : self.num_replicas]
110 |             )
111 | 
112 |         # To make the model consistent across different processes, since the
113 |         # model is based on tasks, we need to make sure the same task is selected
114 |         # across different processes.
115 |         tasks_distribution: torch.Tensor = self.generate_tasks_distribution()
116 | 
117 |         # Chooses the tasks which will be used in each batch in one epoch.
118 |         # With passing generator, we make sure this choice is consistent across
119 |         # different processes.
120 |         batch_task_assignments = torch.multinomial(
121 |             tasks_distribution,
122 |             self.num_batches_per_epoch,
123 |             replacement=True,
124 |             generator=generator,
125 |         )
126 | 
127 |         for batch_task in batch_task_assignments:
128 |             # Gets the number of samples of the selected datasets available for the
129 |             # current rank.
130 |             num_task_samples = self.rank_dataset_sizes[batch_task]
131 |             # Computes the random samples from the chosen dataset.
132 |             indices = torch.randint(
133 |                 low=0,
134 |                 high=num_task_samples,
135 |                 size=(self.batch_size,),
136 |                 generator=generator,
137 |             ).tolist()
138 |             # Converts the selected indices to the global indices on the given dataset.
139 |             results = (
140 |                 self.dataset_offsets[batch_task]
141 |                 + torch.tensor(self.rank_indices[batch_task])[indices]
142 |             ).tolist()
143 |             yield results
144 | 
145 |     def __len__(self):
146 |         return self.num_batches_per_epoch
147 | 
148 |     def set_epoch(self, epoch):
149 |         self.epoch = epoch
150 | 
151 | 
152 | class EvenMultiTaskSampler(MultiTaskBatchSampler[T_co]):
153 |     """Sampler with even balance between datasets"""
154 | 
155 |     def generate_tasks_distribution(self):
156 |         total_size = len(self.dataset_sizes)
157 |         weights = np.array([(1 / total_size) for _ in self.dataset_sizes])
158 |         return torch.as_tensor(weights, dtype=torch.double)
159 | 


--------------------------------------------------------------------------------
/hyperdecoder/data/postprocessors.py:
--------------------------------------------------------------------------------
 1 | def string_to_float(string, default=-1.0):
 2 |     """Converts string to float, using default when conversion not possible."""
 3 |     try:
 4 |         return float(string)
 5 |     except ValueError:
 6 |         return default
 7 | 
 8 | 
 9 | def string_to_int(string, default=-1):
10 |     """Converts string to int, using default when conversion not possible."""
11 |     try:
12 |         return int(string)
13 |     except ValueError:
14 |         return default
15 | 
16 | 
17 | def get_post_processor(task):
18 |     """Returns post processor required to apply on the predictions/targets
19 |     before computing metrics for each task."""
20 |     if task == "stsb":
21 |         return string_to_float
22 |     elif task in ["qqp", "cola", "mrpc"]:
23 |         return string_to_int
24 |     else:
25 |         return None
26 | 


--------------------------------------------------------------------------------
/hyperdecoder/data/utils.py:
--------------------------------------------------------------------------------
 1 | """Defines utilities for the tasks."""
 2 | 
 3 | import numpy as np
 4 | from transformers import T5Tokenizer
 5 | 
 6 | 
 7 | def round_stsb_target(label):
 8 |     """STSB maps two sentences to a floating point number between 1 and 5
 9 |     representing their semantic similarity. Since we are treating all tasks as
10 |     text-to-text tasks we need to convert this floating point number to a string.
11 |     The vast majority of the similarity score labels in STSB are in the set
12 |     [0, 0.2, 0.4, ..., 4.8, 5.0]. So, we first round the number to the closest
13 |     entry in this set, and then we convert the result to a string (literally e.g.
14 |     "3.4"). This converts STSB roughly into a 26-class classification dataset.
15 |     Args:
16 |       label: original label.
17 |     Returns:
18 |       A preprocessed label.
19 |     """
20 |     return np.round((label * 5) / 5, decimals=1)
21 | 
22 | 
23 | tokenizer = T5Tokenizer.from_pretrained("t5-base")
24 | 
25 | 
26 | def compute_task_max_decoding_length(word_list):
27 |     """Computes the max decoding length for the given list of words
28 |     Args:
29 |       word_list: A list of stringss.
30 |     Returns:
31 |       maximum length after tokenization of the inputs.
32 |     """
33 |     max_len = 0
34 |     for word in word_list:
35 |         ids = tokenizer.encode(word)
36 |         max_len = max(max_len, len(ids))
37 |     return max_len
38 | 


--------------------------------------------------------------------------------
/hyperdecoder/finetune_trainer.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import torch
  3 | import datasets
  4 | import json
  5 | import logging
  6 | import os
  7 | from pathlib import Path
  8 | import dataclasses
  9 | 
 10 | from transformers import (
 11 |     AutoTokenizer,
 12 |     HfArgumentParser,
 13 |     set_seed,
 14 | )
 15 | from transformers.trainer_utils import EvaluationStrategy
 16 | 
 17 | from modeling.adapter_t5 import (
 18 |     T5WithAdapterConfig,
 19 |     T5ForConditionalGenerationWithAdapter,
 20 | )
 21 | from third_party.trainers import T5Trainer
 22 | from data import AutoTask
 23 | from third_party.utils import TaskCollator, check_output_dir, MrqaTaskCollator
 24 | from metrics import build_compute_metrics_fn
 25 | from training_args import (
 26 |     Seq2SeqTrainingArguments,
 27 |     ModelArguments,
 28 |     DataTrainingArguments,
 29 |     AdapterTrainingArguments,
 30 | )
 31 | from utils import (
 32 |     get_last_checkpoint_path,
 33 |     freeze_model,
 34 |     unfreeze_adapter_params_encoder,
 35 |     unfreeze_adapter_params_decoder,
 36 |     unfreeze_encoder,
 37 |     unfreeze_decoder,
 38 |     unfreeze_layer_norms,
 39 | )
 40 | 
 41 | logger = logging.getLogger(__name__)
 42 | 
 43 | 
 44 | def remove_rank_info_from_argv(args):
 45 |     extra_parameters = {}
 46 |     if args[1].startswith("--local_rank"):
 47 |         extra_parameters.update({"local_rank": int(args[1].split("=")[-1])})
 48 |         del args[1]
 49 |     return extra_parameters
 50 | 
 51 | 
 52 | def main():
 53 |     # See all possible arguments in src/transformers/training_args.py or by passing
 54 |     # the --help flag to this script. We now keep distinct sets of args, for a cleaner
 55 |     # separation of concerns.
 56 |     parser = HfArgumentParser(
 57 |         (
 58 |             ModelArguments,
 59 |             DataTrainingArguments,
 60 |             Seq2SeqTrainingArguments,
 61 |             AdapterTrainingArguments,
 62 |         )
 63 |     )
 64 | 
 65 |     # For running on multiple gpus with torch.distributed.launch, it adds a local_rank paramter, to allow the parser
 66 |     # still use the config file, we add the local_rank to the config file.
 67 |     if (
 68 |         len(sys.argv) > 2
 69 |         and sys.argv[1].startswith("--local_rank")
 70 |         and (sys.argv[2].endswith(".json"))
 71 |     ):
 72 |         rank_info = remove_rank_info_from_argv(sys.argv)
 73 |         args_dict = json.loads(Path(sys.argv[1]).read_text())
 74 |         args_dict.update(rank_info)
 75 |         model_args, data_args, training_args = parser.parse_dict(args_dict)
 76 |     elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
 77 |         logger.warning("config path: %s", sys.argv[1])
 78 |         # If we pass only one argument to the script and it's the path to a json file,
 79 |         # let's parse it to get our arguments.
 80 |         model_args, data_args, training_args, adapter_args = parser.parse_json_file(
 81 |             json_file=os.path.abspath(sys.argv[1])
 82 |         )
 83 |     else:
 84 |         (
 85 |             model_args,
 86 |             data_args,
 87 |             training_args,
 88 |             adapter_args,
 89 |         ) = parser.parse_args_into_dataclasses()
 90 |     check_output_dir(training_args)
 91 | 
 92 |     # Setup logging
 93 |     # logfile output folders must exist before telling the logger to output there
 94 |     os.makedirs(training_args.output_dir, exist_ok=True)
 95 |     logging.basicConfig(
 96 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 97 |         datefmt="%m/%d/%Y %H:%M:%S",
 98 |         level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
 99 |         filename=os.path.join(training_args.output_dir, "log.txt"),
100 |         filemode="w+",
101 |     )
102 |     logger.addHandler(logging.StreamHandler(sys.stdout))
103 |     logger.warning(
104 |         "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
105 |         training_args.local_rank,
106 |         training_args.device,
107 |         training_args.n_gpu,
108 |         bool(training_args.local_rank != -1),
109 |         training_args.fp16,
110 |     )
111 |     logger.info("Training/evaluation parameters %s", training_args)
112 | 
113 |     # Set seed
114 |     set_seed(training_args.seed)
115 | 
116 |     from transformers import T5Config, T5ForConditionalGeneration
117 | 
118 |     model_class = T5ForConditionalGenerationWithAdapter
119 |     config_class = T5WithAdapterConfig
120 | 
121 |     # Load pretrained model and tokenizer
122 |     #
123 |     # Distributed training:
124 |     # The .from_pretrained methods guarantee that only one local process can concurrently
125 |     # download model & vocab.
126 |     config = config_class.from_pretrained(
127 |         model_args.config_name
128 |         if model_args.config_name
129 |         else model_args.model_name_or_path,
130 |         cache_dir=model_args.cache_dir,
131 |     )
132 |     config.update(dataclasses.asdict(adapter_args))
133 |     all_tasks = list(set(data_args.tasks + data_args.eval_tasks))
134 |     # mrqa is a single 'task' with many sub-tasks
135 |     if "mrqa" in data_args.tasks + data_args.eval_tasks or "mrqa_reg" in data_args.tasks + data_args.eval_tasks:
136 |         all_tasks += [
137 |             "HotpotQA",
138 |             "NaturalQuestionsShort",
139 |             "NewsQA",
140 |             "SearchQA",
141 |             "SQuAD",
142 |             "TriviaQA-web",
143 |         ]
144 |     config.update({"tasks": all_tasks})
145 | 
146 |     tokenizer = AutoTokenizer.from_pretrained(
147 |         model_args.tokenizer_name
148 |         if model_args.tokenizer_name
149 |         else model_args.model_name_or_path,
150 |         cache_dir=model_args.cache_dir,
151 |     )
152 |     if model_args.not_load_t5_checkpoint:
153 |         model = model_class(config=config)
154 |     else:
155 |         last_checkpoint_path = training_args.output_dir
156 |         model_path = (
157 |             model_args.model_name_or_path
158 |             if (
159 |                 (
160 |                     training_args.optimize_from_scratch
161 |                     and not training_args.optimize_from_scratch_with_loading_model
162 |                 )
163 |                 or not os.path.exists(
164 |                     os.path.join(last_checkpoint_path, "pytorch_model.bin")
165 |                 )
166 |             )
167 |             else last_checkpoint_path
168 |         )
169 |         logger.warning("model path loaded from : %s", model_path)
170 |         model = model_class.from_pretrained(
171 |             model_path,
172 |             from_tf=".ckpt" in model_args.model_name_or_path,
173 |             config=config,
174 |             cache_dir=model_args.cache_dir,
175 |         )
176 | 
177 |     # set num_beams for evaluation
178 |     if data_args.eval_beams is None:
179 |         data_args.eval_beams = model.config.num_beams
180 | 
181 |     # freezing the parameters.
182 |     if model_args.freeze_model:
183 |         freeze_model(model)
184 |     if model_args.unfreeze_encoder_adapters:
185 |         unfreeze_adapter_params_encoder(model)
186 |     if model_args.unfreeze_decoder_adapters:
187 |         unfreeze_adapter_params_decoder(model)
188 |     if model_args.unfreeze_encoder:
189 |         unfreeze_encoder(model)
190 |     if model_args.unfreeze_decoder:
191 |         unfreeze_decoder(model)
192 |     if model_args.unfreeze_layer_norms:
193 |         unfreeze_layer_norms(model)
194 | 
195 |     if training_args.print_num_parameters:
196 |         for name, param in model.named_parameters():
197 |             if param.requires_grad:
198 |                 logger.info("Parameter name %s", name)
199 |         total_trainable_params = sum(
200 |             p.numel() for p in model.parameters() if p.requires_grad
201 |         )
202 |         total_params = sum(p.numel() for p in model.parameters())
203 |         logger.info("Total trainable parameters %s", total_trainable_params)
204 |         logger.info("Total parameters %s", total_params)
205 |     # Gets the training/test/validation datasets.
206 |     dataset_class = AutoTask
207 |     if training_args.do_train:
208 |         train_datasets = [
209 |             dataset_class.get(task, seed=data_args.data_seed).get_dataset(
210 |                 split="train",
211 |                 n_obs=data_args.n_train,
212 |                 add_prefix=True,
213 |                 split_validation_test=training_args.split_validation_test,
214 |             )
215 |             for task in data_args.tasks
216 |         ]
217 |         if "mrqa" in data_args.tasks and data_args.filter_nulls:
218 |             mrqa = train_datasets[data_args.tasks.index("mrqa")]
219 |             mrqa.toggle_null_filter()
220 |         dataset_sizes = [len(train_dataset) for train_dataset in train_datasets]
221 |         train_dataset = datasets.concatenate_datasets(train_datasets)
222 |     training_args.remove_unused_columns = False
223 |     eval_datasets = (
224 |         {
225 |             task: dataset_class.get(task, seed=data_args.data_seed).get_dataset(
226 |                 split="validation",
227 |                 n_obs=data_args.n_val,
228 |                 add_prefix=True,
229 |                 split_validation_test=training_args.split_validation_test,
230 |             )
231 |             for task in data_args.eval_tasks
232 |         }
233 |         if training_args.do_eval
234 |         or training_args.evaluation_strategy != EvaluationStrategy.NO
235 |         else None
236 |     )
237 |     test_dataset = (
238 |         {
239 |             task: dataset_class.get(task, seed=data_args.data_seed).get_dataset(
240 |                 split="test",
241 |                 n_obs=data_args.n_test,
242 |                 add_prefix=True,
243 |                 split_validation_test=training_args.split_validation_test,
244 |             )
245 |             for task in data_args.eval_tasks
246 |         }
247 |         if training_args.do_test
248 |         else None
249 |     )
250 | 
251 |     # Defines the metrics for evaluation.
252 |     compute_metrics_fn = (
253 |         build_compute_metrics_fn(data_args.eval_tasks, tokenizer)
254 |         if training_args.predict_with_generate
255 |         else None
256 |     )
257 | 
258 |     collator_class = TaskCollator
259 |     compute_gen_probs = False
260 |     if "mrqa" in eval_datasets:
261 |         collator_class = MrqaTaskCollator
262 |         compute_gen_probs = True
263 |     elif "mrqa_reg" in eval_datasets:
264 |         compute_gen_probs = True
265 | 
266 |     # Defines the trainer.
267 |     trainer = T5Trainer(
268 |         model=model,
269 |         config=config,
270 |         args=training_args,
271 |         train_dataset=train_dataset if training_args.do_train else None,
272 |         eval_dataset=eval_datasets,
273 |         data_collator=collator_class(
274 |             tokenizer,
275 |             data_args,
276 |             tpu_num_cores=training_args.tpu_num_cores,
277 |         ),
278 |         tokenizer=tokenizer,
279 |         compute_metrics=None,
280 |         multi_task_compute_metrics=compute_metrics_fn,
281 |         data_args=data_args,
282 |         compute_gen_probs=compute_gen_probs,
283 |         dataset_sizes=dataset_sizes if training_args.do_train else None,
284 |     )
285 | 
286 |     # Trains the model.
287 |     if training_args.do_train:
288 |         if trainer.is_world_process_zero():
289 |             last_checkpoint_path = training_args.output_dir
290 |             model_path = (
291 |                 model_args.model_name_or_path
292 |                 if (
293 |                     training_args.optimize_from_scratch
294 |                     or not os.path.exists(
295 |                         os.path.join(last_checkpoint_path, "pytorch_model.bin")
296 |                     )
297 |                 )
298 |                 else last_checkpoint_path
299 |             )
300 |         if training_args.compute_time:
301 |             torch.cuda.synchronize()  # wait for move to complete
302 |             start = torch.cuda.Event(enable_timing=True)
303 |             end = torch.cuda.Event(enable_timing=True)
304 |             start.record()
305 |         trainer.train(
306 |             # get_last_checkpoint_path(training_args.output_dir) \
307 |             model_path=model_path
308 |             if (
309 |                 os.path.exists(training_args.output_dir)
310 |                 and not training_args.optimize_from_scratch
311 |             )
312 |             else None,
313 |         )
314 |         if training_args.compute_time:
315 |             torch.cuda.synchronize()  # wait for all_reduce to complete
316 |             end.record()
317 |             total_time = {"total_time": start.elapsed_time(end)}
318 |             print("###### total_time ", total_time)
319 |         trainer.save_model()
320 |         # For convenience, we also re-save the tokenizer to the same directory,
321 |         # so that you can share your model easily on huggingface.co/models =)
322 |         if trainer.is_world_process_zero():
323 |             trainer.state.save_to_json(
324 |                 os.path.join(training_args.output_dir, "trainer_state.json")
325 |             )
326 |             tokenizer.save_pretrained(training_args.output_dir)
327 | 
328 |     if training_args.do_eval:
329 |         trainer.evaluate()
330 | 
331 |     if training_args.do_test:
332 |         # to avoid overwriting
333 |         trainer.answer_output_file = "predicted_answers_test.json"
334 |         trainer.evaluate(test_dataset)
335 | 
336 | 
337 | def _mp_fn(index):
338 |     # For xla_spawn (TPUs)
339 |     main()
340 | 
341 | 
342 | if __name__ == "__main__":
343 |     main()
344 | 


--------------------------------------------------------------------------------
/hyperdecoder/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .metrics import build_compute_metrics_fn
2 | 


--------------------------------------------------------------------------------
/hyperdecoder/metrics/metrics.py:
--------------------------------------------------------------------------------
  1 | """Defines different metrics used for evaluation of tasks."""
  2 | from collections import defaultdict
  3 | import functools
  4 | import numpy as np
  5 | import scipy
  6 | import math
  7 | import sklearn
  8 | from logging import getLogger
  9 | from third_party.utils import calculate_rouge, calculate_bleu, lmap
 10 | from transformers import EvalPrediction, PreTrainedTokenizer
 11 | from typing import Callable, Dict, List, Tuple
 12 | from metrics.squad_scoring import f1_score, exact_match_score
 13 | 
 14 | logger = getLogger(__name__)
 15 | 
 16 | 
 17 | def rouge(predictions, targets) -> dict:
 18 |     """Computes rouge score."""
 19 |     return calculate_rouge(predictions, targets)
 20 | 
 21 | 
 22 | def bleu(predictions, targets) -> dict:
 23 |     """Computes bleu score."""
 24 |     return calculate_bleu(predictions, targets)
 25 | 
 26 | 
 27 | def accuracy(predictions, targets) -> dict:
 28 |     """Computes the average accuracy."""
 29 |     return {"acc": 100 * ((np.array(predictions) == np.array(targets)).mean())}
 30 | 
 31 | 
 32 | def pearson_corrcoef(predictions, targets) -> dict:
 33 |     """Computes Pearson correlation coefficient."""
 34 |     pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0]
 35 | 
 36 |     # Note that if all the predictions will be the same, spearman
 37 |     # correlation is nan, to gaurad against this, we check the output
 38 |     # and return 0 in this case.
 39 |     if math.isnan(pearson_corrcoef):
 40 |         pearson_corrcoef = 0
 41 |     return {"pearson_corrcoef": pearson_corrcoef}
 42 | 
 43 | 
 44 | def spearman_corrcoef(predictions, targets) -> dict:
 45 |     """Computes Spearman correlation coefficient."""
 46 |     spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0]
 47 | 
 48 |     # Note that if all the predictions will be the same, spearman
 49 |     # correlation is nan, to gaurad against this, we check the output
 50 |     # and return 0 in this case.
 51 |     if math.isnan(spearman_corrcoef):
 52 |         spearman_corrcoef = 0
 53 |     return {"spearman_corrcoef": spearman_corrcoef}
 54 | 
 55 | 
 56 | def f1_score_with_invalid(predictions, targets) -> dict:
 57 |     """Computes F1 score,  with any prediction != 0 or 1 is counted as incorrect.
 58 |     Args:
 59 |       targets: list of targets, either 0 or 1
 60 |       predictions: list of predictions, any integer value
 61 |     Returns:
 62 |       F1 score, where any prediction != 0 or 1 is counted as wrong.
 63 |     """
 64 |     targets, predictions = np.asarray(targets), np.asarray(predictions)
 65 |     # Get indices of invalid predictions.
 66 |     invalid_idx_mask = np.logical_and(predictions != 0, predictions != 1)
 67 |     # For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target.
 68 |     predictions[invalid_idx_mask] = 1 - targets[invalid_idx_mask]
 69 |     return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}
 70 | 
 71 | 
 72 | # TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow
 73 | def matthews_corrcoef(predictions, targets) -> dict:
 74 |     """Computes the Matthews correlation coefficient."""
 75 |     return {"mcc": 100 * sklearn.metrics.matthews_corrcoef(targets, predictions)}
 76 | 
 77 | 
 78 | def squad_metrics(predictions, targets) -> dict:
 79 |     d = defaultdict(list)
 80 |     for p, t in zip(predictions, targets):
 81 |         d["f1"].append(f1_score(p, t))
 82 |         d["em"].append(exact_match_score(p, t))
 83 |     from statistics import mean
 84 | 
 85 |     return {"f1": mean(d["f1"]), "em": mean(d["em"])}
 86 | 
 87 | 
 88 | def build_compute_metrics_fn(
 89 |     task_names: List[str], tokenizer: PreTrainedTokenizer
 90 | ) -> Callable[[EvalPrediction], Dict]:
 91 |     """Builds a dictionary from each task to the task metric."""
 92 | 
 93 |     def non_pad_len(tokens: np.ndarray) -> int:
 94 |         return np.count_nonzero(tokens != tokenizer.pad_token_id)
 95 | 
 96 |     def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]:
 97 |         pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
 98 |         pred.label_ids[pred.label_ids < 0] = 0
 99 |         label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
100 |         pred_str = lmap(str.strip, pred_str)
101 |         label_str = lmap(str.strip, label_str)
102 |         return pred_str, label_str
103 | 
104 |     def compute_metrics(pred: EvalPrediction, metrics, post_processor=None) -> Dict:
105 |         pred_str, label_str = decode_pred(pred)
106 | 
107 |         # Applies task post-processor.
108 |         if post_processor is not None:
109 |             pred_str = [post_processor(pred) for pred in pred_str]
110 |             label_str = [post_processor(label) for label in label_str]
111 | 
112 |         eval_results = {}
113 |         for metric in metrics:
114 |             eval_results.update(metric(pred_str, label_str))
115 |             if metric.__name__ in ["bleu", "rouge"]:
116 |                 gen_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
117 |                 eval_results.update({"gen_len": gen_len})
118 |         return eval_results
119 | 
120 |     def tasks_metrics(task) -> Dict:
121 |         from data.tasks import TASK_MAPPING
122 |         from data.postprocessors import get_post_processor
123 | 
124 |         return functools.partial(
125 |             compute_metrics,
126 |             metrics=TASK_MAPPING[task].metrics,
127 |             post_processor=get_post_processor(task),
128 |         )
129 | 
130 |     return {task: tasks_metrics(task) for task in task_names}
131 | 


--------------------------------------------------------------------------------
/hyperdecoder/metrics/squad_scoring.py:
--------------------------------------------------------------------------------
  1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
  2 | from __future__ import print_function
  3 | from collections import Counter
  4 | import string
  5 | import re
  6 | import argparse
  7 | import json
  8 | import sys
  9 | 
 10 | 
 11 | def normalize_answer(s):
 12 |     """Lower text and remove punctuation, articles and extra whitespace."""
 13 | 
 14 |     def remove_articles(text):
 15 |         return re.sub(r"\b(a|an|the)\b", " ", text)
 16 | 
 17 |     def white_space_fix(text):
 18 |         return " ".join(text.split())
 19 | 
 20 |     def remove_punc(text):
 21 |         exclude = set(string.punctuation)
 22 |         return "".join(ch for ch in text if ch not in exclude)
 23 | 
 24 |     def lower(text):
 25 |         return text.lower()
 26 | 
 27 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
 28 | 
 29 | 
 30 | def f1_score(prediction, ground_truth):
 31 |     prediction_tokens = normalize_answer(prediction).split()
 32 |     ground_truth_tokens = normalize_answer(ground_truth).split()
 33 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 34 |     num_same = sum(common.values())
 35 |     if num_same == 0:
 36 |         return 0
 37 |     precision = 1.0 * num_same / len(prediction_tokens)
 38 |     recall = 1.0 * num_same / len(ground_truth_tokens)
 39 |     f1 = (2 * precision * recall) / (precision + recall)
 40 |     return f1
 41 | 
 42 | 
 43 | def exact_match_score(prediction, ground_truth):
 44 |     return normalize_answer(prediction) == normalize_answer(ground_truth)
 45 | 
 46 | 
 47 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 48 |     scores_for_ground_truths = []
 49 |     for ground_truth in ground_truths:
 50 |         score = metric_fn(prediction, ground_truth)
 51 |         scores_for_ground_truths.append(score)
 52 |     return max(scores_for_ground_truths)
 53 | 
 54 | 
 55 | def evaluate(dataset, predictions):
 56 |     f1 = exact_match = total = 0
 57 |     for article in dataset:
 58 |         for paragraph in article["paragraphs"]:
 59 |             for qa in paragraph["qas"]:
 60 |                 total += 1
 61 |                 if qa["id"] not in predictions:
 62 |                     message = (
 63 |                         "Unanswered question " + qa["id"] + " will receive score 0."
 64 |                     )
 65 |                     print(message, file=sys.stderr)
 66 |                     continue
 67 |                 ground_truths = list(map(lambda x: x["text"], qa["answers"]))
 68 |                 prediction = predictions[qa["id"]]
 69 |                 exact_match += metric_max_over_ground_truths(
 70 |                     exact_match_score, prediction, ground_truths
 71 |                 )
 72 |                 f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
 73 | 
 74 |     exact_match = 100.0 * exact_match / total
 75 |     f1 = 100.0 * f1 / total
 76 | 
 77 |     return {"exact_match": exact_match, "f1": f1}
 78 | 
 79 | 
 80 | if __name__ == "__main__":
 81 |     expected_version = "1.1"
 82 |     parser = argparse.ArgumentParser(
 83 |         description="Evaluation for SQuAD " + expected_version
 84 |     )
 85 |     parser.add_argument("dataset_file", help="Dataset file")
 86 |     parser.add_argument("prediction_file", help="Prediction File")
 87 |     args = parser.parse_args()
 88 |     with open(args.dataset_file) as dataset_file:
 89 |         dataset_json = json.load(dataset_file)
 90 |         if dataset_json["version"] != expected_version:
 91 |             print(
 92 |                 "Evaluation expects v-"
 93 |                 + expected_version
 94 |                 + ", but got dataset with v-"
 95 |                 + dataset_json["version"],
 96 |                 file=sys.stderr,
 97 |             )
 98 |         dataset = dataset_json["data"]
 99 |     with open(args.prediction_file) as prediction_file:
100 |         predictions = json.load(prediction_file)
101 |     print(json.dumps(evaluate(dataset, predictions)))
102 | 


--------------------------------------------------------------------------------
/hyperdecoder/modeling/adapter_generators.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | def hyperfanin_init_weight(linear_layer, hypernet_in, mainnet_in):
 8 |     bound = 1e-3 * math.sqrt(3 / (hypernet_in * mainnet_in))
 9 |     nn.init.uniform_(linear_layer.weight, -bound, bound)
10 |     nn.init.constant_(linear_layer.bias, 0.0)
11 | 
12 | 
13 | def hyperfanin_init_bias(linear_layer, hypernet_in):
14 |     bound = 1e-3 * math.sqrt(3 / (hypernet_in))
15 |     nn.init.uniform_(linear_layer.weight, -bound, bound)
16 |     nn.init.constant_(linear_layer.bias, 0.0)
17 | 
18 | 
19 | class SimpleGenerator(nn.Module):
20 |     def __init__(self, config, input_dim, hidden_size, is_encoder=False):
21 |         super().__init__()
22 |         adapter_dim = (
23 |             config.encoder_adapter_dim if is_encoder else config.decoder_adapter_dim
24 |         )
25 |         self.input_dim = input_dim
26 |         self.hidden_dim = config.hypernetwork_bottleneck
27 |         self.linear1 = nn.Linear(self.input_dim, self.hidden_dim)
28 |         self.activation_fn = nn.ReLU()
29 |         # output weights
30 |         self.weight_up = nn.Linear(self.hidden_dim, hidden_size * adapter_dim)
31 |         self.weight_down = nn.Linear(self.hidden_dim, hidden_size * adapter_dim)
32 |         self.bias_up = nn.Linear(self.hidden_dim, hidden_size)
33 |         self.bias_down = nn.Linear(self.hidden_dim, adapter_dim)
34 |         # init weights
35 |         hyperfanin_init_weight(self.weight_up, self.hidden_dim, adapter_dim)
36 |         hyperfanin_init_weight(self.weight_down, self.hidden_dim, hidden_size)
37 |         hyperfanin_init_bias(self.bias_up, self.hidden_dim)
38 |         hyperfanin_init_bias(self.bias_down, self.hidden_dim)
39 | 
40 |     def forward(self, x):
41 |         x = self.linear1(x)
42 |         x = self.activation_fn(x)
43 |         return (
44 |             self.weight_up(x),
45 |             self.weight_down(x),
46 |             self.bias_up(x),
47 |             self.bias_down(x),
48 |         )
49 | 
50 | 
51 | class ParameterGenerator(nn.Module):
52 |     def __init__(self, config, hidden_size, is_encoder=False):
53 |         super().__init__()
54 |         self.config = config
55 |         self.layer_embed = nn.Embedding(config.num_hidden_layers, 10)
56 |         self.decoder = SimpleGenerator(
57 |             config, config.hidden_size + 10, hidden_size, is_encoder=is_encoder
58 |         )
59 | 
60 |     def forward(self, hidden_inputs):
61 |         layers = []
62 |         # setup idxs we need
63 |         layers_idxs = torch.arange(
64 |             0,
65 |             self.config.num_hidden_layers,
66 |             dtype=torch.long,
67 |             device=hidden_inputs.device,
68 |         )
69 |         layers_idxs = layers_idxs.repeat(hidden_inputs.size(0), 1)
70 |         for i in range(self.config.num_hidden_layers):
71 |             layer_embed = self.layer_embed(layers_idxs[:, i])
72 |             hidden_input = torch.cat([hidden_inputs, layer_embed], dim=1)
73 |             layers.append(self.decoder(hidden_input))
74 |         return layers
75 | 


--------------------------------------------------------------------------------
/hyperdecoder/modeling/adapter_layer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import math
  4 | 
  5 | 
  6 | class AdapterLayer(nn.Module):
  7 |     def __init__(self, config, is_encoder=False):
  8 |         super().__init__()
  9 |         self.adapter_dim = (
 10 |             config.encoder_adapter_dim if is_encoder else config.decoder_adapter_dim
 11 |         )
 12 |         hidden_size = config.hidden_size
 13 |         self.input_dim = config.hidden_size
 14 |         self.output_dim = config.hidden_size
 15 |         # insertion weights
 16 |         self.adapter_down_weight = None
 17 |         self.adapter_down_bias = None
 18 |         self.adapter_up_weight = None
 19 |         self.adapter_up_bias = None
 20 |         self.hidden_act = nn.ReLU()
 21 |         # learnt adapter + inits for it
 22 |         self.adapter_down_manual = nn.Linear(hidden_size, self.adapter_dim)
 23 |         self.adapter_up_manual = nn.Linear(self.adapter_dim, hidden_size)
 24 |         nn.init.xavier_uniform_(self.adapter_up_manual.weight, gain=1e-4)
 25 |         nn.init.xavier_uniform_(self.adapter_down_manual.weight, gain=1e-4)
 26 |         nn.init.constant_(self.adapter_up_manual.bias, 0.0)
 27 |         nn.init.constant_(self.adapter_down_manual.bias, 0.0)
 28 | 
 29 |     def clear_adapter(self):
 30 |         self.adapter_down_weight = None
 31 |         self.adapter_down_bias = None
 32 |         self.adapter_up_weight = None
 33 |         self.adapter_up_bias = None
 34 | 
 35 |     def apply_adapter_params(self, bsz, uw, dw, ub, db):
 36 |         self.adapter_down_weight = dw.view(bsz, self.input_dim, self.adapter_dim)
 37 |         self.adapter_down_bias = db.view(bsz, self.adapter_dim)
 38 |         self.adapter_up_weight = uw.view(bsz, self.adapter_dim, self.output_dim)
 39 |         self.adapter_up_bias = ub.view(bsz, self.output_dim)
 40 | 
 41 |     def forward(self, x):
 42 |         if self.adapter_down_weight is not None:
 43 |             x = (x @ self.adapter_down_weight) + self.adapter_down_bias.unsqueeze(1)
 44 |             x = self.hidden_act(x)
 45 |             x = (x @ self.adapter_up_weight) + self.adapter_up_bias.unsqueeze(1)
 46 |         else:
 47 |             x = self.adapter_down_manual(x)
 48 |             x = self.hidden_act(x)
 49 |             x = self.adapter_up_manual(x)
 50 |         return x  # no residual connection - we let the user of this layer decide that
 51 | 
 52 | 
 53 | class TaskSpecificAdapterLayer(nn.Module):
 54 |     def __init__(self, config, task_list, is_encoder=False):
 55 |         super().__init__()
 56 |         self.adapter_dim = (
 57 |             config.encoder_adapter_dim if is_encoder else config.decoder_adapter_dim
 58 |         )
 59 |         hidden_size = config.hidden_size
 60 |         task_list = config.tasks
 61 |         self.input_dim = hidden_size
 62 |         self.output_dim = hidden_size
 63 |         self.hidden_act = nn.ReLU()
 64 |         # learnt adapter + inits for it
 65 |         self.adapter_down_manual_weight = nn.Parameter(
 66 |             torch.randn(len(task_list), hidden_size, self.adapter_dim)
 67 |         )
 68 |         self.adapter_down_manual_bias = nn.Parameter(
 69 |             torch.randn(len(task_list), 1, self.adapter_dim)
 70 |         )
 71 |         self.adapter_up_manual_weight = nn.Parameter(
 72 |             torch.randn(len(task_list), self.adapter_dim, hidden_size)
 73 |         )
 74 |         self.adapter_up_manual_bias = nn.Parameter(
 75 |             torch.randn(len(task_list), 1, hidden_size)
 76 |         )
 77 | 
 78 |         nn.init.xavier_uniform_(self.adapter_down_manual_weight, gain=1e-4)
 79 |         nn.init.constant_(self.adapter_down_manual_bias, 0.0)
 80 |         nn.init.xavier_uniform_(self.adapter_up_manual_weight, gain=1e-4)
 81 |         nn.init.constant_(self.adapter_up_manual_bias, 0.0)
 82 |         # hacky method for setting task specific adapters
 83 |         self.adapter_down_weight_holder = None
 84 |         self.adapter_down_bias_holder = None
 85 |         self.adapter_up_weight_holder = None
 86 |         self.adapter_up_bias_holder = None
 87 | 
 88 |     def clear_adapter(self):
 89 |         self.adapter_down_weight_holder = None
 90 |         self.adapter_down_bias_holder = None
 91 |         self.adapter_up_weight_holder = None
 92 |         self.adapter_up_bias_holder = None
 93 | 
 94 |     def set_indices(self, indices):
 95 |         self.adapter_down_weight_holder = self.adapter_down_manual_weight[indices]
 96 |         self.adapter_down_bias_holder = self.adapter_down_manual_bias[indices]
 97 |         self.adapter_up_weight_holder = self.adapter_up_manual_weight[indices]
 98 |         self.adapter_up_bias_holder = self.adapter_up_manual_bias[indices]
 99 | 
100 |     def forward(self, x):
101 |         x = (
102 |             torch.bmm(x, self.adapter_down_weight_holder)
103 |             + self.adapter_down_bias_holder
104 |         )
105 |         x = self.hidden_act(x)
106 |         x = torch.bmm(x, self.adapter_up_weight_holder) + self.adapter_up_bias_holder
107 |         return x
108 | 


--------------------------------------------------------------------------------
/hyperdecoder/modeling/adapter_t5.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # modified modeling_t5 code.
 17 | import copy
 18 | import warnings
 19 | 
 20 | import torch
 21 | from torch import nn
 22 | from torch.nn import CrossEntropyLoss
 23 | from transformers.models.t5.configuration_t5 import T5Config
 24 | from transformers.modeling_outputs import Seq2SeqLMOutput, BaseModelOutput
 25 | from transformers.models.t5.modeling_t5 import (
 26 |     T5Block,
 27 |     T5LayerFF,
 28 |     T5Stack,
 29 |     T5ForConditionalGeneration,
 30 |     __HEAD_MASK_WARNING_MSG,
 31 | )
 32 | 
 33 | from modeling.adapter_generators import ParameterGenerator
 34 | from modeling.adapter_layer import AdapterLayer, TaskSpecificAdapterLayer
 35 | 
 36 | 
 37 | class T5WithAdapterConfig(T5Config):
 38 |     def __init__(
 39 |         self,
 40 |         encoder_adapter_dim=64,
 41 |         decoder_adapter_dim=64,
 42 |         hypernetwork_bottleneck=128,
 43 |         encoder_adapter="task",
 44 |         decoder_adapter="task",
 45 |         tasks=[],
 46 |         **kwargs,
 47 |     ):
 48 |         super().__init__(**kwargs)
 49 |         self.encoder_adapter_dim = encoder_adapter_dim
 50 |         self.decoder_adapter_dim = decoder_adapter_dim
 51 |         self.hypernetwork_bottleneck = hypernetwork_bottleneck
 52 |         self.encoder_adapter = encoder_adapter
 53 |         self.decoder_adapter = decoder_adapter
 54 |         self.tasks = tasks
 55 | 
 56 | 
 57 | class T5LayerFFWithAdapter(T5LayerFF):
 58 |     def __init__(self, config, is_encoder=False):
 59 |         super().__init__(config)
 60 |         self.config = config
 61 |         if (is_encoder and config.encoder_adapter == "manual_specific") or (
 62 |             not is_encoder and config.decoder_adapter == "manual_specific"
 63 |         ):
 64 |             self.adapter_layer = TaskSpecificAdapterLayer(config, is_encoder=is_encoder)
 65 |         else:
 66 |             self.adapter_layer = AdapterLayer(config, is_encoder=is_encoder)
 67 | 
 68 |     def forward(self, hidden_states):
 69 |         normed_states = self.layer_norm(hidden_states)
 70 |         forwarded_states = self.DenseReluDense(normed_states)
 71 |         adapter_input = (
 72 |             normed_states if self.config.adapter_norm_input else hidden_states
 73 |         )
 74 |         hidden_states = (
 75 |             hidden_states
 76 |             + self.dropout(forwarded_states)
 77 |             + self.adapter_layer(adapter_input)
 78 |         )
 79 |         return hidden_states
 80 | 
 81 | 
 82 | class T5BlockWithAdapter(T5Block):
 83 |     def __init__(self, config, has_relative_attention_bias=False, is_encoder=False):
 84 |         super().__init__(
 85 |             config, has_relative_attention_bias=has_relative_attention_bias
 86 |         )
 87 |         self.layer[-1] = T5LayerFFWithAdapter(config, is_encoder=is_encoder)
 88 | 
 89 | 
 90 | def mean_pooling(hidden_state, attention_mask):
 91 |     input_masked = hidden_state * attention_mask.unsqueeze(-1)
 92 |     return input_masked.sum(1) / attention_mask.sum(1).unsqueeze(-1)
 93 | 
 94 | 
 95 | class T5StackWithAdapter(T5Stack):
 96 |     def __init__(self, config, embed_tokens=None):
 97 |         super().__init__(config, embed_tokens=embed_tokens)
 98 |         blockClass = T5Block
 99 |         if (self.is_decoder and self.config.decoder_adapter != "none") or (
100 |             (not self.is_decoder) and self.config.encoder_adapter != "none"
101 |         ):
102 |             blockClass = T5BlockWithAdapter
103 |             kwargs = {"is_encoder": not self.is_decoder}
104 |         else:
105 |             kwargs = {}
106 |         self.block = torch.nn.ModuleList(
107 |             [
108 |                 blockClass(config, has_relative_attention_bias=bool(i == 0), **kwargs)
109 |                 for i in range(config.num_layers)
110 |             ]
111 |         )
112 |         if (self.is_decoder and self.config.decoder_adapter == "generated") or (
113 |             (not self.is_decoder) and self.config.encoder_adapter == "generated"
114 |         ):
115 |             self.param_gen = ParameterGenerator(
116 |                 config, config.hidden_size, is_encoder=not self.is_decoder
117 |             )
118 |             if self.config.process_encoder_output:
119 |                 self.mlp = nn.Sequential(
120 |                     nn.Linear(config.d_model, config.d_model),
121 |                     nn.ReLU(),
122 |                     nn.Linear(config.d_model, config.d_model),
123 |                 )
124 |             else:
125 |                 # no-op to make the forward function less of an if-maze
126 |                 self.mlp = lambda x: x
127 |         elif (self.is_decoder and self.config.decoder_adapter == "task") or (
128 |             (not self.is_decoder) and self.config.encoder_adapter == "task"
129 |         ):
130 |             self.param_gen = ParameterGenerator(
131 |                 config, config.hidden_size, is_encoder=not self.is_decoder
132 |             )
133 |             self.adapter_task_embedding = nn.Embedding(
134 |                 len(self.config.tasks), self.config.d_model
135 |             )
136 | 
137 |     def forward(
138 |         self,
139 |         input_ids=None,
140 |         encoder_hidden_states=None,
141 |         tasks=None,
142 |         **kwargs,
143 |     ):
144 |         # using input ids to determine whats going
145 |         self.clear_adapters()
146 |         if self.is_decoder and self.config.decoder_adapter == "generated":
147 |             self.apply_params_to_adapters(
148 |                 encoder_hidden_states.size(0),
149 |                 self.param_gen(
150 |                     self.mlp(
151 |                         mean_pooling(
152 |                             encoder_hidden_states, kwargs["encoder_attention_mask"]
153 |                         )
154 |                     )
155 |                 ),
156 |             )
157 |         elif (not self.is_decoder) and self.config.encoder_adapter == "generated":
158 |             # for encoder generation, we first pass through the encoder, then set encoder adapters based on this.
159 |             # currently using learnt adapters in the first pass, but potentially we could turn those off too?
160 |             res = super().forward(
161 |                 input_ids=input_ids,
162 |                 encoder_hidden_states=encoder_hidden_states,
163 |                 **kwargs,
164 |             )
165 |             self.apply_params_to_adapters(
166 |                 input_ids.size(0),
167 |                 self.param_gen(
168 |                     self.mlp(
169 |                         mean_pooling(res.last_hidden_state, kwargs["attention_mask"])
170 |                     )
171 |                 ),
172 |             )
173 |         elif (self.is_decoder and self.config.decoder_adapter == "task") or (
174 |             not self.is_decoder and self.config.encoder_adapter == "task"
175 |         ):
176 |             # at test time, we only test one task at a time.
177 |             if not self.training:
178 |                 # simple sanity check
179 |                 if len(tasks) > 0:
180 |                     assert(tasks[0] == tasks[1] and tasks[1] == tasks[-1])
181 |                 tasks = [tasks[0] for _ in range(input_ids.size(0))]
182 |             indices = torch.tensor(
183 |                 [self.config.tasks.index(task) for task in tasks],
184 |                 device=input_ids.device,
185 |                 dtype=torch.long,
186 |             )
187 |             task_embed = self.adapter_task_embedding(indices)
188 |             self.apply_params_to_adapters(input_ids.size(0), self.param_gen(task_embed))
189 |         elif (self.is_decoder and self.config.decoder_adapter == "manual_specific") or (
190 |             not self.is_decoder and self.config.encoder_adapter == "manual_specific"
191 |         ):
192 |             indices = torch.tensor(
193 |                 [self.config.tasks.index(task) for task in tasks],
194 |                 device=input_ids.device,
195 |                 dtype=torch.long,
196 |             )
197 |             self.apply_indices_to_adapters(indices)
198 |         return super().forward(
199 |             input_ids=input_ids, encoder_hidden_states=encoder_hidden_states, **kwargs
200 |         )
201 | 
202 |     def clear_adapters(self):
203 |         for block in self.block:
204 |             for layer in block.layer:
205 |                 if isinstance(layer, T5LayerFFWithAdapter):
206 |                     layer.adapter_layer.clear_adapter()
207 | 
208 |     def apply_params_to_adapters(self, batch_size, generated_params):
209 |         for param, block in zip(generated_params, self.block):
210 |             block.layer[-1].adapter_layer.apply_adapter_params(batch_size, *param)
211 | 
212 |     def apply_indices_to_adapters(self, indices):
213 |         for block in self.block:
214 |             block.layer[-1].adapter_layer.set_indices(indices)
215 | 
216 | 
217 | class T5ForConditionalGenerationWithAdapter(T5ForConditionalGeneration):
218 |     def __init__(self, config):
219 |         super().__init__(config)
220 |         encoder_config = copy.deepcopy(config)
221 |         encoder_config.is_decoder = False
222 |         encoder_config.use_cache = False
223 |         encoder_config.is_encoder_decoder = False
224 |         self.encoder = T5StackWithAdapter(encoder_config, self.shared)
225 | 
226 |         decoder_config = copy.deepcopy(config)
227 |         decoder_config.is_decoder = True
228 |         decoder_config.is_encoder_decoder = False
229 |         decoder_config.num_layers = config.num_decoder_layers
230 |         self.decoder = T5StackWithAdapter(decoder_config, self.shared)
231 | 
232 |         self.init_weights()
233 | 
234 |     # required to pass tasks through
235 |     def prepare_inputs_for_generation(
236 |         self,
237 |         input_ids,
238 |         past=None,
239 |         attention_mask=None,
240 |         head_mask=None,
241 |         decoder_head_mask=None,
242 |         cross_attn_head_mask=None,
243 |         use_cache=None,
244 |         encoder_outputs=None,
245 |         **kwargs,
246 |     ):
247 | 
248 |         # cut decoder_input_ids if past is used
249 |         if past is not None:
250 |             input_ids = input_ids[:, -1:]
251 | 
252 |         return {
253 |             "decoder_input_ids": input_ids,
254 |             "past_key_values": past,
255 |             "encoder_outputs": encoder_outputs,
256 |             "attention_mask": attention_mask,
257 |             "head_mask": head_mask,
258 |             "decoder_head_mask": decoder_head_mask,
259 |             "cross_attn_head_mask": cross_attn_head_mask,
260 |             "use_cache": use_cache,
261 |             "tasks": kwargs["tasks"],
262 |         }
263 | 
264 |     def forward(
265 |         self,
266 |         input_ids=None,
267 |         attention_mask=None,
268 |         tasks=None,
269 |         decoder_input_ids=None,
270 |         decoder_attention_mask=None,
271 |         head_mask=None,
272 |         decoder_head_mask=None,
273 |         cross_attn_head_mask=None,
274 |         encoder_outputs=None,
275 |         past_key_values=None,
276 |         inputs_embeds=None,
277 |         decoder_inputs_embeds=None,
278 |         labels=None,
279 |         use_cache=None,
280 |         output_attentions=None,
281 |         output_hidden_states=None,
282 |         return_dict=None,
283 |     ):
284 |         use_cache = use_cache if use_cache is not None else self.config.use_cache
285 |         return_dict = (
286 |             return_dict if return_dict is not None else self.config.use_return_dict
287 |         )
288 | 
289 |         # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
290 |         if head_mask is not None and decoder_head_mask is None:
291 |             if self.config.num_layers == self.config.num_decoder_layers:
292 |                 warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
293 |                 decoder_head_mask = head_mask
294 | 
295 |         # Encode if needed (training, first prediction pass)
296 |         if encoder_outputs is None:
297 |             # Convert encoder inputs in embeddings if needed
298 |             encoder_outputs = self.encoder(
299 |                 input_ids=input_ids,
300 |                 attention_mask=attention_mask,
301 |                 tasks=tasks,
302 |                 inputs_embeds=inputs_embeds,
303 |                 head_mask=head_mask,
304 |                 output_attentions=output_attentions,
305 |                 output_hidden_states=output_hidden_states,
306 |                 return_dict=return_dict,
307 |             )
308 |         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
309 |             encoder_outputs = BaseModelOutput(
310 |                 last_hidden_state=encoder_outputs[0],
311 |                 hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
312 |                 attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
313 |             )
314 | 
315 |         hidden_states = encoder_outputs[0]
316 | 
317 |         if self.model_parallel:
318 |             torch.cuda.set_device(self.decoder.first_device)
319 | 
320 |         if (
321 |             labels is not None
322 |             and decoder_input_ids is None
323 |             and decoder_inputs_embeds is None
324 |         ):
325 |             # get decoder inputs from shifting lm labels to the right
326 |             decoder_input_ids = self._shift_right(labels)
327 | 
328 |         # Set device for model parallelism
329 |         if self.model_parallel:
330 |             torch.cuda.set_device(self.decoder.first_device)
331 |             hidden_states = hidden_states.to(self.decoder.first_device)
332 |             if decoder_input_ids is not None:
333 |                 decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
334 |             if attention_mask is not None:
335 |                 attention_mask = attention_mask.to(self.decoder.first_device)
336 |             if decoder_attention_mask is not None:
337 |                 decoder_attention_mask = decoder_attention_mask.to(
338 |                     self.decoder.first_device
339 |                 )
340 | 
341 |         # Decode
342 |         decoder_outputs = self.decoder(
343 |             input_ids=decoder_input_ids,
344 |             attention_mask=decoder_attention_mask,
345 |             tasks=tasks,
346 |             inputs_embeds=decoder_inputs_embeds,
347 |             past_key_values=past_key_values,
348 |             encoder_hidden_states=hidden_states,
349 |             encoder_attention_mask=attention_mask,
350 |             head_mask=decoder_head_mask,
351 |             cross_attn_head_mask=cross_attn_head_mask,
352 |             use_cache=use_cache,
353 |             output_attentions=output_attentions,
354 |             output_hidden_states=output_hidden_states,
355 |             return_dict=return_dict,
356 |         )
357 | 
358 |         sequence_output = decoder_outputs[0]
359 | 
360 |         # Set device for model parallelism
361 |         if self.model_parallel:
362 |             torch.cuda.set_device(self.encoder.first_device)
363 |             self.lm_head = self.lm_head.to(self.encoder.first_device)
364 |             sequence_output = sequence_output.to(self.lm_head.weight.device)
365 | 
366 |         if self.config.tie_word_embeddings:
367 |             # Rescale output before projecting on vocab
368 |             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
369 |             sequence_output = sequence_output * (self.model_dim ** -0.5)
370 | 
371 |         lm_logits = self.lm_head(sequence_output)
372 | 
373 |         loss = None
374 |         if labels is not None:
375 |             loss_fct = CrossEntropyLoss(ignore_index=-100)
376 |             loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
377 |             # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
378 | 
379 |         if not return_dict:
380 |             output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
381 |             return ((loss,) + output) if loss is not None else output
382 | 
383 |         return Seq2SeqLMOutput(
384 |             loss=loss,
385 |             logits=lm_logits,
386 |             past_key_values=decoder_outputs.past_key_values,
387 |             decoder_hidden_states=decoder_outputs.hidden_states,
388 |             decoder_attentions=decoder_outputs.attentions,
389 |             cross_attentions=decoder_outputs.cross_attentions,
390 |             encoder_last_hidden_state=encoder_outputs.last_hidden_state,
391 |             encoder_hidden_states=encoder_outputs.hidden_states,
392 |             encoder_attentions=encoder_outputs.attentions,
393 |         )
394 | 


--------------------------------------------------------------------------------
/hyperdecoder/third_party/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright 2018- The Hugging Face team. All rights reserved.
  2 | 
  3 |                                  Apache License
  4 |                            Version 2.0, January 2004
  5 |                         http://www.apache.org/licenses/
  6 | 
  7 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  8 | 
  9 |    1. Definitions.
 10 | 
 11 |       "License" shall mean the terms and conditions for use, reproduction,
 12 |       and distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 |       "Licensor" shall mean the copyright owner or entity authorized by
 15 |       the copyright owner that is granting the License.
 16 | 
 17 |       "Legal Entity" shall mean the union of the acting entity and all
 18 |       other entities that control, are controlled by, or are under common
 19 |       control with that entity. For the purposes of this definition,
 20 |       "control" means (i) the power, direct or indirect, to cause the
 21 |       direction or management of such entity, whether by contract or
 22 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 23 |       outstanding shares, or (iii) beneficial ownership of such entity.
 24 | 
 25 |       "You" (or "Your") shall mean an individual or Legal Entity
 26 |       exercising permissions granted by this License.
 27 | 
 28 |       "Source" form shall mean the preferred form for making modifications,
 29 |       including but not limited to software source code, documentation
 30 |       source, and configuration files.
 31 | 
 32 |       "Object" form shall mean any form resulting from mechanical
 33 |       transformation or translation of a Source form, including but
 34 |       not limited to compiled object code, generated documentation,
 35 |       and conversions to other media types.
 36 | 
 37 |       "Work" shall mean the work of authorship, whether in Source or
 38 |       Object form, made available under the License, as indicated by a
 39 |       copyright notice that is included in or attached to the work
 40 |       (an example is provided in the Appendix below).
 41 | 
 42 |       "Derivative Works" shall mean any work, whether in Source or Object
 43 |       form, that is based on (or derived from) the Work and for which the
 44 |       editorial revisions, annotations, elaborations, or other modifications
 45 |       represent, as a whole, an original work of authorship. For the purposes
 46 |       of this License, Derivative Works shall not include works that remain
 47 |       separable from, or merely link (or bind by name) to the interfaces of,
 48 |       the Work and Derivative Works thereof.
 49 | 
 50 |       "Contribution" shall mean any work of authorship, including
 51 |       the original version of the Work and any modifications or additions
 52 |       to that Work or Derivative Works thereof, that is intentionally
 53 |       submitted to Licensor for inclusion in the Work by the copyright owner
 54 |       or by an individual or Legal Entity authorized to submit on behalf of
 55 |       the copyright owner. For the purposes of this definition, "submitted"
 56 |       means any form of electronic, verbal, or written communication sent
 57 |       to the Licensor or its representatives, including but not limited to
 58 |       communication on electronic mailing lists, source code control systems,
 59 |       and issue tracking systems that are managed by, or on behalf of, the
 60 |       Licensor for the purpose of discussing and improving the Work, but
 61 |       excluding communication that is conspicuously marked or otherwise
 62 |       designated in writing by the copyright owner as "Not a Contribution."
 63 | 
 64 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 65 |       on behalf of whom a Contribution has been received by Licensor and
 66 |       subsequently incorporated within the Work.
 67 | 
 68 |    2. Grant of Copyright License. Subject to the terms and conditions of
 69 |       this License, each Contributor hereby grants to You a perpetual,
 70 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 71 |       copyright license to reproduce, prepare Derivative Works of,
 72 |       publicly display, publicly perform, sublicense, and distribute the
 73 |       Work and such Derivative Works in Source or Object form.
 74 | 
 75 |    3. Grant of Patent License. Subject to the terms and conditions of
 76 |       this License, each Contributor hereby grants to You a perpetual,
 77 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 78 |       (except as stated in this section) patent license to make, have made,
 79 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 80 |       where such license applies only to those patent claims licensable
 81 |       by such Contributor that are necessarily infringed by their
 82 |       Contribution(s) alone or by combination of their Contribution(s)
 83 |       with the Work to which such Contribution(s) was submitted. If You
 84 |       institute patent litigation against any entity (including a
 85 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 86 |       or a Contribution incorporated within the Work constitutes direct
 87 |       or contributory patent infringement, then any patent licenses
 88 |       granted to You under this License for that Work shall terminate
 89 |       as of the date such litigation is filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the
 92 |       Work or Derivative Works thereof in any medium, with or without
 93 |       modifications, and in Source or Object form, provided that You
 94 |       meet the following conditions:
 95 | 
 96 |       (a) You must give any other recipients of the Work or
 97 |           Derivative Works a copy of this License; and
 98 | 
 99 |       (b) You must cause any modified files to carry prominent notices
100 |           stating that You changed the files; and
101 | 
102 |       (c) You must retain, in the Source form of any Derivative Works
103 |           that You distribute, all copyright, patent, trademark, and
104 |           attribution notices from the Source form of the Work,
105 |           excluding those notices that do not pertain to any part of
106 |           the Derivative Works; and
107 | 
108 |       (d) If the Work includes a "NOTICE" text file as part of its
109 |           distribution, then any Derivative Works that You distribute must
110 |           include a readable copy of the attribution notices contained
111 |           within such NOTICE file, excluding those notices that do not
112 |           pertain to any part of the Derivative Works, in at least one
113 |           of the following places: within a NOTICE text file distributed
114 |           as part of the Derivative Works; within the Source form or
115 |           documentation, if provided along with the Derivative Works; or,
116 |           within a display generated by the Derivative Works, if and
117 |           wherever such third-party notices normally appear. The contents
118 |           of the NOTICE file are for informational purposes only and
119 |           do not modify the License. You may add Your own attribution
120 |           notices within Derivative Works that You distribute, alongside
121 |           or as an addendum to the NOTICE text from the Work, provided
122 |           that such additional attribution notices cannot be construed
123 |           as modifying the License.
124 | 
125 |       You may add Your own copyright statement to Your modifications and
126 |       may provide additional or different license terms and conditions
127 |       for use, reproduction, or distribution of Your modifications, or
128 |       for any such Derivative Works as a whole, provided Your use,
129 |       reproduction, and distribution of the Work otherwise complies with
130 |       the conditions stated in this License.
131 | 
132 |    5. Submission of Contributions. Unless You explicitly state otherwise,
133 |       any Contribution intentionally submitted for inclusion in the Work
134 |       by You to the Licensor shall be under the terms and conditions of
135 |       this License, without any additional terms or conditions.
136 |       Notwithstanding the above, nothing herein shall supersede or modify
137 |       the terms of any separate license agreement you may have executed
138 |       with Licensor regarding such Contributions.
139 | 
140 |    6. Trademarks. This License does not grant permission to use the trade
141 |       names, trademarks, service marks, or product names of the Licensor,
142 |       except as required for reasonable and customary use in describing the
143 |       origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 |    7. Disclaimer of Warranty. Unless required by applicable law or
146 |       agreed to in writing, Licensor provides the Work (and each
147 |       Contributor provides its Contributions) on an "AS IS" BASIS,
148 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 |       implied, including, without limitation, any warranties or conditions
150 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 |       PARTICULAR PURPOSE. You are solely responsible for determining the
152 |       appropriateness of using or redistributing the Work and assume any
153 |       risks associated with Your exercise of permissions under this License.
154 | 
155 |    8. Limitation of Liability. In no event and under no legal theory,
156 |       whether in tort (including negligence), contract, or otherwise,
157 |       unless required by applicable law (such as deliberate and grossly
158 |       negligent acts) or agreed to in writing, shall any Contributor be
159 |       liable to You for damages, including any direct, indirect, special,
160 |       incidental, or consequential damages of any character arising as a
161 |       result of this License or out of the use or inability to use the
162 |       Work (including but not limited to damages for loss of goodwill,
163 |       work stoppage, computer failure or malfunction, or any and all
164 |       other commercial damages or losses), even if such Contributor
165 |       has been advised of the possibility of such damages.
166 | 
167 |    9. Accepting Warranty or Additional Liability. While redistributing
168 |       the Work or Derivative Works thereof, You may choose to offer,
169 |       and charge a fee for, acceptance of support, warranty, indemnity,
170 |       or other liability obligations and/or rights consistent with this
171 |       License. However, in accepting such obligations, You may act only
172 |       on Your own behalf and on Your sole responsibility, not on behalf
173 |       of any other Contributor, and only if You agree to indemnify,
174 |       defend, and hold each Contributor harmless for any liability
175 |       incurred by, or claims asserted against, such Contributor by reason
176 |       of your accepting any such warranty or additional liability.
177 | 
178 |    END OF TERMS AND CONDITIONS
179 | 
180 |    APPENDIX: How to apply the Apache License to your work.
181 | 
182 |       To apply the Apache License to your work, attach the following
183 |       boilerplate notice, with the fields enclosed by brackets "[]"
184 |       replaced with your own identifying information. (Don't include
185 |       the brackets!)  The text should be enclosed in the appropriate
186 |       comment syntax for the file format. We also recommend that a
187 |       file or class name and description of purpose be included on the
188 |       same "printed page" as the copyright notice for easier
189 |       identification within third-party archives.
190 | 
191 |    Copyright [yyyy] [name of copyright owner]
192 | 
193 |    Licensed under the Apache License, Version 2.0 (the "License");
194 |    you may not use this file except in compliance with the License.
195 |    You may obtain a copy of the License at
196 | 
197 |        http://www.apache.org/licenses/LICENSE-2.0
198 | 
199 |    Unless required by applicable law or agreed to in writing, software
200 |    distributed under the License is distributed on an "AS IS" BASIS,
201 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 |    See the License for the specific language governing permissions and
203 |    limitations under the License.
204 | 


--------------------------------------------------------------------------------
/hyperdecoder/third_party/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import sys
15 | 
16 | import os
17 | 
18 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
19 | 


--------------------------------------------------------------------------------
/hyperdecoder/third_party/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | from .t5_trainer import T5Trainer
2 | 


--------------------------------------------------------------------------------
/hyperdecoder/third_party/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .utils import (
 2 |     calculate_rouge,
 3 |     calculate_bleu,
 4 |     assert_all_frozen,
 5 |     check_output_dir,
 6 |     freeze_embeds,
 7 |     freeze_params,
 8 |     lmap,
 9 |     save_json,
10 |     write_txt_file,
11 |     label_smoothed_nll_loss,
12 |     TaskCollator,
13 |     MrqaTaskCollator,
14 | )
15 | 


--------------------------------------------------------------------------------
/hyperdecoder/third_party/utils/sentence_splitter.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from filelock import FileLock
17 | 
18 | try:
19 |     import nltk
20 | 
21 |     NLTK_AVAILABLE = True
22 | except (ImportError, ModuleNotFoundError):
23 |     NLTK_AVAILABLE = False
24 | 
25 | if NLTK_AVAILABLE:
26 |     with FileLock(".lock") as lock:
27 |         nltk.download("punkt", quiet=True)
28 | 
29 | 
30 | def add_newline_to_end_of_each_sentence(x: str) -> str:
31 |     """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
32 |     re.sub("<n>", "", x)  # remove pegasus newline char
33 |     assert (
34 |         NLTK_AVAILABLE
35 |     ), "nltk must be installed to separate newlines between sentences. (pip install nltk)"
36 |     return "\n".join(nltk.sent_tokenize(x))
37 | 


--------------------------------------------------------------------------------
/hyperdecoder/training_args.py:
--------------------------------------------------------------------------------
  1 | """Defines the arguments used for training and evaluation."""
  2 | 
  3 | import logging
  4 | from dataclasses import dataclass, field
  5 | from transformers import TrainingArguments
  6 | from transformers.optimization import (
  7 |     get_constant_schedule,
  8 |     get_constant_schedule_with_warmup,
  9 |     get_cosine_schedule_with_warmup,
 10 |     get_cosine_with_hard_restarts_schedule_with_warmup,
 11 |     get_linear_schedule_with_warmup,
 12 |     get_polynomial_decay_schedule_with_warmup,
 13 | )
 14 | from typing import Optional, List, Tuple
 15 | 
 16 | arg_to_scheduler = {
 17 |     "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
 18 |     "polynomial": get_polynomial_decay_schedule_with_warmup,
 19 |     "constant": get_constant_schedule,
 20 |     "linear": get_linear_schedule_with_warmup,
 21 |     "cosine": get_cosine_schedule_with_warmup,
 22 |     "constant_w_warmup": get_constant_schedule_with_warmup,
 23 | }
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | @dataclass
 29 | class Seq2SeqTrainingArguments(TrainingArguments):
 30 |     """
 31 |     Contains different training parameters such as dropout, optimizers parameters, ... .
 32 |     """
 33 | 
 34 |     label_smoothing: Optional[float] = field(
 35 |         default=0.0,
 36 |         metadata={"help": "The label smoothing epsilon to apply (if not zero)."},
 37 |     )
 38 |     loss_scaling: Optional[bool] = field(
 39 |         default=True,
 40 |         metadata={"help": "Whether to scale loss by number of tokens."},
 41 |     )
 42 |     predict_with_generate: bool = field(
 43 |         default=False,
 44 |         metadata={
 45 |             "help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."
 46 |         },
 47 |     )
 48 |     adafactor: bool = field(
 49 |         default=False, metadata={"help": "whether to use adafactor"}
 50 |     )
 51 |     encoder_layerdrop: Optional[float] = field(
 52 |         default=None,
 53 |         metadata={"help": "Encoder layer dropout probability. Goes into model.config."},
 54 |     )
 55 |     decoder_layerdrop: Optional[float] = field(
 56 |         default=None,
 57 |         metadata={"help": "Decoder layer dropout probability. Goes into model.config."},
 58 |     )
 59 |     dropout: Optional[float] = field(
 60 |         default=None, metadata={"help": "Dropout probability. Goes into model.config."}
 61 |     )
 62 |     attention_dropout: Optional[float] = field(
 63 |         default=None,
 64 |         metadata={"help": "Attention dropout probability. Goes into model.config."},
 65 |     )
 66 |     lr_scheduler: Optional[str] = field(
 67 |         default="linear",
 68 |         metadata={
 69 |             "help": f"Which lr scheduler to use. Selected in {sorted(arg_to_scheduler.keys())}"
 70 |         },
 71 |     )
 72 |     temperature: Optional[int] = field(
 73 |         default=1,
 74 |         metadata={
 75 |             "help": "Defines the temperature"
 76 |             "value for sampling across the multiple datasets."
 77 |         },
 78 |     )
 79 |     do_test: bool = field(
 80 |         default=False,
 81 |         metadata={"help": "Whether to comptue evaluation metrics on the test sets."},
 82 |     )
 83 |     eval_output_dir: Optional[str] = field(
 84 |         default=None,
 85 |         metadata={
 86 |             "help": "The output directory where the evaluation of the model and checkpoints during "
 87 |             "evaluation will be written. Would use the original output_dir if not specified."
 88 |         },
 89 |     )
 90 |     generate_classifier_weights: Optional[bool] = field(
 91 |         default=False,
 92 |         metadata={
 93 |             "help": "If set, generates the weights of the classifier by using a hyper-network."
 94 |         },
 95 |     )
 96 |     optimize_from_scratch: Optional[bool] = field(
 97 |         default=False,
 98 |         metadata={
 99 |             "help": "If set, this does not load the optimizers from"
100 |             "the given model path."
101 |         },
102 |     )
103 |     optimize_from_scratch_with_loading_model: Optional[bool] = field(
104 |         default=False,
105 |         metadata={
106 |             "help": "If set, it loads the model still but optimize from scratch."
107 |         },
108 |     )
109 |     split_validation_test: Optional[bool] = field(
110 |         default=False,
111 |         metadata={
112 |             "help": "If set, for the datasets which do not"
113 |             "have the test set, we use validation set as their"
114 |             "test set and make a validation set from either"
115 |             "splitting the validation set into half (for smaller"
116 |             "than 10K samples datasets), or by using 1K examples"
117 |             "from training set as validation set (for larger"
118 |             " datasets)."
119 |         },
120 |     )
121 |     print_num_parameters: Optional[str] = field(
122 |         default=False,
123 |         metadata={"help": "If specified, prints the total number of parameters."},
124 |     )
125 |     compute_memory: Optional[bool] = field(
126 |         default=False, metadata={"help": "If specified, measures the memory needed."}
127 |     )
128 |     compute_time: Optional[bool] = field(
129 |         default=False, metadata={"help": "If specified, measures the time needed."}
130 |     )
131 |     report_to: Optional[List[str]] = field(
132 |         default="none",
133 |         metadata={
134 |             "help": "The list of integrations to report the results and logs to."
135 |         },
136 |     )
137 | 
138 | 
139 | @dataclass
140 | class ModelArguments:
141 |     """
142 |     Contains the arguments defining model, tokenizer, and config which we use for finetuning.
143 |     Also, it defines which parameters of the model needs to be freezed during finetuning.
144 |     """
145 | 
146 |     model_name_or_path: str = field(
147 |         metadata={
148 |             "help": "Path to pretrained model or model identifier from huggingface.co/models"
149 |         }
150 |     )
151 |     not_load_t5_checkpoint: bool = field(
152 |         default=False, metadata={"help": "whether to load the checkpoint."}
153 |     )
154 |     config_name: Optional[str] = field(
155 |         default=None,
156 |         metadata={
157 |             "help": "Pretrained config name or path if not the same as model_name"
158 |         },
159 |     )
160 |     tokenizer_name: Optional[str] = field(
161 |         default=None,
162 |         metadata={
163 |             "help": "Pretrained tokenizer name or path if not the same as model_name"
164 |         },
165 |     )
166 |     cache_dir: Optional[str] = field(
167 |         default=None,
168 |         metadata={
169 |             "help": "Where do you want to store the pretrained models downloaded from s3"
170 |         },
171 |     )
172 |     freeze_model: bool = field(
173 |         default=True, metadata={"help": "Whether  to freeze the model."}
174 |     )
175 |     unfreeze_encoder_adapters: bool = field(
176 |         default=True, metadata={"help": "Whether to unfreeze the encoder adapters."}
177 |     )
178 |     unfreeze_decoder_adapters: bool = field(
179 |         default=True, metadata={"help": "Whether to unfreeze the decoder adapters."}
180 |     )
181 |     unfreeze_encoder: bool = field(
182 |         default=False, metadata={"help": "Whether to unfreeze the encoder."}
183 |     )
184 |     unfreeze_decoder: bool = field(
185 |         default=False, metadata={"help": "Whether to unfreeze the decoder."}
186 |     )
187 |     unfreeze_layer_norms: bool = field(
188 |         default=False, metadata={"help": "Whether to unfreeze the layer norms."}
189 |     )
190 | 
191 | 
192 | @dataclass
193 | class DataTrainingArguments:
194 |     """
195 |     Arguments related to data used for training and evaluation.
196 |     """
197 | 
198 |     tasks: Optional[List[str]] = field(
199 |         default="MRPC",
200 |         metadata={"help": "Task name from the list of registered tasks."},
201 |     )
202 |     eval_tasks: Optional[List[str]] = field(
203 |         default="MRPC",
204 |         metadata={"help": "Evaluation task name from the list of registered tasks."},
205 |     )
206 |     adapters: Optional[List[str]] = field(
207 |         default=None,
208 |         metadata={"help": "Defines a dictionary from adapters to the tasks."},
209 |     )
210 |     task_embeddings: Optional[List[str]] = field(
211 |         default=None,
212 |         metadata={"help": "Defines a dictionary from tasks to the tasks embeddings."},
213 |     )
214 |     max_source_length: Optional[int] = field(
215 |         default=128,
216 |         metadata={
217 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
218 |             "than this will be truncated, sequences shorter will be padded."
219 |         },
220 |     )
221 |     max_target_length: Optional[int] = field(
222 |         default=128,
223 |         metadata={
224 |             "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
225 |             "than this will be truncated, sequences shorter will be padded."
226 |         },
227 |     )
228 |     val_max_target_length: Optional[int] = field(
229 |         default=128,
230 |         metadata={
231 |             "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
232 |             "than this will be truncated, sequences shorter will be padded."
233 |         },
234 |     )
235 |     test_max_target_length: Optional[int] = field(
236 |         default=128,
237 |         metadata={
238 |             "help": "The maximum total sequence length for test target text after tokenization. Sequences longer "
239 |             "than this will be truncated, sequences shorter will be padded."
240 |         },
241 |     )
242 |     n_train: Optional[int] = field(
243 |         default=-1, metadata={"help": "# training examples. -1 means use all."}
244 |     )
245 |     n_val: Optional[int] = field(
246 |         default=-1, metadata={"help": "# validation examples. -1 means use all."}
247 |     )
248 |     n_test: Optional[int] = field(
249 |         default=-1, metadata={"help": "# test examples. -1 means use all."}
250 |     )
251 |     eval_beams: Optional[int] = field(
252 |         default=None, metadata={"help": "# num_beams to use for evaluation."}
253 |     )
254 |     ignore_pad_token_for_loss: bool = field(
255 |         default=True,
256 |         metadata={
257 |             "help": "If only pad tokens should be ignored. This assumes that `config.pad_token_id` is defined."
258 |         },
259 |     )
260 |     data_seed: Optional[int] = field(
261 |         default=42, metadata={"help": "The seed used to subsample the datasets."}
262 |     )
263 |     ignore_metric_keys: Optional[Tuple[str]] = field(
264 |         default=("xsum_eval_rouge1", "xsum_eval_rougeL", "xsum_eval_rougeLsum"),
265 |         metadata={
266 |             "help": "Metric keys to ignore in calculating average for best model"
267 |         },
268 |     )
269 |     filter_nulls: bool = field(
270 |         default=False,
271 |         metadata={
272 |             "help": "Whether to filter out nulls from the dataset. Only valid when using the chunked mrqa dataset"
273 |         },
274 |     )
275 | 
276 | 
277 | @dataclass
278 | class AdapterTrainingArguments:
279 |     """Defines the adapters parameters."""
280 | 
281 |     encoder_adapter: Optional[str] = field(
282 |         default="manual", metadata={"help": "The encoder adapter to use."}
283 |     )
284 |     decoder_adapter: Optional[str] = field(
285 |         default="generated", metadata={"help": "The decoder adapter to use."}
286 |     )
287 |     encoder_adapter_dim: Optional[int] = field(
288 |         default=64, metadata={"help": "size of adapters in encoder."}
289 |     )
290 |     decoder_adapter_dim: Optional[int] = field(
291 |         default=64, metadata={"help": "size of adapters in decoder."}
292 |     )
293 |     hypernetwork_bottleneck: Optional[int] = field(
294 |         default=128, metadata={"help": "size of hypernetwork bottleneck dim"}
295 |     )
296 |     adapter_norm_input: bool = field(
297 |         default=False,
298 |         metadata={"help": "Whether to use layer normed input into adapters or not."},
299 |     )
300 |     mean_task_embeddings: bool = field(
301 |         default=False,
302 |         metadata={
303 |             "help": "Whether to use average task embedding instead of task-specific or not."
304 |         },
305 |     )
306 |     process_encoder_output: bool = field(
307 |         default=True,
308 |         metadata={
309 |             "help": "Whether to pass the encoder output through a MLP before mean-pooling or not."
310 |         },
311 |     )
312 | 


--------------------------------------------------------------------------------
/hyperdecoder/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .utils import (
15 |     use_task_specific_params,
16 |     reset_config,
17 |     freeze_model,
18 |     unfreeze_adapter_params_encoder,
19 |     unfreeze_adapter_params_decoder,
20 |     unfreeze_encoder,
21 |     unfreeze_decoder,
22 |     unfreeze_layer_norms,
23 |     handle_metrics,
24 |     create_dir,
25 |     get_last_checkpoint_path,
26 |     get_training_args,
27 | )  # , T5SaveModelCallback
28 | 


--------------------------------------------------------------------------------
/hyperdecoder/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | from dataclasses import asdict
  4 | from logging import getLogger
  5 | from third_party.utils import assert_all_frozen, freeze_embeds, freeze_params, save_json
  6 | from transformers.models.t5.modeling_t5 import T5LayerNorm
  7 | 
  8 | from data import TASK_MAPPING
  9 | 
 10 | logger = getLogger(__name__)
 11 | 
 12 | 
 13 | def create_dir(output_dir):
 14 |     """
 15 |     Checks whether to the output_dir already exists and creates it if not.
 16 |     Args:
 17 |       output_dir: path to the output_dir
 18 |     """
 19 |     if not os.path.exists(output_dir):
 20 |         os.makedirs(output_dir)
 21 | 
 22 | 
 23 | def handle_metrics(split, metrics, output_dir):  # , gcs_bucket=None):
 24 |     """
 25 |     Prints and saves metrics or a general dictionary of results.
 26 | 
 27 |     Args:
 28 |         split: one of train, val, test, or training arguments.
 29 |         metrics: metrics dict
 30 |         output_dir: where to save the metrics, if gcs_bucket is given
 31 |         we save the results also on the given bucket.
 32 |     """
 33 |     logger.info(f"***** {split} metrics *****")
 34 |     for key in sorted(metrics.keys()):
 35 |         logger.info(f"  {key} = {metrics[key]}")
 36 |     save_json_file(metrics, f"{split}_results.json", output_dir)
 37 | 
 38 | 
 39 | def save_json_file(json_dict, outfile_name, output_dir):
 40 |     """
 41 |     Saves the given dictionary as a json file to output_dir and also
 42 |     the given bucket if given.
 43 |     """
 44 |     save_json(json_dict, os.path.join(output_dir, outfile_name))
 45 | 
 46 | 
 47 | def get_training_args(arguments_list):
 48 |     """
 49 |     Concatenate all training arguments except evaluation strategy which
 50 |     is not Json serializable.
 51 |     Args:
 52 |         arguments_list: list of dataclasses.
 53 |     Return:
 54 |         arguments: concatenated arguments.
 55 |     """
 56 |     all_arguments = {}
 57 |     for arguments in arguments_list:
 58 |         all_arguments.update(asdict(arguments))
 59 |     all_arguments.pop("evaluation_strategy")
 60 |     return all_arguments
 61 | 
 62 | 
 63 | def get_last_checkpoint_path(output_dir):
 64 |     """
 65 |     Finds the path for the last checkpoint saved in the output_dir
 66 |     Args:
 67 |         output_dir:  output_dir
 68 |     Returns:
 69 |         path to the last checkpoint saved in the output dir.
 70 |     """
 71 |     paths = glob.glob(os.path.join(output_dir, "checkpoint-*"))
 72 |     if len(paths) == 0:
 73 |         return output_dir
 74 |     else:
 75 |         checkpoints = [int(checkpoint.split("-")[-1]) for checkpoint in paths]
 76 |         max_checkpoint = max(checkpoints)
 77 |         return os.path.join(output_dir, "checkpoint-" + str(max_checkpoint))
 78 | 
 79 | 
 80 | def use_task_specific_params(model, task):
 81 |     """Update config with task specific params during evaluation."""
 82 |     task_dataset = TASK_MAPPING[task]
 83 |     task_specific_config = task_dataset.task_specific_config
 84 |     if task_specific_config is not None:
 85 |         logger.info(f"using task specific params for {task}: {task_specific_config}")
 86 |         model.config.update(task_specific_config)
 87 | 
 88 | 
 89 | def reset_config(model, config):
 90 |     """Resets the config file to the one provided."""
 91 |     model.config = model.config.from_dict(config)
 92 |     logger.info(f"config is reset to the initial values.")
 93 | 
 94 | 
 95 | def freeze_model(model):
 96 |     """Freezes the model weights."""
 97 |     freeze_params(model)
 98 | 
 99 | 
100 | def unfreeze_adapter_params_encoder(model):
101 |     for name, param in model.named_parameters():
102 |         if (
103 |             "adapter" in name or "mlp" in name or "param_gen" in name
104 |         ) and "encoder" in name:
105 |             param.requires_grad = True
106 | 
107 | 
108 | def unfreeze_adapter_params_decoder(model):
109 |     for name, param in model.named_parameters():
110 |         if (
111 |             "adapter" in name or "mlp" in name or "param_gen" in name
112 |         ) and "decoder" in name:
113 |             param.requires_grad = True
114 | 
115 | 
116 | def unfreeze_encoder(model):
117 |     for name, param in model.named_parameters():
118 |         if "encoder" in name:
119 |             param.requires_grad = True
120 | 
121 | 
122 | def unfreeze_decoder(model):
123 |     for name, param in model.named_parameters():
124 |         if "decoder" in name:
125 |             param.requires_grad = True
126 | 
127 | 
128 | def unfreeze_layer_norms(model):
129 |     for name, param in model.named_parameters():
130 |         if "layer_norm" in name:
131 |             param.requires_grad = True
132 | 


--------------------------------------------------------------------------------
/mrqa_eval/construct_eval_folders.sh:
--------------------------------------------------------------------------------
 1 | mkdir in-domain
 2 | mkdir out-domain
 3 | 
 4 | echo "downloading in-domain data"
 5 | 
 6 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/SQuAD.jsonl.gz -O in-domain/SQuAD.jsonl.gz
 7 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NewsQA.jsonl.gz -O in-domain/NewsQA.jsonl.gz
 8 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/TriviaQA-web.jsonl.gz -O in-domain/TriviaQA-web.jsonl.gz
 9 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/SearchQA.jsonl.gz -O in-domain/SearchQA.jsonl.gz
10 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/HotpotQA.jsonl.gz -O in-domain/HotpotQA.jsonl.gz
11 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NaturalQuestionsShort.jsonl.gz -O in-domain/NaturalQuestionsShort.jsonl.gz
12 | 
13 | echo "downloading out-domain data"
14 | 
15 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/BioASQ.jsonl.gz -O out-domain/BioASQ.jsonl.gz
16 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/DROP.jsonl.gz -O out-domain/DROP.jsonl.gz
17 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/DuoRC.ParaphraseRC.jsonl.gz -O out-domain/DuoRC.ParaphraseRC.jsonl.gz
18 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/RACE.jsonl.gz -O out-domain/RACE.jsonl.gz
19 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/RelationExtraction.jsonl.gz -O out-domain/RelationExtraction.jsonl.gz
20 | wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/TextbookQA.jsonl.gz -O out-domain/TextbookQA.jsonl.gz
21 | 
22 | echo "done!"


--------------------------------------------------------------------------------
/mrqa_eval/eval.py:
--------------------------------------------------------------------------------
  1 | """Evaluation script for MRQA
  2 | Adapted from Official evaluation script for the MRQA Workshop Shared Task.
  3 | Adapted from the SQuAD v1.1 official evaluation script.
  4 | Usage:
  5 |     python official_eval.py dataset_file.jsonl.gz prediction_file.json
  6 | """
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import argparse
 12 | import string
 13 | import re
 14 | import json
 15 | import gzip
 16 | from collections import Counter
 17 | 
 18 | 
 19 | def normalize_answer(s):
 20 |     """Lower text and remove punctuation, articles and extra whitespace."""
 21 |     def remove_articles(text):
 22 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
 23 | 
 24 |     def white_space_fix(text):
 25 |         return ' '.join(text.split())
 26 | 
 27 |     def remove_punc(text):
 28 |         exclude = set(string.punctuation)
 29 |         return ''.join(ch for ch in text if ch not in exclude)
 30 | 
 31 |     def lower(text):
 32 |         return text.lower()
 33 | 
 34 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
 35 | 
 36 | 
 37 | def f1_score(prediction, ground_truth):
 38 |     prediction_tokens = normalize_answer(prediction).split()
 39 |     ground_truth_tokens = normalize_answer(ground_truth).split()
 40 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 41 |     num_same = sum(common.values())
 42 |     if num_same == 0:
 43 |         return 0
 44 |     precision = 1.0 * num_same / len(prediction_tokens)
 45 |     recall = 1.0 * num_same / len(ground_truth_tokens)
 46 |     f1 = (2 * precision * recall) / (precision + recall)
 47 |     return f1
 48 | 
 49 | 
 50 | def exact_match_score(prediction, ground_truth):
 51 |     return (normalize_answer(prediction) == normalize_answer(ground_truth))
 52 | 
 53 | 
 54 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 55 |     scores_for_ground_truths = []
 56 |     for ground_truth in ground_truths:
 57 |         score = metric_fn(prediction, ground_truth)
 58 |         scores_for_ground_truths.append(score)
 59 |     return max(scores_for_ground_truths)
 60 | 
 61 | 
 62 | def read_predictions(prediction_file):
 63 |     with open(prediction_file) as f:
 64 |         predictions = json.load(f)
 65 |     return predictions
 66 | 
 67 | 
 68 | def read_answers(gold_file):
 69 |     answers = {}
 70 |     with gzip.open(gold_file, 'rb') as f:
 71 |         for i, line in enumerate(f):
 72 |             example = json.loads(line)
 73 |             if i == 0 and 'header' in example:
 74 |                 continue
 75 |             for qa in example['qas']:
 76 |                 answers[qa['qid']] = qa['answers']
 77 |     return answers
 78 | 
 79 | 
 80 | def evaluate(answers, predictions, skip_no_answer=False):
 81 |     f1 = exact_match = total = 0
 82 |     for qid, ground_truths in answers.items():
 83 |         if qid not in predictions:
 84 |             continue
 85 |         total += 1
 86 |         # filter out null guesses
 87 |         filtered_preds = [pred for pred in predictions[qid] if pred[0] != '']
 88 |         if filtered_preds == []:
 89 |             prediction = ''
 90 |         else:
 91 |             probs = [pred[1] for pred in filtered_preds]
 92 |             max_prob_index = probs.index(min(probs))
 93 |             prediction = filtered_preds[max_prob_index][0]
 94 | 
 95 |         exact_match += metric_max_over_ground_truths(
 96 |             exact_match_score, prediction, ground_truths)
 97 |         f1 += metric_max_over_ground_truths(
 98 |             f1_score, prediction, ground_truths)
 99 | 
100 |     exact_match = 100.0 * exact_match / total
101 |     f1 = 100.0 * f1 / total
102 |     print(f'{exact_match:.2f} / {f1:.2f}')
103 |     return {'exact_match': exact_match, 'f1': f1}
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     parser = argparse.ArgumentParser(
108 |         description='Evaluation for MRQA Workshop Shared Task')
109 |     parser.add_argument('dataset_file', type=str, help='Dataset File')
110 |     parser.add_argument('prediction_file', type=str, help='Prediction File')
111 |     parser.add_argument('--skip-no-answer', action='store_true')
112 |     args = parser.parse_args()
113 | 
114 |     answers = read_answers(args.dataset_file)
115 |     predictions = read_predictions(args.prediction_file)
116 |     metrics = evaluate(answers, predictions, args.skip_no_answer)
117 | 
118 |     print(json.dumps(metrics))
119 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==1.0.0
 2 | aiohttp==3.8.1
 3 | aiosignal==1.2.0
 4 | async-timeout==4.0.2
 5 | attrs==21.2.0
 6 | certifi==2021.10.8
 7 | charset-normalizer==2.0.9
 8 | click==8.0.3
 9 | colorama==0.4.4
10 | datasets==1.17.0
11 | dill==0.3.4
12 | filelock==3.4.0
13 | frozenlist==1.2.0
14 | fsspec==2021.11.1
15 | gitdb==4.0.9
16 | GitPython==3.1.24
17 | huggingface-hub==0.2.1
18 | idna==3.3
19 | joblib==1.1.0
20 | multidict==5.2.0
21 | multiprocess==0.70.12.2
22 | nltk==3.6.7
23 | numpy==1.21.5
24 | packaging==21.3
25 | pandas==1.3.5
26 | portalocker==2.3.2
27 | protobuf==3.19.1
28 | pyarrow==6.0.1
29 | pyparsing==3.0.6
30 | python-dateutil==2.8.2
31 | pytz==2021.3
32 | PyYAML==6.0
33 | regex==2021.11.10
34 | requests==2.26.0
35 | rouge-score==0.0.4
36 | sacrebleu==2.0.0
37 | sacremoses==0.0.46
38 | scikit-learn==1.0.1
39 | scipy==1.7.3
40 | sentencepiece==0.1.96
41 | six==1.16.0
42 | sklearn==0.0
43 | smmap==5.0.0
44 | tabulate==0.8.9
45 | threadpoolctl==3.0.0
46 | tokenizers==0.10.3
47 | tqdm==4.62.3
48 | transformers==4.14.1
49 | typing_extensions==4.0.1
50 | urllib3==1.26.7
51 | xxhash==2.0.2
52 | yarl==1.7.2


--------------------------------------------------------------------------------