├── .gitignore
├── LICENSE
├── README.md
├── configs
    ├── bert
    │   ├── houlsby_config.json
    │   ├── houlsby_plus_pals_config.json
    │   ├── low_rank_config.json
    │   ├── pals_config.json
    │   ├── private
    │   │   ├── lr_1024.json
    │   │   ├── lr_128.json
    │   │   ├── lr_16.json
    │   │   ├── lr_256.json
    │   │   ├── lr_32.json
    │   │   ├── lr_512.json
    │   │   ├── lr_64.json
    │   │   └── lr_8.json
    │   ├── raw_config.json
    │   ├── top_attn_config.json
    │   ├── top_low_rank_config.json
    │   ├── top_pals_config.json
    │   ├── vertical_config.json
    │   ├── vertical_low_rank.json
    │   ├── vertical_pals.json
    │   └── vertical_plus_low_rank_config.json
    ├── data
    │   └── private
    │   │   ├── lr_1024.json5
    │   │   ├── lr_128.json5
    │   │   ├── lr_16.json5
    │   │   ├── lr_256.json5
    │   │   ├── lr_32.json5
    │   │   ├── lr_512.json5
    │   │   ├── lr_64.json5
    │   │   └── lr_8.json5
    ├── default.json5
    └── main.json5
├── evaluate.py
├── figures
    └── overview.png
├── requirements.txt
├── src
    ├── __init__.py
    ├── bert.py
    ├── evaluator.py
    ├── interface.py
    ├── model.py
    ├── modeling.py
    ├── modules
    │   ├── __init__.py
    │   ├── alignment.py
    │   ├── connection.py
    │   ├── embedding.py
    │   ├── encoder.py
    │   ├── fusion.py
    │   ├── pooling.py
    │   └── prediction.py
    ├── trainer.py
    └── utils
    │   ├── __init__.py
    │   ├── loader.py
    │   ├── logger.py
    │   ├── metrics.py
    │   ├── params.py
    │   ├── registry.py
    │   └── vocab.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | /data/*
 2 | !/data/*.py
 3 | /models/
 4 | /resources/
 5 | /pred/
 6 | 
 7 | /.idea
 8 | __pycache__/
 9 | .DS_Store
10 | checkpoint
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FedMatch
 2 | Repo for our paper, [FedMatch: Federated Learning Over Heterogeneous Question Answering Data](https://arxiv.org/abs/2108.05069), by Jiangui Chen, Ruqing Zhang, Jiafeng Guo, Yixing Fan, Xueqi Cheng.
 3 | 
 4 | ## Overview
 5 | 
 6 | ![Overview](./figures/overview.png)
 7 | 
 8 | We are excited to present our novel Federated Matching framework for QA, named FedMatch, with a backbone-patch architecture. It could leverage all the available QA data to boost the model training and remove the need to directly exchange the privacy-sensitive QA data among different participants. By decomposing the QA model in each participant into a shared module and a private module, it is able to leverage the common knowledge in different participants and capture the information of the local data in each participant. Empirical results showed that our method can effectively improve the perfor- mance by exploiting the useful information of multiple participants in a privacy-preserving way.
 9 | 
10 | ## Setup
11 | 
12 | - Install python >= 3.6 and pip
13 | - `pip install -r requirements.txt`
14 | - install [PyTorch](https://pytorch.org)
15 | 
16 | ## Download
17 | 
18 | - [FedQA](https://drive.google.com/file/d/1gwJwRW4PFPufht3ZYk0bUOfblId1eg_m/view?usp=sharing)
19 | 
20 | ## Acknowledgements
21 | - [transformers](https://github.com/huggingface/transformers)
22 | - [simple-effective-text-matching-pytorch](https://github.com/alibaba-edu/simple-effective-text-matching-pytorch)
23 | 
24 | ## Citation
25 | If you find our work useful, please consider citing our paper:
26 | 
27 | ```
28 | @inproceedings{chen2021fedmatch,
29 |   title={FedMatch: Federated Learning Over Heterogeneous Question Answering Data},
30 |   author={Chen, Jiangui and Zhang, Ruqing and Guo, Jiafeng and Fan, Yixing and Cheng, Xueqi},
31 |   booktitle={Proceedings of the 30th ACM International Conference on Information \& Knowledge Management},
32 |   pages={181--190},
33 |   year={2021}
34 | }
35 | ```
36 | 
37 | ## License
38 | This project is under Apache License 2.0.
39 | 


--------------------------------------------------------------------------------
/configs/bert/houlsby_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 50,
 3 |     "houlsby": true,
 4 |     "attention_probs_dropout_prob": 0.1,
 5 |     "hidden_act": "gelu",
 6 |     "hidden_dropout_prob": 0.1,
 7 |     "hidden_size": 768,
 8 |     "initializer_range": 0.02,
 9 |     "intermediate_size": 3072,
10 |     "max_position_embeddings": 512,
11 |     "num_attention_heads": 12,
12 |     "num_hidden_layers": 12,
13 |     "type_vocab_size": 2,
14 |     "vocab_size": 30522
15 |   }
16 | 


--------------------------------------------------------------------------------
/configs/bert/houlsby_plus_pals_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 156,
 3 |     "extra_dim": 50,
 4 |     "houlsby": true,
 5 |     "pals": true,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "max_position_embeddings": 512,
13 |     "num_attention_heads": 12,
14 |     "num_hidden_layers": 12,
15 |     "type_vocab_size": 2,
16 |     "vocab_size": 30522
17 |   }
18 | 


--------------------------------------------------------------------------------
/configs/bert/low_rank_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 100,
 3 |     "mult": true,
 4 |     "attention_probs_dropout_prob": 0.1,
 5 |     "hidden_act": "gelu",
 6 |     "hidden_dropout_prob": 0.1,
 7 |     "hidden_size": 768,
 8 |     "initializer_range": 0.02,
 9 |     "intermediate_size": 3072,
10 |     "max_position_embeddings": 512,
11 |     "num_attention_heads": 12,
12 |     "num_hidden_layers": 12,
13 |     "type_vocab_size": 2,
14 |     "vocab_size": 30522
15 | }
16 | 


--------------------------------------------------------------------------------
/configs/bert/pals_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 204,
 3 |     "mult": true,
 4 |     "pals": true,
 5 |     "attention_probs_dropout_prob": 0.1,
 6 |     "hidden_act": "gelu",
 7 |     "hidden_dropout_prob": 0.1,
 8 |     "hidden_size": 768,
 9 |     "initializer_range": 0.02,
10 |     "intermediate_size": 3072,
11 |     "max_position_embeddings": 512,
12 |     "num_attention_heads": 12,
13 |     "num_hidden_layers": 12,
14 |     "type_vocab_size": 2,
15 |     "vocab_size": 30522
16 |   }
17 |   


--------------------------------------------------------------------------------
/configs/bert/private/lr_1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 1024,
 3 |     "vertical": true,
 4 |     "vertical_low_rank": true,
 5 |     "vertical_num_hidden_layers": 1,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 12,
16 |     "num_hidden_layers": 12,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/bert/private/lr_128.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 128,
 3 |     "vertical": true,
 4 |     "vertical_low_rank": true,
 5 |     "vertical_num_hidden_layers": 1,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 12,
16 |     "num_hidden_layers": 12,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/bert/private/lr_16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 16,
 3 |     "vertical": true,
 4 |     "vertical_low_rank": true,
 5 |     "vertical_num_hidden_layers": 1,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 12,
16 |     "num_hidden_layers": 12,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/bert/private/lr_256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 256,
 3 |     "vertical": true,
 4 |     "vertical_low_rank": true,
 5 |     "vertical_num_hidden_layers": 1,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 12,
16 |     "num_hidden_layers": 12,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/bert/private/lr_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 32,
 3 |     "vertical": true,
 4 |     "vertical_low_rank": true,
 5 |     "vertical_num_hidden_layers": 1,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 12,
16 |     "num_hidden_layers": 12,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/bert/private/lr_512.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 512,
 3 |     "vertical": true,
 4 |     "vertical_low_rank": true,
 5 |     "vertical_num_hidden_layers": 1,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 12,
16 |     "num_hidden_layers": 12,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/bert/private/lr_64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 64,
 3 |     "vertical": true,
 4 |     "vertical_low_rank": true,
 5 |     "vertical_num_hidden_layers": 1,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 12,
16 |     "num_hidden_layers": 12,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/bert/private/lr_8.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 8,
 3 |     "vertical": true,
 4 |     "vertical_low_rank": true,
 5 |     "vertical_num_hidden_layers": 1,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 12,
16 |     "num_hidden_layers": 12,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/bert/raw_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "attention_probs_dropout_prob": 0.1,
 3 |     "hidden_act": "gelu",
 4 |     "hidden_dropout_prob": 0.1,
 5 |     "hidden_size": 768,
 6 |     "initializer_range": 0.02,
 7 |     "intermediate_size": 3072,
 8 |     "layer_norm_eps": 1e-12,
 9 |     "max_position_embeddings": 512,
10 |     "model_type": "bert",
11 |     "num_attention_heads": 12,
12 |     "num_hidden_layers": 12,
13 |     "pad_token_id": 0,
14 |     "type_vocab_size": 2,
15 |     "vocab_size": 30522
16 | }
17 | 


--------------------------------------------------------------------------------
/configs/bert/top_attn_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 204,
 3 |     "top": true,
 4 |     "attention_probs_dropout_prob": 0.1,
 5 |     "hidden_act": "gelu",
 6 |     "hidden_dropout_prob": 0.1,
 7 |     "hidden_size": 768,
 8 |     "initializer_range": 0.02,
 9 |     "intermediate_size": 3072,
10 |     "max_position_embeddings": 512,
11 |     "num_attention_heads": 12,
12 |     "num_hidden_layers": 12,
13 |     "type_vocab_size": 2,
14 |     "vocab_size": 30522
15 |   }
16 | 


--------------------------------------------------------------------------------
/configs/bert/top_low_rank_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 100,
 3 |     "top": true,
 4 |     "top_low_rank": true,
 5 |     "attention_probs_dropout_prob": 0.1,
 6 |     "hidden_act": "gelu",
 7 |     "hidden_dropout_prob": 0.1,
 8 |     "hidden_size": 768,
 9 |     "initializer_range": 0.02,
10 |     "intermediate_size": 3072,
11 |     "max_position_embeddings": 512,
12 |     "num_attention_heads": 12,
13 |     "num_hidden_layers": 12,
14 |     "type_vocab_size": 2,
15 |     "vocab_size": 30522
16 |   }
17 | 


--------------------------------------------------------------------------------
/configs/bert/top_pals_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 204,
 3 |     "top": true,
 4 |     "top_pals": true,
 5 |     "attention_probs_dropout_prob": 0.1,
 6 |     "hidden_act": "gelu",
 7 |     "hidden_dropout_prob": 0.1,
 8 |     "hidden_size": 768,
 9 |     "initializer_range": 0.02,
10 |     "intermediate_size": 3072,
11 |     "max_position_embeddings": 512,
12 |     "num_attention_heads": 12,
13 |     "num_hidden_layers": 12,
14 |     "type_vocab_size": 2,
15 |     "vocab_size": 30522
16 |   }
17 | 


--------------------------------------------------------------------------------
/configs/bert/vertical_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "vertical": true,
 3 |     "vertical_num_hidden_layers": 1,
 4 |     "attention_probs_dropout_prob": 0.1,
 5 |     "hidden_act": "gelu",
 6 |     "hidden_dropout_prob": 0.1,
 7 |     "hidden_size": 768,
 8 |     "initializer_range": 0.02,
 9 |     "intermediate_size": 3072,
10 |     "layer_norm_eps": 1e-12,
11 |     "max_position_embeddings": 512,
12 |     "model_type": "bert",
13 |     "num_attention_heads": 12,
14 |     "num_hidden_layers": 12,
15 |     "pad_token_id": 0,
16 |     "type_vocab_size": 2,
17 |     "vocab_size": 30522
18 | }
19 | 


--------------------------------------------------------------------------------
/configs/bert/vertical_low_rank.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 100,
 3 |     "vertical": true,
 4 |     "vertical_low_rank": true,
 5 |     "vertical_num_hidden_layers": 1,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 12,
16 |     "num_hidden_layers": 12,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/bert/vertical_pals.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 204,
 3 |     "vertical": true,
 4 |     "vertical_pals": true,
 5 |     "vertical_num_hidden_layers": 1,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 12,
16 |     "num_hidden_layers": 12,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/bert/vertical_plus_low_rank_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hidden_size_aug": 100,
 3 |     "mult": true,
 4 |     "vertical": true,
 5 |     "vertical_num_hidden_layers": 1,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "max_position_embeddings": 512,
13 |     "num_attention_heads": 12,
14 |     "num_hidden_layers": 12,
15 |     "type_vocab_size": 2,
16 |     "vocab_size": 30522
17 | }
18 | 


--------------------------------------------------------------------------------
/configs/data/private/lr_1024.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |     data_dir: [
 3 |         'data/processed/medquad',
 4 |         'data/processed/privacy',
 5 |         'data/processed/bioasq',
 6 |         'data/processed/fiqa',
 7 |         'data/processed/inqa_small',
 8 |     ],
 9 |     output_dir: 'private/lr_1024',
10 |     metric: 'mrr',
11 |     watch_metrics: ['map'],
12 | 
13 |     bert: {
14 |         bert_config: 'configs/bert/private/lr_1024.json',
15 |         init_checkpoint: 'checkpoint/bert-base-uncased-official/pytorch_model.bin',
16 |         droput: 0.2,
17 |     },
18 | 
19 |     routine: {
20 |         log_per_samples: [64, 128, 256, 512, 256],
21 |         eval_per_samples: [640, 1280, 2560, 5120, 2560],
22 |         eval_per_samples_warmup: [40000, 40000, 40000, 40000, 40000],
23 |         eval_warmup_samples: [0, 0, 0, 0, 0], // after this many steps warmup mode for eval ends
24 |         tolerance_samples: [25600000, 25600000, 25600000, 25600000, 25600000],
25 |         eval_epoch: true,
26 |         min_samples: [0, 0, 0, 0, 0], // train at least these many steps, not affected by early stopping
27 |     },
28 | 
29 |     optim: {
30 |         lr: 2e-5,
31 |         min_lr: 0,
32 |         lr_decay_rate: 1.0,
33 |         warmup_steps: [142, 180, 454, 1338, 360],
34 |         t_total: [1420, 1800, 4540, 13380, 3600],
35 |         batch_size: [12, 32, 32, 32, 32]
36 |     },
37 | 
38 |     fed: {
39 |         round: 11,
40 |         fed_type: 'fed_vertical',
41 |         sample: 'all'
42 |     },
43 | 
44 |     epochs: [1, 1, 1, 1, 1],
45 | 
46 |     max_len1: [25, 10, 15, 10, 10],
47 |     max_len2: [480, 40, 50, 100, 100],
48 | }


--------------------------------------------------------------------------------
/configs/data/private/lr_128.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |     data_dir: [
 3 |         'data/processed/medquad',
 4 |         'data/processed/privacy',
 5 |         'data/processed/bioasq',
 6 |         'data/processed/fiqa',
 7 |         'data/processed/inqa_small',
 8 |     ],
 9 |     output_dir: 'private/lr_128',
10 |     metric: 'mrr',
11 |     watch_metrics: ['map'],
12 | 
13 |     bert: {
14 |         bert_config: 'configs/bert/private/lr_128.json',
15 |         init_checkpoint: 'checkpoint/bert-base-uncased-official/pytorch_model.bin',
16 |         droput: 0.2,
17 |     },
18 | 
19 |     routine: {
20 |         log_per_samples: [64, 128, 256, 512, 256],
21 |         eval_per_samples: [640, 1280, 2560, 5120, 2560],
22 |         eval_per_samples_warmup: [40000, 40000, 40000, 40000, 40000],
23 |         eval_warmup_samples: [0, 0, 0, 0, 0], // after this many steps warmup mode for eval ends
24 |         tolerance_samples: [25600000, 25600000, 25600000, 25600000, 25600000],
25 |         eval_epoch: true,
26 |         min_samples: [0, 0, 0, 0, 0], // train at least these many steps, not affected by early stopping
27 |     },
28 | 
29 |     optim: {
30 |         lr: 2e-5,
31 |         min_lr: 0,
32 |         lr_decay_rate: 1.0,
33 |         warmup_steps: [142, 180, 454, 1338, 360],
34 |         t_total: [1420, 1800, 4540, 13380, 3600],
35 |         batch_size: [12, 32, 32, 32, 32]
36 |     },
37 | 
38 |     fed: {
39 |         round: 11,
40 |         fed_type: 'fed_vertical',
41 |         sample: 'all'
42 |     },
43 | 
44 |     epochs: [1, 1, 1, 1, 1],
45 | 
46 |     max_len1: [25, 10, 15, 10, 10],
47 |     max_len2: [480, 40, 50, 100, 100],
48 | }


--------------------------------------------------------------------------------
/configs/data/private/lr_16.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |     data_dir: [
 3 |         'data/processed/medquad',
 4 |         'data/processed/privacy',
 5 |         'data/processed/bioasq',
 6 |         'data/processed/fiqa',
 7 |         'data/processed/inqa_small',
 8 |     ],
 9 |     output_dir: 'private/lr_16',
10 |     metric: 'mrr',
11 |     watch_metrics: ['map'],
12 | 
13 |     bert: {
14 |         bert_config: 'configs/bert/private/lr_16.json',
15 |         init_checkpoint: 'checkpoint/bert-base-uncased-official/pytorch_model.bin',
16 |         droput: 0.2,
17 |     },
18 | 
19 |     routine: {
20 |         log_per_samples: [64, 128, 256, 512, 256],
21 |         eval_per_samples: [640, 1280, 2560, 5120, 2560],
22 |         eval_per_samples_warmup: [40000, 40000, 40000, 40000, 40000],
23 |         eval_warmup_samples: [0, 0, 0, 0, 0], // after this many steps warmup mode for eval ends
24 |         tolerance_samples: [25600000, 25600000, 25600000, 25600000, 25600000],
25 |         eval_epoch: true,
26 |         min_samples: [0, 0, 0, 0, 0], // train at least these many steps, not affected by early stopping
27 |     },
28 | 
29 |     optim: {
30 |         lr: 2e-5,
31 |         min_lr: 0,
32 |         lr_decay_rate: 1.0,
33 |         warmup_steps: [142, 180, 454, 1338, 360],
34 |         t_total: [1420, 1800, 4540, 13380, 3600],
35 |         batch_size: [12, 32, 32, 32, 32]
36 |     },
37 | 
38 |     fed: {
39 |         round: 11,
40 |         fed_type: 'fed_vertical',
41 |         sample: 'all'
42 |     },
43 | 
44 |     epochs: [1, 1, 1, 1, 1],
45 | 
46 |     max_len1: [25, 10, 15, 10, 10],
47 |     max_len2: [480, 40, 50, 100, 100],
48 | }


--------------------------------------------------------------------------------
/configs/data/private/lr_256.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |     data_dir: [
 3 |         'data/processed/medquad',
 4 |         'data/processed/privacy',
 5 |         'data/processed/bioasq',
 6 |         'data/processed/fiqa',
 7 |         'data/processed/inqa_small',
 8 |     ],
 9 |     output_dir: 'private/lr_256',
10 |     metric: 'mrr',
11 |     watch_metrics: ['map'],
12 | 
13 |     bert: {
14 |         bert_config: 'configs/bert/private/lr_256.json',
15 |         init_checkpoint: 'checkpoint/bert-base-uncased-official/pytorch_model.bin',
16 |         droput: 0.2,
17 |     },
18 | 
19 |     routine: {
20 |         log_per_samples: [64, 128, 256, 512, 256],
21 |         eval_per_samples: [640, 1280, 2560, 5120, 2560],
22 |         eval_per_samples_warmup: [40000, 40000, 40000, 40000, 40000],
23 |         eval_warmup_samples: [0, 0, 0, 0, 0], // after this many steps warmup mode for eval ends
24 |         tolerance_samples: [25600000, 25600000, 25600000, 25600000, 25600000],
25 |         eval_epoch: true,
26 |         min_samples: [0, 0, 0, 0, 0], // train at least these many steps, not affected by early stopping
27 |     },
28 | 
29 |     optim: {
30 |         lr: 2e-5,
31 |         min_lr: 0,
32 |         lr_decay_rate: 1.0,
33 |         warmup_steps: [142, 180, 454, 1338, 360],
34 |         t_total: [1420, 1800, 4540, 13380, 3600],
35 |         batch_size: [12, 32, 32, 32, 32]
36 |     },
37 | 
38 |     fed: {
39 |         round: 11,
40 |         fed_type: 'fed_vertical',
41 |         sample: 'all'
42 |     },
43 | 
44 |     epochs: [1, 1, 1, 1, 1],
45 | 
46 |     max_len1: [25, 10, 15, 10, 10],
47 |     max_len2: [480, 40, 50, 100, 100],
48 | }


--------------------------------------------------------------------------------
/configs/data/private/lr_32.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |     data_dir: [
 3 |         'data/processed/medquad',
 4 |         'data/processed/privacy',
 5 |         'data/processed/bioasq',
 6 |         'data/processed/fiqa',
 7 |         'data/processed/inqa_small',
 8 |     ],
 9 |     output_dir: 'private/lr_32',
10 |     metric: 'mrr',
11 |     watch_metrics: ['map'],
12 | 
13 |     bert: {
14 |         bert_config: 'configs/bert/private/lr_32.json',
15 |         init_checkpoint: 'checkpoint/bert-base-uncased-official/pytorch_model.bin',
16 |         droput: 0.2,
17 |     },
18 | 
19 |     routine: {
20 |         log_per_samples: [64, 128, 256, 512, 256],
21 |         eval_per_samples: [640, 1280, 2560, 5120, 2560],
22 |         eval_per_samples_warmup: [40000, 40000, 40000, 40000, 40000],
23 |         eval_warmup_samples: [0, 0, 0, 0, 0], // after this many steps warmup mode for eval ends
24 |         tolerance_samples: [25600000, 25600000, 25600000, 25600000, 25600000],
25 |         eval_epoch: true,
26 |         min_samples: [0, 0, 0, 0, 0], // train at least these many steps, not affected by early stopping
27 |     },
28 | 
29 |     optim: {
30 |         lr: 2e-5,
31 |         min_lr: 0,
32 |         lr_decay_rate: 1.0,
33 |         warmup_steps: [142, 180, 454, 1338, 360],
34 |         t_total: [1420, 1800, 4540, 13380, 3600],
35 |         batch_size: [12, 32, 32, 32, 32]
36 |     },
37 | 
38 |     fed: {
39 |         round: 11,
40 |         fed_type: 'fed_vertical',
41 |         sample: 'all'
42 |     },
43 | 
44 |     epochs: [1, 1, 1, 1, 1],
45 | 
46 |     max_len1: [25, 10, 15, 10, 10],
47 |     max_len2: [480, 40, 50, 100, 100],
48 | }


--------------------------------------------------------------------------------
/configs/data/private/lr_512.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |     data_dir: [
 3 |         'data/processed/medquad',
 4 |         'data/processed/privacy',
 5 |         'data/processed/bioasq',
 6 |         'data/processed/fiqa',
 7 |         'data/processed/inqa_small',
 8 |     ],
 9 |     output_dir: 'private/lr_512',
10 |     metric: 'mrr',
11 |     watch_metrics: ['map'],
12 | 
13 |     bert: {
14 |         bert_config: 'configs/bert/private/lr_512.json',
15 |         init_checkpoint: 'checkpoint/bert-base-uncased-official/pytorch_model.bin',
16 |         droput: 0.2,
17 |     },
18 | 
19 |     routine: {
20 |         log_per_samples: [64, 128, 256, 512, 256],
21 |         eval_per_samples: [640, 1280, 2560, 5120, 2560],
22 |         eval_per_samples_warmup: [40000, 40000, 40000, 40000, 40000],
23 |         eval_warmup_samples: [0, 0, 0, 0, 0], // after this many steps warmup mode for eval ends
24 |         tolerance_samples: [25600000, 25600000, 25600000, 25600000, 25600000],
25 |         eval_epoch: true,
26 |         min_samples: [0, 0, 0, 0, 0], // train at least these many steps, not affected by early stopping
27 |     },
28 | 
29 |     optim: {
30 |         lr: 2e-5,
31 |         min_lr: 0,
32 |         lr_decay_rate: 1.0,
33 |         warmup_steps: [142, 180, 454, 1338, 360],
34 |         t_total: [1420, 1800, 4540, 13380, 3600],
35 |         batch_size: [12, 32, 32, 32, 32]
36 |     },
37 | 
38 |     fed: {
39 |         round: 11,
40 |         fed_type: 'fed_vertical',
41 |         sample: 'all'
42 |     },
43 | 
44 |     epochs: [1, 1, 1, 1, 1],
45 | 
46 |     max_len1: [25, 10, 15, 10, 10],
47 |     max_len2: [480, 40, 50, 100, 100],
48 | }


--------------------------------------------------------------------------------
/configs/data/private/lr_64.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |     data_dir: [
 3 |         'data/processed/medquad',
 4 |         'data/processed/privacy',
 5 |         'data/processed/bioasq',
 6 |         'data/processed/fiqa',
 7 |         'data/processed/inqa_small',
 8 |     ],
 9 |     output_dir: 'private/lr_64',
10 |     metric: 'mrr',
11 |     watch_metrics: ['map'],
12 | 
13 |     bert: {
14 |         bert_config: 'configs/bert/private/lr_64.json',
15 |         init_checkpoint: 'checkpoint/bert-base-uncased-official/pytorch_model.bin',
16 |         droput: 0.2,
17 |     },
18 | 
19 |     routine: {
20 |         log_per_samples: [64, 128, 256, 512, 256],
21 |         eval_per_samples: [640, 1280, 2560, 5120, 2560],
22 |         eval_per_samples_warmup: [40000, 40000, 40000, 40000, 40000],
23 |         eval_warmup_samples: [0, 0, 0, 0, 0], // after this many steps warmup mode for eval ends
24 |         tolerance_samples: [25600000, 25600000, 25600000, 25600000, 25600000],
25 |         eval_epoch: true,
26 |         min_samples: [0, 0, 0, 0, 0], // train at least these many steps, not affected by early stopping
27 |     },
28 | 
29 |     optim: {
30 |         lr: 2e-5,
31 |         min_lr: 0,
32 |         lr_decay_rate: 1.0,
33 |         warmup_steps: [142, 180, 454, 1338, 360],
34 |         t_total: [1420, 1800, 4540, 13380, 3600],
35 |         batch_size: [12, 32, 32, 32, 32]
36 |     },
37 | 
38 |     fed: {
39 |         round: 11,
40 |         fed_type: 'fed_vertical',
41 |         sample: 'all'
42 |     },
43 | 
44 |     epochs: [1, 1, 1, 1, 1],
45 | 
46 |     max_len1: [25, 10, 15, 10, 10],
47 |     max_len2: [480, 40, 50, 100, 100],
48 | }


--------------------------------------------------------------------------------
/configs/data/private/lr_8.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |     data_dir: [
 3 |         'data/processed/medquad',
 4 |         'data/processed/privacy',
 5 |         'data/processed/bioasq',
 6 |         'data/processed/fiqa',
 7 |         'data/processed/inqa_small',
 8 |     ],
 9 |     output_dir: 'private/lr_8',
10 |     metric: 'mrr',
11 |     watch_metrics: ['map'],
12 | 
13 |     bert: {
14 |         bert_config: 'configs/bert/private/lr_8.json',
15 |         init_checkpoint: 'checkpoint/bert-base-uncased-official/pytorch_model.bin',
16 |         droput: 0.2,
17 |     },
18 | 
19 |     routine: {
20 |         log_per_samples: [64, 128, 256, 512, 256],
21 |         eval_per_samples: [640, 1280, 2560, 5120, 2560],
22 |         eval_per_samples_warmup: [40000, 40000, 40000, 40000, 40000],
23 |         eval_warmup_samples: [0, 0, 0, 0, 0], // after this many steps warmup mode for eval ends
24 |         tolerance_samples: [25600000, 25600000, 25600000, 25600000, 25600000],
25 |         eval_epoch: true,
26 |         min_samples: [0, 0, 0, 0, 0], // train at least these many steps, not affected by early stopping
27 |     },
28 | 
29 |     optim: {
30 |         lr: 2e-5,
31 |         min_lr: 0,
32 |         lr_decay_rate: 1.0,
33 |         warmup_steps: [142, 180, 454, 1338, 360],
34 |         t_total: [1420, 1800, 4540, 13380, 3600],
35 |         batch_size: [12, 32, 32, 32, 32]
36 |     },
37 | 
38 |     fed: {
39 |         round: 11,
40 |         fed_type: 'fed_vertical',
41 |         sample: 'all'
42 |     },
43 | 
44 |     epochs: [1, 1, 1, 1, 1],
45 | 
46 |     max_len1: [25, 10, 15, 10, 10],
47 |     max_len2: [480, 40, 50, 100, 100],
48 | }


--------------------------------------------------------------------------------
/configs/default.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |     basic: {
 3 |         output_dir: 'default',
 4 |         seed: null,
 5 |         cuda: true,
 6 |         multi_gpu: false,
 7 |         deterministic: true, // GPU deterministic mode, will slow down training
 8 |     },
 9 | 
10 |     data: {
11 |         data_dir: null,
12 |         min_df: 5,
13 |         max_vocab: 999999, // capacity for words including out of embedding words
14 |         max_len: 999, // large enough number, treated as unlimited
15 |         min_len: 1,
16 |         lower_case: true, // whether to treat the data and embedding as lowercase.
17 |         sort_by_len: false,
18 |         pretrained_embeddings: 'resources/glove.840B.300d.txt',
19 |         embedding_dim: 300,
20 |         embedding_mode: 'freq', // (options: 'freq', 'last', 'avg', 'strict') what to do when duplicated embedding tokens (after normalization) are found.
21 |     },
22 | 
23 |     model: {
24 |         hidden_size: 150,
25 |         dropout: 0.2,
26 |         blocks: 2,
27 |         fix_embeddings: true,
28 |         encoder: {
29 |             encoder: 'cnn', // cnn, lstm
30 |             enc_layers: 2,
31 |             kernel_sizes: [3],
32 |         },
33 |         alignment: 'linear', // linear, identity
34 |         fusion: 'full', // full, simple
35 |         connection: 'aug', // aug, residual
36 |         prediction: 'full', // full, symmetric, simple
37 | 
38 |     },
39 | 
40 |     logging: {
41 |         log_file: 'log.txt',
42 |         summary_per_logs: 20,
43 |         tensorboard: true,
44 |     },
45 | 
46 |     training: {
47 |         epochs: [10],
48 |         batch_size: [128],
49 |         grad_clipping: 5,
50 |         weight_decay: 0,
51 |         lr: 1e-3,
52 |         beta1: 0.9,
53 |         beta2: 0.999,
54 |         max_loss: 999., // tolerance for unstable training
55 |         lr_decay_rate: 0.95, // exp decay rate for lr
56 |         lr_decay_samples: 128000,
57 |         min_lr: 6e-5,
58 |         lr_warmup_samples: 0, // linear warmup steps for lr
59 |     },
60 | 
61 |     evaluation: {
62 |         // available metrics: acc, auc, f1, map, mrr
63 |         metric: 'acc', // for early stopping
64 |         watch_metrics: ['auc', 'f1'], // shown in logs
65 |         eval_file: 'dev',
66 |         eval_per_samples: [6400],
67 |         eval_per_samples_warmup: [40000],
68 |         eval_warmup_samples: [0], // after this many steps warmup mode for eval ends
69 |         min_samples: [0], // train at least these many steps, not affected by early stopping
70 |         tolerance_samples: [400000], // early stopping
71 |         eval_epoch: true, // eval after epoch
72 |         eval_subset: null,
73 |     },
74 | 
75 |     persistence: {
76 |         resume: null,
77 |         save: true,
78 |         save_all: false,
79 |     },
80 | }


--------------------------------------------------------------------------------
/configs/main.json5:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         name: 'benchmark',
 4 |         __parents__: [
 5 |             'default',
 6 |             'data/private/lr_32',
 7 |         ],
 8 |         __repeat__: 2,
 9 |         eval_file: 'test',
10 |     },
11 |     {
12 |         name: 'benchmark',
13 |         __parents__: [
14 |             'default',
15 |             'data/private/lr_64',
16 |         ],
17 |         __repeat__: 2,
18 |         eval_file: 'test',
19 |     },
20 |     {
21 |         name: 'benchmark',
22 |         __parents__: [
23 |             'default',
24 |             'data/private/lr_256',
25 |         ],
26 |         __repeat__: 2,
27 |         eval_file: 'test',
28 |     },
29 |     {
30 |         name: 'benchmark',
31 |         __parents__: [
32 |             'default',
33 |             'data/private/lr_512',
34 |         ],
35 |         __repeat__: 2,
36 |         eval_file: 'test',
37 |     },
38 |     {
39 |         name: 'benchmark',
40 |         __parents__: [
41 |             'default',
42 |             'data/private/lr_128',
43 |         ],
44 |         __repeat__: 2,
45 |         eval_file: 'test',
46 |     },
47 |     {
48 |         name: 'benchmark',
49 |         __parents__: [
50 |             'default',
51 |             'data/private/lr_16',
52 |         ],
53 |         __repeat__: 2,
54 |         eval_file: 'test',
55 |     },
56 |     {
57 |         name: 'benchmark',
58 |         __parents__: [
59 |             'default',
60 |             'data/private/lr_8',
61 |         ],
62 |         __repeat__: 2,
63 |         eval_file: 'test',
64 |     },
65 |     {
66 |         name: 'benchmark',
67 |         __parents__: [
68 |             'default',
69 |             'data/private/lr_1024',
70 |         ],
71 |         __repeat__: 2,
72 |         eval_file: 'test',
73 |     },
74 | ]


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import sys
18 | from src.evaluator import Evaluator
19 | 
20 | 
21 | def main():
22 |     argv = sys.argv
23 |     if len(argv) == 3:
24 |         model_path, data_file = argv[1:]
25 |         evaluator = Evaluator(model_path, data_file)
26 |         evaluator.evaluate()
27 |     else:
28 |         print('Usage: "python evaluate.py $model_path $data_file"')
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     main()
33 | 


--------------------------------------------------------------------------------
/figures/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chriskuei/FedMatch/305e8c4bbb398712b00c883a986dfec17b500f76/figures/overview.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tqdm
 2 | nltk
 3 | numpy
 4 | scikit-learn
 5 | msgpack-python
 6 | tensorboardX
 7 | json5
 8 | torch
 9 | transformers
10 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chriskuei/FedMatch/305e8c4bbb398712b00c883a986dfec17b500f76/src/__init__.py


--------------------------------------------------------------------------------
/src/bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from .modules.embedding import Embedding
 6 | from .modules import Module, ModuleList, ModuleDict
 7 | from .modeling import BertConfig, BertModel, BERTLayerNorm
 8 | 
 9 | 
10 | class BERT(Module):
11 |     def __init__(self, args):
12 |         super().__init__()
13 |         self.config = BertConfig.from_json_file(args.bert_config)
14 |         self.bert = BertModel(self.config)
15 |         self.dropout = nn.Dropout(p=args.dropout)
16 |         self.prediction = nn.Sequential(
17 |             nn.Linear(self.config.hidden_size, 2),
18 |         )
19 | 
20 |         def init_weights(module):
21 |             if isinstance(module, (nn.Linear, nn.Embedding)):
22 |                 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
23 |             elif isinstance(module, BERTLayerNorm):
24 |                 module.beta.data.normal_(mean=0.0, std=self.config.initializer_range)
25 |                 module.gamma.data.normal_(mean=0.0, std=self.config.initializer_range)
26 |             if isinstance(module, nn.Linear):
27 |                 if module.bias is not None:
28 |                     module.bias.data.zero_()
29 |         self.apply(init_weights)
30 | 
31 |         if args.init_checkpoint:
32 |             self.load_pretrained(args.init_checkpoint)
33 | 
34 |     def load_pretrained(self, init_checkpoint, patch=False, transfer=True):
35 |         if transfer:
36 |             print('Load all parameters')
37 |             missing_keys, unexpected_keys = self.load_state_dict(torch.load(init_checkpoint, map_location='cpu'),strict=False)
38 |             print("missing keys: {}".format(missing_keys))
39 |             print('unexpected keys: {}'.format(unexpected_keys))
40 | 
41 |         else:
42 |             print('Load Bert parameters')
43 |             missing_keys, unexpected_keys = self.bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'),strict=False)
44 |             print("missing keys: {}".format(missing_keys))
45 |             print('unexpected keys: {}'.format(unexpected_keys))
46 | 
47 |     def register_parameters(self, init_checkpoint):
48 |         params = torch.load(init_checkpoint, map_location='cpu')
49 |         for k,v in params.items():
50 |             if 'mean' not in k:
51 |                 self.register_buffer(f"{k.replace('.', '_')}__mean", v)
52 | 
53 |     def forward(self, inputs):
54 |         a = inputs['text1']
55 |         b = inputs['text2']
56 | 
57 |         input_ids = torch.cat((a, b), dim=-1)
58 |         token_type_ids = torch.cat((
59 |             torch.zeros_like(a),
60 |             torch.ones_like(b)), dim=-1).long()
61 |         attention_mask = (input_ids != 0)
62 |         
63 |         bert_output = self.bert(
64 |             input_ids=input_ids,
65 |             token_type_ids=token_type_ids,
66 |             attention_mask=attention_mask
67 |         )
68 | 
69 |         last_output = bert_output[1]
70 | 
71 |         prediction = self.prediction(self.dropout(last_output))
72 | 
73 |         return prediction
74 | 


--------------------------------------------------------------------------------
/src/evaluator.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import os
18 | import json
19 | from pprint import pprint
20 | from .model import Model
21 | from .interface import Interface
22 | from .utils.loader import load_data
23 | 
24 | 
25 | class Evaluator:
26 |     def __init__(self, model_path, data_file):
27 |         self.model_path = model_path
28 |         self.data_file = data_file
29 | 
30 |     def evaluate(self):
31 |         data = load_data(*os.path.split(self.data_file))
32 |         model, checkpoint = Model.load(self.model_path)
33 |         args = checkpoint['args']
34 |         interface = Interface(args)
35 |         batches = interface.pre_process(data, training=False, with_target=False)
36 |         predictions = model.evaluate(batches, return_predict=True)
37 |         self.save(data, predictions, 'pred/predictions.json')
38 |     
39 |     def save(self, data, predictions, path):
40 |         out_file = open(path, 'w')
41 |         for datum, pred in zip(data, predictions):
42 |             datum['pred'] = pred
43 |             out_file.write(f'{json.dumps(datum, ensure_ascii=False)}\n')
44 | 


--------------------------------------------------------------------------------
/src/interface.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (C) 2019 Alibaba Group Holding Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import os
 18 | import random
 19 | import msgpack
 20 | from transformers import AutoTokenizer
 21 | 
 22 | from .utils.vocab import Vocab, Indexer
 23 | from .utils.loader import load_data, load_embeddings
 24 | 
 25 | 
 26 | tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
 27 | 
 28 | 
 29 | class Interface:
 30 |     def __init__(self, args, log=None):
 31 |         self.args = args
 32 |         # build/load vocab and target map
 33 |         vocab_file = os.path.join(args.output_dir, 'vocab.txt')
 34 |         target_map_file = os.path.join(args.output_dir, 'target_map.txt')
 35 |         if not os.path.exists(vocab_file):
 36 |             data = []
 37 |             for dataset in self.args.data_dir:
 38 |                 data.extend(load_data(dataset))
 39 |             self.target_map = Indexer.build(
 40 |                 (sample['target'] for sample in data), log=log)
 41 |             self.target_map.save(target_map_file)
 42 |             self.vocab = Vocab.build(
 43 |                 (word for sample in data
 44 |                  for text in (sample['text1'], sample['text2'])
 45 |                  for word in text.split()[:self.args.max_len]),
 46 |                 lower=args.lower_case, min_df=self.args.min_df, log=log,
 47 |                 pretrained_embeddings=args.pretrained_embeddings,
 48 |                 dump_filtered=os.path.join(args.output_dir, 'filtered_words.txt'))
 49 |             self.vocab.save(vocab_file)
 50 | 
 51 |         else:
 52 |             self.target_map = Indexer.load(target_map_file)
 53 |             self.vocab = Vocab.load(vocab_file)
 54 |         args.num_classes = len(self.target_map)
 55 |         args.num_vocab = len(self.vocab)
 56 |         args.padding = Vocab.pad()
 57 | 
 58 |     def load_embeddings(self):
 59 |         """generate embeddings suited for the current vocab or load previously cached ones."""
 60 |         assert self.args.pretrained_embeddings
 61 |         embedding_file = os.path.join(
 62 |             self.args.output_dir, 'embedding.msgpack')
 63 |         if not os.path.exists(embedding_file):
 64 |             embeddings = load_embeddings(
 65 |                 self.args.pretrained_embeddings, self.vocab,
 66 |                 self.args.embedding_dim, mode=self.args.embedding_mode,
 67 |                 lower=self.args.lower_case)
 68 |             with open(embedding_file, 'wb') as f:
 69 |                 msgpack.dump(embeddings, f)
 70 |         else:
 71 |             with open(embedding_file, 'rb') as f:
 72 |                 embeddings = msgpack.load(f)
 73 |         return embeddings
 74 | 
 75 |     def pre_process(self, data, i, training=True, with_target=True):
 76 |         result = [
 77 |             self.process_sample(sample, i, with_target) for sample in data
 78 |         ]
 79 |         if training:
 80 |             result = list(
 81 |                 filter(
 82 |                     lambda x: len(x['text1']) < self.args.max_len and len(
 83 |                         x['text2']) < self.args.max_len, result
 84 |                 )
 85 |             )
 86 |             if not self.args.sort_by_len:
 87 |                 return result
 88 |             result = sorted(result, key=lambda x: (
 89 |                 len(x['text1']), len(x['text2']), x['text1']))
 90 |         batch_size = self.args.batch_size[i]
 91 |         return [self.make_batch(result[i:i + batch_size]) for i in range(0, len(data), batch_size)]
 92 | 
 93 |     def process_sample(self, sample, i, with_target=True):
 94 |         text1 = sample['text1']
 95 |         text2 = sample['text2']
 96 | 
 97 |         sen1 = tokenizer.encode(text1)[:self.args.max_len1[i]]
 98 |         sen2 = tokenizer.encode(text2)[:self.args.max_len2[i]]
 99 | 
100 |         sen1.insert(0, 101)
101 |         sen1.insert(self.args.max_len1[i]+1, 102)
102 |         sen2.insert(self.args.max_len2[i]+1, 102)
103 | 
104 |         processed = {
105 |             'text1': sen1,
106 |             'text2': sen2
107 |         }
108 | 
109 |         if 'target' in sample and with_target:
110 |             target = sample['target']
111 |             assert target in self.target_map
112 |             processed['target'] = self.target_map.index(target)
113 |         return processed
114 | 
115 |     def shuffle_batch(self, data, i):
116 |         data = random.sample(data, len(data))
117 |         if self.args.sort_by_len:
118 |             return data
119 |         batch_size = self.args.batch_size[i]
120 |         batches = [
121 |             data[i:i + batch_size]
122 |             for i in range(0, len(data), batch_size)]
123 |         return list(map(self.make_batch, batches))
124 | 
125 |     def make_batch(self, batch, with_target=True):
126 |         batch = {
127 |             key: [sample[key] for sample in batch]
128 |             for key in batch[0].keys()
129 |         }
130 |         if 'target' in batch and not with_target:
131 |             del batch['target']
132 |         batch = {
133 |             key: self.padding(value, min_len=self.args.min_len) if key.startswith(
134 |                 'text') else value
135 |             for key, value in batch.items()}
136 |         return batch
137 | 
138 |     @staticmethod
139 |     def padding(samples, min_len=1):
140 |         max_len = max(max(map(len, samples)), min_len)
141 |         batch = [
142 |             sample + [Vocab.pad()] * (max_len - len(sample))
143 |             for sample in samples
144 |         ]
145 |         return batch
146 | 
147 |     def post_process(self, output):
148 |         final_prediction = []
149 |         for prob in output:
150 |             idx = max(range(len(prob)), key=prob.__getitem__)
151 |             target = self.target_map[idx]
152 |             final_prediction.append(target)
153 |         return final_prediction
154 | 


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (C) 2019 Alibaba Group Holding Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import os
 18 | import math
 19 | import random
 20 | import torch
 21 | import torch.nn.functional as f
 22 | from tqdm import tqdm
 23 | from transformers import AdamW, get_linear_schedule_with_warmup
 24 | 
 25 | from .bert import BERT
 26 | from .utils.metrics import registry as metrics
 27 | 
 28 | 
 29 | class Model:
 30 |     prefix = 'checkpoint'
 31 |     best_model_name = 'best.pt'
 32 | 
 33 |     def __init__(self, dataset, args, state_dict=None, number=None):
 34 |         self.args = args
 35 |         self.dataset = dataset
 36 |         self.best_model_name = dataset.replace('data/processed/', '')
 37 |         self.number = number or 0
 38 |         # network
 39 |         self.network = BERT(args)
 40 |         self.device = torch.cuda.current_device() if args.cuda else torch.device('cpu')
 41 |         self.network.to(self.device)
 42 |         # optimizer
 43 |         self.params = list(
 44 |             filter(lambda x: x.requires_grad, self.network.parameters()))
 45 | 
 46 |         no_decay = ['bias', 'LayerNorm.weight']
 47 |         optimizer_grouped_parameters = [
 48 |             {'params': [p for n, p in self.network.named_parameters() if not any(
 49 |                 nd in n for nd in no_decay)], 'weight_decay': 5e-5},
 50 |             {'params': [p for n, p in self.network.named_parameters() if any(
 51 |                 nd in n for nd in no_decay)], 'weight_decay': 0.0}
 52 |         ]
 53 |         self.opt = AdamW(
 54 |             optimizer_grouped_parameters,
 55 |             lr=args.lr, betas=(0.9, 0.98), eps=1e-8
 56 |         )
 57 |         self.scheduler = get_linear_schedule_with_warmup(
 58 |             self.opt, args.warmup_steps[self.number], args.t_total[self.number])
 59 |         # updates
 60 |         self.updates = state_dict['updates'] if state_dict else 0
 61 | 
 62 |         if state_dict:
 63 |             new_state = set(self.network.state_dict().keys())
 64 |             for k in list(state_dict['model'].keys()):
 65 |                 if k not in new_state:
 66 |                     del state_dict['model'][k]
 67 |             self.network.load_state_dict(state_dict['model'])
 68 |             self.opt.load_state_dict(state_dict['opt'])
 69 | 
 70 |     def update(self, batch, origin={}):
 71 |         self.network.train()
 72 |         self.network.to(self.device)
 73 |         self.opt.zero_grad()
 74 |         inputs, target = self.process_data(batch)
 75 |         output = self.network(inputs)
 76 |         summary = self.network.get_summary()
 77 |         loss = self.get_loss(output, target)
 78 |         loss.backward()
 79 |         grad_norm = torch.nn.utils.clip_grad_norm_(
 80 |             self.params, self.args.grad_clipping)
 81 |         assert grad_norm >= 0, 'encounter nan in gradients.'
 82 |         self.opt.step()
 83 |         self.scheduler.step()
 84 | 
 85 |         self.network.eval()
 86 | 
 87 |         def model_dist_norm(model, target_params):
 88 |             squared_sum = 0
 89 |             for name, layer in model.named_parameters():
 90 |                 squared_sum += torch.sum(
 91 |                     torch.pow(layer.data - target_params[name].data, 2))
 92 |             return math.sqrt(squared_sum)
 93 |         if self.args.fed_type == 'diff_privacy':
 94 |             model_norm = model_dist_norm(self.network, origin)
 95 |             if model_norm > self.args.s_norm:
 96 |                 norm_scale = self.args.s_norm / (model_norm)
 97 |                 for name, layer in self.network.named_parameters():
 98 |                     clipped_difference = norm_scale * (
 99 |                         layer.data - origin[name])
100 |                     layer.data.copy_(
101 |                         origin[name] + clipped_difference)
102 |         self.updates += 1
103 |         stats = {
104 |             'dataset': self.dataset,
105 |             'updates': self.updates,
106 |             'loss': loss.item(),
107 |             'lr': self.opt.param_groups[0]['lr'],
108 |             'gnorm': grad_norm,
109 |             'summary': summary,
110 |         }
111 |         return stats
112 | 
113 |     def evaluate(self, data, return_predict=False):
114 |         self.network.eval()
115 |         targets = []
116 |         probabilities = []
117 |         predictions = []
118 |         losses = []
119 |         for batch in tqdm(data[:self.args.eval_subset], desc='evaluating', leave=False):
120 |             inputs, target = self.process_data(batch)
121 |             with torch.no_grad():
122 |                 output = self.network(inputs)
123 |                 if not return_predict:
124 |                     loss = self.get_loss(output, target)
125 |                 pred = torch.argmax(output, dim=1)
126 |                 prob = torch.nn.functional.softmax(output, dim=1)
127 |                 if not return_predict:
128 |                     losses.append(loss.item())
129 |                     targets.extend(target.tolist())
130 |                 probabilities.extend(prob.tolist())
131 |                 predictions.extend(pred.tolist())
132 |         outputs = {
133 |             'target': targets,
134 |             'prob': probabilities,
135 |             'pred': predictions,
136 |             'args': self.args,
137 |             'dataset': self.dataset,
138 |         }
139 |         self.outputs = outputs
140 |         if return_predict:
141 |             return predictions
142 |         stats = {
143 |             'updates': self.updates,
144 |             'loss': sum(losses[:-1]) / (len(losses) - 1) if len(losses) > 1 else sum(losses),
145 |         }
146 |         for metric in self.args.watch_metrics:
147 |             if metric not in stats:  # multiple metrics could be computed by the same function
148 |                 stats.update(metrics[metric](outputs))
149 |         assert 'score' not in stats, 'metric name collides with "score"'
150 |         eval_score = stats[self.args.metric]
151 |         stats['score'] = eval_score
152 |         return eval_score, stats  # first value is for early stopping
153 | 
154 |     def predict(self, batch):
155 |         self.network.eval()
156 |         inputs, _ = self.process_data(batch)
157 |         with torch.no_grad():
158 |             output = self.network(inputs)
159 |             output = torch.nn.functional.softmax(output, dim=1)
160 |         return output.tolist()
161 | 
162 |     def process_data(self, batch):
163 |         text1 = torch.LongTensor(batch['text1']).to(self.device)
164 |         text2 = torch.LongTensor(batch['text2']).to(self.device)
165 |         mask1 = torch.ne(text1, self.args.padding).unsqueeze(2)
166 |         mask2 = torch.ne(text2, self.args.padding).unsqueeze(2)
167 |         inputs = {
168 |             'text1': text1,
169 |             'text2': text2,
170 |             'mask1': mask1,
171 |             'mask2': mask2,
172 |         }
173 |         if 'target' in batch:
174 |             target = torch.LongTensor(batch['target']).to(self.device)
175 |             return inputs, target
176 |         return inputs, None
177 | 
178 |     @staticmethod
179 |     def get_loss(logits, target):
180 |         return f.cross_entropy(logits, target)
181 | 
182 |     def save(self, states, name=None):
183 |         if name:
184 |             filename = os.path.join(self.args.summary_dir, name)
185 |         else:
186 |             filename = os.path.join(
187 |                 self.args.summary_dir, f'{self.prefix}_{self.updates}.pt')
188 |         params = {
189 |             'state_dict': {
190 |                 'model': self.network.state_dict(),
191 |                 'opt': self.opt.state_dict(),
192 |                 'updates': self.updates,
193 |                 'outputs': self.outputs
194 |             },
195 |             'args': self.args,
196 |             'random_state': random.getstate(),
197 |             'torch_state': torch.random.get_rng_state()
198 |         }
199 |         params.update(states)
200 |         if self.args.cuda:
201 |             params['torch_cuda_state'] = torch.cuda.get_rng_state()
202 |         torch.save(params, filename)
203 | 
204 |     @classmethod
205 |     def load(cls, file):
206 |         checkpoint = torch.load(file, map_location=(
207 |             lambda s, _: torch.serialization.default_restore_location(s, 'cpu')
208 |         ))
209 |         prev_args = checkpoint['args']
210 |         # update args
211 |         prev_args.output_dir = os.path.dirname(os.path.dirname(file))
212 |         prev_args.summary_dir = os.path.join(
213 |             prev_args.output_dir, prev_args.name)
214 |         prev_args.cuda = prev_args.cuda and torch.cuda.is_available()
215 |         return cls(prev_args, state_dict=checkpoint['state_dict']), checkpoint
216 | 
217 |     def num_parameters(self, exclude_embed=False):
218 |         num_params = sum(
219 |             p.numel() for p in self.network.parameters() if p.requires_grad)
220 |         if exclude_embed:
221 |             num_params -= 0 if self.args.fix_embeddings else next(
222 |                 self.network.embedding.parameters()).numel()
223 |         return num_params
224 | 
225 |     def set_embeddings(self, embeddings):
226 |         self.network.embedding.set_(embeddings)
227 | 


--------------------------------------------------------------------------------
/src/modeling.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch BERT model."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import copy
 22 | import json
 23 | import math
 24 | import six
 25 | import torch
 26 | import torch.nn.functional as F
 27 | import torch.nn as nn
 28 | from torch.nn import CrossEntropyLoss, MSELoss
 29 | from torch.autograd import Variable
 30 | from torch.nn.parameter import Parameter
 31 | 
 32 | 
 33 | def gelu(x):
 34 |     """Implementation of the gelu activation function.
 35 |         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
 36 |         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 37 |     """
 38 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 39 | 
 40 | 
 41 | class BertConfig(object):
 42 |     """Configuration class to store the configuration of a `BertModel`.
 43 |     """
 44 |     def __init__(self,
 45 |                 vocab_size,
 46 |                 top_pals=False,
 47 |                 top_low_rank=False,
 48 |                 vertical_pals=False,
 49 |                 vertical_low_rank=False,
 50 |                 vertical=False,
 51 |                 hidden_size=768,
 52 |                 num_hidden_layers=12,
 53 |                 num_attention_heads=12,
 54 |                 intermediate_size=3072,
 55 |                 hidden_act="gelu",
 56 |                 hidden_dropout_prob=0.1,
 57 |                 attention_probs_dropout_prob=0.1,
 58 |                 max_position_embeddings=512,
 59 |                 type_vocab_size=16,
 60 |                 initializer_range=0.02,
 61 |                 pals=False,
 62 |                 mult=False,
 63 |                 top=False,
 64 |                 lhuc=False,
 65 |                 houlsby=False,
 66 |                 bert_lay_top=False,
 67 |                 num_tasks=1,
 68 |                 extra_dim=None,
 69 |                 adapter=None,
 70 |                 adapter_size=0,
 71 |                 hidden_size_aug=204):
 72 |         """Constructs BertConfig.
 73 | 
 74 |         Args:
 75 |             vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
 76 |             hidden_size: Size of the encoder layers and the pooler layer.
 77 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 78 |             num_attention_heads: Number of attention heads for each attention layer in
 79 |                 the Transformer encoder.
 80 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 81 |                 layer in the Transformer encoder.
 82 |             hidden_act: The non-linear activation function (function or string) in the
 83 |                 encoder and pooler.
 84 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 85 |                 layers in the embeddings, encoder, and pooler.
 86 |             attention_probs_dropout_prob: The dropout ratio for the attention
 87 |                 probabilities.
 88 |             max_position_embeddings: The maximum sequence length that this model might
 89 |                 ever be used with. Typically set this to something large just in case
 90 |                 (e.g., 512 or 1024 or 2048).
 91 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 92 |                 `BertModel`.
 93 |             initializer_range: The sttdev of the truncated_normal_initializer for
 94 |                 initializing all weight matrices.
 95 |         """
 96 |         self.vocab_size = vocab_size
 97 |         self.top_pals = top_pals
 98 |         self.top_low_rank = top_low_rank
 99 |         self.vertical = vertical
100 |         self.vertical_pals = vertical_pals
101 |         self.vertical_low_rank = vertical_low_rank
102 |         self.hidden_size = hidden_size
103 |         self.num_hidden_layers = num_hidden_layers
104 |         self.num_attention_heads = num_attention_heads
105 |         self.hidden_act = hidden_act
106 |         self.intermediate_size = intermediate_size
107 |         self.hidden_dropout_prob = hidden_dropout_prob
108 |         self.attention_probs_dropout_prob = attention_probs_dropout_prob
109 |         self.max_position_embeddings = max_position_embeddings
110 |         self.type_vocab_size = type_vocab_size
111 |         self.initializer_range = initializer_range
112 |         self.hidden_size_aug = hidden_size_aug
113 |         self.pals = pals
114 |         self.extra_dim = extra_dim
115 |         self.houlsby = houlsby
116 |         self.mult = mult
117 |         self.top = top
118 |         self.bert_lay_top = bert_lay_top
119 |         self.lhuc = lhuc
120 |         self.num_tasks=num_tasks
121 |         self.adapter = adapter
122 | 
123 |     @classmethod
124 |     def from_dict(cls, json_object):
125 |         """Constructs a `BertConfig` from a Python dictionary of parameters."""
126 |         config = BertConfig(vocab_size=None)
127 |         for (key, value) in six.iteritems(json_object):
128 |             config.__dict__[key] = value
129 |         return config
130 | 
131 |     @classmethod
132 |     def from_json_file(cls, json_file):
133 |         """Constructs a `BertConfig` from a json file of parameters."""
134 |         with open(json_file, "r") as reader:
135 |             text = reader.read()
136 |         return cls.from_dict(json.loads(text))
137 | 
138 |     def to_json_file(self, json_file_path):
139 |         """ Save this instance to a json file."""
140 |         with open(json_file_path, "w", encoding='utf-8') as writer:
141 |             writer.write(self.to_json_string())
142 | 
143 |     def to_dict(self):
144 |         """Serializes this instance to a Python dictionary."""
145 |         output = copy.deepcopy(self.__dict__)
146 |         return output
147 | 
148 |     def to_json_string(self):
149 |         """Serializes this instance to a JSON string."""
150 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
151 | 
152 | 
153 | class BERTLayerNorm(nn.Module):
154 |     def __init__(self, config, multi_params=None, variance_epsilon=1e-12):
155 |         """Construct a layernorm module in the TF style (epsilon inside the square root).
156 |         """
157 |         super(BERTLayerNorm, self).__init__()
158 |         if multi_params is not None:
159 |             self.gamma = nn.Parameter(torch.ones(config.hidden_size_aug))
160 |             self.beta = nn.Parameter(torch.zeros(config.hidden_size_aug))
161 |         else:
162 |             self.gamma = nn.Parameter(torch.ones(config.hidden_size))
163 |             self.beta = nn.Parameter(torch.zeros(config.hidden_size))
164 |         self.variance_epsilon = variance_epsilon
165 | 
166 |     def forward(self, x):
167 |         u = x.mean(-1, keepdim=True)
168 |         s = (x - u).pow(2).mean(-1, keepdim=True)
169 |         x = (x - u) / torch.sqrt(s + self.variance_epsilon)
170 |         return self.gamma * x + self.beta
171 | 
172 | 
173 | class BERTEmbeddings(nn.Module):
174 |     def __init__(self, config):
175 |         super(BERTEmbeddings, self).__init__()
176 |         """Construct the embedding module from word, position and token_type embeddings.
177 |         """
178 |         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
179 |         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
180 |         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
181 | 
182 |         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
183 |         # any TensorFlow checkpoint file
184 |         self.LayerNorm = BERTLayerNorm(config)
185 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
186 | 
187 |     def forward(self, input_ids, token_type_ids=None):
188 |         seq_length = input_ids.size(1)
189 |         position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
190 |         position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
191 |         if token_type_ids is None:
192 |             token_type_ids = torch.zeros_like(input_ids)
193 | 
194 |         words_embeddings = self.word_embeddings(input_ids)
195 |         position_embeddings = self.position_embeddings(position_ids)
196 |         token_type_embeddings = self.token_type_embeddings(token_type_ids)
197 | 
198 |         embeddings = words_embeddings + position_embeddings + token_type_embeddings
199 |         embeddings = self.LayerNorm(embeddings)
200 |         embeddings = self.dropout(embeddings)
201 |         return embeddings
202 | 
203 | 
204 | class BERTSelfAttention(nn.Module):
205 |     def __init__(self, config, multi_params=None):
206 |         super(BERTSelfAttention, self).__init__()
207 |         if config.hidden_size % config.num_attention_heads != 0:
208 |             raise ValueError(
209 |                 "The hidden size (%d) is not a multiple of the number of attention "
210 |                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
211 |         if multi_params is not None:
212 |             self.num_attention_heads = multi_params
213 |             self.attention_head_size = int(config.hidden_size_aug / self.num_attention_heads)
214 |             self.all_head_size = self.num_attention_heads * self.attention_head_size
215 |             hidden_size = config.hidden_size_aug
216 |         else:
217 |             self.num_attention_heads = config.num_attention_heads
218 |             self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
219 |             self.all_head_size = self.num_attention_heads * self.attention_head_size
220 |             hidden_size = config.hidden_size
221 | 
222 |         self.query = nn.Linear(hidden_size, self.all_head_size)
223 |         self.key = nn.Linear(hidden_size, self.all_head_size)
224 |         self.value = nn.Linear(hidden_size, self.all_head_size)
225 | 
226 |         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
227 | 
228 |     def transpose_for_scores(self, x):
229 |         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
230 |         x = x.view(*new_x_shape)
231 |         return x.permute(0, 2, 1, 3)
232 | 
233 |     def forward(self, hidden_states, attention_mask):
234 |         mixed_query_layer = self.query(hidden_states)
235 |         mixed_key_layer = self.key(hidden_states)
236 |         mixed_value_layer = self.value(hidden_states)
237 | 
238 |         query_layer = self.transpose_for_scores(mixed_query_layer)
239 |         key_layer = self.transpose_for_scores(mixed_key_layer)
240 |         value_layer = self.transpose_for_scores(mixed_value_layer)
241 | 
242 |         # Take the dot product between "query" and "key" to get the raw attention scores.
243 |         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
244 |         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
245 |         # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
246 |         attention_scores = attention_scores + attention_mask
247 | 
248 |         # Normalize the attention scores to probabilities.
249 |         attention_probs = nn.Softmax(dim=-1)(attention_scores)
250 | 
251 |         # This is actually dropping out entire tokens to attend to, which might
252 |         # seem a bit unusual, but is taken from the original Transformer paper.
253 |         attention_probs = self.dropout(attention_probs)
254 | 
255 |         context_layer = torch.matmul(attention_probs, value_layer)
256 |         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
257 |         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
258 |         context_layer = context_layer.view(*new_context_layer_shape)
259 |         return context_layer
260 | 
261 | 
262 | class BERTMultSelfOutput(nn.Module):
263 |     def __init__(self, config, multi_params=None):
264 |         super(BERTMultSelfOutput, self).__init__()
265 |         self.LayerNorm = BERTLayerNorm(config, multi_params)
266 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
267 | 
268 |     def forward(self, hidden_states, input_tensor):
269 |         hidden_states = self.dropout(hidden_states)
270 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
271 |         return hidden_states
272 | 
273 | 
274 | class AdapterLayer(nn.Module):
275 |     def __init__(self, config):
276 |         super().__init__()
277 |         self.adapter_linear1 = nn.Linear(config.hidden_size, config.adapter_size)
278 |         self.gelu = gelu
279 |         self.adapter_linear2 = nn.Linear(config.adapter_size, config.hidden_size)
280 |     def forward(self, input_tensor):
281 |         net = self.adapter_linear1(input_tensor)
282 |         net = self.gelu(net)
283 |         net = self.adapter_linear2(net)
284 |         return net + input_tensor
285 | 
286 | 
287 | class BERTSelfOutput(nn.Module):
288 |     def __init__(self, config, multi_params=None, houlsby=False):
289 |         super(BERTSelfOutput, self).__init__()
290 |         if houlsby:
291 |             multi = BERTLowRank(config)
292 |             self.multi_layers = nn.ModuleList([copy.deepcopy(multi) for _ in range(config.num_tasks)])
293 |         if multi_params is not None:
294 |             self.dense = nn.Linear(config.hidden_size_aug, config.hidden_size_aug)
295 |         else:
296 |             self.dense = nn.Linear(config.hidden_size, config.hidden_size)
297 |         if config.adapter == 'adapter_google':
298 |             adapter =  AdapterLayer(config)
299 |             self.adapters = nn.ModuleList([copy.deepcopy(adapter) for _ in range(config.num_tasks)])
300 |         self.LayerNorm = BERTLayerNorm(config, multi_params)
301 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
302 |         self.houlsby = houlsby
303 |         self.adapter=config.adapter
304 | 
305 |     def forward(self, hidden_states, input_tensor, attention_mask=None, i=0):
306 |         hidden_states = self.dense(hidden_states)
307 |         hidden_states = self.dropout(hidden_states)
308 |         if self.houlsby:
309 |             hidden_states = hidden_states + self.multi_layers[i](hidden_states, attention_mask)
310 |         if self.adapter == 'adapter_google':
311 |             hidden_states = self.adapters[i](hidden_states)
312 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
313 |         return hidden_states
314 | 
315 | 
316 | class BERTAttention(nn.Module):
317 |     def __init__(self, config, multi_params=None, houlsby=False):
318 |         super(BERTAttention, self).__init__()
319 |         self.self = BERTSelfAttention(config, multi_params)
320 |         self.output = BERTSelfOutput(config, multi_params, houlsby)
321 | 
322 |     def forward(self, input_tensor, attention_mask, i=0):
323 |         self_output = self.self(input_tensor, attention_mask)
324 |         attention_output = self.output(self_output, input_tensor, attention_mask, i=i)
325 |         return attention_output
326 | 
327 | 
328 | class BERTPals(nn.Module):
329 |     def __init__(self, config, extra_dim=None):
330 |         super(BERTPals, self).__init__()
331 |         # Encoder and decoder matrices project down to the smaller dimension
332 |         self.aug_dense = nn.Linear(config.hidden_size, config.hidden_size_aug)
333 |         self.aug_dense2 = nn.Linear(config.hidden_size_aug, config.hidden_size)
334 |         # Attention without the final matrix multiply.
335 |         self.attn = BERTSelfAttention(config, 6)
336 |         self.config = config
337 |         self.hidden_act_fn = gelu
338 | 
339 |     def forward(self, hidden_states, attention_mask=None):
340 |         hidden_states_aug = self.aug_dense(hidden_states)
341 |         hidden_states_aug = self.attn(hidden_states_aug, attention_mask)
342 |         hidden_states = self.aug_dense2(hidden_states_aug)
343 |         hidden_states = self.hidden_act_fn(hidden_states)
344 |         return hidden_states
345 | 
346 | 
347 | class BERTLowRank(nn.Module):
348 |     def __init__(self, config, extra_dim=None):
349 |         super(BERTLowRank, self).__init__()
350 |         # Encoder and decoder matrices project down to the smaller dimension
351 |         if config.extra_dim:
352 |             self.aug_dense = nn.Linear(config.hidden_size, config.extra_dim)
353 |             self.aug_dense2 = nn.Linear(config.extra_dim, config.hidden_size)
354 |         else:
355 |             self.aug_dense = nn.Linear(config.hidden_size, config.hidden_size_aug)
356 |             self.aug_dense2 = nn.Linear(config.hidden_size_aug, config.hidden_size)
357 |         self.config = config
358 |         self.hidden_act_fn = gelu
359 | 
360 |     def forward(self, hidden_states, attention_mask=None):
361 |         hidden_states_aug = self.aug_dense(hidden_states)
362 |         hidden_states_aug = self.hidden_act_fn(hidden_states_aug)
363 |         hidden_states = self.aug_dense2(hidden_states_aug)
364 |         return hidden_states
365 | 
366 | 
367 | class BERTIntermediate(nn.Module):
368 |     def __init__(self, config):
369 |         super(BERTIntermediate, self).__init__()
370 |         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
371 |         self.config = config
372 |         self.intermediate_act_fn = gelu
373 | 
374 |     def forward(self, hidden_states):
375 |         hidden_states = self.dense(hidden_states)
376 |         hidden_states = self.intermediate_act_fn(hidden_states)
377 |         return hidden_states
378 | 
379 | 
380 | class BERTLhuc(nn.Module):
381 |     def __init__(self, config):
382 |         super(BERTLhuc, self).__init__()
383 |         self.lhuc = Parameter(torch.zeros(config.hidden_size))
384 | 
385 |     def forward(self, hidden_states):
386 |         hidden_states = hidden_states * 2. * nn.functional.sigmoid(self.lhuc)
387 |         return hidden_states
388 | 
389 | 
390 | class BERTOutput(nn.Module):
391 |     def __init__(self, config, houlsby=False):
392 |         super(BERTOutput, self).__init__()
393 |         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
394 |         self.LayerNorm = BERTLayerNorm(config)
395 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
396 |         if houlsby:
397 |             if config.pals:
398 |                 multi = BERTPals(config)
399 |             else:
400 |                 multi = BERTLowRank(config)
401 |             self.multi_layers = nn.ModuleList([copy.deepcopy(multi) for _ in range(config.num_tasks)])
402 |         if config.adapter == 'adapter_google':
403 |             adapter = AdapterLayer(config)
404 |             self.adapters = nn.ModuleList([copy.deepcopy(adapter) for _ in range(config.num_tasks)])
405 |         self.houlsby = houlsby
406 |         self.adapter = config.adapter
407 | 
408 |     def forward(self, hidden_states, input_tensor, attention_mask=None, i=0):
409 |         hidden_states = self.dense(hidden_states)
410 |         hidden_states = self.dropout(hidden_states)
411 |         if self.houlsby:
412 |             hidden_states = hidden_states + self.multi_layers[i](input_tensor, attention_mask)
413 |         if self.adapter == 'adapter_google':
414 |             hidden_states = self.adapters[i](hidden_states)
415 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
416 |         return hidden_states
417 | 
418 | 
419 | class BERTLayer(nn.Module):
420 |     def __init__(self, config, mult=False, houlsby=False):
421 |         super(BERTLayer, self).__init__()
422 |         self.attention = BERTAttention(config, houlsby=houlsby)
423 |         self.intermediate = BERTIntermediate(config)
424 |         self.output = BERTOutput(config, houlsby=houlsby)
425 |         if config.lhuc:
426 |             lhuc = BERTLhuc(config)
427 |             self.multi_lhuc = nn.ModuleList([copy.deepcopy(lhuc) for _ in range(config.num_tasks)])
428 |         if mult:
429 |             if config.pals:
430 |                 multi = BERTPals(config)
431 |             else:
432 |                 multi = BERTLowRank(config)
433 |             self.multi_layers = nn.ModuleList([copy.deepcopy(multi) for _ in range(config.num_tasks)])
434 |         self.mult = mult
435 |         self.lhuc = config.lhuc
436 |         self.houlsby = houlsby
437 | 
438 |     def forward(self, hidden_states, attention_mask, i=0):
439 |         attention_output = self.attention(hidden_states, attention_mask, i)
440 |         intermediate_output = self.intermediate(attention_output)
441 |         if self.lhuc and not self.mult:
442 |             layer_output = self.output(intermediate_output, attention_output)
443 |             layer_output = self.multi_lhuc[i](layer_output)
444 |         elif self.mult:
445 |             extra = self.multi_layers[i](hidden_states, attention_mask)
446 |             if self.lhuc:
447 |                 extra = self.multi_lhuc[i](extra)
448 |             layer_output = self.output(intermediate_output, attention_output + extra)
449 |         elif self.houlsby:
450 |             layer_output = self.output(intermediate_output, attention_output, attention_mask, i)
451 |         else:
452 |             layer_output = self.output(intermediate_output, attention_output)
453 |         return layer_output
454 | 
455 | 
456 | class BERTEncoder(nn.Module):
457 |     def __init__(self, config):
458 |         super(BERTEncoder, self).__init__()
459 |         self.config = config
460 |         if config.houlsby:
461 |             # Adjust line below to add PALs etc. to different layers. True means add a PAL.
462 |             self.multis = [True if i < 999 else False for i in range(config.num_hidden_layers)]
463 |             self.layer = nn.ModuleList([BERTLayer(config, houlsby=mult) for mult in self.multis])
464 |         elif config.mult:
465 |             # Adjust line below to add PALs etc. to different layers. True means add a PAL.
466 |             self.multis = [True if i < 999 else False for i in range(config.num_hidden_layers)]
467 |             self.layer = nn.ModuleList([BERTLayer(config, mult=mult) for mult in self.multis])
468 |         else:
469 |             layer = BERTLayer(config)
470 |             self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
471 | 
472 |         if config.top:
473 |             if config.bert_lay_top:
474 |                 multi = BERTLayer(config)
475 |             elif config.top_pals:
476 |                 multi = BERTPals(config)
477 |             elif config.top_low_rank:
478 |                 multi = BERTLowRank(config)
479 |             else:
480 |                 # Projection matrices and attention for adding to the top.
481 |                 mult_dense = nn.Linear(config.hidden_size, config.hidden_size_aug)
482 |                 self.mult_dense = nn.ModuleList([copy.deepcopy(mult_dense) for _ in range(config.num_tasks)])
483 |                 mult_dense2 = nn.Linear(config.hidden_size_aug, config.hidden_size)
484 |                 self.mult_dense2 = nn.ModuleList([copy.deepcopy(mult_dense2) for _ in range(config.num_tasks)])
485 |                 multi = nn.ModuleList([copy.deepcopy(BERTAttention(config, 12)) for _ in range(6)])
486 | 
487 |             self.multi_layers = nn.ModuleList([copy.deepcopy(multi) for _ in range(config.num_tasks)])
488 |             self.gelu = gelu
489 | 
490 |         if config.mult and config.pals:
491 |             dense = nn.Linear(config.hidden_size, config.hidden_size_aug)
492 |             # Shared encoder and decoder across layers
493 |             self.mult_aug_dense = nn.ModuleList([copy.deepcopy(dense) for _ in range(config.num_tasks)])
494 |             dense2 = nn.Linear(config.hidden_size_aug, config.hidden_size)
495 |             self.mult_aug_dense2 = nn.ModuleList([copy.deepcopy(dense2) for _ in range(config.num_tasks)])
496 |             for l, layer in enumerate(self.layer):
497 |                 if self.multis[l]:
498 |                     for i, lay in enumerate(layer.multi_layers):
499 |                         lay.aug_dense = self.mult_aug_dense[i]
500 |                         lay.aug_dense2 = self.mult_aug_dense2[i]
501 |         if config.houlsby and config.pals:
502 |             dense = nn.Linear(config.hidden_size, config.hidden_size_aug)
503 |             # Shared encoder and decoder across layers
504 |             self.mult_aug_dense = nn.ModuleList([copy.deepcopy(dense) for _ in range(config.num_tasks)])
505 |             dense2 = nn.Linear(config.hidden_size_aug, config.hidden_size)
506 |             self.mult_aug_dense2 = nn.ModuleList([copy.deepcopy(dense2) for _ in range(config.num_tasks)])
507 |             dense3 = nn.Linear(config.hidden_size, config.hidden_size_aug)
508 |             for l, layer in enumerate(self.layer):
509 |                 if self.multis[l]:
510 |                     for i, lay in enumerate(layer.output.multi_layers):
511 |                         lay.aug_dense = self.mult_aug_dense[i]
512 |                         lay.aug_dense2 = self.mult_aug_dense2[i]
513 | 
514 | 
515 |     def forward(self, hidden_states, attention_mask, i=0):
516 |         all_encoder_layers = []
517 |         for layer_module in self.layer:
518 |             hidden_states = layer_module(hidden_states, attention_mask, i)
519 |             all_encoder_layers.append(hidden_states)
520 |         if self.config.top:
521 |             if self.config.bert_lay_top:
522 |                 all_encoder_layers[-1] = self.multi_layers[i](hidden_states, attention_mask)
523 |             elif self.config.top_pals or self.config.top_low_rank:
524 |                 all_encoder_layers[-1] = self.multi_layers[i](hidden_states, attention_mask)
525 |             else:
526 |                 hidden_states = self.mult_dense[i](hidden_states)
527 |                 for lay in self.multi_layers[i]:
528 |                     hidden_states = lay(hidden_states, attention_mask)
529 |                 all_encoder_layers[-1] = self.mult_dense2[i](hidden_states)
530 |         return all_encoder_layers
531 | 
532 | 
533 | class BERTPooler(nn.Module):
534 |     def __init__(self, config):
535 |         super(BERTPooler, self).__init__()
536 | 
537 |         dense = nn.Linear(config.hidden_size, config.hidden_size)
538 |         self.activation = nn.Tanh()
539 |         self.pool = False
540 |         if self.pool:
541 |             self.mult_dense_layers = nn.ModuleList([copy.deepcopy(dense) for _ in range(config.num_tasks)])
542 |         else:
543 |             self.dense = dense
544 |         self.mult = config.mult
545 |         self.top = config.top
546 | 
547 |     def forward(self, hidden_states, i=0):
548 |         # We "pool" the model by simply taking the hidden state corresponding
549 |         # to the first token.
550 |         first_token_tensor = hidden_states[:, 0]
551 |         if (self.mult or self.top) and self.pool:
552 |             pooled_output = self.mult_dense_layers[i](first_token_tensor)
553 |         else:
554 |             pooled_output = self.dense(first_token_tensor)
555 |         pooled_output = self.activation(pooled_output)
556 |         return pooled_output
557 | 
558 | 
559 | class BertModel(nn.Module):
560 |     """BERT model ("Bidirectional Embedding Representations from a Transformer").
561 | 
562 |     Example usage:
563 |     ```python
564 |     # Already been converted into WordPiece token ids
565 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
566 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
567 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
568 | 
569 |     config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
570 |         num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
571 | 
572 |     model = modeling.BertModel(config=config)
573 |     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
574 |     ```
575 |     """
576 |     def __init__(self, config: BertConfig):
577 |         """Constructor for BertModel.
578 | 
579 |         Args:
580 |             config: `BertConfig` instance.
581 |         """
582 |         super(BertModel, self).__init__()
583 |         self.config = config
584 |         self.embeddings = BERTEmbeddings(config)
585 |         self.encoder = BERTEncoder(config)
586 |         if config.vertical:
587 |             if config.vertical_pals:
588 |                 self.vertical = BERTPals(config)
589 |             elif config.vertical_low_rank:
590 |                 self.vertical = BERTLowRank(config)
591 |             else:
592 |                 vertical_config = copy.deepcopy(config)
593 |                 vertical_config.num_hidden_layers = vertical_config.vertical_num_hidden_layers
594 |                 self.vertical = BERTEncoder(vertical_config)
595 |         self.pooler = BERTPooler(config)
596 |         self.LayerNorm = BERTLayerNorm(config)
597 | 
598 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, i=0):
599 |         if attention_mask is None:
600 |             attention_mask = torch.ones_like(input_ids)
601 |         if token_type_ids is None:
602 |             token_type_ids = torch.zeros_like(input_ids)
603 | 
604 |         # We create a 3D attention mask from a 2D tensor mask.
605 |         # Sizes are [batch_size, 1, 1, from_seq_length]
606 |         # So we can broadcast to [batch_size, num_heads, to_seq_length, from_seq_length]
607 |         # this attention mask is more simple than the triangular masking of causal attention
608 |         # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
609 |         extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
610 | 
611 |         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
612 |         # masked positions, this operation will create a tensor which is 0.0 for
613 |         # positions we want to attend and -10000.0 for masked positions.
614 |         # Since we are adding it to the raw scores before the softmax, this is
615 |         # effectively the same as removing these entirely.
616 |         extended_attention_mask = extended_attention_mask.float()
617 |         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
618 | 
619 |         embedding_output = self.embeddings(input_ids, token_type_ids)
620 |         all_encoder_layers = self.encoder(embedding_output, extended_attention_mask, i)
621 |         sequence_output = all_encoder_layers[-1]
622 |         if self.config.vertical:
623 |             if self.config.vertical_pals or self.config.vertical_low_rank:
624 |                 vertical_output = self.vertical(embedding_output, extended_attention_mask)
625 |             else:
626 |                 all_vertical_layers = self.vertical(embedding_output, extended_attention_mask, i)
627 |                 vertical_output = all_vertical_layers[-1]
628 |             # sequence_output = sequence_output + vertical_output
629 |             sequence_output = self.LayerNorm(sequence_output + vertical_output)
630 |         pooled_output = self.pooler(sequence_output, i)
631 |         return all_encoder_layers, pooled_output
632 | 
633 | 
634 | class BertForMultiTask(nn.Module):
635 |     """BERT model for classification or regression on GLUE tasks (STS-B is treated as a regression task).
636 |     This module is composed of the BERT model with a linear layer on top of
637 |     the pooled output.
638 | 
639 |     ```
640 |     """
641 |     def __init__(self, config, tasks):
642 |         super(BertForMultiTask, self).__init__()
643 |         self.bert = BertModel(config)
644 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
645 |         self.classifier = nn.ModuleList([nn.Linear(config.hidden_size, num_labels)
646 |                                          for i, num_labels in enumerate(tasks)])
647 |         def init_weights(module):
648 |             if isinstance(module, (nn.Linear, nn.Embedding)):
649 |                 # Slightly different from the TF version which uses truncated_normal for initialization
650 |                 # cf https://github.com/pytorch/pytorch/pull/5617
651 |                 module.weight.data.normal_(mean=0.0, std=config.initializer_range)
652 |             elif isinstance(module, BERTLayerNorm):
653 |                 module.beta.data.normal_(mean=0.0, std=config.initializer_range)
654 |                 module.gamma.data.normal_(mean=0.0, std=config.initializer_range)
655 |             if isinstance(module, nn.Linear):
656 |                 if module.bias is not None:
657 |                     module.bias.data.zero_()
658 |         self.apply(init_weights)
659 | 
660 |     def forward(self, input_ids, token_type_ids, attention_mask, task_id, name='cola', labels=None):
661 |         _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, task_id)
662 |         pooled_output = self.dropout(pooled_output)
663 |         logits = self.classifier[task_id](pooled_output)
664 | 
665 |         if labels is not None and name != 'sts':
666 |             loss_fct = CrossEntropyLoss()
667 |             loss = loss_fct(logits, labels)
668 |             return loss, logits
669 |         # STS is a regression task.
670 |         elif labels is not None and name == 'sts':
671 |             loss_fct = MSELoss()
672 |             loss = loss_fct(logits, labels.unsqueeze(1))
673 |             return loss, logits
674 |         else:
675 |             return logits
676 | 
677 | class BertForMultiNLI(nn.Module):
678 |     """BERT model for classification over sentence pair.
679 |     This module is composed of the BERT model with a linear layer on top of
680 |     the pooled output.
681 |     ```
682 |     """
683 |     def __init__(self, config, task_num_labels):
684 |         super(BertForMultiNLI, self).__init__()
685 |         self.bert = BertModel(config)
686 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
687 |         self.classifier = nn.ModuleList([nn.Linear(config.hidden_size, num_labels)
688 |                                          for i, num_labels in enumerate(task_num_labels)])
689 |         def init_weights(module):
690 |             if isinstance(module, (nn.Linear, nn.Embedding)):
691 |                 # Slightly different from the TF version which uses truncated_normal for initialization
692 |                 # cf https://github.com/pytorch/pytorch/pull/5617
693 |                 module.weight.data.normal_(mean=0.0, std=config.initializer_range)
694 |             elif isinstance(module, BERTLayerNorm):
695 |                 module.beta.data.normal_(mean=0.0, std=config.initializer_range)
696 |                 module.gamma.data.normal_(mean=0.0, std=config.initializer_range)
697 |             if isinstance(module, nn.Linear):
698 |                 if module.bias is not None:
699 |                     module.bias.data.zero_()
700 |         self.apply(init_weights)
701 | 
702 |     def forward(self, input_ids, token_type_ids, attention_mask, task_id, labels=None):
703 |         _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, task_id)
704 |         pooled_output = self.dropout(pooled_output)
705 |         logits = self.classifier[task_id](pooled_output)
706 | 
707 |         if labels is not None:
708 |             loss_fct = CrossEntropyLoss()
709 |             loss = loss_fct(logits, labels)
710 |             return loss, logits
711 |         else:
712 |             return logits
713 | 
714 | class BertForQuestionAnswering(nn.Module):
715 |     """BERT model for Question Answering (span extraction).
716 |     This module is composed of the BERT model with a linear layer on top of
717 |     the sequence output that computes start_logits and end_logits
718 | 
719 |     Example usage:
720 |     ```python
721 |     # Already been converted into WordPiece token ids
722 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
723 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
724 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
725 | 
726 |     config = BertConfig(vocab_size=32000, hidden_size=512,
727 |         num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
728 | 
729 |     model = BertForQuestionAnswering(config)
730 |     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
731 |     ```
732 |     """
733 |     def __init__(self, config):
734 |         super(BertForQuestionAnswering, self).__init__()
735 |         self.bert = BertModel(config)
736 |         # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
737 |         # self.dropout = nn.Dropout(config.hidden_dropout_prob)
738 |         self.qa_outputs = nn.Linear(config.hidden_size, 2)
739 | 
740 |         def init_weights(module):
741 |             if isinstance(module, (nn.Linear, nn.Embedding)):
742 |                 # Slightly different from the TF version which uses truncated_normal for initialization
743 |                 # cf https://github.com/pytorch/pytorch/pull/5617
744 |                 module.weight.data.normal_(mean=0.0, std=config.initializer_range)
745 |             elif isinstance(module, BERTLayerNorm):
746 |                 module.beta.data.normal_(mean=0.0, std=config.initializer_range)
747 |                 module.gamma.data.normal_(mean=0.0, std=config.initializer_range)
748 |             if isinstance(module, nn.Linear):
749 |                 module.bias.data.zero_()
750 |         self.apply(init_weights)
751 | 
752 |     def forward(self, input_ids, token_type_ids, attention_mask, task_id=None, start_positions=None, end_positions=None, weight=20, reg=False):
753 |         all_encoder_layers, _ = self.bert(input_ids, token_type_ids, attention_mask, task_id)
754 |         sequence_output = all_encoder_layers[-1]
755 |         logits = self.qa_outputs(sequence_output)
756 |         start_logits, end_logits = logits.split(1, dim=-1)
757 |         start_logits = start_logits.squeeze(-1)
758 |         end_logits = end_logits.squeeze(-1)
759 |         #import ipdb
760 |         #ipdb.set_trace()
761 |         if start_positions is not None and end_positions is not None:
762 |             # If we are on multi-GPU, split add a dimension - if not this is a no-op
763 |             start_positions = start_positions.squeeze(-1)
764 |             end_positions = end_positions.squeeze(-1)
765 |             # sometimes the start/end positions are outside our model inputs, we ignore these terms
766 |             ignored_index = start_logits.size(1)
767 |             start_positions.clamp_(0, ignored_index)
768 |             end_positions.clamp_(0, ignored_index)
769 | 
770 |             loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
771 |             start_loss = loss_fct(start_logits, start_positions)
772 |             end_loss = loss_fct(end_logits, end_positions)
773 |             ce_loss = (start_loss + end_loss) / 2
774 |             # calculate l2 loss or ewc loss.
775 |             if reg:
776 |                 l2_loss = self.l2_loss()
777 |                 total_loss = l2_loss * weight + ce_loss
778 |                 return total_loss
779 |             else:
780 |                 return ce_loss
781 |         else:
782 |             return start_logits, end_logits
783 | 
784 |     def l2_loss(self):
785 |         losses = []
786 |         for n, p in self.named_parameters():
787 |             mean = getattr(self, f"{n.replace('.', '_')}__mean")
788 |             losses.append(((p-mean)**2).sum())
789 |         return sum(losses)
790 | 
791 |     def load_pretrained(self, init_checkpoint, patch=False, transfer=False):
792 |         #if patdch:
793 |         #    partial = torch.load(init_checkpoint, map_location='cpu')
794 |         #    model_dict = self.bert.state_dict()
795 |         #    update = {}
796 |         #    for n, p in model_dict.items():
797 |         #        if 'aug' in n or 'mult' in n:
798 |         #            update[n] = p
799 |         #            if 'pooler.mult' in n and 'bias' in n:
800 |         #                update[n] = partial['pooler.dense.bias']
801 |         #            if 'pooler.mult' in n and 'weight' in n:
802 |         #                update[n] = partial['pooler.dense.weight']
803 |         #        else:
804 |         #            update[n] = partial[n]
805 |         #    self.bert.load_state_dict(update)
806 |         if transfer:
807 |             print('Load all parameters')
808 |             missing_keys, unexpected_keys = self.load_state_dict(torch.load(init_checkpoint, map_location='cpu'),strict=False)
809 |             print("missing keys: {}".format(missing_keys))
810 |             print('unexpected keys: {}'.format(unexpected_keys))
811 |             # register paramerters.
812 | 
813 |         else:
814 |             print('Load Bert parameters')
815 |             missing_keys, unexpected_keys = self.bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'),strict=False)
816 |             print("missing keys: {}".format(missing_keys))
817 |             print('unexpected keys: {}'.format(unexpected_keys))
818 | 
819 |     def register_parameters(self, init_checkpoint):
820 |         params = torch.load(init_checkpoint, map_location='cpu')
821 |         for k,v in params.items():
822 |             if 'mean' not in k:
823 |                 self.register_buffer(f"{k.replace('.', '_')}__mean", v)


--------------------------------------------------------------------------------
/src/modules/__init__.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (C) 2019 Alibaba Group Holding Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | from typing import Collection
 18 | import math
 19 | import torch
 20 | import torch.nn as nn
 21 | 
 22 | 
 23 | class Module(nn.Module):
 24 |     def __init__(self):
 25 |         super().__init__()
 26 |         self.summary = {}
 27 | 
 28 |     def add_summary(self, name, val):
 29 |         if self.training:
 30 |             self.summary[name] = val.clone().detach().cpu().numpy()
 31 | 
 32 |     def get_summary(self, base_name=''):
 33 |         summary = {}
 34 |         if base_name:
 35 |             base_name += '/'
 36 |         if self.summary:
 37 |             summary.update({base_name + name: val for name, val in self.summary.items()})
 38 |         for name, child in self.named_children():
 39 |             if hasattr(child, 'get_summary'):
 40 |                 name = base_name + name
 41 |                 summary.update(child.get_summary(name))
 42 |         return summary
 43 | 
 44 | 
 45 | class ModuleList(nn.ModuleList):
 46 |     def get_summary(self, base_name=''):
 47 |         summary = {}
 48 |         if base_name:
 49 |             base_name += '/'
 50 |         for i, module in enumerate(self):
 51 |             if hasattr(module, 'get_summary'):
 52 |                 name = base_name + str(i)
 53 |                 summary.update(module.get_summary(name))
 54 |         return summary
 55 | 
 56 | 
 57 | class ModuleDict(nn.ModuleDict):
 58 |     def get_summary(self, base_name=''):
 59 |         summary = {}
 60 |         if base_name:
 61 |             base_name += '/'
 62 |         for key, module in self.items():
 63 |             if hasattr(module, 'get_summary'):
 64 |                 name = base_name + key
 65 |                 summary.update(module.get_summary(name))
 66 |         return summary
 67 | 
 68 | 
 69 | class GeLU(nn.Module):
 70 |     def forward(self, x):
 71 |         return 0.5 * x * (1. + torch.tanh(x * 0.7978845608 * (1. + 0.044715 * x * x)))
 72 | 
 73 | 
 74 | class Linear(nn.Module):
 75 |     def __init__(self, in_features, out_features, activations=False):
 76 |         super().__init__()
 77 |         linear = nn.Linear(in_features, out_features)
 78 |         nn.init.normal_(linear.weight, std=math.sqrt((2. if activations else 1.) / in_features))
 79 |         nn.init.zeros_(linear.bias)
 80 |         modules = [nn.utils.weight_norm(linear)]
 81 |         if activations:
 82 |             modules.append(GeLU())
 83 |         self.model = nn.Sequential(*modules)
 84 | 
 85 |     def forward(self, x):
 86 |         return self.model(x)
 87 | 
 88 | 
 89 | class Conv1d(Module):
 90 |     def __init__(self, in_channels, out_channels, kernel_sizes: Collection[int]):
 91 |         super().__init__()
 92 |         assert all(k % 2 == 1 for k in kernel_sizes), 'only support odd kernel sizes'
 93 |         assert out_channels % len(kernel_sizes) == 0, 'out channels must be dividable by kernels'
 94 |         out_channels = out_channels // len(kernel_sizes)
 95 |         convs = []
 96 |         for kernel_size in kernel_sizes:
 97 |             conv = nn.Conv1d(in_channels, out_channels, kernel_size,
 98 |                              padding=(kernel_size - 1) // 2)
 99 |             nn.init.normal_(conv.weight, std=math.sqrt(2. / (in_channels * kernel_size)))
100 |             nn.init.zeros_(conv.bias)
101 |             convs.append(nn.Sequential(nn.utils.weight_norm(conv), GeLU()))
102 |         self.model = nn.ModuleList(convs)
103 | 
104 |     def forward(self, x):
105 |         return torch.cat([encoder(x) for encoder in self.model], dim=-1)
106 | 


--------------------------------------------------------------------------------
/src/modules/alignment.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import math
18 | import torch
19 | import torch.nn as nn
20 | import torch.nn.functional as f
21 | from functools import partial
22 | from src.utils.registry import register
23 | from . import Linear, Module
24 | 
25 | registry = {}
26 | register = partial(register, registry=registry)
27 | 
28 | 
29 | @register('identity')
30 | class Alignment(Module):
31 |     def __init__(self, args, __):
32 |         super().__init__()
33 |         self.temperature = nn.Parameter(torch.tensor(1 / math.sqrt(args.hidden_size)))
34 | 
35 |     def _attention(self, a, b):
36 |         return torch.matmul(a, b.transpose(1, 2)) * self.temperature
37 | 
38 |     def forward(self, a, b, mask_a, mask_b):
39 |         attn = self._attention(a, b)
40 |         mask = torch.matmul(mask_a.float(), mask_b.transpose(1, 2).float()).bool()
41 |         attn.masked_fill_(~mask, -1e7)
42 |         attn_a = f.softmax(attn, dim=1)
43 |         attn_b = f.softmax(attn, dim=2)
44 |         feature_b = torch.matmul(attn_a.transpose(1, 2), a)
45 |         feature_a = torch.matmul(attn_b, b)
46 |         self.add_summary('temperature', self.temperature)
47 |         self.add_summary('attention_a', attn_a)
48 |         self.add_summary('attention_b', attn_b)
49 |         return feature_a, feature_b
50 | 
51 | 
52 | @register('linear')
53 | class MappedAlignment(Alignment):
54 |     def __init__(self, args, input_size):
55 |         super().__init__(args, input_size)
56 |         self.projection = nn.Sequential(
57 |             nn.Dropout(args.dropout),
58 |             Linear(input_size, args.hidden_size, activations=True),
59 |         )
60 | 
61 |     def _attention(self, a, b):
62 |         a = self.projection(a)
63 |         b = self.projection(b)
64 |         return super()._attention(a, b)
65 | 


--------------------------------------------------------------------------------
/src/modules/connection.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import math
18 | import torch
19 | import torch.nn as nn
20 | from . import Linear
21 | from functools import partial
22 | from src.utils.registry import register
23 | registry = {}
24 | register = partial(register, registry=registry)
25 | 
26 | 
27 | @register('none')
28 | class NullConnection(nn.Module):
29 |     def forward(self, x, _, __):
30 |         return x
31 | 
32 | 
33 | @register('residual')
34 | class Residual(nn.Module):
35 |     def __init__(self, args):
36 |         super().__init__()
37 |         self.linear = Linear(args.embedding_dim, args.hidden_size)
38 | 
39 |     def forward(self, x, res, i):
40 |         if i == 1:
41 |             res = self.linear(res)
42 |         return (x + res) * math.sqrt(0.5)
43 | 
44 | 
45 | @register('aug')
46 | class AugmentedResidual(nn.Module):
47 |     def forward(self, x, res, i):
48 |         if i == 1:
49 |             return torch.cat([x, res], dim=-1)  # res is embedding
50 |         hidden_size = x.size(-1)
51 |         x = (res[:, :, :hidden_size] + x) * math.sqrt(0.5)
52 |         return torch.cat([x, res[:, :, hidden_size:]], dim=-1)  # latter half of res is embedding
53 | 


--------------------------------------------------------------------------------
/src/modules/embedding.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch
18 | import torch.nn as nn
19 | import torch.nn.functional as f
20 | 
21 | 
22 | class Embedding(nn.Module):
23 |     def __init__(self, args):
24 |         super().__init__()
25 |         self.fix_embeddings = args.fix_embeddings
26 |         self.embedding = nn.Embedding(args.num_vocab, args.embedding_dim, padding_idx=0)
27 |         self.dropout = args.dropout
28 | 
29 |     def set_(self, value):
30 |         self.embedding.weight.requires_grad = not self.fix_embeddings
31 |         self.embedding.load_state_dict({'weight': torch.tensor(value)})
32 | 
33 |     def forward(self, x):
34 |         x = self.embedding(x)
35 |         x = f.dropout(x, self.dropout, self.training)
36 |         return x
37 | 


--------------------------------------------------------------------------------
/src/modules/encoder.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch.nn as nn
18 | import torch.nn.functional as f
19 | from . import Conv1d
20 | 
21 | 
22 | class Encoder(nn.Module):
23 |     def __init__(self, args, input_size):
24 |         super().__init__()
25 |         self.dropout = args.dropout
26 |         self.encoders = nn.ModuleList([Conv1d(
27 |                 in_channels=input_size if i == 0 else args.hidden_size,
28 |                 out_channels=args.hidden_size,
29 |                 kernel_sizes=args.kernel_sizes) for i in range(args.enc_layers)])
30 | 
31 |     def forward(self, x, mask):
32 |         x = x.transpose(1, 2)  # B x C x L
33 |         mask = mask.transpose(1, 2)
34 |         for i, encoder in enumerate(self.encoders):
35 |             x.masked_fill_(~mask, 0.)
36 |             if i > 0:
37 |                 x = f.dropout(x, self.dropout, self.training)
38 |             x = encoder(x)
39 |         x = f.dropout(x, self.dropout, self.training)
40 |         return x.transpose(1, 2)  # B x L x C
41 | 


--------------------------------------------------------------------------------
/src/modules/fusion.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch
18 | import torch.nn as nn
19 | import torch.nn.functional as f
20 | from functools import partial
21 | from src.utils.registry import register
22 | from . import Linear
23 | 
24 | registry = {}
25 | register = partial(register, registry=registry)
26 | 
27 | 
28 | @register('simple')
29 | class Fusion(nn.Module):
30 |     def __init__(self, args, input_size):
31 |         super().__init__()
32 |         self.fusion = Linear(input_size * 2, args.hidden_size, activations=True)
33 | 
34 |     def forward(self, x, align):
35 |         return self.fusion(torch.cat([x, align], dim=-1))
36 | 
37 | 
38 | @register('full')
39 | class FullFusion(nn.Module):
40 |     def __init__(self, args, input_size):
41 |         super().__init__()
42 |         self.dropout = args.dropout
43 |         self.fusion1 = Linear(input_size * 2, args.hidden_size, activations=True)
44 |         self.fusion2 = Linear(input_size * 2, args.hidden_size, activations=True)
45 |         self.fusion3 = Linear(input_size * 2, args.hidden_size, activations=True)
46 |         self.fusion = Linear(args.hidden_size * 3, args.hidden_size, activations=True)
47 | 
48 |     def forward(self, x, align):
49 |         x1 = self.fusion1(torch.cat([x, align], dim=-1))
50 |         x2 = self.fusion2(torch.cat([x, x - align], dim=-1))
51 |         x3 = self.fusion3(torch.cat([x, x * align], dim=-1))
52 |         x = torch.cat([x1, x2, x3], dim=-1)
53 |         x = f.dropout(x, self.dropout, self.training)
54 |         return self.fusion(x)
55 | 


--------------------------------------------------------------------------------
/src/modules/pooling.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch.nn as nn
18 | 
19 | 
20 | class Pooling(nn.Module):
21 |     def forward(self, x, mask):
22 |         return x.masked_fill_(~mask, -float('inf')).max(dim=1)[0]
23 | 


--------------------------------------------------------------------------------
/src/modules/prediction.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch
18 | import torch.nn as nn
19 | from functools import partial
20 | from src.utils.registry import register
21 | from . import Linear
22 | 
23 | registry = {}
24 | register = partial(register, registry=registry)
25 | 
26 | 
27 | @register('simple')
28 | class Prediction(nn.Module):
29 |     def __init__(self, args, inp_features=2):
30 |         super().__init__()
31 |         self.dense = nn.Sequential(
32 |             nn.Dropout(args.dropout),
33 |             Linear(args.hidden_size * inp_features, args.hidden_size, activations=True),
34 |             nn.Dropout(args.dropout),
35 |             Linear(args.hidden_size, args.num_classes),
36 |         )
37 | 
38 |     def forward(self, a, b):
39 |         return self.dense(torch.cat([a, b], dim=-1))
40 | 
41 | 
42 | @register('full')
43 | class AdvancedPrediction(Prediction):
44 |     def __init__(self, args):
45 |         super().__init__(args, inp_features=4)
46 | 
47 |     def forward(self, a, b):
48 |         return self.dense(torch.cat([a, b, a - b, a * b], dim=-1))
49 | 
50 | 
51 | @register('symmetric')
52 | class SymmetricPrediction(AdvancedPrediction):
53 |     def forward(self, a, b):
54 |         return self.dense(torch.cat([a, b, (a - b).abs(), a * b], dim=-1))
55 | 


--------------------------------------------------------------------------------
/src/trainer.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (C) 2019 Alibaba Group Holding Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import os
 18 | import copy
 19 | import random
 20 | import json5
 21 | import torch
 22 | from datetime import datetime
 23 | from pprint import pformat
 24 | from collections import OrderedDict
 25 | 
 26 | from .utils.loader import load_data
 27 | from .utils.logger import Logger
 28 | from .utils.params import validate_params
 29 | from .model import Model
 30 | from .interface import Interface
 31 | 
 32 | 
 33 | class Trainer:
 34 |     def __init__(self, args):
 35 |         self.args = args
 36 |         self.log = Logger(self.args, 0)
 37 | 
 38 |     def train(self):
 39 |         start_time = datetime.now()
 40 |         server, interface, states = self.build_model()
 41 | 
 42 |         clients = []
 43 |         train_datasets = []
 44 |         dev_datasets = []
 45 |         logs = []
 46 |         for i, dataset in enumerate(self.args.data_dir):
 47 |             clients.append(Model(dataset=dataset, args=self.args, number=i))
 48 |             train = load_data(dataset, 'train')
 49 |             dev = load_data(dataset, self.args.eval_file)
 50 |             train_datasets.append(train)
 51 |             dev_datasets.append(dev)
 52 |             logs.append(Logger(self.args, i))
 53 |             self.log(f'{dataset} train ({len(train)}) | {self.args.eval_file} ({len(dev)})')
 54 | 
 55 |         train_batches = []
 56 |         for i, dataset in enumerate(train_datasets):
 57 |             train_batches.append(interface.pre_process(dataset, i))
 58 |         dev_batches = []
 59 |         for i, dataset in enumerate(dev_datasets):
 60 |             dev_batches.append(interface.pre_process(dataset, i, training=False))
 61 |         self.log('setup complete: {}s.'.format(str(datetime.now() - start_time).split(".")[0]))
 62 | 
 63 |         if self.args.sample == 'sqrt' or self.args.sample == 'prop':
 64 |             probs = [len(batches) for batches in train_batches]
 65 |             alpha = 0
 66 |             if self.args.sample == 'prop':
 67 |                 alpha = 1.
 68 |             if self.args.sample == 'sqrt':
 69 |                 alpha = 0.5
 70 |             probs = [p ** alpha for p in probs]
 71 |             tot = sum(probs)
 72 |             probs = [p/tot for p in probs]
 73 |         
 74 |         aggregation_weights = [1. / len(clients) for client in clients]
 75 | 
 76 |         try:
 77 |             for fed_round in range(self.args.round):
 78 |                 self.log(f'training in round {fed_round}.')
 79 |                 for client, train, dev, log, i in zip(clients, train_batches, dev_batches, logs, range(len(train_datasets))):
 80 |                     server_params = server.network.state_dict()
 81 |                     client_params = client.network.state_dict()
 82 |                     share_params = {}
 83 | 
 84 |                     if self.args.fed_type == 'fedpatch':
 85 |                         for key, data in server_params.items():
 86 |                             if 'prediction' in key:
 87 |                                 share_params[key] = copy.deepcopy(data)
 88 |                             else:
 89 |                                 share_params[key] = copy.deepcopy(client_params[key])
 90 |                     elif self.args.fed_type == 'dual_bert':
 91 |                         for key, data in server_params.items():
 92 |                             if 'bert_keep' not in key and 'prediction' not in key:
 93 |                                 share_params[key] = copy.deepcopy(data)
 94 |                             else:
 95 |                                 share_params[key] = copy.deepcopy(client_params[key])
 96 |                     elif self.args.fed_type == 'LayerNorm':
 97 |                         for key, data in server_params.items():
 98 |                             if 'LayerNorm' not in key and 'prediction' not in key:
 99 |                                 share_params[key] = copy.deepcopy(data)
100 |                             else:
101 |                                 share_params[key] = copy.deepcopy(client_params[key])
102 |                     elif self.args.fed_type == 'diff_privacy' or self.args.fed_type == 'median':
103 |                         def dp_noise(param, sigma):
104 |                             noised_layer = torch.cuda.FloatTensor(param.shape).normal_(mean=0, std=sigma)
105 |                             return noised_layer
106 |                         for key, data in server_params.items():
107 |                             data.add_(dp_noise(data, self.args.sigma))
108 |                             share_params[key] = copy.deepcopy(data)
109 |                     elif self.args.fed_type == 'fedavg':
110 |                         for key, data in server_params.items():
111 |                             share_params[key] = copy.deepcopy(data)
112 |                     # elif self.args.fed_type == 'median':
113 |                     #     for key, data in server_params.items():
114 |                     #         share_params[key] = copy.deepcopy(data)
115 |                     elif self.args.fed_type == 'self-center':
116 |                         for key, data in server_params.items():
117 |                             share_params[key] = (copy.deepcopy(data).cpu() * len(clients) - copy.deepcopy(client_params[key]).cpu()) / (len(clients)-1) * 0.5 + copy.deepcopy(client_params[key]).cpu() * 0.5
118 |                     elif self.args.fed_type == 'pals':
119 |                         for key, data in server_params.items():
120 |                             if 'aug' in key or 'predictions' in key or 'mult' in key or 'gamma' in key or 'beta' in key:
121 |                                 share_params[key] = copy.deepcopy(client_params[key])
122 |                             else:
123 |                                 share_params[key] = copy.deepcopy(data)
124 |                     elif self.args.fed_type == 'fed_vertical':
125 |                         for key, data in server_params.items():
126 |                             if 'aug' in key or 'predictions' in key or 'mult' in key or 'vertical' in key or 'gamma' in key or 'beta' in key:
127 |                                 share_params[key] = copy.deepcopy(client_params[key])
128 |                             else:
129 |                                 share_params[key] = copy.deepcopy(data)
130 |                     client.network.load_state_dict(share_params)
131 |                     for epoch in range(states['start_epoch'], self.args.epochs[i] + 1):
132 |                         epoch = fed_round * self.args.epochs[i] + epoch
133 |                         states['epoch'] = epoch
134 |                         log.set_epoch(epoch)
135 |                         if self.args.sample == 'anneal':
136 |                             probs = [len(batches) for batches in train_batches]
137 |                             alpha = 1. - 0.8 * epoch / (self.args.epochs[i] * self.args.round - 1)
138 |                             print(alpha)
139 |                             probs = [p**alpha for p in probs]
140 |                             tot = sum(probs)
141 |                             probs = [p/tot for p in probs]
142 |                         if self.args.sample != 'all':
143 |                             train = random.sample(train, k=int(len(train) * probs[i]))
144 |                         batches = interface.shuffle_batch(train, i)
145 |                         for batch_id, batch in enumerate(batches):
146 |                             stats = client.update(batch, origin=share_params)
147 |                             log.update(stats)
148 |                             eval_per_updates = self.args.eval_per_updates[i] \
149 |                                 if client.updates > self.args.eval_warmup_steps[i] else self.args.eval_per_updates_warmup[i]
150 |                             if client.updates % eval_per_updates == 0 or (self.args.eval_epoch and batch_id + 1 == len(batches)):
151 |                                 log.newline()
152 |                                 score, dev_stats = client.evaluate(dev)
153 |                                 if score > states['best_eval']:
154 |                                     states['best_eval'], states['best_epoch'], states['best_step'], states['best_stats'] = score, epoch, client.updates, dev_stats
155 |                                     if self.args.save:
156 |                                         client.save(states, name=client.best_model_name)
157 |                                 log.log_eval(dev_stats)
158 |                                 if self.args.save_all:
159 |                                     client.save(states)
160 |                                     client.save(states, name='last')
161 |                                 if client.updates - states['best_step'] > self.args.early_stopping[i] \
162 |                                         and client.updates > self.args.min_steps[i]:
163 |                                     log('[Tolerance reached. Training is stopped early.]')
164 |                                     raise EarlyStop('[Tolerance reached. Training is stopped early.]')
165 |                             if stats['loss'] > self.args.max_loss:
166 |                                 raise EarlyStop('[Loss exceeds tolerance. Unstable training is stopped early.]')
167 |                             if stats['lr'] < self.args.min_lr - 1e-6:
168 |                                 raise EarlyStop('[Learning rate has decayed below min_lr. Training is stopped early.]')
169 |                         log.newline()
170 |                     client.network.to('cpu')
171 |                 
172 |                 # Aggregation
173 |                 update_state = OrderedDict()
174 |                 if self.args.fed_type == 'median':
175 |                     for k, client in enumerate(clients):
176 |                         local_state = client.network.state_dict()
177 |                         for key in server.network.state_dict().keys():
178 |                             if k == 0:
179 |                                 update_state[
180 |                                     key] = copy.deepcopy(local_state[key].unsqueeze(dim=-1))
181 |                             else:
182 |                                 update_state[
183 |                                     key] = torch.cat(
184 |                                         (update_state[key], local_state[key].unsqueeze(dim=-1)),
185 |                                         dim=-1
186 |                                     )
187 |                     for key in update_state.keys():
188 |                         update_state[key] = update_state[key].median(dim=-1).values
189 |                 else:
190 |                     for k, client in enumerate(clients):
191 |                         local_state = client.network.state_dict()
192 |                         for key in server.network.state_dict().keys():
193 |                             if k == 0:
194 |                                 update_state[
195 |                                     key] = local_state[key] * aggregation_weights[k]
196 |                             else:
197 |                                 update_state[
198 |                                     key] += local_state[key] * aggregation_weights[k]
199 |                 server.network.load_state_dict(update_state)
200 | 
201 |             self.log('Training complete.')
202 |         except KeyboardInterrupt:
203 |             self.log.newline()
204 |             self.log(f'Training interrupted. Stopped early.')
205 |         except EarlyStop as e:
206 |             self.log.newline()
207 |             self.log(str(e))
208 |         self.log(f'best dev score {states["best_eval"]} at step {states["best_step"]} '
209 |                  f'(epoch {states["best_epoch"]}).')
210 |         self.log(f'best eval stats [{self.log.best_eval_str}]')
211 |         training_time = str(datetime.now() - start_time).split('.')[0]
212 |         self.log(f'Training time: {training_time}.')
213 |         states['start_time'] = str(start_time).split('.')[0]
214 |         states['training_time'] = training_time
215 |         return states
216 | 
217 |     def build_model(self):
218 |         states = {}
219 |         interface = Interface(self.args, self.log)
220 |         self.log(f'#classes: {self.args.num_classes}; #vocab: {self.args.num_vocab}')
221 |         if self.args.seed:
222 |             random.seed(self.args.seed)
223 |             torch.manual_seed(self.args.seed)
224 |             if self.args.cuda:
225 |                 torch.cuda.manual_seed(self.args.seed)
226 |             if self.args.deterministic:
227 |                 torch.backends.cudnn.deterministic = True
228 | 
229 |         model = Model(dataset='server', args=self.args)
230 | 
231 |         # set initial states
232 |         states['start_epoch'] = 1
233 |         states['best_eval'] = 0.
234 |         states['best_epoch'] = 0
235 |         states['best_step'] = 0
236 | 
237 |         self.log(f'trainable params: {model.num_parameters():,d}')
238 |         self.log(f'trainable params (exclude embeddings): {model.num_parameters(exclude_embed=True):,d}')
239 |         validate_params(self.args)
240 |         with open(os.path.join(self.args.summary_dir, 'args.json5'), 'w') as f:
241 |             json5.dump(self.args.__dict__, f, indent=2)
242 |         self.log(pformat(vars(self.args), indent=2, width=120))
243 |         return model, interface, states
244 | 
245 | 
246 | class EarlyStop(Exception):
247 |     pass
248 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/utils/loader.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import os
18 | import json
19 | import numpy as np
20 | 
21 | 
22 | def load_data(data_dir, split=None):
23 |     data = []
24 |     if split is None:
25 |         files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.json')]
26 |     else:
27 |         if not split.endswith('.json'):
28 |             split += '.json'
29 |         files = [os.path.join(data_dir, f'{split}')]
30 |     for file in files:
31 |         print(file)
32 |         with open(file) as f:
33 |             for line in f:
34 |                 sample = json.loads(line)
35 |                 data.append({
36 |                     'text1': str(sample['question']),
37 |                     'text2': str(sample['answer']),
38 |                     'target': str(sample['label']),
39 |                 })
40 |     return data
41 | 
42 | 
43 | def load_embeddings(file, vocab, dim, lower, mode='freq'):
44 |     embedding = np.zeros((len(vocab), dim))
45 |     count = np.zeros((len(vocab), 1))
46 |     with open(file) as f:
47 |         for line in f:
48 |             elems = line.rstrip().split()
49 |             if len(elems) != dim + 1:
50 |                 continue
51 |             token = elems[0]
52 |             if lower and mode != 'strict':
53 |                 token = token.lower()
54 |             if token in vocab:
55 |                 index = vocab.index(token)
56 |                 vector = [float(x) for x in elems[1:]]
57 |                 if mode == 'freq' or mode == 'strict':
58 |                     if not count[index]:
59 |                         embedding[index] = vector
60 |                         count[index] = 1.
61 |                 elif mode == 'last':
62 |                     embedding[index] = vector
63 |                     count[index] = 1.
64 |                 elif mode == 'avg':
65 |                     embedding[index] += vector
66 |                     count[index] += 1.
67 |                 else:
68 |                     raise NotImplementedError('Unknown embedding loading mode: ' + mode)
69 |     if mode == 'avg':
70 |         inverse_mask = np.where(count == 0, 1., 0.)
71 |         embedding /= count + inverse_mask
72 |     return embedding.tolist()
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/src/utils/logger.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (C) 2019 Alibaba Group Holding Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import os
 18 | import sys
 19 | import logging
 20 | 
 21 | 
 22 | class Logger:
 23 |     def __init__(self, args, i):
 24 |         log = logging.getLogger(args.summary_dir)
 25 |         if not log.handlers:
 26 |             log.setLevel(logging.DEBUG)
 27 |             fh = logging.FileHandler(os.path.join(args.summary_dir, args.log_file))
 28 |             fh.setLevel(logging.INFO)
 29 |             ch = ProgressHandler()
 30 |             ch.setLevel(logging.DEBUG)
 31 |             formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
 32 |             fh.setFormatter(formatter)
 33 |             ch.setFormatter(formatter)
 34 |             log.addHandler(fh)
 35 |             log.addHandler(ch)
 36 |         self.log = log
 37 |         # setup TensorBoard
 38 |         if args.tensorboard:
 39 |             from tensorboardX import SummaryWriter
 40 |             self.writer = SummaryWriter(os.path.join(args.summary_dir, 'viz'))
 41 |             self.log.info(f'TensorBoard activated.')
 42 |         else:
 43 |             self.writer = None
 44 |         self.log_per_updates = args.log_per_updates[i]
 45 |         self.summary_per_updates = args.summary_per_updates[i]
 46 |         self.grad_clipping = args.grad_clipping
 47 |         self.clips = 0
 48 |         self.train_meters = {}
 49 |         self.epoch = None
 50 |         self.best_eval = 0.
 51 |         self.best_eval_str = ''
 52 | 
 53 |     def set_epoch(self, epoch):
 54 |         self(f'Epoch: {epoch}')
 55 |         self.epoch = epoch
 56 | 
 57 |     @staticmethod
 58 |     def _format_number(x):
 59 |         return f'{x:.4f}' if float(x) > 1e-3 else f'{x:.4e}'
 60 | 
 61 |     def update(self, stats):
 62 |         dataset = stats.pop('dataset')
 63 |         updates = stats.pop('updates')
 64 |         summary = stats.pop('summary')
 65 |         if updates % self.log_per_updates == 0:
 66 |             self.clips += int(stats['gnorm'] > self.grad_clipping)
 67 |             stats_str = ' '.join(f'{key}: ' + self._format_number(val) for key, val in stats.items())
 68 |             for key, val in stats.items():
 69 |                 if key not in self.train_meters:
 70 |                     self.train_meters[key] = AverageMeter()
 71 |                 self.train_meters[key].update(val)
 72 |             msg = f'[{dataset}] epoch {self.epoch} updates {updates} {stats_str}'
 73 |             if self.log_per_updates != 1:
 74 |                 msg = '> ' + msg
 75 |             self.log.info(msg)
 76 |             if self.writer and updates % self.summary_per_updates == 0:
 77 |                 for key, val in stats.items():
 78 |                     self.writer.add_scalar(f'train/{key}', val, updates)
 79 |                 for key, val in summary.items():
 80 |                     self.writer.add_histogram(key, val, updates)
 81 | 
 82 |     def newline(self):
 83 |         self.log.debug('')
 84 | 
 85 |     def log_eval(self, valid_stats):
 86 |         self.newline()
 87 |         updates = valid_stats.pop('updates')
 88 |         eval_score = valid_stats.pop('score')
 89 |         # report the exponential averaged training stats, while reporting the full dev set stats
 90 |         if self.train_meters:
 91 |             train_stats_str = ' '.join(f'{key}: ' + self._format_number(val) for key, val in self.train_meters.items())
 92 |             train_stats_str += ' ' + f'clip: {self.clips}'
 93 |             self.log.info(f'train {train_stats_str}')
 94 |         valid_stats_str = ' '.join(f'{key}: ' + self._format_number(val) for key, val in valid_stats.items())
 95 |         if eval_score > self.best_eval:
 96 |             self.best_eval_str = valid_stats_str
 97 |             self.best_eval = eval_score
 98 |             valid_stats_str += ' [NEW BEST]'
 99 |         else:
100 |             valid_stats_str += f' [BEST: {self.best_eval_str}]'
101 |         self.log.info(f'valid {valid_stats_str}')
102 |         if self.writer:
103 |             for key in valid_stats.keys():
104 |                 group = {'valid': valid_stats[key]}
105 |                 if self.train_meters and key in self.train_meters:
106 |                     group['train'] = float(self.train_meters[key])
107 |                 self.writer.add_scalars(f'valid/{key}', group, updates)
108 |         self.train_meters = {}
109 |         self.clips = 0
110 | 
111 |     def __call__(self, msg):
112 |         self.log.info(msg)
113 | 
114 | 
115 | class ProgressHandler(logging.Handler):
116 |     def __init__(self, level=logging.NOTSET):
117 |         super().__init__(level)
118 | 
119 |     def emit(self, record):
120 |         log_entry = self.format(record)
121 |         if record.message.startswith('> '):
122 |             sys.stdout.write('{}\r'.format(log_entry.rstrip()))
123 |             sys.stdout.flush()
124 |         else:
125 |             sys.stdout.write('{}\n'.format(log_entry))
126 | 
127 | 
128 | class AverageMeter(object):
129 |     """Keep exponential weighted averages."""
130 |     def __init__(self, beta=0.99):
131 |         self.beta = beta
132 |         self.moment = 0.
133 |         self.value = 0.
134 |         self.t = 0.
135 | 
136 |     def update(self, val):
137 |         self.t += 1
138 |         self.moment = self.beta * self.moment + (1 - self.beta) * val
139 |         # bias correction
140 |         self.value = self.moment / (1 - self.beta ** self.t)
141 | 
142 |     def __format__(self, spec):
143 |         return format(self.value, spec)
144 | 
145 |     def __float__(self):
146 |         return float(self.value)
147 | 


--------------------------------------------------------------------------------
/src/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import os
18 | import subprocess
19 | from functools import partial
20 | import numpy as np
21 | from sklearn import metrics
22 | 
23 | from .registry import register
24 | 
25 | registry = {}
26 | register = partial(register, registry=registry)
27 | 
28 | 
29 | @register('acc')
30 | def acc(outputs):
31 |     target = outputs['target']
32 |     pred = outputs['pred']
33 |     return {
34 |         'acc': metrics.accuracy_score(target, pred).item(),
35 |     }
36 | 
37 | 
38 | @register('f1')
39 | def f1(outputs):
40 |     target = outputs['target']
41 |     pred = outputs['pred']
42 |     return {
43 |         'f1': metrics.f1_score(target, pred, average='weighted').item(),
44 |     }
45 | 
46 | 
47 | @register('auc')
48 | def auc(outputs):
49 |     target = outputs['target']
50 |     prob = np.array(outputs['prob'])
51 |     return {
52 |         'auc': metrics.roc_auc_score(target, prob[:, 1]).item(),
53 |     }
54 | 
55 | 
56 | @register('map')
57 | @register('mrr')
58 | def ranking(outputs):
59 |     args = outputs['args']
60 |     dataset = outputs['dataset']
61 |     prediction = [o[1] for o in outputs['prob']]
62 |     ref_file = os.path.join(dataset, '{}.ref'.format(args.eval_file))
63 |     rank_file = os.path.join(dataset, '{}.rank'.format(args.eval_file))
64 |     tmp_file = os.path.join(args.summary_dir, 'tmp-pred.txt')
65 |     with open(rank_file) as f:
66 |         prefix = []
67 |         for line in f:
68 |             prefix.append(line.strip().split())
69 |         assert len(prefix) == len(prediction), \
70 |             'prefix {}, while prediction {}'.format(len(prefix), len(prediction))
71 |     with open(tmp_file, 'w') as f:
72 |         for prefix, pred in zip(prefix, prediction):
73 |             prefix[-2] = str(pred)
74 |             f.write(' '.join(prefix) + '\n')
75 |     sp = subprocess.Popen('./resources/trec_eval {} {} | egrep "map|recip_rank"'.format(ref_file, tmp_file),
76 |                           shell=True,
77 |                           stdout=subprocess.PIPE, stderr=subprocess.PIPE)
78 |     stdout, stderr = sp.communicate()
79 |     stdout, stderr = stdout.decode(), stderr.decode()
80 |     os.remove(tmp_file)
81 |     map_, mrr = [float(s[-6:]) for s in stdout.strip().split('\n')]
82 |     return {
83 |         'map': map_,
84 |         'mrr': mrr,
85 |     }
86 | 


--------------------------------------------------------------------------------
/src/utils/params.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (C) 2019 Alibaba Group Holding Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import os
 18 | import math
 19 | import shutil
 20 | from datetime import datetime
 21 | import torch
 22 | import json5
 23 | 
 24 | 
 25 | class Object:
 26 |     """
 27 |     @DynamicAttrs
 28 |     """
 29 |     pass
 30 | 
 31 | 
 32 | def parse(config_file):
 33 |     root = os.path.dirname(config_file)  # __parent__ in config is a relative path
 34 |     config_group = _load_param('', config_file)
 35 |     if type(config_group) is dict:
 36 |         config_group = [config_group]
 37 |     configs = []
 38 |     for config in config_group:
 39 |         try:
 40 |             choice = config.pop('__iter__')
 41 |             assert len(choice) == 1, 'only support iterating over 1 variable'
 42 |             key, values = next(iter(choice.items()))
 43 |         except KeyError:
 44 |             key, value = config.popitem()
 45 |             values = [value]
 46 |         for value in values:
 47 |             config[key] = value
 48 |             repeat = config.get('__repeat__', 1)
 49 |             for index in range(repeat):
 50 |                 config_ = config.copy()
 51 |                 config_['__index__'] = index
 52 |                 if repeat > 1:
 53 |                     config_['name'] += '-' + str(index)
 54 |                 args = _parse_args(root, config_)
 55 |                 configs.append((args, config_))
 56 |     return configs
 57 | 
 58 | 
 59 | def _parse_args(root, config):
 60 |     args = Object()
 61 |     assert type(config) is dict
 62 |     parents = config.get('__parents__', [])
 63 |     for parent in parents:
 64 |         parent = _load_param(root, parent)
 65 |         assert type(parent) is dict, 'only top-level configs can be a sequence'
 66 |         _add_param(args, parent)
 67 |     _add_param(args, config)
 68 |     _post_process(args)
 69 |     return args
 70 | 
 71 | 
 72 | def _add_param(args, x: dict):
 73 |     for k, v in x.items():
 74 |         if type(v) is dict:
 75 |             _add_param(args, v)
 76 |         else:
 77 |             k = _validate_param(k)
 78 |             if hasattr(args, k):
 79 |                 previous_type = type(getattr(args, k))
 80 |                 current_type = type(v)
 81 |                 assert previous_type is current_type or \
 82 |                     isinstance(None, previous_type) or \
 83 |                     isinstance(None, current_type) or \
 84 |                     (previous_type is float and current_type is int), \
 85 |                     f'param "{k}" of type {previous_type} can not be overwritten by type {current_type}'
 86 |             setattr(args, k, v)
 87 | 
 88 | 
 89 | def _load_param(root, file: str):
 90 |     file = os.path.join(root, file)
 91 |     if not file.endswith('.json5'):
 92 |         file += '.json5'
 93 |     with open(file) as f:
 94 |         config = json5.load(f)
 95 |         return config
 96 | 
 97 | 
 98 | def _post_process(args: Object):
 99 |     if not args.output_dir.startswith('models'):
100 |         args.output_dir = os.path.join('models', args.output_dir)
101 |     os.makedirs(args.output_dir, exist_ok=True)
102 |     if not args.name:
103 |         args.name = str(datetime.now())
104 |     args.summary_dir = os.path.join(args.output_dir, args.name)
105 |     if os.path.exists(args.summary_dir):
106 |         shutil.rmtree(args.summary_dir)
107 |     os.makedirs(args.summary_dir)
108 |     data_config_file = os.path.join(args.output_dir, 'data_config.json5')
109 |     if os.path.exists(data_config_file):
110 |         with open(data_config_file) as f:
111 |             config = json5.load(f)
112 |             for k, v in config.items():
113 |                 if not hasattr(args, k) or getattr(args, k) != v:
114 |                     print('ERROR: Data configurations are different. Please use another output_dir or '
115 |                           'remove the older one manually.')
116 |                     exit()
117 |     else:
118 |         with open(data_config_file, 'w') as f:
119 |             keys = ['data_dir', 'min_df', 'max_vocab', 'max_len', 'min_len', 'lower_case',
120 |                     'pretrained_embeddings', 'embedding_mode']
121 |             json5.dump({k: getattr(args, k) for k in keys}, f)
122 |     args.metric = args.metric.lower()
123 |     args.watch_metrics = [m.lower() for m in args.watch_metrics]
124 |     if args.metric not in args.watch_metrics:
125 |         args.watch_metrics.append(args.metric)
126 |     args.cuda = args.cuda and torch.cuda.is_available()
127 |     args.fix_embeddings = args.pretrained_embeddings and args.fix_embeddings
128 | 
129 |     def samples2steps(n, i):
130 |         return int(math.ceil(n / args.batch_size[i]))
131 | 
132 |     totals = len(args.data_dir)
133 |     if not hasattr(args, 'log_per_updates'):
134 |         args.log_per_updates = []
135 |         for i in range(totals):
136 |             args.log_per_updates.append(samples2steps(args.log_per_samples[i], i))
137 |     if not hasattr(args, 'eval_per_updates'):
138 |         args.eval_per_updates = []
139 |         for i in range(totals):
140 |             args.eval_per_updates.append(samples2steps(args.eval_per_samples[i], i))
141 |     if not hasattr(args, 'eval_per_updates_warmup'):
142 |         args.eval_per_updates_warmup = []
143 |         for i in range(totals):
144 |             args.eval_per_updates_warmup.append(samples2steps(args.eval_per_samples_warmup[i], i))
145 |     if not hasattr(args, 'eval_warmup_steps'):
146 |         args.eval_warmup_steps = []
147 |         for i in range(totals):
148 |             args.eval_warmup_steps.append(samples2steps(args.eval_warmup_samples[i], i))
149 |     if not hasattr(args, 'min_steps'):
150 |         args.min_steps = []
151 |         for i in range(totals):
152 |             args.min_steps.append(samples2steps(args.min_samples[i], i))
153 |     if not hasattr(args, 'early_stopping'):
154 |         args.early_stopping = []
155 |         for i in range(totals):
156 |             args.early_stopping.append(samples2steps(args.tolerance_samples[i], i))
157 |     if not hasattr(args, 'summary_per_updates'):
158 |         args.summary_per_updates = [args.summary_per_logs * log_per_updates for log_per_updates in args.log_per_updates]
159 |     assert args.lr >= args.min_lr, 'initial learning rate must be larger than min_lr'
160 | 
161 | 
162 | def validate_params(args):
163 |     """validate params after interface initialization"""
164 |     # assert args.num_classes == 2 or ('f1' not in args.watch_metrics and 'auc' not in args.watch_metrics), \
165 |     #     f'F1 and AUC are only valid for 2 classes.'
166 |     assert args.num_classes == 2 or 'ranking' not in args.watch_metrics, \
167 |         f'ranking metrics are only valid for 2 classes.'
168 |     assert args.num_vocab > 0
169 | 
170 | 
171 | def _validate_param(name):
172 |     name = name.replace('-', '_')
173 |     if not str.isidentifier(name):
174 |         raise ValueError(f'Invalid param name: {name}')
175 |     return name
176 | 


--------------------------------------------------------------------------------
/src/utils/registry.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | def register(name=None, registry=None):
18 |     def decorator(fn, registration_name=None):
19 |         module_name = registration_name or _default_name(fn)
20 |         if module_name in registry:
21 |             raise LookupError(f"module {module_name} already registered.")
22 |         registry[module_name] = fn
23 |         return fn
24 |     return lambda fn: decorator(fn, name)
25 | 
26 | 
27 | def _default_name(obj_class):
28 |     return obj_class.__name__
29 | 


--------------------------------------------------------------------------------
/src/utils/vocab.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (C) 2019 Alibaba Group Holding Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | from collections import Counter
 18 | 
 19 | 
 20 | class Indexer:
 21 |     def __init__(self):
 22 |         self.w2id = {}
 23 |         self.id2w = {}
 24 | 
 25 |     @property
 26 |     def n_spec(self):
 27 |         return 0
 28 | 
 29 |     def __len__(self):
 30 |         return len(self.w2id)
 31 | 
 32 |     def __getitem__(self, index):
 33 |         if index not in self.id2w:
 34 |             raise IndexError(f'invalid index {index} in indices.')
 35 |         return self.id2w[index]
 36 | 
 37 |     def __contains__(self, item):
 38 |         return item in self.w2id
 39 | 
 40 |     def index(self, symbol):
 41 |         if symbol in self.w2id:
 42 |             return self.w2id[symbol]
 43 |         raise IndexError(f'Unknown symbol {symbol}')
 44 | 
 45 |     def keys(self):
 46 |         return self.w2id.keys()
 47 | 
 48 |     def indices(self):
 49 |         return self.id2w.keys()
 50 | 
 51 |     def add_symbol(self, symbol):
 52 |         if symbol not in self.w2id:
 53 |             self.id2w[len(self.id2w)] = symbol
 54 |             self.w2id[symbol] = len(self.w2id)
 55 | 
 56 |     @classmethod
 57 |     def build(cls, symbols, min_counts=1, dump_filtered=None, log=print):
 58 |         counter = Counter(symbols)
 59 |         symbols = sorted([t for t, c in counter.items() if c >= min_counts],
 60 |                          key=counter.get, reverse=True)
 61 |         log(f'''{len(symbols)} symbols found: {' '.join(symbols[:15]) + ('...' if len(symbols) > 15 else '')}''')
 62 |         filtered = sorted(list(counter.keys() - set(symbols)), key=counter.get, reverse=True)
 63 |         if filtered:
 64 |             log('filtered classes:')
 65 |             if len(filtered) > 20:
 66 |                 log('{} ... {}'.format(' '.join(filtered[:10]), ' '.join(filtered[-10:])))
 67 |             else:
 68 |                 log(' '.join(filtered))
 69 |             if dump_filtered:
 70 |                 with open(dump_filtered, 'w') as f:
 71 |                     for name in filtered:
 72 |                         f.write(f'{name} {counter.get(name)}\n')
 73 |         indexer = cls()
 74 |         try:  # restore numeric order if labels are represented by integers already
 75 |             symbols = list(map(int, symbols))
 76 |             symbols.sort()
 77 |             symbols = list(map(str, symbols))
 78 |         except ValueError:
 79 |             pass
 80 |         for symbol in symbols:
 81 |             if symbol:
 82 |                 indexer.add_symbol(symbol)
 83 |         return indexer
 84 | 
 85 |     def save(self, file):
 86 |         with open(file, 'w') as f:
 87 |             for symbol, index in self.w2id.items():
 88 |                 if index < self.n_spec:
 89 |                     continue
 90 |                 f.write('{}\n'.format(symbol))
 91 | 
 92 |     @classmethod
 93 |     def load(cls, file):
 94 |         indexer = cls()
 95 |         with open(file) as f:
 96 |             for line in f:
 97 |                 symbol = line.rstrip()
 98 |                 assert len(symbol) > 0, 'Empty symbol encountered.'
 99 |                 indexer.add_symbol(symbol)
100 |         return indexer
101 | 
102 | 
103 | class RobustIndexer(Indexer):
104 |     def __init__(self, validate=True):
105 |         super().__init__()
106 |         self.w2id.update({self.unk_symbol(): self.unk()})
107 |         self.id2w = {i: w for w, i in self.w2id.items()}
108 |         if validate:
109 |             self.validate_spec()
110 | 
111 |     @property
112 |     def n_spec(self):
113 |         return 1
114 | 
115 |     def index(self, symbol):
116 |         return self.w2id[symbol] if symbol in self.w2id else self.unk()
117 | 
118 |     @staticmethod
119 |     def unk():
120 |         return 0
121 | 
122 |     @staticmethod
123 |     def unk_symbol():
124 |         return '<UNK>'
125 | 
126 |     def validate_spec(self):
127 |         assert self.n_spec == len(self.w2id), f'{self.n_spec}, {len(self.w2id)}'
128 |         assert len(self.w2id) == max(self.id2w.keys()) + 1, "empty indices found in special tokens"
129 |         assert len(self.w2id) == len(self.id2w), "index conflict in special tokens"
130 | 
131 | 
132 | class Vocab(RobustIndexer):
133 |     def __init__(self):
134 |         super().__init__(validate=False)
135 |         self.w2id.update({
136 |             self.pad_symbol(): self.pad(),
137 |         })
138 |         self.id2w = {i: w for w, i in self.w2id.items()}
139 |         self.validate_spec()
140 | 
141 |     @classmethod
142 |     def build(cls, words, lower=False, min_df=1, max_tokens=float('inf'), pretrained_embeddings=None,
143 |               dump_filtered=None, log=print):
144 |         if pretrained_embeddings:
145 |             wv_vocab = cls.load_embedding_vocab(pretrained_embeddings, lower)
146 |         else:
147 |             wv_vocab = set()
148 |         if lower:
149 |             words = (word.lower() for word in words)
150 |         counter = Counter(words)
151 |         candidate_tokens = sorted([t for t, c in counter.items() if t in wv_vocab or c >= min_df],
152 |                                   key=counter.get, reverse=True)
153 |         if len(candidate_tokens) > max_tokens:
154 |             tokens = []
155 |             for i, token in enumerate(candidate_tokens):
156 |                 if i < max_tokens:
157 |                     tokens.append(token)
158 |                 elif token in wv_vocab:
159 |                     tokens.append(token)
160 |         else:
161 |             tokens = candidate_tokens
162 |         total = sum(counter.values())
163 |         matched = sum(counter[t] for t in tokens)
164 |         stats = (len(tokens), len(counter), total - matched, total, (total - matched) / total * 100)
165 |         log('vocab coverage {}/{} | OOV occurrences {}/{} ({:.4f}%)'.format(*stats))
166 |         tokens_set = set(tokens)
167 |         if pretrained_embeddings:
168 |             oop_samples = sorted(list(tokens_set - wv_vocab), key=counter.get, reverse=True)
169 |             log('Covered by pretrained vectors {:.4f}%. '.format(len(tokens_set & wv_vocab) / len(tokens) * 100) +
170 |                 ('outside pretrained: ' + ' '.join(oop_samples[:10]) + ' ...' if len(oop_samples) > 10 else '')
171 |                 if oop_samples else '')
172 |         log('top words:\n{}'.format(' '.join(tokens[:10])))
173 |         filtered = sorted(list(counter.keys() - set(tokens)), key=counter.get, reverse=True)
174 |         if filtered:
175 |             if len(filtered) > 20:
176 |                 log('filtered words:\n{} ... {}'.format(' '.join(filtered[:10]), ' '.join(filtered[-10:])))
177 |             else:
178 |                 log('filtered words:\n' + ' '.join(filtered))
179 |             if dump_filtered:
180 |                 with open(dump_filtered, 'w') as f:
181 |                     for name in filtered:
182 |                         f.write(f'{name} {counter.get(name)}\n')
183 | 
184 |         vocab = cls()
185 |         for token in tokens:
186 |             vocab.add_symbol(token)
187 |         return vocab
188 | 
189 |     @staticmethod
190 |     def load_embedding_vocab(file, lower):
191 |         wv_vocab = set()
192 |         with open(file) as f:
193 |             for line in f:
194 |                 token = line.rstrip().split(' ')[0]
195 |                 if lower:
196 |                     token = token.lower()
197 |                 wv_vocab.add(token)
198 |         return wv_vocab
199 | 
200 |     @staticmethod
201 |     def pad():
202 |         return 0
203 | 
204 |     @staticmethod
205 |     def unk():
206 |         return 1
207 | 
208 |     @property
209 |     def n_spec(self):
210 |         return 2
211 | 
212 |     @staticmethod
213 |     def pad_symbol():
214 |         return '<PAD>'
215 | 
216 |     char_map = {  # escape special characters for safe serialization
217 |         '\n': '<NEWLINE>',
218 |     }
219 | 
220 |     def save(self, file):
221 |         with open(file, 'w') as f:
222 |             for symbol, index in self.w2id.items():
223 |                 if index < self.n_spec:
224 |                     continue
225 |                 symbol = self.char_map.get(symbol, symbol)
226 |                 f.write(f'{symbol}\n')
227 | 
228 |     @classmethod
229 |     def load(cls, file):
230 |         vocab = cls()
231 |         reverse_char_map = {v: k for k, v in cls.char_map.items()}
232 |         with open(file) as f:
233 |             for line in f:
234 |                 symbol = line.rstrip('\n')
235 |                 symbol = reverse_char_map.get(symbol, symbol)
236 |                 vocab.add_symbol(symbol)
237 |         return vocab
238 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (C) 2019 Alibaba Group Holding Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import os
18 | import sys
19 | import json5
20 | import requests
21 | from pprint import pprint
22 | 
23 | from src.utils import params
24 | from src.trainer import Trainer
25 | 
26 | 
27 | def main():
28 |     argv = sys.argv
29 |     if len(argv) == 2:
30 |         arg_groups = params.parse(sys.argv[1])
31 |         for args, config in arg_groups:
32 |             trainer = Trainer(args)
33 |             states = trainer.train()
34 |             with open('models/log.jsonl', 'a') as f:
35 |                 f.write(json5.dumps({
36 |                     'params': config,
37 |                     'state': states,
38 |                 }))
39 |                 f.write('\n')
40 |     elif len(argv) == 3 and '--dry' in argv:
41 |         argv.remove('--dry')
42 |         arg_groups = params.parse(sys.argv[1])
43 |         pprint([args.__dict__ for args, _ in arg_groups])
44 |     else:
45 |         print('Usage: "python train.py configs/xxx.json5"')
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------