├── .editor-settings
├── .gitignore
├── 0_download_bert.sh
├── 1_download_relations.sh
├── 2_create_random_walks.sh
├── 3_generate_corpus.sh
├── 4_pretrain_adapter.sh
├── 9_download_pretrained_adapters_omcs.sh
├── 9_download_pretrained_adapters_rw30.sh
├── LICENSE
├── README.md
├── archive
├── create_pretraining_data.py
├── poc_bash_test.sh
├── poc_create_pretraining_data.sh
├── poc_create_pretraining_data_rw.sh
├── poc_create_pretraining_data_wo_nsp.sh
├── poc_finetuning.sh
├── poc_finetuning_adapter.sh
├── poc_finetuning_adapter_longer.sh
├── poc_finetuning_adapter_longer_2.sh
├── poc_finetuning_adapter_quick_insight.sh
├── poc_finetuning_adapter_sst2.sh
├── poc_finetuning_dws.sh
├── poc_finetuning_rw.sh
├── poc_pretraining.sh
├── poc_pretraining_dws.sh
├── poc_pretraining_rw.sh
├── prediction_diagnostic.sh
├── predictions_rw_100000.sh
├── predictions_rw_100000_all.sh
├── predictions_rw_25000_all.sh
├── run_classifier_adapter.py
├── run_pretraining_adapter.py
└── run_regression_adapter.py
├── copa_1_download_copa.sh
├── copa_2_finetune_adapter.sh
├── copa_2_finetune_bert.sh
├── csqa_1_download_commonsenseqa.sh
├── csqa_2_finetune_adapter.sh
├── csqa_3_eval_adapter.sh
├── data_utility
├── create_pretraining_data.py
└── create_pretraining_data_wo_nsp.py
├── download_utility
├── download_bert.py
├── download_commonsenseqa.py
├── download_glue.py
└── download_relations.py
├── glue_1_download_glue.sh
├── glue_2_finetune_adapter.sh
├── images
└── Retrograph.png
├── randomwalks_utility
├── create_corpora_from_random_walks.py
├── preprocess_cn.py
└── random_walks.py
├── results_utility
├── fetcher.py
└── parse_predictions.py
├── retrograph
├── __init__.py
├── modeling
│ ├── __init__.py
│ ├── metrics_extension.py
│ ├── modeling.py
│ ├── modeling_adapter.py
│ ├── optimization.py
│ ├── optimization_adapter.py
│ └── tokenization.py
└── training
│ ├── __init__.py
│ └── preprocessors.py
├── setup.py
├── siqa_1_download_siqa.sh
├── siqa_2_finetune_adapters.sh
├── siqa_2_finetune_bert.sh
├── siqa_calc_acc_testset.py
├── training_utility
├── copa_preprocessor.py
├── run_classifier.py
├── run_classifier_adapter_tune_all.py
├── run_commonsenseqa.py
├── run_commonsenseqa_adapter.py
├── run_copa.py
├── run_copa_adapter.py
├── run_pretraining.py
├── run_pretraining_adapter.py
├── run_pretraining_wo_nsp.py
├── run_pretraining_wo_nsp_adapter.py
├── run_regression.py
├── run_regression_adapter_tune_all.py
├── run_siqa.py
├── run_siqa_adapters.py
└── siqa_preprocessor.py
└── utility
├── ec2.py
├── ec2_download.py
├── s3_download.py
├── s3_upload.py
└── upload_s3.sh
/.editor-settings:
--------------------------------------------------------------------------------
1 | tabLength: 2
2 | softTabs: true
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | __pycache__/
3 | data/
4 | models/
5 | relations/
6 | randomwalks/
7 |
--------------------------------------------------------------------------------
/0_download_bert.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DOWNLOAD_UTILITY_SCRIPTS=download_utility
4 |
5 | mkdir -p 'models/BERT_BASE_UNCASED'
6 |
7 | # DOWNLOAD BERT
8 | python3.6 $DOWNLOAD_UTILITY_SCRIPTS/download_bert.py
9 |
--------------------------------------------------------------------------------
/1_download_relations.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DOWNLOAD_UTILITY_SCRIPTS=download_utility
4 |
5 |
6 | DIR_SAVE_RELATIONS='relations/'
7 | mkdir -p $DIR_SAVE_RELATIONS
8 |
9 | # DOWNLOAD RELATIONS
10 | python3.6 $DOWNLOAD_UTILITY_SCRIPTS/download_relations.py --data_dir $DIR_SAVE_RELATIONS --relations all
11 |
--------------------------------------------------------------------------------
/2_create_random_walks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | RANDOM_WALKS_SCRIPTS=randomwalks_utility
4 |
5 | mkdir -p 'randomwalks'
6 |
7 | # Preprocess the relations
8 | python3.6 $RANDOM_WALKS_SCRIPTS/preprocess_cn.py
9 |
10 | # Create the randomwalks using node2vec
11 | python3.6 $RANDOM_WALKS_SCRIPTS/random_walks.py
12 |
--------------------------------------------------------------------------------
/3_generate_corpus.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Create natural language text from the RWs
4 | # create_corpora_from_random_walks.py -> takes as input the pickle file and generates the corpus
5 | # -> output corpus "rw_.txt"
6 | # || could change how sentences are generated. at the moment sentences are always 3 word sentences
7 | # -> if you want extra vocab in bert change function "create_realtionship_token"
8 |
9 | RANDOM_WALKS_SCRIPTS=randomwalks_utility
10 | DATA_SCRIPTS=data_utility
11 |
12 | python3.6 $RANDOM_WALKS_SCRIPTS/create_corpora_from_random_walks.py
13 |
14 | # COMMENTS - NIKOLAI
15 | #create_pretraining_data.py OR
16 | #create_pretraining_data_wo_nsp.py (without Next Sentence Prediciton)
17 | #
18 | #For OMSC you only need to create the pretraining data
19 | ## 4 - Pretraining BERT using RW Corpus
20 |
21 | ## 1.1 - OMCS Pretraining Data
22 | #Step1: (create pretraining out of corpus)
23 | #create_pretraining_data.py OR
24 | #create_pretraining_data_wo_nsp.py (without Next Sentence Prediciton)
25 |
26 | VOCAB_FILE=models/BERT_BASE_UNCASED/vocab.txt
27 |
28 | # TODO: change this to create different pre-training data
29 | INPUT_FILE=randomwalks/rw_corpus_1.0_1.0_2_15_nl.txt
30 | OUTPUT_FILE=randomwalks/rw_corpus_1.0_1.0_2_15_nl.tf
31 |
32 |
33 | python3.6 $DATA_SCRIPTS/create_pretraining_data_wo_nsp.py --input_file $INPUT_FILE --output_file $OUTPUT_FILE --vocab_file $VOCAB_FILE
34 |
35 | # python3.6 $DATA_SCRIPTS/create_pretraining_data.py --input_file $INPUT_FILE --output_file $OUTPUT_FILE --vocab_file $VOCAB_FILE
36 |
--------------------------------------------------------------------------------
/4_pretrain_adapter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #Step1: (run the pretraining)
4 | #run_pretraining_adapter.py OR
5 | #run_pretraining_adapter_wo_nsp.py (without Next Sentence Prediciton)
6 | #
7 | #
8 | #Need to load the Adapter Model
9 | #And need to load the Adapter Optimiser for that.
10 |
11 | TRAINING_UTILITY=training_utility
12 |
13 | export CUDA_VISIBLE_DEVICES=8
14 |
15 | BERT_CONFIG_FILE=models/BERT_BASE_UNCASED/bert_config.json
16 | INPUT_FILE=randomwalks/rw_corpus_1.0_1.0_2_15_nl.tf
17 | OUTPUT_DIR=models/output_pretrain_adapter
18 |
19 | mkdir -p $OUTPUT_DIR
20 |
21 | python3.6 $TRAINING_UTILITY/run_pretraining_wo_nsp_adapter.py --input_file $INPUT_FILE --output_dir $OUTPUT_DIR \
22 | --bert_config_file $BERT_CONFIG_FILE \
23 | --do_train True
24 |
--------------------------------------------------------------------------------
/9_download_pretrained_adapters_omcs.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | waws --downloadS3 -f omcs_pretraining_free_wo_nsp_adapter.zip -b wluper-retrograph
3 | unzip omcs_pretraining_free_wo_nsp_adapter.zip
4 | mv omcs_pretraining_free_wo_nsp_adapter.zip models
5 | mv omcs_pretraining_free_wo_nsp_adapter models
6 |
--------------------------------------------------------------------------------
/9_download_pretrained_adapters_rw30.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | waws --downloadS3 -f 1.0_1.0_5_30_full_assertions_nl.zip -b wluper-retrograph
3 | unzip 1.0_1.0_5_30_full_assertions_nl.zip
4 | mv 1.0_1.0_5_30_full_assertions_nl.zip models
5 | mv 1.0_1.0_5_30_full_assertions_nl models
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2019-Present Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas. All rights reserved.
2 |
3 | Apache License
4 | Version 2.0, January 2004
5 | http://www.apache.org/licenses/
6 |
7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8 |
9 | 1. Definitions.
10 |
11 | "License" shall mean the terms and conditions for use, reproduction,
12 | and distribution as defined by Sections 1 through 9 of this document.
13 |
14 | "Licensor" shall mean the copyright owner or entity authorized by
15 | the copyright owner that is granting the License.
16 |
17 | "Legal Entity" shall mean the union of the acting entity and all
18 | other entities that control, are controlled by, or are under common
19 | control with that entity. For the purposes of this definition,
20 | "control" means (i) the power, direct or indirect, to cause the
21 | direction or management of such entity, whether by contract or
22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
23 | outstanding shares, or (iii) beneficial ownership of such entity.
24 |
25 | "You" (or "Your") shall mean an individual or Legal Entity
26 | exercising permissions granted by this License.
27 |
28 | "Source" form shall mean the preferred form for making modifications,
29 | including but not limited to software source code, documentation
30 | source, and configuration files.
31 |
32 | "Object" form shall mean any form resulting from mechanical
33 | transformation or translation of a Source form, including but
34 | not limited to compiled object code, generated documentation,
35 | and conversions to other media types.
36 |
37 | "Work" shall mean the work of authorship, whether in Source or
38 | Object form, made available under the License, as indicated by a
39 | copyright notice that is included in or attached to the work
40 | (an example is provided in the Appendix below).
41 |
42 | "Derivative Works" shall mean any work, whether in Source or Object
43 | form, that is based on (or derived from) the Work and for which the
44 | editorial revisions, annotations, elaborations, or other modifications
45 | represent, as a whole, an original work of authorship. For the purposes
46 | of this License, Derivative Works shall not include works that remain
47 | separable from, or merely link (or bind by name) to the interfaces of,
48 | the Work and Derivative Works thereof.
49 |
50 | "Contribution" shall mean any work of authorship, including
51 | the original version of the Work and any modifications or additions
52 | to that Work or Derivative Works thereof, that is intentionally
53 | submitted to Licensor for inclusion in the Work by the copyright owner
54 | or by an individual or Legal Entity authorized to submit on behalf of
55 | the copyright owner. For the purposes of this definition, "submitted"
56 | means any form of electronic, verbal, or written communication sent
57 | to the Licensor or its representatives, including but not limited to
58 | communication on electronic mailing lists, source code control systems,
59 | and issue tracking systems that are managed by, or on behalf of, the
60 | Licensor for the purpose of discussing and improving the Work, but
61 | excluding communication that is conspicuously marked or otherwise
62 | designated in writing by the copyright owner as "Not a Contribution."
63 |
64 | "Contributor" shall mean Licensor and any individual or Legal Entity
65 | on behalf of whom a Contribution has been received by Licensor and
66 | subsequently incorporated within the Work.
67 |
68 | 2. Grant of Copyright License. Subject to the terms and conditions of
69 | this License, each Contributor hereby grants to You a perpetual,
70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71 | copyright license to reproduce, prepare Derivative Works of,
72 | publicly display, publicly perform, sublicense, and distribute the
73 | Work and such Derivative Works in Source or Object form.
74 |
75 | 3. Grant of Patent License. Subject to the terms and conditions of
76 | this License, each Contributor hereby grants to You a perpetual,
77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
78 | (except as stated in this section) patent license to make, have made,
79 | use, offer to sell, sell, import, and otherwise transfer the Work,
80 | where such license applies only to those patent claims licensable
81 | by such Contributor that are necessarily infringed by their
82 | Contribution(s) alone or by combination of their Contribution(s)
83 | with the Work to which such Contribution(s) was submitted. If You
84 | institute patent litigation against any entity (including a
85 | cross-claim or counterclaim in a lawsuit) alleging that the Work
86 | or a Contribution incorporated within the Work constitutes direct
87 | or contributory patent infringement, then any patent licenses
88 | granted to You under this License for that Work shall terminate
89 | as of the date such litigation is filed.
90 |
91 | 4. Redistribution. You may reproduce and distribute copies of the
92 | Work or Derivative Works thereof in any medium, with or without
93 | modifications, and in Source or Object form, provided that You
94 | meet the following conditions:
95 |
96 | (a) You must give any other recipients of the Work or
97 | Derivative Works a copy of this License; and
98 |
99 | (b) You must cause any modified files to carry prominent notices
100 | stating that You changed the files; and
101 |
102 | (c) You must retain, in the Source form of any Derivative Works
103 | that You distribute, all copyright, patent, trademark, and
104 | attribution notices from the Source form of the Work,
105 | excluding those notices that do not pertain to any part of
106 | the Derivative Works; and
107 |
108 | (d) If the Work includes a "NOTICE" text file as part of its
109 | distribution, then any Derivative Works that You distribute must
110 | include a readable copy of the attribution notices contained
111 | within such NOTICE file, excluding those notices that do not
112 | pertain to any part of the Derivative Works, in at least one
113 | of the following places: within a NOTICE text file distributed
114 | as part of the Derivative Works; within the Source form or
115 | documentation, if provided along with the Derivative Works; or,
116 | within a display generated by the Derivative Works, if and
117 | wherever such third-party notices normally appear. The contents
118 | of the NOTICE file are for informational purposes only and
119 | do not modify the License. You may add Your own attribution
120 | notices within Derivative Works that You distribute, alongside
121 | or as an addendum to the NOTICE text from the Work, provided
122 | that such additional attribution notices cannot be construed
123 | as modifying the License.
124 |
125 | You may add Your own copyright statement to Your modifications and
126 | may provide additional or different license terms and conditions
127 | for use, reproduction, or distribution of Your modifications, or
128 | for any such Derivative Works as a whole, provided Your use,
129 | reproduction, and distribution of the Work otherwise complies with
130 | the conditions stated in this License.
131 |
132 | 5. Submission of Contributions. Unless You explicitly state otherwise,
133 | any Contribution intentionally submitted for inclusion in the Work
134 | by You to the Licensor shall be under the terms and conditions of
135 | this License, without any additional terms or conditions.
136 | Notwithstanding the above, nothing herein shall supersede or modify
137 | the terms of any separate license agreement you may have executed
138 | with Licensor regarding such Contributions.
139 |
140 | 6. Trademarks. This License does not grant permission to use the trade
141 | names, trademarks, service marks, or product names of the Licensor,
142 | except as required for reasonable and customary use in describing the
143 | origin of the Work and reproducing the content of the NOTICE file.
144 |
145 | 7. Disclaimer of Warranty. Unless required by applicable law or
146 | agreed to in writing, Licensor provides the Work (and each
147 | Contributor provides its Contributions) on an "AS IS" BASIS,
148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 | implied, including, without limitation, any warranties or conditions
150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 | PARTICULAR PURPOSE. You are solely responsible for determining the
152 | appropriateness of using or redistributing the Work and assume any
153 | risks associated with Your exercise of permissions under this License.
154 |
155 | 8. Limitation of Liability. In no event and under no legal theory,
156 | whether in tort (including negligence), contract, or otherwise,
157 | unless required by applicable law (such as deliberate and grossly
158 | negligent acts) or agreed to in writing, shall any Contributor be
159 | liable to You for damages, including any direct, indirect, special,
160 | incidental, or consequential damages of any character arising as a
161 | result of this License or out of the use or inability to use the
162 | Work (including but not limited to damages for loss of goodwill,
163 | work stoppage, computer failure or malfunction, or any and all
164 | other commercial damages or losses), even if such Contributor
165 | has been advised of the possibility of such damages.
166 |
167 | 9. Accepting Warranty or Additional Liability. While redistributing
168 | the Work or Derivative Works thereof, You may choose to offer,
169 | and charge a fee for, acceptance of support, warranty, indemnity,
170 | or other liability obligations and/or rights consistent with this
171 | License. However, in accepting such obligations, You may act only
172 | on Your own behalf and on Your sole responsibility, not on behalf
173 | of any other Contributor, and only if You agree to indemnify,
174 | defend, and hold each Contributor harmless for any liability
175 | incurred by, or claims asserted against, such Contributor by reason
176 | of your accepting any such warranty or additional liability.
177 |
178 | END OF TERMS AND CONDITIONS
179 |
180 | APPENDIX: How to apply the Apache License to your work.
181 |
182 | To apply the Apache License to your work, attach the following
183 | boilerplate notice, with the fields enclosed by brackets "[]"
184 | replaced with your own identifying information. (Don't include
185 | the brackets!) The text should be enclosed in the appropriate
186 | comment syntax for the file format. We also recommend that a
187 | file or class name and description of purpose be included on the
188 | same "printed page" as the copyright notice for easier
189 | identification within third-party archives.
190 |
191 | Copyright 2019 Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas
192 |
193 | Licensed under the Apache License, Version 2.0 (the "License");
194 | you may not use this file except in compliance with the License.
195 | You may obtain a copy of the License at
196 |
197 | http://www.apache.org/licenses/LICENSE-2.0
198 |
199 | Unless required by applicable law or agreed to in writing, software
200 | distributed under the License is distributed on an "AS IS" BASIS,
201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 | See the License for the specific language governing permissions and
203 | limitations under the License.
204 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Retrograph
2 | # (aka Common Sense of World Knowledge? Investigating Adapter-Based Knowledge Injection into Pretrained Transformers)
3 | # accepted at EMNLP/DeeLIO 2020.
4 | 
5 |
6 | ## Abstract
7 | Following the major success of neural language models (LMs) such as BERT or GPT-2 on a variety of language understanding tasks, recent work focused on injecting (structured) knowledge from external resources into these models. While on the one hand, joint pretraining (i.e., training from scratch, adding objectives based on external knowledge to the primary LM objective) may be prohibitively computationally expensive, post-hoc fine-tuning on external knowledge, on the other hand, may lead to the catastrophic forgetting of distributional knowledge. In this work, we investigate models for complementing the distributional knowledge of BERT with conceptual knowledge from ConceptNet and its corresponding Open Mind Common Sense (OMCS) corpus, respectively, using adapter training. While overall results on the GLUE benchmark paint an inconclusive picture, a deeper analysis reveals that our adapter-based models substantially outperform BERT (up to 15-20 performance points) on inference tasks that require the type of conceptual knowledge explicitly present in ConceptNet and OMCS.
8 |
9 | ## Paper (EMNLP/DeeLIO 2020 Proceedings to follow)
10 | [Link To Paper](https://arxiv.org/abs/2005.11787)
11 |
12 | ## Key people
13 | [Anne Lauscher](https://www.uni-mannheim.de/dws/people/researchers/phd-students/anne-lauscher/)
14 |
15 | [Olga Majewska](https://om304.github.io/)
16 |
17 | [Leonardo Ribeiro](https://github.com/leoribeiro)
18 |
19 | [Goran Glavaš](https://www.uni-mannheim.de/dws/people/professors/prof-dr-goran-glavas/)
20 |
21 | [Nikolai Rozanov](https://github.com/ai-nikolai)
22 |
23 | [Iryna Gurevych](https://www.informatik.tu-darmstadt.de/ukp/ukp_home/staff_ukp/prof_dr_iryna_gurevych/index.en.jsp)
24 |
25 | ## Description
26 | Retrograph is the official repo behind University of Mannheim's, TU Darmstadt's and Wluper's Commonsense Adapter Paper.
27 |
28 | The key idea is that one can inject knowledge into pretrained language models using Adapters.
29 |
30 | We try two methods to generate training data for the adapters:
31 | 1. OMCS
32 | 2. Random walk from ConceptNet
33 |
34 | We evaluate on:
35 | 1. glue
36 | 2. csqa
37 | 3. copa
38 | 4. siqa
39 |
40 | Key results, you can find in the paper:
41 | [Link To Paper](https://arxiv.org/abs/2005.11787)
42 |
43 |
44 |
45 |
46 | ## A - Getting it running:
47 |
48 | Environment: python 3.6
49 |
50 | Please, follow these instructions to execute the experiments.
51 |
52 | ### 0 - Download BERT (This needs to be done for all experiments)
53 | Step 0: Download BERT
54 | ```
55 | bash ./0_download_bert.sh
56 | ```
57 | It creates:
58 | 1. models/BERT_BASE_UNCASED
59 |
60 |
61 |
62 | ### Next Steps:
63 | 1. Generate Random Walks and Pretrain Adapter -> Go to [B - Random Walks and Pretraining](#random_walk)
64 |
65 | 2. Finetune on existing Adapters -> Go to [C - Finetuning on Pretrained Adapters](#finetuning):
66 | - [GLUE](#glue)
67 | - [CSQA](#csqa)
68 | - [COPA](#copa)
69 | - [SIQA](#siqa)
70 |
71 |
72 |
73 | ## B - Random Walks and Pretraining
74 | Follow these steps for pretraining adapter.
75 |
76 |
77 | ### 1 - Download Relations
78 | Step 1: Download Relations
79 | ```
80 | bash ./1_download_relations.sh
81 | ```
82 | It creates:
83 | 1. relations/cn_relationType*.txt
84 |
85 |
86 | ### 2 - Creating Random Walks
87 |
88 | Step 2: Create the sequences of tokens using random walks generated by node2vec:
89 | ```
90 | bash ./2_create_random_walks.sh
91 | ```
92 |
93 | It creates the main file `randomwalks/random_walk_1.0_1.0_2_15.p` and others also (`randomwalks/cn_assertions_filtered.tsv`)
94 |
95 |
96 |
97 | ### 3 - Generating the Corpus (This takes a serious while)
98 | Step 3: Create natural language text from the random walks:
99 | ```
100 | bash ./3_generate_corpus.sh
101 | ```
102 | The generated corpus will be used as input for BERT + Adapters. It creates a file in TF format: `randomwalks/rw_corpus_1.0_1.0_2_15_nl.tf` (and also generates: `randomwalks/rw_corpus_1.0_1.0_2_15_nl.tf`)
103 |
104 |
105 | ### 4 - Pretraining Adapter
106 |
107 | Step 4: Pretrain the adapter using the RW corpus:
108 | ```
109 | bash ./4_pretrain_adapter.sh
110 | ```
111 | Creates a model in: `models/output_pretrain_adapter`
112 |
113 |
114 |
115 |
116 |
117 | ## C - Finetuning on Pretrained Adapters
118 |
119 |
120 | ### 9 - Download Pretrained Adapters (needs to be done if you don't have already pretrained adapters)
121 |
122 | ```
123 | 9_download_pretrained_adapters_rw30.sh
124 | 9_download_pretrained_adapters_omcs.sh
125 | ```
126 |
127 | **ALL models will be saved in Creates a model in: `models/output_model_finetunning`**
128 | **Modify the task_2_....sh files if you want to change hyper parameters**
129 |
130 |
131 |
132 | ## GLUE
133 |
134 | **Run all glue_1,2_.sh files in that order**
135 |
136 |
137 |
138 |
139 | ## CommonsenseQA
140 |
141 | **Run all csqa_1,2_.sh files in that order**
142 |
143 |
144 |
145 | ## COPA
146 |
147 | **Run all copa_1,2_.sh files in that order**
148 |
149 |
150 |
151 | ## SIQA
152 | **Run all siqa_1,2_.sh files in that order**
153 |
154 |
155 |
156 |
157 |
--------------------------------------------------------------------------------
/archive/create_pretraining_data.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Create masked LM/next sentence masked_lm TF examples for BERT."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 | import random
23 |
24 | import tokenization
25 | import tensorflow as tf
26 |
27 | flags = tf.flags
28 |
29 | FLAGS = flags.FLAGS
30 |
31 | flags.DEFINE_string("input_file", None,
32 | "Input raw text file (or comma-separated list of files).")
33 |
34 | flags.DEFINE_string(
35 | "output_file", None,
36 | "Output TF example file (or comma-separated list of files).")
37 |
38 | flags.DEFINE_string("vocab_file", None,
39 | "The vocabulary file that the BERT model was trained on.")
40 |
41 | flags.DEFINE_bool(
42 | "do_lower_case", True,
43 | "Whether to lower case the input text. Should be True for uncased "
44 | "models and False for cased models.")
45 |
46 | flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
47 |
48 | flags.DEFINE_integer("max_predictions_per_seq", 20,
49 | "Maximum number of masked LM predictions per sequence.")
50 |
51 | flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
52 |
53 | flags.DEFINE_integer(
54 | "dupe_factor", 10,
55 | "Number of times to duplicate the input data (with different masks).")
56 |
57 | flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
58 |
59 | flags.DEFINE_float(
60 | "short_seq_prob", 0.1,
61 | "Probability of creating sequences which are shorter than the "
62 | "maximum length.")
63 |
64 |
65 | class TrainingInstance(object):
66 | """A single training instance (sentence pair)."""
67 |
68 | def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
69 | is_random_next):
70 | self.tokens = tokens
71 | self.segment_ids = segment_ids
72 | self.is_random_next = is_random_next
73 | self.masked_lm_positions = masked_lm_positions
74 | self.masked_lm_labels = masked_lm_labels
75 |
76 | def __str__(self):
77 | s = ""
78 | s += "tokens: %s\n" % (" ".join(
79 | [tokenization.printable_text(x) for x in self.tokens]))
80 | s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
81 | s += "is_random_next: %s\n" % self.is_random_next
82 | s += "masked_lm_positions: %s\n" % (" ".join(
83 | [str(x) for x in self.masked_lm_positions]))
84 | s += "masked_lm_labels: %s\n" % (" ".join(
85 | [tokenization.printable_text(x) for x in self.masked_lm_labels]))
86 | s += "\n"
87 | return s
88 |
89 | def __repr__(self):
90 | return self.__str__()
91 |
92 |
93 | def write_instance_to_example_files(instances, tokenizer, max_seq_length,
94 | max_predictions_per_seq, output_files):
95 | """Create TF example files from `TrainingInstance`s."""
96 | writers = []
97 | for output_file in output_files:
98 | writers.append(tf.python_io.TFRecordWriter(output_file))
99 |
100 | writer_index = 0
101 |
102 | total_written = 0
103 | for (inst_index, instance) in enumerate(instances):
104 | input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
105 | input_mask = [1] * len(input_ids)
106 | segment_ids = list(instance.segment_ids)
107 | assert len(input_ids) <= max_seq_length
108 |
109 | while len(input_ids) < max_seq_length:
110 | input_ids.append(0)
111 | input_mask.append(0)
112 | segment_ids.append(0)
113 |
114 | assert len(input_ids) == max_seq_length
115 | assert len(input_mask) == max_seq_length
116 | assert len(segment_ids) == max_seq_length
117 |
118 | masked_lm_positions = list(instance.masked_lm_positions)
119 | masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
120 | masked_lm_weights = [1.0] * len(masked_lm_ids)
121 |
122 | while len(masked_lm_positions) < max_predictions_per_seq:
123 | masked_lm_positions.append(0)
124 | masked_lm_ids.append(0)
125 | masked_lm_weights.append(0.0)
126 |
127 | next_sentence_label = 1 if instance.is_random_next else 0
128 |
129 | features = collections.OrderedDict()
130 | features["input_ids"] = create_int_feature(input_ids)
131 | features["input_mask"] = create_int_feature(input_mask)
132 | features["segment_ids"] = create_int_feature(segment_ids)
133 | features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
134 | features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
135 | features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
136 | features["next_sentence_labels"] = create_int_feature([next_sentence_label])
137 |
138 | tf_example = tf.train.Example(features=tf.train.Features(feature=features))
139 |
140 | writers[writer_index].write(tf_example.SerializeToString())
141 | writer_index = (writer_index + 1) % len(writers)
142 |
143 | total_written += 1
144 |
145 | if inst_index < 20:
146 | tf.logging.info("*** Example ***")
147 | tf.logging.info("tokens: %s" % " ".join(
148 | [tokenization.printable_text(x) for x in instance.tokens]))
149 |
150 | for feature_name in features.keys():
151 | feature = features[feature_name]
152 | values = []
153 | if feature.int64_list.value:
154 | values = feature.int64_list.value
155 | elif feature.float_list.value:
156 | values = feature.float_list.value
157 | tf.logging.info(
158 | "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
159 |
160 | for writer in writers:
161 | writer.close()
162 |
163 | tf.logging.info("Wrote %d total instances", total_written)
164 |
165 |
166 | def create_int_feature(values):
167 | feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
168 | return feature
169 |
170 |
171 | def create_float_feature(values):
172 | feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
173 | return feature
174 |
175 |
176 | def create_training_instances(input_files, tokenizer, max_seq_length,
177 | dupe_factor, short_seq_prob, masked_lm_prob,
178 | max_predictions_per_seq, rng):
179 | """Create `TrainingInstance`s from raw text."""
180 | all_documents = [[]]
181 |
182 | # Input file format:
183 | # (1) One sentence per line. These should ideally be actual sentences, not
184 | # entire paragraphs or arbitrary spans of text. (Because we use the
185 | # sentence boundaries for the "next sentence prediction" task).
186 | # (2) Blank lines between documents. Document boundaries are needed so
187 | # that the "next sentence prediction" task doesn't span between documents.
188 | for input_file in input_files:
189 | with tf.gfile.GFile(input_file, "r") as reader:
190 | while True:
191 | line = tokenization.convert_to_unicode(reader.readline())
192 | if not line:
193 | break
194 | line = line.strip()
195 |
196 | # Empty lines are used as document delimiters
197 | if not line:
198 | all_documents.append([])
199 | tokens = tokenizer.tokenize(line)
200 | if tokens:
201 | all_documents[-1].append(tokens)
202 |
203 | # Remove empty documents
204 | all_documents = [x for x in all_documents if x]
205 | rng.shuffle(all_documents)
206 |
207 | vocab_words = list(tokenizer.vocab.keys())
208 | instances = []
209 | for _ in range(dupe_factor):
210 | for document_index in range(len(all_documents)):
211 | instances.extend(
212 | create_instances_from_document(
213 | all_documents, document_index, max_seq_length, short_seq_prob,
214 | masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
215 |
216 | rng.shuffle(instances)
217 | return instances
218 |
219 |
220 | def create_instances_from_document(
221 | all_documents, document_index, max_seq_length, short_seq_prob,
222 | masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
223 | """Creates `TrainingInstance`s for a single document."""
224 | document = all_documents[document_index]
225 |
226 | # Account for [CLS], [SEP], [SEP]
227 | max_num_tokens = max_seq_length - 3
228 |
229 | # We *usually* want to fill up the entire sequence since we are padding
230 | # to `max_seq_length` anyways, so short sequences are generally wasted
231 | # computation. However, we *sometimes*
232 | # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
233 | # sequences to minimize the mismatch between pre-training and fine-tuning.
234 | # The `target_seq_length` is just a rough target however, whereas
235 | # `max_seq_length` is a hard limit.
236 | target_seq_length = max_num_tokens
237 | if rng.random() < short_seq_prob:
238 | target_seq_length = rng.randint(2, max_num_tokens)
239 |
240 | # We DON'T just concatenate all of the tokens from a document into a long
241 | # sequence and choose an arbitrary split point because this would make the
242 | # next sentence prediction task too easy. Instead, we split the input into
243 | # segments "A" and "B" based on the actual "sentences" provided by the user
244 | # input.
245 | instances = []
246 | current_chunk = []
247 | current_length = 0
248 | i = 0
249 | while i < len(document):
250 | segment = document[i]
251 | current_chunk.append(segment)
252 | current_length += len(segment)
253 | if i == len(document) - 1 or current_length >= target_seq_length:
254 | if current_chunk:
255 | # `a_end` is how many segments from `current_chunk` go into the `A`
256 | # (first) sentence.
257 | a_end = 1
258 | if len(current_chunk) >= 2:
259 | a_end = rng.randint(1, len(current_chunk) - 1)
260 |
261 | tokens_a = []
262 | for j in range(a_end):
263 | tokens_a.extend(current_chunk[j])
264 |
265 | tokens_b = []
266 | # Random next
267 | is_random_next = False
268 | if len(current_chunk) == 1 or rng.random() < 0.5:
269 | is_random_next = True
270 | target_b_length = target_seq_length - len(tokens_a)
271 |
272 | # This should rarely go for more than one iteration for large
273 | # corpora. However, just to be careful, we try to make sure that
274 | # the random document is not the same as the document
275 | # we're processing.
276 | for _ in range(10):
277 | random_document_index = rng.randint(0, len(all_documents) - 1)
278 | if random_document_index != document_index:
279 | break
280 |
281 | random_document = all_documents[random_document_index]
282 | random_start = rng.randint(0, len(random_document) - 1)
283 | for j in range(random_start, len(random_document)):
284 | tokens_b.extend(random_document[j])
285 | if len(tokens_b) >= target_b_length:
286 | break
287 | # We didn't actually use these segments so we "put them back" so
288 | # they don't go to waste.
289 | num_unused_segments = len(current_chunk) - a_end
290 | i -= num_unused_segments
291 | # Actual next
292 | else:
293 | is_random_next = False
294 | for j in range(a_end, len(current_chunk)):
295 | tokens_b.extend(current_chunk[j])
296 | truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
297 |
298 | assert len(tokens_a) >= 1
299 | assert len(tokens_b) >= 1
300 |
301 | tokens = []
302 | segment_ids = []
303 | tokens.append("[CLS]")
304 | segment_ids.append(0)
305 | for token in tokens_a:
306 | tokens.append(token)
307 | segment_ids.append(0)
308 |
309 | tokens.append("[SEP]")
310 | segment_ids.append(0)
311 |
312 | for token in tokens_b:
313 | tokens.append(token)
314 | segment_ids.append(1)
315 | tokens.append("[SEP]")
316 | segment_ids.append(1)
317 |
318 | (tokens, masked_lm_positions,
319 | masked_lm_labels) = create_masked_lm_predictions(
320 | tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
321 | instance = TrainingInstance(
322 | tokens=tokens,
323 | segment_ids=segment_ids,
324 | is_random_next=is_random_next,
325 | masked_lm_positions=masked_lm_positions,
326 | masked_lm_labels=masked_lm_labels)
327 | instances.append(instance)
328 | current_chunk = []
329 | current_length = 0
330 | i += 1
331 |
332 | return instances
333 |
334 |
335 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
336 | ["index", "label"])
337 |
338 |
339 | def create_masked_lm_predictions(tokens, masked_lm_prob,
340 | max_predictions_per_seq, vocab_words, rng):
341 | """Creates the predictions for the masked LM objective."""
342 |
343 | cand_indexes = []
344 | for (i, token) in enumerate(tokens):
345 | if token == "[CLS]" or token == "[SEP]":
346 | continue
347 | cand_indexes.append(i)
348 |
349 | rng.shuffle(cand_indexes)
350 |
351 | output_tokens = list(tokens)
352 |
353 | num_to_predict = min(max_predictions_per_seq,
354 | max(1, int(round(len(tokens) * masked_lm_prob))))
355 |
356 | masked_lms = []
357 | covered_indexes = set()
358 | for index in cand_indexes:
359 | if len(masked_lms) >= num_to_predict:
360 | break
361 | if index in covered_indexes:
362 | continue
363 | covered_indexes.add(index)
364 |
365 | masked_token = None
366 | # 80% of the time, replace with [MASK]
367 | if rng.random() < 0.8:
368 | masked_token = "[MASK]"
369 | else:
370 | # 10% of the time, keep original
371 | if rng.random() < 0.5:
372 | masked_token = tokens[index]
373 | # 10% of the time, replace with random word
374 | else:
375 | masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
376 |
377 | output_tokens[index] = masked_token
378 |
379 | masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
380 |
381 | masked_lms = sorted(masked_lms, key=lambda x: x.index)
382 |
383 | masked_lm_positions = []
384 | masked_lm_labels = []
385 | for p in masked_lms:
386 | masked_lm_positions.append(p.index)
387 | masked_lm_labels.append(p.label)
388 |
389 | return (output_tokens, masked_lm_positions, masked_lm_labels)
390 |
391 |
392 | def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
393 | """Truncates a pair of sequences to a maximum sequence length."""
394 | while True:
395 | total_length = len(tokens_a) + len(tokens_b)
396 | if total_length <= max_num_tokens:
397 | break
398 |
399 | trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
400 | assert len(trunc_tokens) >= 1
401 |
402 | # We want to sometimes truncate from the front and sometimes from the
403 | # back to add more randomness and avoid biases.
404 | if rng.random() < 0.5:
405 | del trunc_tokens[0]
406 | else:
407 | trunc_tokens.pop()
408 |
409 |
410 | def main(_):
411 | tf.logging.set_verbosity(tf.logging.INFO)
412 |
413 | tokenizer = tokenization.FullTokenizer(
414 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
415 |
416 | input_files = []
417 | for input_pattern in FLAGS.input_file.split(","):
418 | input_files.extend(tf.gfile.Glob(input_pattern))
419 |
420 | tf.logging.info("*** Reading from input files ***")
421 | for input_file in input_files:
422 | tf.logging.info(" %s", input_file)
423 |
424 | rng = random.Random(FLAGS.random_seed)
425 | instances = create_training_instances(
426 | input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
427 | FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
428 | rng)
429 |
430 | output_files = FLAGS.output_file.split(",")
431 | tf.logging.info("*** Writing to output files ***")
432 | for output_file in output_files:
433 | tf.logging.info(" %s", output_file)
434 |
435 | write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
436 | FLAGS.max_predictions_per_seq, output_files)
437 |
438 |
439 | if __name__ == "__main__":
440 | flags.mark_flag_as_required("input_file")
441 | flags.mark_flag_as_required("output_file")
442 | flags.mark_flag_as_required("vocab_file")
443 | tf.app.run()
444 |
--------------------------------------------------------------------------------
/archive/poc_bash_test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #export CUDA_VISIBLE_DEVICES=1
3 | export BERT_DIR="/home/Anne/uncased_L-12_H-768_A-12"
4 | export BERT_CONFIG=$BERT_DIR/bert_config.json
5 | export VOCAB_DIR=$BERT_DIR/vocab.txt
6 | export PATH_SUFFIX="/sentences/free-wo-nsp"
7 | export BERT_EXTENDED_DIR="/home/Anne/ConceptBERT/output/pretraining${PATH_SUFFIX}"
8 | export OUTPUT_DIR="/home/Anne/ConceptBERT/output/finetuning${PATH_SUFFIX}"
9 | export GLUE_DIR="/home/Anne/ConceptBERT/data/glue_data"
10 | export S3_PATH="~/test/output/finetuning${PATH_SUFFIX}"
11 |
12 | for STEP in "25000"; do
13 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
14 | for task_name in "SST2"; do
15 |
16 | # Copy the data to s3
17 | for dir in ${OUTPUT_DIR}/${STEP}/*/; do
18 | #for dir in /home/Anne/ConceptBERT/output/finetuning/sentences/free-wo-nsp/25000/CoLA*; do
19 | echo "DIR ${dir}"
20 | for filename in ${dir}*; do
21 | echo "FILENAME ${filename}"
22 |
23 | #IFS='/' # hyphen (-) is set as delimiter
24 | #declare -a PARTS
25 | #read -ra PARTS <<< ${FILE} # str is read into an array as tokens separated by IFS
26 | #echo "PARTS ${PARTS}"
27 | FILE=${filename##*/}
28 | echo ${FILE}
29 | temp=${filename%/*}
30 | SUBDIR=${temp##*/}
31 | echo ${SUBDIR}
32 |
33 | #S3="${S3_PATH}/${STEP}/${PARTS[${#PARTS[@]}-2]}/${PARTS[${#PARTS[@]}-1]}"
34 | S3=${S3_PATH}/${STEP}/${SUBDIR}/${FILE}
35 | #S3="${S3_PATH}/${STEP}/${filename}"
36 | echo "S3 ${S3}"
37 | waws --uploadS3 -b wluper-retrograph -f "${filename}" -l "${S3}"
38 | done
39 | done
40 | #waws --uploadS3 -b wluper-retrograph -f $OUTPUT_DIR/${STEP}/${task_name}/ -l $S3_PATH/${STEP}/${task_name}/
41 | #rm -r $OUTPUT_DIR/${STEP}/${task_name}*
42 | done
43 | done
--------------------------------------------------------------------------------
/archive/poc_create_pretraining_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #--input_file=./data/omcs-sentences-more-filtered.txt \
3 | #--output_file=./data/omcs-sentences-more-filtered.tfrecord \
4 |
5 | python create_pretraining_data.py \
6 | --input_file=./data/omcs-sentences-free-filtered-3.txt \
7 | --output_file=./data/omcs-sentences-free-filtered.tfrecord \
8 | --vocab_file=/c/Users/anlausch/Downloads/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12/vocab.txt \
9 | --do_lower_case=True \
10 | --max_seq_length=128 \
11 | --max_predictions_per_seq=20 \
12 | --masked_lm_prob=0.15 \
13 | --random_seed=12345 \
14 | --dupe_factor=5
15 |
--------------------------------------------------------------------------------
/archive/poc_create_pretraining_data_rw.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | python create_pretraining_data.py \
4 | --input_file=./data/rw_corpus_1.0_1.0_2_10_2.txt \
5 | --output_file=./data/rw_corpus_1.0_1.0_2_10_cn_relations_2.tfrecord \
6 | --vocab_file=/work/anlausch/uncased_L-12_H-768_A-12/vocab_cn_relations.txt \
7 | --do_lower_case=True \
8 | --max_seq_length=128 \
9 | --max_predictions_per_seq=20 \
10 | --masked_lm_prob=0.15 \
11 | --random_seed=12345 \
12 | --dupe_factor=5 |& tee ./data/cn_relations_2.out
13 |
14 | python create_pretraining_data.py \
15 | --input_file=./data/rw_corpus_1.0_1.0_2_10_3.txt \
16 | --output_file=./data/rw_corpus_1.0_1.0_2_10_cn_relations_3.tfrecord \
17 | --vocab_file=/work/anlausch/uncased_L-12_H-768_A-12/vocab_cn_relations_2.txt \
18 | --do_lower_case=True \
19 | --max_seq_length=128 \
20 | --max_predictions_per_seq=20 \
21 | --masked_lm_prob=0.15 \
22 | --random_seed=12345 \
23 | --dupe_factor=5 |& tee ./data/cn_relations_3.out
24 |
25 | python create_pretraining_data.py \
26 | --input_file=./data/rw_corpus_1.0_1.0_2_10_nl.txt \
27 | --output_file=./data/rw_corpus_1.0_1.0_2_10_cn_relations_nl.tfrecord \
28 | --vocab_file=/work/anlausch/uncased_L-12_H-768_A-12/vocab.txt \
29 | --do_lower_case=True \
30 | --max_seq_length=128 \
31 | --max_predictions_per_seq=20 \
32 | --masked_lm_prob=0.15 \
33 | --random_seed=12345 \
34 | --dupe_factor=5 |& tee ./data/cn_relations_nl.out
35 |
--------------------------------------------------------------------------------
/archive/poc_create_pretraining_data_wo_nsp.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #--input_file=./data/omcs-sentences-more-filtered.txt \
3 | #--output_file=./data/omcs-sentences-more-filtered.tfrecord \
4 |
5 | python create_pretraining_data_wo_nsp.py \
6 | --input_file=./data/omcs-sentences-free-filtered-3.txt \
7 | --output_file=./data/omcs-sentences-free-filtered-wo-nsp.tfrecord \
8 | --vocab_file=/c/Users/anlausch/Downloads/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12/vocab.txt \
9 | --do_lower_case=True \
10 | --max_seq_length=128 \
11 | --max_predictions_per_seq=20 \
12 | --masked_lm_prob=0.15 \
13 | --random_seed=12345 \
14 | --dupe_factor=5
15 |
--------------------------------------------------------------------------------
/archive/poc_finetuning.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #export CUDA_VISIBLE_DEVICES=1
3 | BERT_DIR="/home/Anne/uncased_L-12_H-768_A-12"
4 | BERT_CONFIG=$BERT_DIR/bert_config.json
5 | VOCAB_DIR=$BERT_DIR/vocab.txt
6 | PATH_SUFFIX="/sentences/free-wo-nsp"
7 | BERT_EXTENDED_DIR="/home/Anne/ConceptBERT/output/pretraining${PATH_SUFFIX}"
8 | OUTPUT_DIR="/home/Anne/ConceptBERT/output/finetuning${PATH_SUFFIX}"
9 | GLUE_DIR="/home/Anne/ConceptBERT/data/glue_data"
10 | S3_PATH="~/anne/output/finetuning${PATH_SUFFIX}"
11 |
12 | for STEP in "25000" "50000" "75000" "100000"; do
13 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
14 | for task_name in "CoLA" "MRPC" "RTE" "SST2" "QNLIV2" ; do
15 | echo $task_name
16 | echo $CHECKPOINT
17 |
18 | GLUE_DATA="$GLUE_DIR/$task_name"
19 |
20 | python run_classifier.py \
21 | --task_name=$task_name \
22 | --do_train=true \
23 | --do_eval=true \
24 | --do_early_stopping=false \
25 | --data_dir=$GLUE_DATA \
26 | --vocab_file=$VOCAB_DIR \
27 | --bert_config_file=$BERT_CONFIG \
28 | --init_checkpoint=$CHECKPOINT\
29 | --max_seq_length=128 \
30 | --train_batch_size="[16]" \
31 | --learning_rate="[2e-5, 3e-5]" \
32 | --num_train_epochs="[3,4]" \
33 | --original_model=True \
34 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out
35 |
36 | # Copy the data to s3
37 | for dir in ${OUTPUT_DIR}/${STEP}/*/; do
38 | #for dir in /home/Anne/ConceptBERT/output/finetuning/sentences/free-wo-nsp/25000/CoLA*; do
39 | echo "DIR ${dir}"
40 | for filename in ${dir}*; do
41 | echo "FILENAME ${filename}"
42 |
43 | #IFS='/' # hyphen (-) is set as delimiter
44 | #declare -a PARTS
45 | #read -ra PARTS <<< ${FILE} # str is read into an array as tokens separated by IFS
46 | #echo "PARTS ${PARTS}"
47 | FILE=${filename##*/}
48 | echo ${FILE}
49 | temp=${filename%/*}
50 | SUBDIR=${temp##*/}
51 | echo ${SUBDIR}
52 |
53 | #S3="${S3_PATH}/${STEP}/${PARTS[${#PARTS[@]}-2]}/${PARTS[${#PARTS[@]}-1]}"
54 | S3=${S3_PATH}/${STEP}/${SUBDIR}/${FILE}
55 | #S3="${S3_PATH}/${STEP}/${filename}"
56 | echo "S3 ${S3}"
57 | waws --uploadS3 -b wluper-retrograph -f "${filename}" -l "${S3}"
58 | done
59 | done
60 | #waws --uploadS3 -b wluper-retrograph -f $OUTPUT_DIR/${STEP}/${task_name}/ -l $S3_PATH/${STEP}/${task_name}/
61 | rm -r $OUTPUT_DIR/${STEP}/${task_name}*
62 | done
63 |
64 |
65 |
66 | for task_name in "STSB" ; do
67 | echo $task_name
68 | export GLUE_DATA="$GLUE_DIR/$task_name"
69 |
70 | python run_regression.py \
71 | --task_name=$task_name \
72 | --do_train=true \
73 | --do_eval=true \
74 | --do_early_stopping=false \
75 | --data_dir=$GLUE_DATA \
76 | --vocab_file=$VOCAB_DIR \
77 | --bert_config_file=$BERT_CONFIG \
78 | --init_checkpoint=$CHECKPOINT\
79 | --max_seq_length=128 \
80 | --train_batch_size="[16]" \
81 | --learning_rate="[2e-5, 3e-5]" \
82 | --num_train_epochs="[3,4]" \
83 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out
84 |
85 | # Copy the data to s3
86 | for dir in ${OUTPUT_DIR}/${STEP}/*/; do
87 | #for dir in /home/Anne/ConceptBERT/output/finetuning/sentences/free-wo-nsp/25000/CoLA*; do
88 | echo "DIR ${dir}"
89 | for filename in ${dir}*; do
90 | echo "FILENAME ${filename}"
91 |
92 | #IFS='/' # hyphen (-) is set as delimiter
93 | #declare -a PARTS
94 | #read -ra PARTS <<< ${FILE} # str is read into an array as tokens separated by IFS
95 | #echo "PARTS ${PARTS}"
96 | FILE=${filename##*/}
97 | echo ${FILE}
98 | temp=${filename%/*}
99 | SUBDIR=${temp##*/}
100 | echo ${SUBDIR}
101 |
102 | #S3="${S3_PATH}/${STEP}/${PARTS[${#PARTS[@]}-2]}/${PARTS[${#PARTS[@]}-1]}"
103 | S3=${S3_PATH}/${STEP}/${SUBDIR}/${FILE}
104 | #S3="${S3_PATH}/${STEP}/${filename}"
105 | echo "S3 ${S3}"
106 | waws --uploadS3 -b wluper-retrograph -f "${filename}" -l "${S3}"
107 | done
108 | done
109 | #waws --uploadS3 -b wluper-retrograph -f $OUTPUT_DIR/${STEP}/${task_name}/ -l $S3_PATH/${STEP}/${task_name}/
110 | rm -r $OUTPUT_DIR/${STEP}/${task_name}*
111 | done
112 | done
--------------------------------------------------------------------------------
/archive/poc_finetuning_adapter.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | export CUDA_VISIBLE_DEVICES=0
4 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
5 | BERT_CONFIG=$BERT_DIR/bert_config.json
6 | VOCAB_DIR=$BERT_DIR/vocab.txt
7 |
8 | PATH_SUFFIX="/omcs/free-wo-nsp-adapter"
9 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}"
10 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}"
11 |
12 | for STEP in "25000" "50000" "75000" "100000"; do
13 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
14 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2"
15 | echo $task_name
16 | echo $CHECKPOINT
17 |
18 | GLUE_DATA="$GLUE_DIR/$task_name"
19 |
20 | python run_classifier_adapter.py \
21 | --task_name=$task_name \
22 | --do_train=true \
23 | --do_eval=true \
24 | --do_early_stopping=false \
25 | --data_dir=$GLUE_DATA \
26 | --vocab_file=$VOCAB_DIR \
27 | --bert_config_file=$BERT_CONFIG \
28 | --init_checkpoint=$CHECKPOINT\
29 | --max_seq_length=128 \
30 | --train_batch_size="[16]" \
31 | --learning_rate="[2e-5, 3e-5]" \
32 | --num_train_epochs="[3,4]" \
33 | --original_model=True \
34 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out
35 | done
36 |
37 | for task_name in "STSB" ; do
38 | echo $task_name
39 | export GLUE_DATA="$GLUE_DIR/$task_name"
40 |
41 | python run_regression_adapter.py \
42 | --task_name=$task_name \
43 | --do_train=true \
44 | --do_eval=true \
45 | --do_early_stopping=false \
46 | --data_dir=$GLUE_DATA \
47 | --vocab_file=$VOCAB_DIR \
48 | --bert_config_file=$BERT_CONFIG \
49 | --init_checkpoint=$CHECKPOINT\
50 | --max_seq_length=128 \
51 | --train_batch_size="[16]" \
52 | --learning_rate="[2e-5, 3e-5]" \
53 | --num_train_epochs="[3,4]" \
54 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out
55 | done
56 | done
57 |
58 | OUTPUT_SUFFIX=_tune_all
59 | ### the second finetuning variant
60 | for STEP in "25000" "50000" "75000" "100000"; do
61 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
62 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2"
63 | echo $task_name
64 | echo $CHECKPOINT
65 |
66 | GLUE_DATA="$GLUE_DIR/$task_name"
67 |
68 | python run_classifier_adapter_tune_all.py \
69 | --task_name=$task_name \
70 | --do_train=true \
71 | --do_eval=true \
72 | --do_early_stopping=false \
73 | --data_dir=$GLUE_DATA \
74 | --vocab_file=$VOCAB_DIR \
75 | --bert_config_file=$BERT_CONFIG \
76 | --init_checkpoint=$CHECKPOINT\
77 | --max_seq_length=128 \
78 | --train_batch_size="[16]" \
79 | --learning_rate="[2e-5, 3e-5]" \
80 | --num_train_epochs="[3,4]" \
81 | --original_model=True \
82 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out
83 | done
84 |
85 | for task_name in "STSB" ; do
86 | echo $task_name
87 | export GLUE_DATA="$GLUE_DIR/$task_name"
88 |
89 | python run_regression_adapter_tune_all.py \
90 | --task_name=$task_name \
91 | --do_train=true \
92 | --do_eval=true \
93 | --do_early_stopping=false \
94 | --data_dir=$GLUE_DATA \
95 | --vocab_file=$VOCAB_DIR \
96 | --bert_config_file=$BERT_CONFIG \
97 | --init_checkpoint=$CHECKPOINT\
98 | --max_seq_length=128 \
99 | --train_batch_size="[16]" \
100 | --learning_rate="[2e-5, 3e-5]" \
101 | --num_train_epochs="[3,4]" \
102 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out
103 | done
104 | done
105 |
106 | PATH_SUFFIX="/rw/1.0_1.0_2_10/nl-adapter"
107 | for STEP in "25000" "50000" "75000" "100000"; do
108 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
109 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2"
110 | echo $task_name
111 | echo $CHECKPOINT
112 |
113 | GLUE_DATA="$GLUE_DIR/$task_name"
114 |
115 | python run_classifier_adapter.py \
116 | --task_name=$task_name \
117 | --do_train=true \
118 | --do_eval=true \
119 | --do_early_stopping=false \
120 | --data_dir=$GLUE_DATA \
121 | --vocab_file=$VOCAB_DIR \
122 | --bert_config_file=$BERT_CONFIG \
123 | --init_checkpoint=$CHECKPOINT\
124 | --max_seq_length=128 \
125 | --train_batch_size="[16]" \
126 | --learning_rate="[2e-5, 3e-5]" \
127 | --num_train_epochs="[3,4]" \
128 | --original_model=True \
129 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out
130 | done
131 |
132 | for task_name in "STSB" ; do
133 | echo $task_name
134 | export GLUE_DATA="$GLUE_DIR/$task_name"
135 |
136 | python run_regression_adapter.py \
137 | --task_name=$task_name \
138 | --do_train=true \
139 | --do_eval=true \
140 | --do_early_stopping=false \
141 | --data_dir=$GLUE_DATA \
142 | --vocab_file=$VOCAB_DIR \
143 | --bert_config_file=$BERT_CONFIG \
144 | --init_checkpoint=$CHECKPOINT\
145 | --max_seq_length=128 \
146 | --train_batch_size="[16]" \
147 | --learning_rate="[2e-5, 3e-5]" \
148 | --num_train_epochs="[3,4]" \
149 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out
150 | done
151 | done
152 |
153 | OUTPUT_SUFFIX=_tune_all
154 | ### the second finetuning variant
155 | for STEP in "25000" "50000" "75000" "100000"; do
156 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
157 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2"
158 | echo $task_name
159 | echo $CHECKPOINT
160 |
161 | GLUE_DATA="$GLUE_DIR/$task_name"
162 |
163 | python run_classifier_adapter_tune_all.py \
164 | --task_name=$task_name \
165 | --do_train=true \
166 | --do_eval=true \
167 | --do_early_stopping=false \
168 | --data_dir=$GLUE_DATA \
169 | --vocab_file=$VOCAB_DIR \
170 | --bert_config_file=$BERT_CONFIG \
171 | --init_checkpoint=$CHECKPOINT\
172 | --max_seq_length=128 \
173 | --train_batch_size="[16]" \
174 | --learning_rate="[2e-5, 3e-5]" \
175 | --num_train_epochs="[3,4]" \
176 | --original_model=True \
177 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out
178 | done
179 |
180 | for task_name in "STSB" ; do
181 | echo $task_name
182 | export GLUE_DATA="$GLUE_DIR/$task_name"
183 |
184 | python run_regression_adapter_tune_all.py \
185 | --task_name=$task_name \
186 | --do_train=true \
187 | --do_eval=true \
188 | --do_early_stopping=false \
189 | --data_dir=$GLUE_DATA \
190 | --vocab_file=$VOCAB_DIR \
191 | --bert_config_file=$BERT_CONFIG \
192 | --init_checkpoint=$CHECKPOINT\
193 | --max_seq_length=128 \
194 | --train_batch_size="[16]" \
195 | --learning_rate="[2e-5, 3e-5]" \
196 | --num_train_epochs="[3,4]" \
197 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out
198 | done
199 | done
200 |
--------------------------------------------------------------------------------
/archive/poc_finetuning_adapter_longer.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #Step1:
4 | #run_classifier_adapter_tune_all.py ->
5 | #
6 | #
7 | #Need to load the Adapter Model
8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT
9 |
10 |
11 | export CUDA_VISIBLE_DEVICES=8
12 |
13 | BERT_DIR="models/BERT_BASE_UNCASED"
14 | BERT_CONFIG=$BERT_DIR/bert_config.json
15 | VOCAB_DIR=$BERT_DIR/vocab.txt
16 |
17 | BERT_EXTENDED_DIR="data/output_pretrain_adapter"
18 | OUTPUT_DIR="data/output_model_finetunning"
19 |
20 | GLUE_DIR='data/GLUE'
21 |
22 | OUTPUT_SUFFIX=_tune_all
23 | ### the second finetuning variant
24 | for STEP in "98000" "99000"; do
25 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
26 | for task_name in "QNLI" "QQP" "MNLI"; do
27 | echo $task_name
28 | echo $CHECKPOINT
29 |
30 | GLUE_DATA="$GLUE_DIR/$task_name"
31 |
32 | python run_classifier_adapter_tune_all.py \
33 | --task_name=$task_name \
34 | --do_train=true \
35 | --do_eval=true \
36 | --do_early_stopping=false \
37 | --data_dir=$GLUE_DATA \
38 | --vocab_file=$VOCAB_DIR \
39 | --bert_config_file=$BERT_CONFIG \
40 | --init_checkpoint=$CHECKPOINT\
41 | --max_seq_length=128 \
42 | --train_batch_size="[16]" \
43 | --learning_rate="[2e-5, 3e-5]" \
44 | --num_train_epochs="[3,4]" \
45 | --original_model=True \
46 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out
47 | done
48 | done
49 |
--------------------------------------------------------------------------------
/archive/poc_finetuning_adapter_longer_2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | export CUDA_VISIBLE_DEVICES=0
4 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
5 | BERT_CONFIG=$BERT_DIR/bert_config.json
6 | VOCAB_DIR=$BERT_DIR/vocab.txt
7 |
8 | PATH_SUFFIX="/rw/1.0_1.0_2_10/nl-adapter"
9 | OUTPUT_SUFFIX=_tune_all
10 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}"
11 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}"
12 |
13 | ### the second finetuning variant
14 | for STEP in "25000" "100000"; do
15 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
16 | for task_name in "QNLIV2" "QQP" "MNLI"; do
17 | echo $task_name
18 | echo $CHECKPOINT
19 |
20 | GLUE_DATA="$GLUE_DIR/$task_name"
21 |
22 | python run_classifier_adapter_tune_all.py \
23 | --task_name=$task_name \
24 | --do_train=true \
25 | --do_eval=true \
26 | --do_early_stopping=false \
27 | --data_dir=$GLUE_DATA \
28 | --vocab_file=$VOCAB_DIR \
29 | --bert_config_file=$BERT_CONFIG \
30 | --init_checkpoint=$CHECKPOINT\
31 | --max_seq_length=128 \
32 | --train_batch_size="[16]" \
33 | --learning_rate="[2e-5, 3e-5]" \
34 | --num_train_epochs="[3,4]" \
35 | --original_model=True \
36 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out
37 | done
38 | done
39 |
--------------------------------------------------------------------------------
/archive/poc_finetuning_adapter_quick_insight.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | export CUDA_VISIBLE_DEVICES=3
4 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
5 | BERT_CONFIG=$BERT_DIR/bert_config.json
6 | VOCAB_DIR=$BERT_DIR/vocab.txt
7 |
8 |
9 | PATH_SUFFIX="/rw/1.0_1.0_2_10/nl-adapter"
10 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}"
11 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}"
12 |
13 | OUTPUT_SUFFIX=_tune_all_quick_insight
14 | ### the second finetuning variant
15 | for STEP in "25000" "50000" "75000" "100000"; do
16 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
17 | for task_name in "CoLA" "MRPC" "RTE"; do #"QNLIV2"
18 | echo $task_name
19 | echo $CHECKPOINT
20 |
21 | GLUE_DATA="$GLUE_DIR/$task_name"
22 |
23 | python run_classifier_adapter_tune_all.py \
24 | --task_name=$task_name \
25 | --do_train=true \
26 | --do_eval=true \
27 | --do_early_stopping=false \
28 | --data_dir=$GLUE_DATA \
29 | --vocab_file=$VOCAB_DIR \
30 | --bert_config_file=$BERT_CONFIG \
31 | --init_checkpoint=$CHECKPOINT\
32 | --max_seq_length=128 \
33 | --train_batch_size="[16]" \
34 | --learning_rate="[2e-5, 3e-5]" \
35 | --num_train_epochs="[3,4]" \
36 | --original_model=True \
37 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out
38 | done
39 |
40 | for task_name in "STSB" ; do
41 | echo $task_name
42 | export GLUE_DATA="$GLUE_DIR/$task_name"
43 |
44 | python run_regression_adapter_tune_all.py \
45 | --task_name=$task_name \
46 | --do_train=true \
47 | --do_eval=true \
48 | --do_early_stopping=false \
49 | --data_dir=$GLUE_DATA \
50 | --vocab_file=$VOCAB_DIR \
51 | --bert_config_file=$BERT_CONFIG \
52 | --init_checkpoint=$CHECKPOINT\
53 | --max_seq_length=128 \
54 | --train_batch_size="[16]" \
55 | --learning_rate="[2e-5, 3e-5]" \
56 | --num_train_epochs="[3,4]" \
57 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out
58 | done
59 | done
60 |
--------------------------------------------------------------------------------
/archive/poc_finetuning_adapter_sst2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | export CUDA_VISIBLE_DEVICES=1
4 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
5 | BERT_CONFIG=$BERT_DIR/bert_config.json
6 | VOCAB_DIR=$BERT_DIR/vocab.txt
7 |
8 |
9 | PATH_SUFFIX="/rw/1.0_1.0_2_10/nl-adapter"
10 | OUTPUT_SUFFIX=_tune_all_quick_insight
11 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}"
12 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}"
13 |
14 | ### the second finetuning variant
15 | for STEP in "25000" "100000"; do
16 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
17 | for task_name in "SST2"; do
18 | echo $task_name
19 | echo $CHECKPOINT
20 |
21 | GLUE_DATA="$GLUE_DIR/$task_name"
22 |
23 | python run_classifier_adapter_tune_all.py \
24 | --task_name=$task_name \
25 | --do_train=true \
26 | --do_eval=true \
27 | --do_early_stopping=false \
28 | --data_dir=$GLUE_DATA \
29 | --vocab_file=$VOCAB_DIR \
30 | --bert_config_file=$BERT_CONFIG \
31 | --init_checkpoint=$CHECKPOINT\
32 | --max_seq_length=128 \
33 | --train_batch_size="[16]" \
34 | --learning_rate="[2e-5, 3e-5]" \
35 | --num_train_epochs="[3,4]" \
36 | --original_model=True \
37 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out
38 | done
39 |
40 | done
41 |
--------------------------------------------------------------------------------
/archive/poc_finetuning_dws.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export CUDA_VISIBLE_DEVICES=0
3 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
4 | BERT_CONFIG=$BERT_DIR/bert_config.json
5 | VOCAB_DIR=$BERT_DIR/vocab.txt
6 | #PATH_SUFFIX="/omcs/free-wo-nsp"
7 | PATH_SUFFIX="/omcs/free-wo-nsp-no-warmup"
8 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}"
9 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}"
10 |
11 | for STEP in "25000" "50000" "75000" "100000"; do
12 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
13 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2"
14 | echo $task_name
15 | echo $CHECKPOINT
16 |
17 | GLUE_DATA="$GLUE_DIR/$task_name"
18 |
19 | python run_classifier.py \
20 | --task_name=$task_name \
21 | --do_train=true \
22 | --do_eval=true \
23 | --do_early_stopping=false \
24 | --data_dir=$GLUE_DATA \
25 | --vocab_file=$VOCAB_DIR \
26 | --bert_config_file=$BERT_CONFIG \
27 | --init_checkpoint=$CHECKPOINT\
28 | --max_seq_length=128 \
29 | --train_batch_size="[16]" \
30 | --learning_rate="[2e-5, 3e-5]" \
31 | --num_train_epochs="[3,4]" \
32 | --original_model=True \
33 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out
34 | done
35 |
36 | for task_name in "STSB" ; do
37 | echo $task_name
38 | export GLUE_DATA="$GLUE_DIR/$task_name"
39 |
40 | python run_regression.py \
41 | --task_name=$task_name \
42 | --do_train=true \
43 | --do_eval=true \
44 | --do_early_stopping=false \
45 | --data_dir=$GLUE_DATA \
46 | --vocab_file=$VOCAB_DIR \
47 | --bert_config_file=$BERT_CONFIG \
48 | --init_checkpoint=$CHECKPOINT\
49 | --max_seq_length=128 \
50 | --train_batch_size="[16]" \
51 | --learning_rate="[2e-5, 3e-5]" \
52 | --num_train_epochs="[3,4]" \
53 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out
54 | done
55 | done
--------------------------------------------------------------------------------
/archive/poc_finetuning_rw.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export CUDA_VISIBLE_DEVICES=3
3 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
4 | BERT_CONFIG=$BERT_DIR/bert_config.json
5 | VOCAB_DIR=$BERT_DIR/vocab.txt
6 | PATH_SUFFIX="/rw/1.0_1.0_2_10/nl"
7 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}"
8 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}"
9 |
10 | for STEP in "25000" "50000" "75000" "100000"; do
11 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
12 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2"
13 | echo $task_name
14 | echo $CHECKPOINT
15 |
16 | GLUE_DATA="$GLUE_DIR/$task_name"
17 |
18 | python run_classifier.py \
19 | --task_name=$task_name \
20 | --do_train=true \
21 | --do_eval=true \
22 | --do_early_stopping=false \
23 | --data_dir=$GLUE_DATA \
24 | --vocab_file=$VOCAB_DIR \
25 | --bert_config_file=$BERT_CONFIG \
26 | --init_checkpoint=$CHECKPOINT\
27 | --max_seq_length=128 \
28 | --train_batch_size="[16]" \
29 | --learning_rate="[2e-5, 3e-5]" \
30 | --num_train_epochs="[3,4]" \
31 | --original_model=True \
32 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out
33 | done
34 |
35 | for task_name in "STSB" ; do
36 | echo $task_name
37 | export GLUE_DATA="$GLUE_DIR/$task_name"
38 |
39 | python run_regression.py \
40 | --task_name=$task_name \
41 | --do_train=true \
42 | --do_eval=true \
43 | --do_early_stopping=false \
44 | --data_dir=$GLUE_DATA \
45 | --vocab_file=$VOCAB_DIR \
46 | --bert_config_file=$BERT_CONFIG \
47 | --init_checkpoint=$CHECKPOINT\
48 | --max_seq_length=128 \
49 | --train_batch_size="[16]" \
50 | --learning_rate="[2e-5, 3e-5]" \
51 | --num_train_epochs="[3,4]" \
52 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out
53 | done
54 | done
--------------------------------------------------------------------------------
/archive/poc_pretraining.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | echo "script started"
3 |
4 | INPUT_FILE="/home/Anne/ConceptBERT/data/omcs-sentences-free-filtered-wo-nsp.tfrecord"
5 | OUTPUT_DIR="/home/Anne/ConceptBERT/output/pretraining/sentences/free-wo-nsp/"
6 | NUM_TRAIN_STEPS=100000
7 | BERT_DIR="/home/Anne/uncased_L-12_H-768_A-12"
8 | BERT_CONFIG=$BERT_DIR/bert_config.json
9 | # TODO: Here is an error!!! We should run this again and change run_pretraining to run_pretraining_wo_nsp
10 | #
11 | python run_pretraining.py \
12 | --input_file=$INPUT_FILE \
13 | --output_dir=$OUTPUT_DIR \
14 | --do_train=True \
15 | --do_eval=True \
16 | --bert_config_file=$BERT_CONFIG \
17 | --train_batch_size=16 \
18 | --eval_batch_size=8 \
19 | --max_seq_length=128 \
20 | --max_predictions_per_seq=20 \
21 | --num_train_steps=$NUM_TRAIN_STEPS \
22 | --num_warmup_steps=1000 \
23 | --learning_rate=1e-4 \
24 | --max_eval_steps=1000 \
25 | --save_checkpoints_steps=25000 \
26 | --init_checkpoint=$BERT_DIR/bert_model.ckpt
--------------------------------------------------------------------------------
/archive/poc_pretraining_dws.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | echo "script started"
3 | echo "No warmup here"
4 | export CUDA_VISIBLE_DEVICES=3
5 |
6 | INPUT_FILE="/work/anlausch/ConceptBERT/data/omcs-sentences-free-filtered-wo-nsp.tfrecord"
7 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/pretraining/omcs/free-wo-nsp-no-warmup/"
8 | NUM_TRAIN_STEPS=100000
9 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
10 | BERT_CONFIG=$BERT_DIR/bert_config.json
11 |
12 | # TODO: Here is an error!!! We should run this again and change run_pretraining to run_pretraining_wo_nsp
13 | #
14 | python run_pretraining.py \
15 | --input_file=$INPUT_FILE \
16 | --output_dir=$OUTPUT_DIR \
17 | --do_train=True \
18 | --do_eval=True \
19 | --bert_config_file=$BERT_CONFIG \
20 | --train_batch_size=16 \
21 | --eval_batch_size=8 \
22 | --max_seq_length=128 \
23 | --max_predictions_per_seq=20 \
24 | --num_train_steps=$NUM_TRAIN_STEPS \
25 | --num_warmup_steps=0 \
26 | --learning_rate=1e-4 \
27 | --max_eval_steps=1000 \
28 | --save_checkpoints_steps=25000 \
29 | --init_checkpoint=$BERT_DIR/bert_model.ckpt
--------------------------------------------------------------------------------
/archive/poc_pretraining_rw.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | echo "script started"
3 | export CUDA_VISIBLE_DEVICES=0
4 |
5 | INPUT_FILE="/work/anlausch/ConceptBERT/data/rw_corpus_1.0_1.0_2_10_cn_relations_nl.tfrecord"
6 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/pretraining/rw/1.0_1.0_2_10/nl/"
7 | NUM_TRAIN_STEPS=100000
8 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
9 | BERT_CONFIG=$BERT_DIR/bert_config.json
10 |
11 | python run_pretraining.py \
12 | --input_file=$INPUT_FILE \
13 | --output_dir=$OUTPUT_DIR \
14 | --do_train=True \
15 | --do_eval=True \
16 | --bert_config_file=$BERT_CONFIG \
17 | --train_batch_size=16 \
18 | --eval_batch_size=8 \
19 | --max_seq_length=128 \
20 | --max_predictions_per_seq=20 \
21 | --num_train_steps=$NUM_TRAIN_STEPS \
22 | --num_warmup_steps=10000 \
23 | --learning_rate=1e-4 \
24 | --max_eval_steps=1000 \
25 | --save_checkpoints_steps=25000 \
26 | --init_checkpoint=$BERT_DIR/bert_model.ckpt
27 |
28 |
29 | INPUT_FILE="/work/anlausch/ConceptBERT/data/rw_corpus_1.0_1.0_2_10_cn_relations_2.tfrecord"
30 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/pretraining/rw/1.0_1.0_2_10/cn_relations/"
31 | NUM_TRAIN_STEPS=100000
32 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
33 | BERT_CONFIG=$BERT_DIR/bert_config_cn_relations.json
34 |
35 | python run_pretraining.py \
36 | --input_file=$INPUT_FILE \
37 | --output_dir=$OUTPUT_DIR \
38 | --do_train=True \
39 | --do_eval=True \
40 | --bert_config_file=$BERT_CONFIG \
41 | --train_batch_size=16 \
42 | --eval_batch_size=8 \
43 | --max_seq_length=128 \
44 | --max_predictions_per_seq=20 \
45 | --num_train_steps=$NUM_TRAIN_STEPS \
46 | --num_warmup_steps=10000 \
47 | --learning_rate=1e-4 \
48 | --max_eval_steps=1000 \
49 | --save_checkpoints_steps=25000 \
50 | --init_checkpoint=$BERT_DIR/bert_model.ckpt
--------------------------------------------------------------------------------
/archive/prediction_diagnostic.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # general config
3 | export CUDA_VISIBLE_DEVICES=;
4 | BERT_BASE_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
5 | VOCAB_DIR=$BERT_BASE_DIR/vocab.txt
6 | BERT_CONFIG=$BERT_BASE_DIR/bert_config.json
7 | GLUE_DATA="$GLUE_DIR"
8 | STEP_NUMBER=25000
9 |
10 | # root dir of your checkpoints
11 | # ROOT="/work/anlausch/ConceptBERT/output/finetuning/omcs/free-wo-nsp-adapter_tune_all/${STEP_NUMBER}/"
12 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/"
13 |
14 | # this is the mnli model which was best on the matched dataset
15 | for config in "MNLI_16_2e-05_3/model.ckpt-73631","diagnostic"; do
16 | IFS=","
17 | set -- $config
18 | echo $1 and $2
19 | TASK=$2
20 |
21 | # location of the checkpoint which was best on dev
22 | TRAINED_CLASSIFIER=${ROOT}${1}
23 | OUTPUT_DIR=${ROOT}predictions/${TASK}
24 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
25 | python run_classifier_adapter_tune_all.py \
26 | --task_name=${TASK} \
27 | --do_predict=true \
28 | --do_train=false \
29 | --do_eval=false \
30 | --data_dir=$GLUE_DIR/${TASK} \
31 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
32 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
33 | --init_checkpoint=$TRAINED_CLASSIFIER \
34 | --do_early_stopping=false \
35 | --max_seq_length=128 \
36 | --original_model=True \
37 | --matched=False \
38 | --output_dir=${OUTPUT_DIR}
39 |
40 | # this is a parser I wrote which should output the predictions in the glue platform format
41 | python parse_predictions.py \
42 | --task=${TASK} \
43 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
44 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
45 | done
--------------------------------------------------------------------------------
/archive/predictions_rw_100000.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # general config
3 | export CUDA_VISIBLE_DEVICES=;
4 | BERT_BASE_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
5 | VOCAB_DIR=$BERT_BASE_DIR/vocab.txt
6 | BERT_CONFIG=$BERT_BASE_DIR/bert_config.json
7 | GLUE_DATA="$GLUE_DIR"
8 | STEP_NUMBER=100000
9 |
10 | # root dir of your checkpoints
11 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all_quick_insight/${STEP_NUMBER}/"
12 |
13 | # this is a tuple of trained model and task, you can add more tuples
14 | # todo: sst2 missing here
15 | for config in "CoLA_16_2e-05_4/model.ckpt-2137","CoLA" "MRPC_16_2e-05_3/model.ckpt-687","MRPC" "RTE_16_3e-05_4/model.ckpt-622","RTE"; do
16 | IFS=","
17 | set -- $config
18 | echo $1 and $2
19 | TASK=$2
20 |
21 | # location of the checkpoint which was best on dev
22 | TRAINED_CLASSIFIER=${ROOT}${1}
23 | OUTPUT_DIR=${ROOT}predictions/${TASK}
24 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
25 | python run_classifier.py \
26 | --task_name=${TASK} \
27 | --do_predict=true \
28 | --do_train=false \
29 | --do_eval=false \
30 | --data_dir=$GLUE_DIR/${TASK} \
31 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
32 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
33 | --init_checkpoint=$TRAINED_CLASSIFIER \
34 | --do_early_stopping=false \
35 | --max_seq_length=128 \
36 | --original_model=True \
37 | --matched=False \
38 | --output_dir=${OUTPUT_DIR}
39 |
40 | # this is a parser I wrote which should output the predictions in the glue platform format
41 | python parse_predictions.py \
42 | --task=${TASK} \
43 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
44 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
45 | done
46 |
47 | for config in "STSB_16_2e-05_4/model.ckpt-1437","STSB"; do
48 | IFS=","
49 | set -- $config
50 | echo $1 and $2
51 | TASK=$2
52 |
53 | # location of the checkpoint which was best on dev
54 | TRAINED_CLASSIFIER=${ROOT}${1}
55 | OUTPUT_DIR=${ROOT}predictions/${TASK}
56 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
57 | python run_regression.py \
58 | --task_name=${TASK} \
59 | --do_predict=true \
60 | --do_train=false \
61 | --do_eval=false \
62 | --data_dir=$GLUE_DIR/${TASK} \
63 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
64 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
65 | --init_checkpoint=$TRAINED_CLASSIFIER \
66 | --do_early_stopping=false \
67 | --max_seq_length=128 \
68 | --original_model=True \
69 | --matched=False \
70 | --output_dir=${OUTPUT_DIR}
71 |
72 | # this is a parser I wrote which should output the predictions in the glue platform format
73 | python parse_predictions.py \
74 | --task=${TASK} \
75 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
76 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
77 | done
78 |
79 |
80 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/"
81 |
82 | # TODO: MNLI is missing here
83 | # this is a tuple of trained model and task, you can add more tuples
84 | for config in "QNLIV2_16_3e-05_3/model.ckpt-19639","QNLIV2" "QQP_16_3e-05_4/model.ckpt-90962","QQP"; do
85 | IFS=","
86 | set -- $config
87 | echo $1 and $2
88 | TASK=$2
89 |
90 | # location of the checkpoint which was best on dev
91 | TRAINED_CLASSIFIER=${ROOT}${1}
92 | OUTPUT_DIR=${ROOT}predictions/${TASK}
93 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
94 | python run_classifier.py \
95 | --task_name=${TASK} \
96 | --do_predict=true \
97 | --do_train=false \
98 | --do_eval=false \
99 | --data_dir=$GLUE_DIR/${TASK} \
100 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
101 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
102 | --init_checkpoint=$TRAINED_CLASSIFIER \
103 | --do_early_stopping=false \
104 | --max_seq_length=128 \
105 | --original_model=True \
106 | --matched=False \
107 | --output_dir=${OUTPUT_DIR}
108 |
109 | # this is a parser I wrote which should output the predictions in the glue platform format
110 | python parse_predictions.py \
111 | --task=${TASK} \
112 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
113 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
114 | done
115 |
116 |
117 |
118 |
119 |
--------------------------------------------------------------------------------
/archive/predictions_rw_100000_all.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # general config
3 | export CUDA_VISIBLE_DEVICES=1;
4 | BERT_BASE_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
5 | VOCAB_DIR=$BERT_BASE_DIR/vocab.txt
6 | BERT_CONFIG=$BERT_BASE_DIR/bert_config.json
7 | GLUE_DATA="$GLUE_DIR"
8 | STEP_NUMBER=100000
9 |
10 | # root dir of your checkpoints
11 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all_quick_insight/${STEP_NUMBER}/"
12 |
13 | # this is a tuple of trained model and task, you can add more tuples
14 |
15 | for config in "SST2_16_3e-05_3/model.ckpt-12627","SST2" "CoLA_16_2e-05_4/model.ckpt-2137","CoLA" "MRPC_16_2e-05_3/model.ckpt-687","MRPC" "RTE_16_3e-05_4/model.ckpt-622","RTE"; do
16 | IFS=","
17 | set -- $config
18 | echo $1 and $2
19 | TASK=$2
20 |
21 | # location of the checkpoint which was best on dev
22 | TRAINED_CLASSIFIER=${ROOT}${1}
23 | OUTPUT_DIR=${ROOT}predictions/${TASK}
24 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
25 | python run_classifier_adapter_tune_all.py \
26 | --task_name=${TASK} \
27 | --do_predict=true \
28 | --do_train=false \
29 | --do_eval=false \
30 | --data_dir=$GLUE_DIR/${TASK} \
31 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
32 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
33 | --init_checkpoint=$TRAINED_CLASSIFIER \
34 | --do_early_stopping=false \
35 | --max_seq_length=128 \
36 | --original_model=True \
37 | --matched=True \
38 | --output_dir=${OUTPUT_DIR}
39 |
40 | # this is a parser I wrote which should output the predictions in the glue platform format
41 | python parse_predictions.py \
42 | --task=${TASK} \
43 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
44 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
45 | done
46 |
47 | for config in "STSB_16_2e-05_4/model.ckpt-1437","STSB"; do
48 | IFS=","
49 | set -- $config
50 | echo $1 and $2
51 | TASK=$2
52 |
53 | # location of the checkpoint which was best on dev
54 | TRAINED_CLASSIFIER=${ROOT}${1}
55 | OUTPUT_DIR=${ROOT}predictions/${TASK}
56 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
57 | python run_regression_adapter_tune_all.py \
58 | --task_name=${TASK} \
59 | --do_predict=true \
60 | --do_train=false \
61 | --do_eval=false \
62 | --data_dir=$GLUE_DIR/${TASK} \
63 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
64 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
65 | --init_checkpoint=$TRAINED_CLASSIFIER \
66 | --do_early_stopping=false \
67 | --max_seq_length=128 \
68 | --original_model=True \
69 | --matched=True \
70 | --output_dir=${OUTPUT_DIR}
71 |
72 | # this is a parser I wrote which should output the predictions in the glue platform format
73 | python parse_predictions.py \
74 | --task=${TASK} \
75 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
76 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
77 | done
78 |
79 |
80 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/"
81 |
82 |
83 | # this is a tuple of trained model and task, you can add more tuples
84 | for config in "QQP_16_3e-05_4/model.ckpt-90962","QQP" "MNLI_16_3e-05_4/model.ckpt-98175","MNLI" "QNLIV2_16_3e-05_3/model.ckpt-19639","QNLIV2"; do
85 | IFS=","
86 | set -- $config
87 | echo $1 and $2
88 | TASK=$2
89 |
90 | # location of the checkpoint which was best on dev
91 | TRAINED_CLASSIFIER=${ROOT}${1}
92 | OUTPUT_DIR=${ROOT}predictions/${TASK}
93 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
94 | python run_classifier_adapter_tune_all.py \
95 | --task_name=${TASK} \
96 | --do_predict=true \
97 | --do_train=false \
98 | --do_eval=false \
99 | --data_dir=$GLUE_DIR/${TASK} \
100 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
101 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
102 | --init_checkpoint=$TRAINED_CLASSIFIER \
103 | --do_early_stopping=false \
104 | --max_seq_length=128 \
105 | --original_model=True \
106 | --matched=True \
107 | --output_dir=${OUTPUT_DIR}
108 |
109 | # this is a parser I wrote which should output the predictions in the glue platform format
110 | python parse_predictions.py \
111 | --task=${TASK} \
112 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
113 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
114 | done
115 |
116 |
117 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/"
118 |
119 |
120 | # this is a tuple of trained model and task, you can add more tuples
121 | # TODO: Do another dev set evaluation
122 | for config in "MNLI_16_3e-05_4/model.ckpt-98175","MNLI"; do
123 | IFS=","
124 | set -- $config
125 | echo $1 and $2
126 | TASK=$2
127 |
128 | # location of the checkpoint which was best on dev
129 | TRAINED_CLASSIFIER=${ROOT}${1}
130 | OUTPUT_DIR=${ROOT}predictions/${TASK}-mm
131 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
132 | python run_classifier_adapter_tune_all.py \
133 | --task_name=${TASK} \
134 | --do_predict=true \
135 | --do_train=false \
136 | --do_eval=false \
137 | --data_dir=$GLUE_DIR/${TASK} \
138 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
139 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
140 | --init_checkpoint=$TRAINED_CLASSIFIER \
141 | --do_early_stopping=false \
142 | --max_seq_length=128 \
143 | --original_model=True \
144 | --matched=False \
145 | --output_dir=${OUTPUT_DIR}
146 |
147 | # this is a parser I wrote which should output the predictions in the glue platform format
148 | python parse_predictions.py \
149 | --task=${TASK} \
150 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
151 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
152 | done
153 |
154 | # this is the mnli model which was best on the matched dataset
155 | for config in "MNLI_16_3e-05_4/model.ckpt-98175","diagnostic"; do
156 | IFS=","
157 | set -- $config
158 | echo $1 and $2
159 | TASK=$2
160 |
161 | # location of the checkpoint which was best on dev
162 | TRAINED_CLASSIFIER=${ROOT}${1}
163 | OUTPUT_DIR=${ROOT}predictions/${TASK}
164 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
165 | python run_classifier_adapter_tune_all.py \
166 | --task_name=${TASK} \
167 | --do_predict=true \
168 | --do_train=false \
169 | --do_eval=false \
170 | --data_dir=$GLUE_DIR/${TASK} \
171 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
172 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
173 | --init_checkpoint=$TRAINED_CLASSIFIER \
174 | --do_early_stopping=false \
175 | --max_seq_length=128 \
176 | --original_model=True \
177 | --matched=True \
178 | --output_dir=${OUTPUT_DIR}
179 |
180 | # this is a parser I wrote which should output the predictions in the glue platform format
181 | python parse_predictions.py \
182 | --task=${TASK} \
183 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
184 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
185 | done
186 |
187 |
188 |
189 |
--------------------------------------------------------------------------------
/archive/predictions_rw_25000_all.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # general config
3 | export CUDA_VISIBLE_DEVICES=1;
4 | BERT_BASE_DIR="/work/anlausch/uncased_L-12_H-768_A-12"
5 | VOCAB_DIR=$BERT_BASE_DIR/vocab.txt
6 | BERT_CONFIG=$BERT_BASE_DIR/bert_config.json
7 | GLUE_DATA="$GLUE_DIR"
8 | STEP_NUMBER=25000
9 |
10 | # root dir of your checkpoints
11 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all_quick_insight/${STEP_NUMBER}/"
12 |
13 | # this is a tuple of trained model and task, you can add more tuples
14 |
15 | for config in "SST2_16_2e-05_3/model.ckpt-12627","SST2" "CoLA_16_2e-05_4/model.ckpt-2137","CoLA" "MRPC_16_2e-05_4/model.ckpt-917","MRPC" "RTE_16_3e-05_4/model.ckpt-622","RTE"; do
16 | IFS=","
17 | set -- $config
18 | echo $1 and $2
19 | TASK=$2
20 |
21 | # location of the checkpoint which was best on dev
22 | TRAINED_CLASSIFIER=${ROOT}${1}
23 | OUTPUT_DIR=${ROOT}predictions/${TASK}
24 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
25 | python run_classifier_adapter_tune_all.py \
26 | --task_name=${TASK} \
27 | --do_predict=true \
28 | --do_train=false \
29 | --do_eval=false \
30 | --data_dir=$GLUE_DIR/${TASK} \
31 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
32 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
33 | --init_checkpoint=$TRAINED_CLASSIFIER \
34 | --do_early_stopping=false \
35 | --max_seq_length=128 \
36 | --original_model=True \
37 | --matched=True \
38 | --output_dir=${OUTPUT_DIR}
39 |
40 | # this is a parser I wrote which should output the predictions in the glue platform format
41 | python parse_predictions.py \
42 | --task=${TASK} \
43 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
44 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
45 | done
46 |
47 | for config in "STSB_16_3e-05_4/model.ckpt-1437","STSB"; do
48 | IFS=","
49 | set -- $config
50 | echo $1 and $2
51 | TASK=$2
52 |
53 | # location of the checkpoint which was best on dev
54 | TRAINED_CLASSIFIER=${ROOT}${1}
55 | OUTPUT_DIR=${ROOT}predictions/${TASK}
56 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
57 | python run_regression_adapter_tune_all.py \
58 | --task_name=${TASK} \
59 | --do_predict=true \
60 | --do_train=false \
61 | --do_eval=false \
62 | --data_dir=$GLUE_DIR/${TASK} \
63 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
64 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
65 | --init_checkpoint=$TRAINED_CLASSIFIER \
66 | --do_early_stopping=false \
67 | --max_seq_length=128 \
68 | --original_model=True \
69 | --matched=True \
70 | --output_dir=${OUTPUT_DIR}
71 |
72 | # this is a parser I wrote which should output the predictions in the glue platform format
73 | python parse_predictions.py \
74 | --task=${TASK} \
75 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
76 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
77 | done
78 |
79 |
80 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/"
81 |
82 |
83 | # this is a tuple of trained model and task, you can add more tuples
84 | for config in "QQP_16_2e-05_3/model.ckpt-68221","QQP" "MNLI_16_2e-05_3/model.ckpt-73631","MNLI" "QNLIV2_16_2e-05_3/model.ckpt-19639","QNLIV2"; do
85 | IFS=","
86 | set -- $config
87 | echo $1 and $2
88 | TASK=$2
89 |
90 | # location of the checkpoint which was best on dev
91 | TRAINED_CLASSIFIER=${ROOT}${1}
92 | OUTPUT_DIR=${ROOT}predictions/${TASK}
93 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
94 | python run_classifier_adapter_tune_all.py \
95 | --task_name=${TASK} \
96 | --do_predict=true \
97 | --do_train=false \
98 | --do_eval=false \
99 | --data_dir=$GLUE_DIR/${TASK} \
100 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
101 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
102 | --init_checkpoint=$TRAINED_CLASSIFIER \
103 | --do_early_stopping=false \
104 | --max_seq_length=128 \
105 | --original_model=True \
106 | --matched=True \
107 | --output_dir=${OUTPUT_DIR}
108 |
109 | # this is a parser I wrote which should output the predictions in the glue platform format
110 | python parse_predictions.py \
111 | --task=${TASK} \
112 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
113 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
114 | done
115 |
116 |
117 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/"
118 |
119 |
120 | # this is a tuple of trained model and task, you can add more tuples
121 | for config in "MNLI_16_2e-05_3/model.ckpt-73631","MNLI"; do
122 | IFS=","
123 | set -- $config
124 | echo $1 and $2
125 | TASK=$2
126 |
127 | # location of the checkpoint which was best on dev
128 | TRAINED_CLASSIFIER=${ROOT}${1}
129 | OUTPUT_DIR=${ROOT}predictions/${TASK}-mm
130 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
131 | python run_classifier_adapter_tune_all.py \
132 | --task_name=${TASK} \
133 | --do_predict=true \
134 | --do_train=false \
135 | --do_eval=false \
136 | --data_dir=$GLUE_DIR/${TASK} \
137 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
138 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
139 | --init_checkpoint=$TRAINED_CLASSIFIER \
140 | --do_early_stopping=false \
141 | --max_seq_length=128 \
142 | --original_model=True \
143 | --matched=False \
144 | --output_dir=${OUTPUT_DIR}
145 |
146 | # this is a parser I wrote which should output the predictions in the glue platform format
147 | python parse_predictions.py \
148 | --task=${TASK} \
149 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
150 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
151 | done
152 |
153 | # this is the mnli model which was best on the matched dataset
154 | for config in "MNLI_16_2e-05_3/model.ckpt-73631","diagnostic"; do
155 | IFS=","
156 | set -- $config
157 | echo $1 and $2
158 | TASK=$2
159 |
160 | # location of the checkpoint which was best on dev
161 | TRAINED_CLASSIFIER=${ROOT}${1}
162 | OUTPUT_DIR=${ROOT}predictions/${TASK}
163 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true
164 | python run_classifier_adapter_tune_all.py \
165 | --task_name=${TASK} \
166 | --do_predict=true \
167 | --do_train=false \
168 | --do_eval=false \
169 | --data_dir=$GLUE_DIR/${TASK} \
170 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
171 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
172 | --init_checkpoint=$TRAINED_CLASSIFIER \
173 | --do_early_stopping=false \
174 | --max_seq_length=128 \
175 | --original_model=True \
176 | --matched=True \
177 | --output_dir=${OUTPUT_DIR}
178 |
179 | # this is a parser I wrote which should output the predictions in the glue platform format
180 | python parse_predictions.py \
181 | --task=${TASK} \
182 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \
183 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0"
184 | done
185 |
186 |
187 |
188 |
--------------------------------------------------------------------------------
/copa_1_download_copa.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | waws --downloadS3 -f copa_en.zip -b wluper-retrograph
3 | mkdir data/COPA
4 | unzip copa_en.zip
5 | mv test_gold.jsonl data/COPA
6 | mv train.en.jsonl data/COPA
7 | mv val.en.jsonl data/COPA
8 | mv copa_en.zip data
9 |
--------------------------------------------------------------------------------
/copa_2_finetune_adapter.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #Step1:
4 | #run_classifier_adapter_tune_all.py ->
5 | #
6 | #
7 | #Need to load the Adapter Model
8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT
9 | TRAINING_UTILITY=training_utility
10 |
11 | export CUDA_VISIBLE_DEVICES=0
12 |
13 | BERT_DIR="models/BERT_BASE_UNCASED"
14 | BERT_CONFIG=$BERT_DIR/bert_config.json
15 | BERT_VOCAB=$BERT_DIR/vocab.txt
16 |
17 | TASKNAME='COPA'
18 | DATA_DIR=data/$TASKNAME
19 |
20 | LEARNING_RATE=2e-5
21 | EPOCHS=3.0
22 | VARIANT=A
23 |
24 | EXPERIMENT_NAME=$LEARNING_RATE.$EPOCHS$VARIANT
25 | STEP="150000"
26 |
27 | PRETRAINED_NAME="RW30"
28 | BERT_EXTENDED_DIR="models/1.0_1.0_5_30_full_assertions_nl"
29 | # BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter"
30 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
31 |
32 | OUTPUT_DIR="models/output_model_finetunning/${TASKNAME}/${PRETRAINED_NAME}/${STEP}/${EXPERIMENT_NAME}"
33 |
34 |
35 | python3.6 $TRAINING_UTILITY/run_copa_adapter.py \
36 | --do_train=true \
37 | --do_eval=true \
38 | --data_dir=$DATA_DIR \
39 | --vocab_file=$BERT_VOCAB \
40 | --bert_config_file=$BERT_CONFIG \
41 | --init_checkpoint=$CHECKPOINT \
42 | --max_seq_length=128 \
43 | --train_batch_size=8 \
44 | --learning_rate=$LEARNING_RATE \
45 | --num_train_epochs=$EPOCHS \
46 | --variant=$VARIANT \
47 | --output_dir=$OUTPUT_DIR/ | tee $OUTPUT_DIR.out
48 |
--------------------------------------------------------------------------------
/copa_2_finetune_bert.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #Step1:
4 | #run_classifier_adapter_tune_all.py ->
5 | #
6 | #
7 | #Need to load the Adapter Model
8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT
9 | TRAINING_UTILITY=training_utility
10 |
11 | export CUDA_VISIBLE_DEVICES=0
12 |
13 | BERT_DIR="models/BERT_BASE_UNCASED"
14 | BERT_CONFIG=$BERT_DIR/bert_config.json
15 | BERT_VOCAB=$BERT_DIR/vocab.txt
16 |
17 | TASKNAME='COPA'
18 | DATA_DIR=data/$TASKNAME
19 |
20 | LEARNING_RATE=2e-5
21 | EPOCHS=3.0
22 | VARIANT=A
23 |
24 | EXPERIMENT_NAME=$LEARNING_RATE.$EPOCHS$VARIANT
25 |
26 | # BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter"
27 | # CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
28 |
29 | BERT_EXTENDED_DIR=$BERT_DIR
30 | CHECKPOINT=${BERT_EXTENDED_DIR}/bert_model.ckpt
31 | OUTPUT_DIR="models/output_model_finetunning/${TASKNAME}/BERT_BASE/${EXPERIMENT_NAME}"
32 |
33 |
34 | python3.6 $TRAINING_UTILITY/run_copa.py \
35 | --do_train=true \
36 | --do_eval=true \
37 | --data_dir=$DATA_DIR \
38 | --vocab_file=$BERT_VOCAB \
39 | --bert_config_file=$BERT_CONFIG \
40 | --init_checkpoint=$CHECKPOINT \
41 | --max_seq_length=128 \
42 | --train_batch_size=8 \
43 | --learning_rate=$LEARNING_RATE \
44 | --num_train_epochs=$EPOCHS \
45 | --variant=$VARIANT \
46 | --output_dir=$OUTPUT_DIR/ | tee $OUTPUT_DIR.out
47 |
--------------------------------------------------------------------------------
/csqa_1_download_commonsenseqa.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | python3.6 download_utility/download_commonsenseqa.py
4 |
--------------------------------------------------------------------------------
/csqa_2_finetune_adapter.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #Step1:
4 | #run_classifier_adapter_tune_all.py ->
5 | #
6 | #
7 | #Need to load the Adapter Model
8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT
9 | TRAINING_UTILITY=training_utility
10 |
11 | export CUDA_VISIBLE_DEVICES=0
12 |
13 | BERT_DIR="models/BERT_BASE_UNCASED"
14 | BERT_CONFIG=$BERT_DIR/bert_config.json
15 | BERT_VOCAB=$BERT_DIR/vocab.txt
16 |
17 | BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter"
18 | OUTPUT_DIR="models/output_model_finetunning"
19 | OUTPUT_SUFFIX=_tune_all
20 |
21 | TASKNAME='COMMONSENSEQA'
22 | DATA_DIR=data/$TASKNAME
23 |
24 | SPLIT="rand"
25 |
26 | STEP="25000"
27 |
28 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
29 |
30 | python3.6 $TRAINING_UTILITY/run_commonsenseqa_adapter.py \
31 | --split=$SPLIT \
32 | --do_train=true \
33 | --do_eval=true \
34 | --data_dir=$DATA_DIR \
35 | --vocab_file=$BERT_VOCAB \
36 | --bert_config_file=$BERT_CONFIG \
37 | --init_checkpoint=$CHECKPOINT \
38 | --max_seq_length=128 \
39 | --train_batch_size=8 \
40 | --learning_rate=2e-5 \
41 | --num_train_epochs=3.0 \
42 | --output_dir=$OUTPUT_DIR/$TASKNAME/ | tee $OUTPUT_DIR/$TASKNAME.out
43 |
--------------------------------------------------------------------------------
/csqa_3_eval_adapter.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #Step1:
4 | #run_classifier_adapter_tune_all.py ->
5 | #
6 | #
7 | #Need to load the Adapter Model
8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT
9 | TRAINING_UTILITY=training_utility
10 |
11 | export CUDA_VISIBLE_DEVICES=0
12 |
13 | BERT_DIR="models/BERT_BASE_UNCASED"
14 | BERT_CONFIG=$BERT_DIR/bert_config.json
15 | BERT_VOCAB=$BERT_DIR/vocab.txt
16 |
17 | BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter"
18 | OUTPUT_DIR="models/output_model_finetunning"
19 | OUTPUT_SUFFIX=_tune_all
20 |
21 | TASKNAME='COMMONSENSEQA'
22 | DATA_DIR=data/$TASKNAME
23 |
24 | SPLIT="rand"
25 |
26 | STEP="25000"
27 |
28 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
29 |
30 | TRAINED_MODEL=$OUTPUT_DIR/$TASKNAME/model.ckpt-3000
31 |
32 | python3.6 $TRAINING_UTILITY/run_commonsenseqa_adapter.py \
33 | --split=$SPLIT \
34 | --do_train=false \
35 | --do_eval=true \
36 | --data_dir=$DATA_DIR \
37 | --vocab_file=$BERT_VOCAB \
38 | --bert_config_file=$BERT_CONFIG \
39 | --init_checkpoint=$TRAINED_MODEL \
40 | --max_seq_length=128 \
41 | --train_batch_size=8 \
42 | --learning_rate=2e-5 \
43 | --num_train_epochs=3.0 \
44 | --output_dir=$OUTPUT_DIR/$TASKNAME/ | tee $OUTPUT_DIR/$TASKNAME.out
45 |
--------------------------------------------------------------------------------
/download_utility/download_bert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2019 Wluper Ltd. Team, Nikolai Rozanov.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | # ##############################################################################
18 | # Import
19 | ##############################################################################
20 |
21 | # Native
22 | import urllib.request
23 | import os
24 | import zipfile
25 |
26 | # Packages
27 | import shutil
28 |
29 | # Local
30 |
31 |
32 | # #############################################################################
33 | # Code
34 | ##############################################################################
35 | BERT_TO_URL_MAPPING = {
36 | "BERT_LARGE_UNCASED_WHOLEWORD" : "https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip",
37 | "BERT_LARGE_CASED_WHOLEWORD" : "https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip",
38 |
39 | "BERT_LARGE_UNCASED" : "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip",
40 | "BERT_LARGE_CASED" : "https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip",
41 |
42 | "BERT_BASE_UNCASED" : "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip",
43 | "BERT_BASE_CASED" : "https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip",
44 |
45 | "BERT_BASE_CASED_MULTI" : "https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip",#re
46 | "BERT_BASE_UNCASED_MULTI" : "https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip",
47 |
48 | "BERT_BASE_CHINESE" : "https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip"
49 | }
50 |
51 |
52 | def download_bert_zip(target_file_name:str, which_bert:str="BERT_BASE_CASED"):
53 | """
54 | Downloads the officially pre-trained model from google.
55 | File is a zip and contains:
56 | 1. A TensorFlow checkpoint (bert_model.ckpt) containing the pre-trained weights (which is actually 3 files).
57 | 2. A vocab file (vocab.txt) to map WordPiece to word id.
58 | 3. A config file (bert_config.json) which specifies the hyperparameters of the model.
59 |
60 | Part Reference:
61 | https://stackoverflow.com/questions/7243750/download-file-from-web-in-python-3
62 | """
63 | try:
64 | url = BERT_TO_URL_MAPPING[which_bert]
65 | except KeyError:
66 | print("Seems like this BERT model doesn't exist. Please specify a possible option.")
67 | exit()
68 | os.makedirs(os.path.dirname(target_file_name),exist_ok=True) #creates path if not in existence.
69 | with urllib.request.urlopen(url) as response, open(target_file_name, 'wb') as out_file:
70 | print(f"Downloading: {which_bert}. Target_file: {target_file_name}\nThis may take some time.")
71 | shutil.copyfileobj(response, out_file)
72 | print("Finished the Download.")
73 |
74 |
75 | def unzip_bert(path_to_zip_file: str, target_folder_name: str):
76 | """
77 | unzips the bert and places the content into target_folder_name.
78 |
79 | Part Reference:
80 | https://stackoverflow.com/questions/3451111/unzipping-files-in-python
81 | """
82 | print(f"Unzipping Bert zip {path_to_zip_file}.")
83 | with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
84 | zip_ref.extractall(target_folder_name)
85 | print("Finished Unzipping.")
86 | print(f"Moving Content to: {target_folder_name}")
87 | _move_unzipped_content(target_folder_name)
88 | print("Finished Moving. Finished Process.")
89 |
90 |
91 | def _move_unzipped_content(target_folder_name:str):
92 | """ Helper function to move content for function unzip_bert. (has assumptions)"""
93 | bert_data = os.listdir(target_folder_name)[0]
94 | final_bert_data_path = os.path.join(target_folder_name, bert_data)
95 | for file in os.listdir(final_bert_data_path):
96 | shutil.move(os.path.join(final_bert_data_path,file), target_folder_name)
97 | os.rmdir(final_bert_data_path)
98 |
99 | # #############################################################################
100 | # MAIN
101 | ##############################################################################
102 | if __name__=="__main__":
103 | which_bert = "BERT_BASE_UNCASED"
104 | target_file_name = os.path.join("models","bert_pretrained.zip")
105 | target_folder_name = os.path.join("models",which_bert)
106 | download_bert_zip(target_file_name=target_file_name, which_bert=which_bert)
107 | unzip_bert(target_file_name, target_folder_name)
108 |
--------------------------------------------------------------------------------
/download_utility/download_commonsenseqa.py:
--------------------------------------------------------------------------------
1 | ''' Script for downloading all CommonsenseQA data.
2 | Author: Nikolai Rozanov
3 | '''
4 |
5 | import os
6 | import sys
7 | import shutil
8 | import argparse
9 | import tempfile
10 | import urllib.request
11 |
12 |
13 | LINKS = [
14 | "https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl",
15 | "https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl",
16 | "https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl"
17 | ]
18 |
19 | def download_and_extract(link, data_dir):
20 | """ downloads and moves. """
21 | print("Downloading and extracting %s..." % link)
22 | data_file = get_name_from_link(link)
23 | urllib.request.urlretrieve(link,data_file)
24 | shutil.move(data_file,os.path.join(data_dir,data_file))
25 | print("\tCompleted!")
26 |
27 | def get_name_from_link(link):
28 | """ returns name from link. """
29 | name = link.split("/")[-1]
30 | return name
31 |
32 | def make_dir(directory_path, directory_name):
33 | """ Makes a directory if it doesn't exist. """
34 | directory = os.path.join(directory_path, directory_name)
35 | if not os.path.exists(directory):
36 | os.makedirs(directory)
37 |
38 | def main():
39 | DATA="data"
40 | TARGET_FOLDER="COMMONSENSEQA"
41 | data_dir = os.path.join(DATA, TARGET_FOLDER)
42 |
43 | make_dir(DATA, TARGET_FOLDER)
44 | for link in LINKS:
45 | download_and_extract(link, data_dir)
46 |
47 |
48 |
49 | if __name__ == '__main__':
50 | main()
51 |
--------------------------------------------------------------------------------
/download_utility/download_glue.py:
--------------------------------------------------------------------------------
1 | ''' Script for downloading all GLUE data.
2 | Note: for legal reasons, we are unable to host MRPC.
3 | You can either use the version hosted by the SentEval team, which is already tokenized,
4 | or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
5 | For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
6 | You should then rename and place specific files in a folder (see below for an example).
7 | mkdir MRPC
8 | cabextract MSRParaphraseCorpus.msi -d MRPC
9 | cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
10 | cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
11 | rm MRPC/_*
12 | rm MSRParaphraseCorpus.msi
13 | 1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
14 | 2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
15 |
16 | Part Source:
17 | https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
18 |
19 | Instructions:
20 | python3 download_glue.py --data_dir data/GLUE --tasks all
21 | '''
22 |
23 | import os
24 | import sys
25 | import shutil
26 | import argparse
27 | import tempfile
28 | import urllib.request
29 | import zipfile
30 |
31 | TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS-B", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic", "COPA"]
32 | TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip',
33 | "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',
34 | "MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
35 | "QQP":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip',
36 | "STS-B":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
37 | "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip',
38 | "SNLI":'https://dl.fbaipublicfiles.com/glue/data/SNLI.zip',
39 | "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip',
40 | "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',
41 | "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip',
42 | "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv',
43 | "COPA": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/COPA.zip"}
44 |
45 | MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
46 | MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
47 |
48 | def download_and_extract(task, data_dir):
49 | print("Downloading and extracting %s..." % task)
50 | data_file = "%s.zip" % task
51 | urllib.request.urlretrieve(TASK2PATH[task], data_file)
52 | with zipfile.ZipFile(data_file) as zip_ref:
53 | zip_ref.extractall(data_dir)
54 | os.remove(data_file)
55 | print("\tCompleted!")
56 |
57 | def format_mrpc(data_dir, path_to_data):
58 | print("Processing MRPC...")
59 | mrpc_dir = os.path.join(data_dir, "MRPC")
60 | if not os.path.isdir(mrpc_dir):
61 | os.mkdir(mrpc_dir)
62 | if path_to_data:
63 | mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
64 | mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
65 | else:
66 | print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN)
67 | mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
68 | mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
69 | urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
70 | urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
71 | assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
72 | assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
73 | urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
74 |
75 | dev_ids = []
76 | with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
77 | for row in ids_fh:
78 | dev_ids.append(row.strip().split('\t'))
79 |
80 | with open(mrpc_train_file, encoding="utf8") as data_fh, \
81 | open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \
82 | open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh:
83 | header = data_fh.readline()
84 | train_fh.write(header)
85 | dev_fh.write(header)
86 | for row in data_fh:
87 | label, id1, id2, s1, s2 = row.strip().split('\t')
88 | if [id1, id2] in dev_ids:
89 | dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
90 | else:
91 | train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
92 |
93 | with open(mrpc_test_file, encoding="utf8") as data_fh, \
94 | open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh:
95 | header = data_fh.readline()
96 | test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
97 | for idx, row in enumerate(data_fh):
98 | label, id1, id2, s1, s2 = row.strip().split('\t')
99 | test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
100 | print("\tCompleted!")
101 |
102 | def download_diagnostic(data_dir):
103 | print("Downloading and extracting diagnostic...")
104 | if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
105 | os.mkdir(os.path.join(data_dir, "diagnostic"))
106 | data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
107 | urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
108 | print("\tCompleted!")
109 | return
110 |
111 | def get_tasks(task_names):
112 | task_names = task_names.split(',')
113 | if "all" in task_names:
114 | tasks = TASKS
115 | else:
116 | tasks = []
117 | for task_name in task_names:
118 | assert task_name in TASKS, "Task %s not found!" % task_name
119 | tasks.append(task_name)
120 | return tasks
121 |
122 | def main(arguments):
123 | parser = argparse.ArgumentParser()
124 | parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
125 | parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
126 | type=str, default='all')
127 | parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
128 | type=str, default='')
129 | args = parser.parse_args(arguments)
130 |
131 | if not os.path.isdir(args.data_dir):
132 | os.mkdir(args.data_dir)
133 | tasks = get_tasks(args.tasks)
134 |
135 | for task in tasks:
136 | if task == 'MRPC':
137 | format_mrpc(args.data_dir, args.path_to_mrpc)
138 | elif task == 'diagnostic':
139 | download_diagnostic(args.data_dir)
140 | else:
141 | download_and_extract(task, args.data_dir)
142 |
143 |
144 | if __name__ == '__main__':
145 | sys.exit(main(sys.argv[1:]))
146 |
--------------------------------------------------------------------------------
/download_utility/download_relations.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the script for downloading ConceptNet relations from S3.
3 | Make sure waws is installed and configured:
4 |
5 | pip3 install waws
6 | waws --configure
7 |
8 | To run: python3 download_relations.py
9 | To download all (31): -r all (default)
10 | To download specific relations, provide a comma-separated list of relations: -r isA,formOf
11 | To specify/create a local download directory, use: -d directory_name.
12 | """
13 |
14 | import os
15 | import sys
16 | import argparse
17 | import waws
18 |
19 | RELATIONS = ['relatedTo', 'formOf', 'isA', 'partOf', 'hasA', 'usedFor', 'capableOf',
20 | 'atLocation', 'causes', 'hasSubevent', 'hasFirstSubevent', 'hasLastSubevent',
21 | 'hasPrerequisite', 'hasProperty', 'motivatedByGoal', 'obstructedBy', 'desires',
22 | 'createdBy', 'synonyms', 'antonyms', 'distinctFrom', 'derivedFrom', 'symbolOf',
23 | 'definedAs', 'mannerOf', 'locatedNear', 'hasContext', 'similarTo', 'causesDesire',
24 | 'madeOf', 'receivesAction']
25 |
26 | s3 = waws.BucketManager()
27 |
28 | def download(relation, data_dir):
29 | print("Downloading and extracting %s..." % relation)
30 | data_file = "cn_%s.txt" % relation
31 |
32 | s3.download_file(
33 | file_name=data_file,
34 | local_path=data_dir,
35 | remote_path="",
36 | bucket_name="wluper-retrograph"
37 | )
38 |
39 | print("\tDone!")
40 |
41 | def get_relations(relation_names):
42 | relation_names = relation_names.split(',')
43 | if "all" in relation_names:
44 | relations = RELATIONS
45 | else:
46 | relations = []
47 | for rel_name in relation_names:
48 | assert rel_name in RELATIONS, "Relation %s not found!" % rel_name
49 | relations.append(rel_name)
50 | return relations
51 |
52 | def main(arguments):
53 | parser = argparse.ArgumentParser()
54 | parser.add_argument('-d', '--data_dir', help='directory to save data to', type=str, default='./')
55 | parser.add_argument('-r', '--relations', help='relations to download as a comma separated string',
56 | type=str, default='all')
57 | args = parser.parse_args(arguments)
58 |
59 | if not os.path.isdir(args.data_dir):
60 | os.mkdir(args.data_dir)
61 | relations = get_relations(args.relations)
62 |
63 | for rel in relations:
64 | download(rel, args.data_dir)
65 |
66 |
67 | if __name__ == '__main__':
68 | sys.exit(main(sys.argv[1:]))
69 |
--------------------------------------------------------------------------------
/glue_1_download_glue.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DOWNLOAD_UTILITY_SCRIPTS=download_utility
4 |
5 |
6 | DIR_SAVE_RELATIONS='relations/'
7 | mkdir -p $DIR_SAVE_RELATIONS
8 |
9 | # DOWNLOAD RELATIONS
10 | python3.6 $DOWNLOAD_UTILITY_SCRIPTS/download_relations.py --data_dir $DIR_SAVE_RELATIONS --relations all
11 |
12 | mkdir -p 'data/GLUE'
13 | mkdir -p 'models/BERT_BASE_UNCASED'
14 |
15 | # DOWNLOAD BERT
16 | python3.6 $DOWNLOAD_UTILITY_SCRIPTS/download_bert.py
17 |
18 | # DOWNLOAD GLUE
19 | python3.6 $DOWNLOAD_UTILITY_SCRIPTS/download_glue.py --data_dir data/GLUE --tasks all
20 |
--------------------------------------------------------------------------------
/glue_2_finetune_adapter.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #Step1:
4 | #run_classifier_adapter_tune_all.py ->
5 | #
6 | #
7 | #Need to load the Adapter Model
8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT
9 | TRAINING_UTILITY=training_utility
10 |
11 | export CUDA_VISIBLE_DEVICES=8
12 |
13 | BERT_DIR="models/BERT_BASE_UNCASED"
14 | BERT_CONFIG=$BERT_DIR/bert_config.json
15 | VOCAB_DIR=$BERT_DIR/vocab.txt
16 |
17 | BERT_EXTENDED_DIR="models/output_pretrain_adapter"
18 | OUTPUT_DIR="models/output_model_finetunning"
19 | OUTPUT_SUFFIX=_tune_all
20 |
21 | GLUE_DIR='data/GLUE'
22 |
23 | ### the second finetuning variant
24 | for STEP in "0" "99000"; do
25 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
26 | for task_name in "QNLI" "QQP" "MNLI"; do
27 | echo $task_name
28 | echo $CHECKPOINT
29 |
30 | GLUE_DATA="$GLUE_DIR/$task_name"
31 |
32 | python3.6 $TRAINING_UTILITY/run_classifier_adapter_tune_all.py \
33 | --task_name=$task_name \
34 | --do_train=true \
35 | --do_eval=true \
36 | --do_early_stopping=false \
37 | --data_dir=$GLUE_DATA \
38 | --vocab_file=$VOCAB_DIR \
39 | --bert_config_file=$BERT_CONFIG \
40 | --init_checkpoint=$CHECKPOINT\
41 | --max_seq_length=128 \
42 | --train_batch_size="[16]" \
43 | --learning_rate="[2e-5, 3e-5]" \
44 | --num_train_epochs="[3,4]" \
45 | --original_model=True \
46 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} | tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out
47 | done
48 | done
49 |
--------------------------------------------------------------------------------
/images/Retrograph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wluper/Retrograph/d275e45c9127e645e4f02f32f42a62c2636f6c3a/images/Retrograph.png
--------------------------------------------------------------------------------
/randomwalks_utility/create_corpora_from_random_walks.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import codecs
3 | from tqdm import tqdm
4 | from concurrent.futures import ProcessPoolExecutor, as_completed
5 |
6 | def load_walks(path="./randomwalks/random_walk_1.0_1.0_2_10.p"):
7 | return pickle.load(open(path, "rb"))
8 |
9 |
10 | def create_relationship_token(text):
11 | # here, I've changed something
12 | return text #"<" + "".join(text.split(" ")) + ">"
13 |
14 | def process_walks(walks):
15 | text = ""
16 | for walk in walks:
17 | previous_token = ""
18 | for i, token in enumerate(walk):
19 | # every first token is a node and every second is a relationship
20 | # we don't need to capitalize anything as we are anyways working with the uncased BERT
21 | if (i % 2 == 0 and previous_token != "" and i != 0 and i != 2) or (i == 3 and previous_token != ""):
22 | # we have reached the end of a valid sentence sequence, so we put a period
23 | if i == 3:
24 | text = text[:-1] + ".\n"
25 | else:
26 | text = text + token + ".\n"
27 | if i != len(walk) - 1 and i == 3:
28 | # if the walk is not finished yet, we duplicate the token
29 | text = text + previous_token + " " + create_relationship_token(token) + " "
30 | elif i != len(walk) - 1:
31 | # if the walk is not finished yet, we duplicate the token
32 | text = text + token + " "
33 | else:
34 | # otherwise we can put a new line to mark the end of a document
35 | text = text + "\n\n"
36 | elif i % 2 == 0:
37 | text = text + token + " "
38 | elif i % 1 == 0:
39 | text = text + create_relationship_token(token) + " "
40 | previous_token = token
41 |
42 | return text
43 |
44 | def chunks(lst, n):
45 | """Yield successive n-sized chunks from lst."""
46 | for i in range(0, len(lst), n):
47 | yield lst[i:i + n]
48 |
49 | def generate_corpus_from_walks(walks, output_path_prefix="./randomwalks/rw_corpus_", output_path_suffix=""):
50 | # how do we actually want to generate the corpus?
51 | # one option is to always dublicate the node in the middle..
52 | # also Goran says that we want to keep the relations as separate tokens in the vocab. I do not necessarily agree with this, but we try.
53 | # What is one document? Is is always one walk? Maybe yes...
54 | output_path = output_path_prefix + output_path_suffix + ".txt"
55 | text = ""
56 | print('size of walks', len(walks))
57 | print('processing RWs...')
58 |
59 | workers = 10
60 | splits = 1000
61 | text = ""
62 | with ProcessPoolExecutor(max_workers=workers) as executor:
63 | futures = {}
64 | for i, ws in enumerate(chunks(walks, splits)):
65 | job = executor.submit(process_walks, ws)
66 | futures[job] = i
67 |
68 | for job in tqdm(as_completed(futures)):
69 | t = job.result()
70 | text += t
71 | r = futures[job]
72 | del futures[job]
73 |
74 |
75 | with codecs.open(output_path, "w", "utf8") as out:
76 | out.write(text)
77 |
78 |
79 | def main():
80 | in_prefix = "randomwalks/random_walk_"
81 | in_suffix = "1.0_1.0_2_15"
82 | walks = load_walks(in_prefix + in_suffix + ".p")
83 | generate_corpus_from_walks(walks, output_path_suffix=in_suffix + "_nl")
84 |
85 |
86 | if __name__=="__main__":
87 | main()
88 |
--------------------------------------------------------------------------------
/randomwalks_utility/preprocess_cn.py:
--------------------------------------------------------------------------------
1 | import codecs
2 |
3 | """
4 | As we got the relations to consider from olga, we don't need to do this anymore
5 | """
6 | # def filter_assertions(path="./relations/assertions.csv"):
7 | # assertions = []
8 | # with codecs.open(path, "r", "utf8") as f:
9 | # reader = csv.DictReader(f, dialect=csv.excel_tab, fieldnames=["URI", "relation", "node_a", "node_b", "info"])
10 | # for i,row in enumerate(reader):
11 | # node_a = row["node_a"].split("/c/en/")
12 | # node_b = row["node_b"].split("/c/en/")
13 | # if len(node_a) > 1 and len(node_b) > 1:
14 | # # these should be nodes in english
15 | # node_a = node_a[1].split("/")[-1].replace("_", "-")
16 | # node_b = node_b[1].split("/")[-1].replace("_", "-")
17 | # print(node_a)
18 | # print(node_b)
19 |
20 | """
21 | Based on the relations from olga
22 | """
23 | def create_joined_assertions_for_random_walks(paths=["./relations/cn_antonyms.txt", "./relations/cn_isA.txt", "./relations/cn_mannerOf.txt","./relations/cn_synonyms.txt"], output_path="./randomwalks/cn_assertions_filtered.tsv"):
24 | # we ideally want to have a "natural language representation" of the relations
25 | # TODO: keep in mind that antonymy and synonymy are bidirectional relationships, so maybe we want to account for this, i.e., by creating the corresponding pairs in the opposite direction or so
26 | # TODO: As an alternative of random walks, we can also just use the natural language representation of the relationships
27 | relation_dict = {
28 | "antonyms": "is an antonym of",
29 | "isA": "is a",
30 | "mannerOf": "is a manner of",
31 | "synonyms": "is a synonym of"
32 | }
33 | all_assertions = []
34 | for path in paths:
35 | relation = path.split("cn_")[1].split(".txt")[0]
36 | nl_relation = relation_dict[relation]
37 | with codecs.open(path, "r", "utf8") as f:
38 | for line in f.readlines():
39 | word_a, word_b = line.strip().split("\t")
40 | full_assertion = [word_a, word_b, nl_relation]
41 | all_assertions.append(full_assertion)
42 | # TODO: here is an attempt to account for bidirectionality; Does it make sense?
43 | if relation == "antonyms" or relation == "synonyms":
44 | full_assertion_b = [word_b, word_a, nl_relation]
45 | all_assertions.append(full_assertion_b)
46 | # In total, we have 293105 assertions
47 | print("In total, we have %d assertions" % len(all_assertions))
48 | with codecs.open(output_path, "w", "utf8") as out:
49 | for assertion in all_assertions:
50 | out.write(assertion[0] + "\t" + assertion[1] + "\t" + assertion[2] + "\n")
51 |
52 |
53 |
54 | def main():
55 | create_joined_assertions_for_random_walks()
56 | #profile_data()
57 | #filter_assertions()
58 |
59 | if __name__ == "__main__":
60 | main()
61 |
--------------------------------------------------------------------------------
/randomwalks_utility/random_walks.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import networkx as nx
3 | import random
4 | import pickle
5 |
6 | def read_graph(path="./randomwalks/cn_assertions_filtered.tsv"):
7 | '''
8 | Reads the input network in networkx.
9 | '''
10 |
11 | G = nx.read_edgelist(path, nodetype=str, data=(('edge_type', str),), create_using=nx.DiGraph(), delimiter="\t")
12 | for edge in G.edges():
13 | G[edge[0]][edge[1]]['weight'] = 1
14 | return G
15 |
16 |
17 | class Graph():
18 | def __init__(self, nx_G, is_directed, p, q):
19 | self.G = nx_G
20 | self.is_directed = is_directed
21 | self.p = p
22 | self.q = q
23 |
24 | def node2vec_walk(self, walk_length, start_node):
25 | '''
26 | Simulate a random walk starting from start node.
27 | '''
28 | G = self.G
29 | alias_nodes = self.alias_nodes
30 | alias_edges = self.alias_edges
31 |
32 | walk = [start_node]
33 |
34 | while len(walk) < walk_length:
35 | cur = walk[-1]
36 | cur_nbrs = sorted(G.neighbors(cur))
37 | if len(cur_nbrs) > 0:
38 | if len(walk) == 1:
39 | # TODO: This is Annes main change to the code, the rest is original node2vec code
40 | # NEW
41 | n = cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]
42 | walk.append(G.get_edge_data(cur, n)["edge_type"])
43 | walk.append(n)
44 |
45 | #walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
46 | else:
47 | #prev = walk[-2]
48 | prev = walk[-3]
49 | next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0],
50 | alias_edges[(prev, cur)][1])]
51 | ## new
52 | walk.append(G.get_edge_data(cur, next)["edge_type"])
53 | ####
54 | walk.append(next)
55 | else:
56 | break
57 |
58 | return walk
59 |
60 | def simulate_walks(self, num_walks, walk_length):
61 | '''
62 | Repeatedly simulate random walks from each node.
63 | '''
64 | G = self.G
65 | walks = []
66 | nodes = list(G.nodes())
67 | print
68 | 'Walk iteration:'
69 | for walk_iter in range(num_walks):
70 | print
71 | str(walk_iter + 1), '/', str(num_walks)
72 | random.shuffle(nodes)
73 | for node in nodes:
74 | walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
75 |
76 | return walks
77 |
78 | def get_alias_edge(self, src, dst):
79 | '''
80 | Get the alias edge setup lists for a given edge.
81 | '''
82 | G = self.G
83 | p = self.p
84 | q = self.q
85 |
86 | unnormalized_probs = []
87 | for dst_nbr in sorted(G.neighbors(dst)):
88 | if dst_nbr == src:
89 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p)
90 | elif G.has_edge(dst_nbr, src):
91 | unnormalized_probs.append(G[dst][dst_nbr]['weight'])
92 | else:
93 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q)
94 | norm_const = sum(unnormalized_probs)
95 | normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs]
96 |
97 | return alias_setup(normalized_probs)
98 |
99 | def preprocess_transition_probs(self):
100 | '''
101 | Preprocessing of transition probabilities for guiding the random walks.
102 | '''
103 | G = self.G
104 | is_directed = self.is_directed
105 |
106 | alias_nodes = {}
107 | for node in G.nodes():
108 | unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))]
109 | norm_const = sum(unnormalized_probs)
110 | normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs]
111 | alias_nodes[node] = alias_setup(normalized_probs)
112 |
113 | alias_edges = {}
114 | triads = {}
115 |
116 | if is_directed:
117 | for edge in G.edges():
118 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
119 | else:
120 | for edge in G.edges():
121 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
122 | alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
123 |
124 | self.alias_nodes = alias_nodes
125 | self.alias_edges = alias_edges
126 |
127 | return
128 |
129 |
130 | def alias_setup(probs):
131 | '''
132 | Compute utility lists for non-uniform sampling from discrete distributions.
133 | Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
134 | for details
135 | '''
136 | K = len(probs)
137 | q = np.zeros(K)
138 | J = np.zeros(K, dtype=np.int)
139 |
140 | smaller = []
141 | larger = []
142 | for kk, prob in enumerate(probs):
143 | q[kk] = K * prob
144 | if q[kk] < 1.0:
145 | smaller.append(kk)
146 | else:
147 | larger.append(kk)
148 |
149 | while len(smaller) > 0 and len(larger) > 0:
150 | small = smaller.pop()
151 | large = larger.pop()
152 |
153 | J[small] = large
154 | q[large] = q[large] + q[small] - 1.0
155 | if q[large] < 1.0:
156 | smaller.append(large)
157 | else:
158 | larger.append(large)
159 |
160 | return J, q
161 |
162 |
163 | def alias_draw(J, q):
164 | '''
165 | Draw sample from a non-uniform discrete distribution using alias sampling.
166 | '''
167 | K = len(J)
168 |
169 | kk = int(np.floor(np.random.rand() * K))
170 | if np.random.rand() < q[kk]:
171 | return kk
172 | else:
173 | return J[kk]
174 |
175 |
176 | """
177 | parser.add_argument('--walk-length', type=int, default=80,
178 | help='Length of walk per source. Default is 80.')
179 |
180 | parser.add_argument('--num-walks', type=int, default=10,
181 | help='Number of walks per source. Default is 10.')
182 |
183 | parser.add_argument('--workers', type=int, default=8,
184 | help='Number of parallel workers. Default is 8.')
185 |
186 | parser.add_argument('--p', type=float, default=1,
187 | help='Return hyperparameter. Default is 1.')
188 |
189 | parser.add_argument('--q', type=float, default=1,
190 | help='Inout hyperparameter. Default is 1.')
191 |
192 | """
193 |
194 | def generate_random_walks_from_assertions():
195 | p = 1.0 # return hyperparameter
196 | q = 1.0 # inout hyperparameter
197 | is_directed = True # whether the graph is directed
198 | num_walks = 2 # number of wandom walks per source def. 10
199 | walk_length = 15 # length of walk per source def. 80
200 |
201 | nx_G = read_graph(path="./randomwalks/cn_assertions_filtered.tsv")
202 | G = Graph(nx_G, is_directed, p, q)
203 | G.preprocess_transition_probs()
204 | walks = G.simulate_walks(num_walks, walk_length)
205 | filename = "./randomwalks/random_walk_" + str(p) + "_" + str(q) + "_" + str(num_walks) + "_" + str(walk_length) + ".p"
206 | with open(filename, 'wb') as handle:
207 | pickle.dump(walks, handle)
208 | print(len(walks))
209 |
210 |
211 | def analyze_graph():
212 | nx_G = read_graph(path="./randomwalks/cn_assertions_filtered.tsv")
213 | print("%d nodes in the graph" % nx_G.number_of_nodes())
214 | print("%d edges in the graph" % nx_G.number_of_edges())
215 | print("%f density of graph" % nx.density(nx_G))
216 | #print("%f density of graph" % nx.number_of_selfloops(nx_G))
217 | print("%s" % nx.info(nx_G))
218 | print("%f avg in-degree" % float(float(sum(nx_G.in_degree().values()))/float(len(nx_G.in_degree().values()))))
219 | print("%f min in-degree" % float(float(min(nx_G.in_degree().values()))))
220 | print("%f max in-degree" % float(float(max(nx_G.in_degree().values()))))
221 | print("%f std in-degree" % float(float(np.std(np.array([float(v) for v in nx_G.in_degree().values()], dtype=np.float)))))
222 | print("%f avg in-degree" % float(float(np.average(np.array([float(v) for v in nx_G.in_degree().values()], dtype=np.float)))))
223 |
224 | print("%f avg out-degree" % float(float(sum(nx_G.out_degree().values()))/float(len(nx_G.out_degree().values()))))
225 | print("%f min out-degree" % float(float(min(nx_G.out_degree().values()))))
226 | print("%f max out-degree" % float(float(max(nx_G.out_degree().values()))))
227 | print("%f std out-degree" % float(float(np.std(np.array([float(v) for v in nx_G.out_degree().values()], dtype=np.float)))))
228 | print("%f avg out-degree" % float(float(np.average(np.array([float(v) for v in nx_G.out_degree().values()], dtype=np.float)))))
229 |
230 |
231 | comps_strong = list(nx.strongly_connected_component_subgraphs(nx_G))
232 | print("%d num strongly connected components" % len(comps_strong))
233 | comps_weak = list(nx.weakly_connected_component_subgraphs(nx_G))
234 | print("%d num weakly connected components" % len(comps_weak))
235 | diameters=[]
236 | for c in comps_strong:
237 | diameters.append(nx.diameter(c))
238 | print("Avg diameter %f for strongly connected components" % float(sum(diameters)/len(diameters)))
239 | print("Max diameter %f for strongly connected components" % max(diameters))
240 | print("Min diameter %f for strongly connected components" % min(diameters))
241 | print("%f std diameter" % float(float(np.std(np.array(diameters, dtype=np.float)))))
242 | print("%f avg diameter" % float(float(np.average(np.array(diameters, dtype=np.float)))))
243 |
244 |
245 | def load_random_walk(p):
246 | walk = pickle.load(open(p, 'rb'))
247 | return walk
248 |
249 |
250 | def main():
251 | generate_random_walks_from_assertions()
252 | #analyze_graph()
253 | # load_random_walk(p="./randomwalks/random_walk_1.0_1.0_2_10.p")
254 |
255 | if __name__=="__main__":
256 | main()
257 |
--------------------------------------------------------------------------------
/results_utility/fetcher.py:
--------------------------------------------------------------------------------
1 | import os
2 | import codecs
3 | import csv
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | import seaborn as sns
7 | import pandas as pd
8 |
9 | def fetch_results(base_path="/work/anlausch/ConceptBERT/output/finetuning/omcs/", subdir="free-wo-nsp/"):#subdir="base_16_longer/"):# ##
10 | """
11 | :param base_path:
12 | :param subdir:
13 | :return:
14 | >>> fetch_results()
15 | """
16 | path = base_path + subdir
17 | result_dict = {}
18 | for root, dirs, files in os.walk(path):
19 | for f in files:
20 | if f == "eval_results.txt":
21 | id = root.split("/")[-1]
22 | task = "_".join(id.split("_")[:1])
23 | hyperparams = "_".join(id.split("_")[1:])
24 | train_step = root.split("/")[-2]
25 | if train_step in result_dict:
26 | train_step_dict = result_dict[train_step]
27 | else:
28 | train_step_dict = {}
29 | with codecs.open(os.path.join(root, f), "r", "utf8") as file:
30 | file_dict = {}
31 | for line in file.readlines():
32 | key = line.split(" = ")[0]
33 | try:
34 | value = float(line.split(" = ")[1].strip())
35 | file_dict[key] = value
36 | except Exception as e:
37 | print(e)
38 | if task not in train_step_dict:
39 | train_step_dict[task] = {}
40 | train_step_dict[task][hyperparams] = file_dict
41 | result_dict[train_step] = train_step_dict
42 | filtered_list = []
43 | for train_step, train_step_dict in result_dict.items():
44 | for task, task_dict in train_step_dict.items():
45 | if task in ["MRPC", "RTE", "MNLI", "QQP", "SST2", "QNLI", "QNLIV2"]:
46 | measure = "eval_accuracy"
47 | elif task in ["CoLA"]:
48 | measure = "mcc"
49 | elif task in ["STSB"]:
50 | measure = "spearman"
51 | else:
52 | print("Task name not in list: %s", task)
53 | if len(task_dict) < 4:
54 | print("Task %s result dict for train step %s has not all hyperparam results" % (task, train_step))
55 | break
56 | else:
57 | best_config = ""
58 | best_result = 0.0
59 | for i, (config, result) in enumerate(task_dict.items()):
60 | if result[measure] >= best_result:
61 | best_config = config
62 | best_result = result[measure]
63 | filtered_list.append({"train_step": train_step, "task": task, "hyperparams": best_config, "score": best_result})
64 | return filtered_list
65 |
66 |
67 | def output_results_as_csv(filtered_list, output_path="./../finetuning/poc_over_time/wn_binary.csv"):
68 | csv_keys = list(filtered_list[0].keys())
69 | with open(output_path, 'w') as output_file:
70 | dict_writer = csv.DictWriter(output_file, csv_keys)
71 | dict_writer.writeheader()
72 | dict_writer.writerows(filtered_list)
73 |
74 | def plot_task(task="CoLA", output_path="./../finetuning/poc_over_time/cola2.pdf"):
75 | """
76 | :param output_path:
77 | :param task:
78 | :return:
79 | >>> plot_task()
80 | """
81 | filtered_list_wn = fetch_results(subdir="wn_binary") + fetch_results(subdir="wn_binary_16_longer")
82 | # kick out ill stsb
83 | for d in filtered_list_wn:
84 | if d["train_step"] != "stsb_first":
85 | d["train_step"] = int(d["train_step"])/2
86 | d["model"] = "informed"
87 | filtered_list_base = fetch_results(subdir="base_16") + fetch_results(subdir="base_16_longer")
88 | for d in filtered_list_base:
89 | d["train_step"] = int(d["train_step"])
90 | d["model"] = "base"
91 | filtered_list_wn = [d for d in filtered_list_wn if d["train_step"] != "stsb_first" and d["task"]==task and d["train_step"] == 1000000]
92 | filtered_list_base = [d for d in filtered_list_base if d["task"] == task and d["train_step"] == 1000000]
93 | filtered_list_wn = sorted(filtered_list_wn, key=lambda k: k['train_step'])
94 | filtered_list_base = sorted(filtered_list_base, key=lambda k: k['train_step'])
95 |
96 | # aligned_wn = []
97 | # aligned_base = []
98 | # ind = []
99 | # for d_base in filtered_list_base:
100 | # for d_wn in filtered_list_wn:
101 | # if int(d_base["train_step"])*2 == int(d_wn["train_step"]):
102 | # aligned_base.append(d_base["score"])
103 | # aligned_wn.append(d_wn["score"])
104 | # ind.append(d_base["train_step"])
105 | # break
106 | all = filtered_list_wn + filtered_list_base
107 | df = pd.DataFrame(all)
108 |
109 | sns.set()
110 |
111 | with sns.plotting_context("paper"):
112 | #ind = lm_steps # the x locations for the groups
113 |
114 | fig, ax = plt.subplots()
115 |
116 | sns.lineplot(x="train_step", y="score", hue="model", style="model", data=df)
117 | #plt.title(task)
118 |
119 | ax.set(xlabel='Language Modeling Steps', ylabel='Accuracy')
120 | ax.yaxis.grid(True, linestyle="dotted")
121 | ax.xaxis.grid(True, linestyle="dotted")
122 |
123 | fig.savefig(output_path)
124 | #plt.show()
125 | print("Done")
126 |
127 |
128 | def main():
129 | #filtered_list = fetch_results(base_path="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/", subdir="nl-adapter/")
130 | filtered_list = fetch_results(base_path="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/",
131 | subdir="nl-adapter_tune_all")
132 | output_results_as_csv(filtered_list,
133 | output_path="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/results_filtered.csv")
134 | filtered_list = fetch_results(base_path="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/",
135 | subdir="nl-adapter_tune_all_quick_insight")
136 | output_results_as_csv(filtered_list,
137 | output_path="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all_quick_insight/results_filtered.csv")
138 | filtered_list = fetch_results(base_path="/work/anlausch/ConceptBERT/output/finetuning/omcs/", subdir="free-wo-nsp-adapter_tune_all/")
139 | output_results_as_csv(filtered_list, output_path="/work/anlausch/ConceptBERT/output/finetuning/omcs/free-wo-nsp-adapter_tune_all/results_filtered.csv")
140 | filtered_list = fetch_results(base_path="/work/anlausch/replant/bert/finetuning/poc_over_time/", subdir="wn_binary_16_longer/")
141 | output_results_as_csv(filtered_list, output_path="/work/anlausch/replant/bert/finetuning/poc_over_time/wn_binary_16_longer/results_filtered.csv")
142 | filtered_list = fetch_results(base_path="/work/anlausch/replant/bert/finetuning/poc_over_time/", subdir="base_16_longer/")
143 | output_results_as_csv(filtered_list, output_path="/work/anlausch/replant/bert/finetuning/poc_over_time/base_16_longer/results_filtered.csv")
144 | #output_results_as_csv(filtered_list, output_path="/work/anlausch/ConceptBERT/output/finetuning/omcs/free-wo-nsp-adapter_tune_all/results_filtered.csv")
145 |
146 | if __name__=="__main__":
147 | main()
--------------------------------------------------------------------------------
/results_utility/parse_predictions.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import numpy as np
3 | import os
4 | import argparse
5 |
6 | def parse_predictions(input_path, output_path, task="STSB"):
7 | """
8 | :param input_path:
9 | :param output_path:
10 | :param task:
11 | :return:
12 | >>> parse_predictions("/work/anlausch/replant/bert/predictions/wn_binary/mnli_neu_32_5e-05_3.0/test_results.tsv", "/work/anlausch/replant/bert/predictions/wn_binary_32_5e-05_3.0/MNLI-mm-neu.tsv", task="MNLI")
13 | """
14 | if task != "STSB":
15 | import run_classifier
16 | else:
17 | import run_regression
18 | predicted_labels = []
19 | if task == "MRPC":
20 | #ids = MrpcProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/MRPC")
21 | labels = run_classifier.MrpcProcessor().get_labels()
22 | if task == "RTE":
23 | labels = run_classifier.RTEProcessor().get_labels()
24 | if task == "QNLI":
25 | labels = run_classifier.QNLIProcessor().get_labels()
26 | if task == "QNLIV2":
27 | labels = run_classifier.QNLIProcessor().get_labels()
28 | if task == "MNLI":
29 | labels = run_classifier.MnliProcessor().get_labels()
30 | if task == "SST2":
31 | labels = run_classifier.SST2Processor().get_labels()
32 | if task == "CoLA":
33 | labels = run_classifier.ColaProcessor().get_labels()
34 | if task == "QQP":
35 | labels = run_classifier.QQPProcessor().get_labels()
36 | if task == "diagnostic":
37 | labels = run_classifier.DiagnosticProcessor().get_labels()
38 | with codecs.open(input_path, "r", "utf8") as f_in:
39 | for line in f_in.readlines():
40 | predictions = np.array(line.split("\t"), dtype=np.float32)
41 | if task != "STSB":
42 | predicted_index = np.argmax(predictions)
43 | predicted_labels.append(labels[predicted_index])
44 | else:
45 | predicted_labels.append(predictions[0])
46 | f_in.close()
47 | with codecs.open(output_path, "w", "utf8") as f_out:
48 | f_out.write("index\tprediction\n")
49 | for i, prediction in enumerate(predicted_labels):
50 | f_out.write(str(i) + "\t" + str(prediction) + "\n")
51 | f_out.close()
52 |
53 |
54 | def write_fake_predictions(output_path, task="MRPC"):
55 | """
56 | :param input_path:
57 | :param output_path:
58 | :param task:
59 | :return:
60 | >>> write_fake_predictions("/work/anlausch/replant/bert/predictions/base_32_5e-05_3.0/copy_for_submission/fakes/STS-B.tsv", task="STSB")
61 | """
62 | if task != "STSB":
63 | import run_classifier
64 | else:
65 | import run_regression
66 | if task == "MNLI":
67 | test_examples = run_classifier.MnliProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task, False)
68 | labels = run_classifier.MnliProcessor().get_labels()
69 | elif task == "QQP":
70 | test_examples = run_classifier.QQPProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task)
71 | labels = run_classifier.QQPProcessor().get_labels()
72 | elif task == "WNLI":
73 | test_examples = run_classifier.WNLIProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task)
74 | labels = run_classifier.WNLIProcessor().get_labels()
75 | elif task == "CoLA":
76 | test_examples = run_classifier.ColaProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task)
77 | labels = run_classifier.ColaProcessor().get_labels()
78 | elif task == "STSB":
79 | test_examples = run_regression.STSBProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task)
80 | elif task == "diagnostic":
81 | test_examples = run_classifier.DiagnosticProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task)
82 | labels = run_classifier.DiagnosticProcessor().get_labels()
83 | with codecs.open(output_path, "w", "utf8") as f_out:
84 | f_out.write("index\tprediction\n")
85 | if task != "STSB":
86 | for i, data in enumerate(test_examples):
87 | f_out.write(str(i) + "\t" + str(labels[0]) + "\n")
88 | else:
89 | for i, data in enumerate(test_examples):
90 | f_out.write(str(i) + "\t" + str(2.5) + "\n")
91 | f_out.close()
92 |
93 |
94 |
95 | def main():
96 | parser = argparse.ArgumentParser(description="Running prediction parser")
97 | parser.add_argument("--task", type=str, default=None,
98 | help="Input path in case train and dev are in a single file", required=True)
99 | parser.add_argument("--input_path", type=str, default="/work/anlausch/replant/bert/predictions/wn_binary_32_5e-05_3.0/test_results.tsv",
100 | help="Input path in case train and dev are in a single file", required=False)
101 | parser.add_argument("--output_path_root", type=str, default="/work/anlausch/replant/bert/predictions/wn_binary_32_5e-05_3.0/",
102 | help="Input path in case train and dev are in a single file", required=False)
103 |
104 | args = parser.parse_args()
105 | task = args.task
106 | input_path = args.input_path
107 | root = args.output_path_root
108 | output_path = root + str(task) + ".tsv"
109 | parse_predictions(input_path, output_path, task)
110 |
111 | if __name__ == "__main__":
112 | main()
--------------------------------------------------------------------------------
/retrograph/__init__.py:
--------------------------------------------------------------------------------
1 | #####################################################
2 | # coding=utf-8
3 | # Copyright 2019 Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | ####################################################
17 |
18 |
19 | ####################################################
20 | # IMPORT STATEMENTS
21 | ####################################################
22 |
23 | # >>>>>> Native Imports <<<<<<<
24 |
25 | # >>>>>> Package Imports <<<<<<<
26 |
27 | # >>>>>> Local Imports <<<<<<<
28 |
29 |
30 | ####################################################
31 | # CODE
32 | ####################################################
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | ####################################################
41 | # MAIN
42 | ####################################################
43 |
44 |
45 | # EOF
46 |
--------------------------------------------------------------------------------
/retrograph/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | #####################################################
2 | # coding=utf-8
3 | # Copyright 2019 Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | ####################################################
17 |
18 |
19 | ####################################################
20 | # IMPORT STATEMENTS
21 | ####################################################
22 |
23 | # >>>>>> Native Imports <<<<<<<
24 |
25 | # >>>>>> Package Imports <<<<<<<
26 |
27 | # >>>>>> Local Imports <<<<<<<
28 |
29 |
30 | ####################################################
31 | # CODE
32 | ####################################################
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | ####################################################
41 | # MAIN
42 | ####################################################
43 |
44 |
45 | # EOF
46 |
--------------------------------------------------------------------------------
/retrograph/modeling/metrics_extension.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.python.eager import context
3 | from tensorflow.python.framework import dtypes
4 | from tensorflow.python.framework import ops
5 | from tensorflow.python.framework import sparse_tensor
6 | from tensorflow.python.ops import array_ops
7 | from tensorflow.python.ops import check_ops
8 | from tensorflow.python.ops import confusion_matrix
9 | from tensorflow.python.ops import control_flow_ops
10 | from tensorflow.python.ops import math_ops
11 | from tensorflow.python.ops import nn
12 | from tensorflow.python.ops import sets
13 | from tensorflow.python.ops import sparse_ops
14 | from tensorflow.python.ops import state_ops
15 | from tensorflow.python.ops import variable_scope
16 | from tensorflow.python.ops import weights_broadcast_ops
17 | from tensorflow.python.platform import tf_logging as logging
18 | from tensorflow.python.training import distribution_strategy_context
19 | from tensorflow.python.util.deprecation import deprecated
20 | from tensorflow.python.util.tf_export import tf_export
21 |
22 | def mcc(labels,
23 | predictions,
24 | weights=None,
25 | metrics_collections=None,
26 | updates_collections=None,
27 | name=None):
28 |
29 | if context.executing_eagerly():
30 | raise RuntimeError('mcc is not '
31 | 'supported when eager execution is enabled.')
32 |
33 | with tf.variable_scope(name, 'mcc',(predictions, labels, weights)):
34 |
35 | predictions, labels, weights = _remove_squeezable_dimensions(
36 | predictions=math_ops.cast(predictions, dtype=dtypes.bool),
37 | labels=math_ops.cast(labels, dtype=dtypes.bool),
38 | weights=weights)
39 |
40 | true_p, true_positives_update_op = tf.metrics.true_positives(
41 | labels,
42 | predictions,
43 | weights,
44 | metrics_collections=None,
45 | updates_collections=None,
46 | name=None)
47 | false_p, false_positives_update_op = tf.metrics.false_positives(
48 | labels,
49 | predictions,
50 | weights,
51 | metrics_collections=None,
52 | updates_collections=None,
53 | name=None)
54 | true_n, true_negatives_update_op = tf.metrics.true_negatives(
55 | labels,
56 | predictions,
57 | weights,
58 | metrics_collections=None,
59 | updates_collections=None,
60 | name=None)
61 | false_n, false_negatives_update_op = tf.metrics.false_negatives(
62 | labels,
63 | predictions,
64 | weights,
65 | metrics_collections=None,
66 | updates_collections=None,
67 | name=None)
68 |
69 | def compute_mcc(tp, fp, tn, fn, name):
70 | return tf.math.divide(
71 | tf.math.subtract(
72 | tf.math.multiply(tp,tn),
73 | tf.math.multiply(fp,fn))
74 | ,tf.sqrt(
75 | tf.math.multiply(
76 | tf.math.multiply(tf.math.add(tp,fp),tf.math.add(tp,fn)),
77 | tf.math.multiply(tf.math.add(tn,fp),tf.math.add(tn,fn)))), name=name)
78 |
79 | def once_across_towers(_, true_p, false_p, true_n, false_n):
80 | return compute_mcc(true_p, false_p, true_n, false_n, 'value')
81 |
82 | mcc = _aggregate_across_towers(metrics_collections, once_across_towers,
83 | true_p, false_p, true_n, false_n)
84 |
85 | update_op = compute_mcc(true_positives_update_op,
86 | false_positives_update_op, true_negatives_update_op, false_negatives_update_op, 'update_op')
87 | if updates_collections:
88 | ops.add_to_collections(updates_collections, update_op)
89 |
90 | return mcc, update_op
91 |
92 |
93 | def _remove_squeezable_dimensions(predictions, labels, weights):
94 | """Squeeze or expand last dim if needed.
95 |
96 | Squeezes last dim of `predictions` or `labels` if their rank differs by 1
97 | (using confusion_matrix.remove_squeezable_dimensions).
98 | Squeezes or expands last dim of `weights` if its rank differs by 1 from the
99 | new rank of `predictions`.
100 |
101 | If `weights` is scalar, it is kept scalar.
102 |
103 | This will use static shape if available. Otherwise, it will add graph
104 | operations, which could result in a performance hit.
105 |
106 | Args:
107 | predictions: Predicted values, a `Tensor` of arbitrary dimensions.
108 | labels: Optional label `Tensor` whose dimensions match `predictions`.
109 | weights: Optional weight scalar or `Tensor` whose dimensions match
110 | `predictions`.
111 |
112 | Returns:
113 | Tuple of `predictions`, `labels` and `weights`. Each of them possibly has
114 | the last dimension squeezed, `weights` could be extended by one dimension.
115 | """
116 | predictions = ops.convert_to_tensor(predictions)
117 | if labels is not None:
118 | labels, predictions = confusion_matrix.remove_squeezable_dimensions(
119 | labels, predictions)
120 | predictions.get_shape().assert_is_compatible_with(labels.get_shape())
121 |
122 | if weights is None:
123 | return predictions, labels, None
124 |
125 | weights = ops.convert_to_tensor(weights)
126 | weights_shape = weights.get_shape()
127 | weights_rank = weights_shape.ndims
128 | if weights_rank == 0:
129 | return predictions, labels, weights
130 |
131 | predictions_shape = predictions.get_shape()
132 | predictions_rank = predictions_shape.ndims
133 | if (predictions_rank is not None) and (weights_rank is not None):
134 | # Use static rank.
135 | if weights_rank - predictions_rank == 1:
136 | weights = array_ops.squeeze(weights, [-1])
137 | elif predictions_rank - weights_rank == 1:
138 | weights = array_ops.expand_dims(weights, [-1])
139 | else:
140 | # Use dynamic rank.
141 | weights_rank_tensor = array_ops.rank(weights)
142 | rank_diff = weights_rank_tensor - array_ops.rank(predictions)
143 |
144 | def _maybe_expand_weights():
145 | return control_flow_ops.cond(
146 | math_ops.equal(rank_diff, -1),
147 | lambda: array_ops.expand_dims(weights, [-1]), lambda: weights)
148 |
149 | # Don't attempt squeeze if it will fail based on static check.
150 | if ((weights_rank is not None) and
151 | (not weights_shape.dims[-1].is_compatible_with(1))):
152 | maybe_squeeze_weights = lambda: weights
153 | else:
154 | maybe_squeeze_weights = lambda: array_ops.squeeze(weights, [-1])
155 |
156 | def _maybe_adjust_weights():
157 | return control_flow_ops.cond(
158 | math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
159 | _maybe_expand_weights)
160 |
161 | # If weights are scalar, do nothing. Otherwise, try to add or remove a
162 | # dimension to match predictions.
163 | weights = control_flow_ops.cond(
164 | math_ops.equal(weights_rank_tensor, 0), lambda: weights,
165 | _maybe_adjust_weights)
166 | return predictions, labels, weights
167 |
168 | def _aggregate_across_towers(metrics_collections, metric_value_fn, *args):
169 | """Aggregate metric value across towers."""
170 | def fn(distribution, *a):
171 | """Call `metric_value_fn` in the correct control flow context."""
172 | if hasattr(distribution, '_outer_control_flow_context'):
173 | # If there was an outer context captured before this method was called,
174 | # then we enter that context to create the metric value op. If the
175 | # caputred context is `None`, ops.control_dependencies(None) gives the
176 | # desired behavior. Else we use `Enter` and `Exit` to enter and exit the
177 | # captured context.
178 | # This special handling is needed because sometimes the metric is created
179 | # inside a while_loop (and perhaps a TPU rewrite context). But we don't
180 | # want the value op to be evaluated every step or on the TPU. So we
181 | # create it outside so that it can be evaluated at the end on the host,
182 | # once the update ops have been evaluted.
183 |
184 | # pylint: disable=protected-access
185 | if distribution._outer_control_flow_context is None:
186 | with ops.control_dependencies(None):
187 | metric_value = metric_value_fn(distribution, *a)
188 | else:
189 | distribution._outer_control_flow_context.Enter()
190 | metric_value = metric_value_fn(distribution, *a)
191 | distribution._outer_control_flow_context.Exit()
192 | # pylint: enable=protected-access
193 | else:
194 | metric_value = metric_value_fn(distribution, *a)
195 | if metrics_collections:
196 | ops.add_to_collections(metrics_collections, metric_value)
197 | return metric_value
198 |
199 | return distribution_strategy_context.get_tower_context().merge_call(fn, *args)
200 |
--------------------------------------------------------------------------------
/retrograph/modeling/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | beta_1=0.9,
63 | beta_2=0.999,
64 | epsilon=1e-6,
65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
66 |
67 | if use_tpu:
68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
69 |
70 | tvars = tf.trainable_variables()
71 | grads = tf.gradients(loss, tvars)
72 |
73 | # This is how the model was pre-trained.
74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
75 |
76 | train_op = optimizer.apply_gradients(
77 | zip(grads, tvars), global_step=global_step)
78 |
79 | new_global_step = global_step + 1
80 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
81 | return train_op
82 |
83 |
84 | def create_optimizer_multitask(standard_loss, wn_loss, selected_task_id, wn_upper_bound, init_lr, num_train_steps, num_warmup_steps, use_tpu):
85 | """Creates an optimizer training op."""
86 | global_step = tf.train.get_or_create_global_step()
87 |
88 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
89 |
90 | # Implements linear decay of the learning rate.
91 | learning_rate = tf.train.polynomial_decay(
92 | learning_rate,
93 | global_step,
94 | num_train_steps,
95 | end_learning_rate=0.0,
96 | power=1.0,
97 | cycle=False)
98 |
99 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
100 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
101 | if num_warmup_steps:
102 | global_steps_int = tf.cast(global_step, tf.int32)
103 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
104 |
105 | global_steps_float = tf.cast(global_steps_int, tf.float32)
106 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
107 |
108 | warmup_percent_done = global_steps_float / warmup_steps_float
109 | warmup_learning_rate = init_lr * warmup_percent_done
110 |
111 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
112 | learning_rate = (
113 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
114 |
115 | # It is recommended that you use this optimizer for fine tuning, since this
116 | # is how the model was trained (note that the Adam m/v variables are NOT
117 | # loaded from init_checkpoint.)
118 | optimizer = AdamWeightDecayOptimizer(
119 | learning_rate=learning_rate,
120 | weight_decay_rate=0.01,
121 | beta_1=0.9,
122 | beta_2=0.999,
123 | epsilon=1e-6,
124 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
125 |
126 | if use_tpu:
127 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
128 |
129 | multitask_optimizer = tf.contrib.opt.MultitaskOptimizerWrapper(optimizer)
130 | tvars = tf.trainable_variables()
131 |
132 | standard_grads = tf.gradients(standard_loss, tvars)
133 | wn_grads = tf.gradients(wn_loss, tvars)
134 |
135 | # This is how the model was pre-trained.
136 | (standard_grads, _) = tf.clip_by_global_norm(standard_grads, clip_norm=1.0)
137 | (wn_grads, _) = tf.clip_by_global_norm(wn_grads, clip_norm=1.0)
138 |
139 | train_op_standard = multitask_optimizer.apply_gradients(
140 | zip(standard_grads, tvars), global_step=global_step)
141 | train_op_wn = multitask_optimizer.apply_gradients(
142 | zip(wn_grads, tvars), global_step=global_step)
143 |
144 | new_global_step = global_step + 1
145 |
146 | train_op_standard = tf.group(train_op_standard, [global_step.assign(new_global_step)])
147 | train_op_wn = tf.group(train_op_wn, [global_step.assign(new_global_step)])
148 |
149 | # TODO: Check this
150 | #wn_step = tf.Variable(name='wn_step', trainable=False, dtype=tf.int32, initial_value=tf.constant(0))
151 | #bert_step = tf.Variable(name='bert_step', trainable=False, dtype=tf.int32, initial_value=tf.constant(0))
152 |
153 | #(increment_wn, increment_bert) = tf.case(
154 | # [(tf.less(selected_task_id, wn_upper_bound), lambda: (tf.constant(1), tf.constant(0)))],
155 | # default=lambda: (tf.constant(0), tf.constant(1)),
156 | # exclusive=True)
157 |
158 | #bert_step = tf.assign_add(bert_step, increment_bert)
159 | #wn_step = tf.assign_add(wn_step, increment_wn)
160 | #tf.summary.scalar(name='selected_task_id', tensor=selected_task_id)
161 | #tf.summary.scalar(name='wn_step', tensor=wn_step)
162 | #tf.summary.scalar(name='bert_step', tensor=bert_step)
163 | #tf.summary.scalar('gs', global_step)
164 |
165 | #train_op = tf.cond(tf.less(selected_task_id, wn_upper_bound), lambda: train_op_wn, lambda: train_op_standard, name="multitask_train")
166 |
167 | train_op = tf.case([(tf.less(selected_task_id, wn_upper_bound), lambda: train_op_wn)], default=lambda: train_op_standard, exclusive=True)
168 |
169 | return train_op
170 |
171 |
172 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
173 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
174 |
175 | def __init__(self,
176 | learning_rate,
177 | weight_decay_rate=0.0,
178 | beta_1=0.9,
179 | beta_2=0.999,
180 | epsilon=1e-6,
181 | exclude_from_weight_decay=None,
182 | name="AdamWeightDecayOptimizer"):
183 | """Constructs a AdamWeightDecayOptimizer."""
184 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
185 |
186 | self.learning_rate = learning_rate
187 | self.weight_decay_rate = weight_decay_rate
188 | self.beta_1 = beta_1
189 | self.beta_2 = beta_2
190 | self.epsilon = epsilon
191 | self.exclude_from_weight_decay = exclude_from_weight_decay
192 |
193 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
194 | """See base class."""
195 | assignments = []
196 | for (grad, param) in grads_and_vars:
197 | if grad is None or param is None:
198 | continue
199 |
200 | param_name = self._get_variable_name(param.name)
201 |
202 | # TODO: Check this carefully, because I added the variable reuse
203 | with tf.variable_scope("adam", reuse=tf.AUTO_REUSE):
204 | m = tf.get_variable(
205 | name=param_name + "/adam_m",
206 | shape=param.shape.as_list(),
207 | dtype=tf.float32,
208 | trainable=False,
209 | initializer=tf.zeros_initializer())
210 | v = tf.get_variable(
211 | name=param_name + "/adam_v",
212 | shape=param.shape.as_list(),
213 | dtype=tf.float32,
214 | trainable=False,
215 | initializer=tf.zeros_initializer())
216 |
217 | # Standard Adam update.
218 | next_m = (
219 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
220 | next_v = (
221 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
222 | tf.square(grad)))
223 |
224 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
225 |
226 | # Just adding the square of the weights to the loss function is *not*
227 | # the correct way of using L2 regularization/weight decay with Adam,
228 | # since that will interact with the m and v parameters in strange ways.
229 | #
230 | # Instead we want ot decay the weights in a manner that doesn't interact
231 | # with the m/v parameters. This is equivalent to adding the square
232 | # of the weights to the loss with plain (non-momentum) SGD.
233 | if self._do_use_weight_decay(param_name):
234 | update += self.weight_decay_rate * param
235 |
236 | update_with_lr = self.learning_rate * update
237 |
238 | next_param = param - update_with_lr
239 |
240 | assignments.extend(
241 | [param.assign(next_param),
242 | m.assign(next_m),
243 | v.assign(next_v)])
244 | return tf.group(*assignments, name=name)
245 |
246 | def _do_use_weight_decay(self, param_name):
247 | """Whether to use L2 weight decay for `param_name`."""
248 | if not self.weight_decay_rate:
249 | return False
250 | if self.exclude_from_weight_decay:
251 | for r in self.exclude_from_weight_decay:
252 | if re.search(r, param_name) is not None:
253 | return False
254 | return True
255 |
256 | def _get_variable_name(self, param_name):
257 | """Get the variable name from the tensor name."""
258 | m = re.match("^(.*):\\d+$", param_name)
259 | if m is not None:
260 | param_name = m.group(1)
261 | return param_name
262 |
--------------------------------------------------------------------------------
/retrograph/modeling/optimization_adapter.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | adapter_weight_decay_rate=0.01,
63 | beta_1=0.9,
64 | beta_2=0.999,
65 | epsilon=1e-6,
66 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
67 |
68 | if use_tpu:
69 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
70 |
71 | tvars = []
72 | for collection in ["adapters", "layer_norm", "head"]:
73 | tvars += tf.get_collection(collection)
74 | grads = tf.gradients(loss, tvars)
75 |
76 | # This is how the model was pre-trained.
77 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
78 |
79 | train_op = optimizer.apply_gradients(
80 | zip(grads, tvars), global_step=global_step)
81 |
82 | # Normally the global step update is done inside of `apply_gradients`.
83 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
84 | # a different optimizer, you should probably take this line out.
85 | new_global_step = global_step + 1
86 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
87 | return train_op
88 |
89 |
90 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
91 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
92 |
93 | def __init__(self,
94 | learning_rate,
95 | weight_decay_rate=0.0,
96 | adapter_weight_decay_rate=0.0,
97 | beta_1=0.9,
98 | beta_2=0.999,
99 | epsilon=1e-6,
100 | exclude_from_weight_decay=None,
101 | name="AdamWeightDecayOptimizer"):
102 | """Constructs a AdamWeightDecayOptimizer."""
103 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
104 |
105 | self.learning_rate = learning_rate
106 | self.weight_decay_rate = weight_decay_rate
107 | self.adapter_weight_decay_rate = adapter_weight_decay_rate
108 | self.beta_1 = beta_1
109 | self.beta_2 = beta_2
110 | self.epsilon = epsilon
111 | self.exclude_from_weight_decay = exclude_from_weight_decay
112 | self._adapter_variable_names = {
113 | self._get_variable_name(v.name) for v in tf.get_collection("adapters")
114 | }
115 |
116 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
117 | """See base class."""
118 | assignments = []
119 | for (grad, param) in grads_and_vars:
120 | if grad is None or param is None:
121 | continue
122 |
123 | param_name = self._get_variable_name(param.name)
124 |
125 | m = tf.get_variable(
126 | name=param_name + "/adam_m",
127 | shape=param.shape.as_list(),
128 | dtype=tf.float32,
129 | trainable=False,
130 | initializer=tf.zeros_initializer())
131 | v = tf.get_variable(
132 | name=param_name + "/adam_v",
133 | shape=param.shape.as_list(),
134 | dtype=tf.float32,
135 | trainable=False,
136 | initializer=tf.zeros_initializer())
137 |
138 | # Standard Adam update.
139 | next_m = (
140 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
141 | next_v = (
142 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
143 | tf.square(grad)))
144 |
145 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
146 |
147 | # Just adding the square of the weights to the loss function is *not*
148 | # the correct way of using L2 regularization/weight decay with Adam,
149 | # since that will interact with the m and v parameters in strange ways.
150 | #
151 | # Instead we want ot decay the weights in a manner that doesn't interact
152 | # with the m/v parameters. This is equivalent to adding the square
153 | # of the weights to the loss with plain (non-momentum) SGD.
154 | if self._do_use_weight_decay(param_name):
155 | if param_name in self._adapter_variable_names:
156 | update += self.adapter_weight_decay_rate * param
157 | else:
158 | update += self.weight_decay_rate * param
159 |
160 | update_with_lr = self.learning_rate * update
161 |
162 | next_param = param - update_with_lr
163 |
164 | assignments.extend(
165 | [param.assign(next_param),
166 | m.assign(next_m),
167 | v.assign(next_v)])
168 | return tf.group(*assignments, name=name)
169 |
170 | def _do_use_weight_decay(self, param_name):
171 | """Whether to use L2 weight decay for `param_name`."""
172 | if param_name in self._adapter_variable_names:
173 | if not self.adapter_weight_decay_rate:
174 | return False
175 | else:
176 | if not self.weight_decay_rate:
177 | return False
178 |
179 | if self.exclude_from_weight_decay:
180 | for r in self.exclude_from_weight_decay:
181 | if re.search(r, param_name) is not None:
182 | return False
183 |
184 | return True
185 |
186 | def _get_variable_name(self, param_name):
187 | """Get the variable name from the tensor name."""
188 | m = re.match("^(.*):\\d+$", param_name)
189 | if m is not None:
190 | param_name = m.group(1)
191 | return param_name
--------------------------------------------------------------------------------
/retrograph/modeling/tokenization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 | import unicodedata
23 | import six
24 | import tensorflow as tf
25 | import codecs
26 |
27 | def write_vocab_from_fasttext(path_in, path_out):
28 | """Converts fasttext vectors into simple vocab file
29 | >>> write_vocab_from_fasttext("./../data/fasttext/wiki-news-300d-1M.vec", "./../data/vocab_word_level.txt")
30 | """
31 | with codecs.open(path_in, "r", "utf8") as f_in:
32 | with codecs.open(path_out, "w", "utf8") as f_out:
33 | for i, line in enumerate(f_in.readlines()):
34 | if i== 0:
35 | print(line)
36 | elif i <= 200000:
37 | word = line.split(' ')[0]
38 | print(word)
39 | f_out.write(word)
40 | f_out.write("\n")
41 | f_out.close()
42 | f_in.close()
43 |
44 |
45 | def convert_to_unicode(text):
46 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
47 | if six.PY3:
48 | if isinstance(text, str):
49 | return text
50 | elif isinstance(text, bytes):
51 | return text.decode("utf-8", "ignore")
52 | else:
53 | raise ValueError("Unsupported string type: %s" % (type(text)))
54 | elif six.PY2:
55 | if isinstance(text, str):
56 | return text.decode("utf-8", "ignore")
57 | elif isinstance(text, unicode):
58 | return text
59 | else:
60 | raise ValueError("Unsupported string type: %s" % (type(text)))
61 | else:
62 | raise ValueError("Not running on Python2 or Python 3?")
63 |
64 |
65 | def printable_text(text):
66 | """Returns text encoded in a way suitable for print or `tf.logging`."""
67 |
68 | # These functions want `str` for both Python2 and Python3, but in one case
69 | # it's a Unicode string and in the other it's a byte string.
70 | if six.PY3:
71 | if isinstance(text, str):
72 | return text
73 | elif isinstance(text, bytes):
74 | return text.decode("utf-8", "ignore")
75 | else:
76 | raise ValueError("Unsupported string type: %s" % (type(text)))
77 | elif six.PY2:
78 | if isinstance(text, str):
79 | return text
80 | elif isinstance(text, unicode):
81 | return text.encode("utf-8")
82 | else:
83 | raise ValueError("Unsupported string type: %s" % (type(text)))
84 | else:
85 | raise ValueError("Not running on Python2 or Python 3?")
86 |
87 |
88 | def load_vocab(vocab_file):
89 | """Loads a vocabulary file into a dictionary."""
90 | vocab = collections.OrderedDict()
91 | index = 0
92 | with tf.gfile.GFile(vocab_file, "r") as reader:
93 | while True:
94 | token = convert_to_unicode(reader.readline())
95 | if not token:
96 | break
97 | token = token.strip()
98 | vocab[token] = index
99 | index += 1
100 | return vocab
101 |
102 |
103 | def convert_by_vocab(vocab, items):
104 | """Converts a sequence of [tokens|ids] using the vocab."""
105 | output = []
106 | for item in items:
107 | if item in vocab:
108 | output.append(vocab[item])
109 | else:
110 | return []
111 | return output
112 |
113 |
114 | def convert_tokens_to_ids(vocab, tokens):
115 | return convert_by_vocab(vocab, tokens)
116 |
117 |
118 | def convert_ids_to_tokens(inv_vocab, ids):
119 | return convert_by_vocab(inv_vocab, ids)
120 |
121 |
122 | def whitespace_tokenize(text):
123 | """Runs basic whitespace cleaning and splitting on a piece of text."""
124 | text = text.strip()
125 | if not text:
126 | return []
127 | tokens = text.split()
128 | return tokens
129 |
130 |
131 | class FullTokenizer(object):
132 | """Runs end-to-end tokenziation."""
133 |
134 | def __init__(self, vocab_file, do_lower_case=True):
135 | self.vocab = load_vocab(vocab_file)
136 | self.inv_vocab = {v: k for k, v in self.vocab.items()}
137 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
138 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
139 |
140 | def tokenize(self, text):
141 | split_tokens = []
142 | for token in self.basic_tokenizer.tokenize(text):
143 | for sub_token in self.wordpiece_tokenizer.tokenize(token):
144 | split_tokens.append(sub_token)
145 |
146 | return split_tokens
147 |
148 | def convert_tokens_to_ids(self, tokens):
149 | return convert_by_vocab(self.vocab, tokens)
150 |
151 | def convert_ids_to_tokens(self, ids):
152 | return convert_by_vocab(self.inv_vocab, ids)
153 |
154 |
155 | class BasicTokenizer(object):
156 | """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
157 |
158 | def __init__(self, do_lower_case=True):
159 | """Constructs a BasicTokenizer.
160 |
161 | Args:
162 | do_lower_case: Whether to lower case the input.
163 | """
164 | self.do_lower_case = do_lower_case
165 |
166 | def tokenize(self, text):
167 | """Tokenizes a piece of text."""
168 | text = convert_to_unicode(text)
169 | text = self._clean_text(text)
170 |
171 | # This was added on November 1st, 2018 for the multilingual and Chinese
172 | # models. This is also applied to the English models now, but it doesn't
173 | # matter since the English models were not trained on any Chinese data
174 | # and generally don't have any Chinese data in them (there are Chinese
175 | # characters in the vocabulary because Wikipedia does have some Chinese
176 | # words in the English Wikipedia.).
177 | text = self._tokenize_chinese_chars(text)
178 |
179 | orig_tokens = whitespace_tokenize(text)
180 | split_tokens = []
181 | for token in orig_tokens:
182 | if self.do_lower_case:
183 | token = token.lower()
184 | token = self._run_strip_accents(token)
185 | split_tokens.extend(self._run_split_on_punc(token))
186 |
187 | output_tokens = whitespace_tokenize(" ".join(split_tokens))
188 | return output_tokens
189 |
190 | def _run_strip_accents(self, text):
191 | """Strips accents from a piece of text."""
192 | text = unicodedata.normalize("NFD", text)
193 | output = []
194 | for char in text:
195 | cat = unicodedata.category(char)
196 | if cat == "Mn":
197 | continue
198 | output.append(char)
199 | return "".join(output)
200 |
201 | def _run_split_on_punc(self, text):
202 | """Splits punctuation on a piece of text."""
203 | chars = list(text)
204 | i = 0
205 | start_new_word = True
206 | output = []
207 | while i < len(chars):
208 | char = chars[i]
209 | if _is_punctuation(char):
210 | output.append([char])
211 | start_new_word = True
212 | else:
213 | if start_new_word:
214 | output.append([])
215 | start_new_word = False
216 | output[-1].append(char)
217 | i += 1
218 |
219 | return ["".join(x) for x in output]
220 |
221 | def _tokenize_chinese_chars(self, text):
222 | """Adds whitespace around any CJK character."""
223 | output = []
224 | for char in text:
225 | cp = ord(char)
226 | if self._is_chinese_char(cp):
227 | output.append(" ")
228 | output.append(char)
229 | output.append(" ")
230 | else:
231 | output.append(char)
232 | return "".join(output)
233 |
234 | def _is_chinese_char(self, cp):
235 | """Checks whether CP is the codepoint of a CJK character."""
236 | # This defines a "chinese character" as anything in the CJK Unicode block:
237 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
238 | #
239 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
240 | # despite its name. The modern Korean Hangul alphabet is a different block,
241 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write
242 | # space-separated words, so they are not treated specially and handled
243 | # like the all of the other languages.
244 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
245 | (cp >= 0x3400 and cp <= 0x4DBF) or #
246 | (cp >= 0x20000 and cp <= 0x2A6DF) or #
247 | (cp >= 0x2A700 and cp <= 0x2B73F) or #
248 | (cp >= 0x2B740 and cp <= 0x2B81F) or #
249 | (cp >= 0x2B820 and cp <= 0x2CEAF) or
250 | (cp >= 0xF900 and cp <= 0xFAFF) or #
251 | (cp >= 0x2F800 and cp <= 0x2FA1F)): #
252 | return True
253 |
254 | return False
255 |
256 | def _clean_text(self, text):
257 | """Performs invalid character removal and whitespace cleanup on text."""
258 | output = []
259 | for char in text:
260 | cp = ord(char)
261 | if cp == 0 or cp == 0xfffd or _is_control(char):
262 | continue
263 | if _is_whitespace(char):
264 | output.append(" ")
265 | else:
266 | output.append(char)
267 | return "".join(output)
268 |
269 |
270 | class WordpieceTokenizer(object):
271 | """Runs WordPiece tokenziation."""
272 |
273 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
274 | self.vocab = vocab
275 | self.unk_token = unk_token
276 | self.max_input_chars_per_word = max_input_chars_per_word
277 |
278 | def tokenize(self, text):
279 | """Tokenizes a piece of text into its word pieces.
280 |
281 | This uses a greedy longest-match-first algorithm to perform tokenization
282 | using the given vocabulary.
283 |
284 | For example:
285 | input = "unaffable"
286 | output = ["un", "##aff", "##able"]
287 |
288 | Args:
289 | text: A single token or whitespace separated tokens. This should have
290 | already been passed through `BasicTokenizer.
291 |
292 | Returns:
293 | A list of wordpiece tokens.
294 | """
295 |
296 | text = convert_to_unicode(text)
297 |
298 | output_tokens = []
299 | for token in whitespace_tokenize(text):
300 | chars = list(token)
301 | if len(chars) > self.max_input_chars_per_word:
302 | output_tokens.append(self.unk_token)
303 | continue
304 |
305 | is_bad = False
306 | start = 0
307 | sub_tokens = []
308 | while start < len(chars):
309 | end = len(chars)
310 | cur_substr = None
311 | while start < end:
312 | substr = "".join(chars[start:end])
313 | if start > 0:
314 | substr = "##" + substr
315 | if substr in self.vocab:
316 | cur_substr = substr
317 | break
318 | end -= 1
319 | if cur_substr is None:
320 | is_bad = True
321 | break
322 | sub_tokens.append(cur_substr)
323 | start = end
324 |
325 | if is_bad:
326 | output_tokens.append(self.unk_token)
327 | else:
328 | output_tokens.extend(sub_tokens)
329 | return output_tokens
330 |
331 |
332 | def _is_whitespace(char):
333 | """Checks whether `chars` is a whitespace character."""
334 | # \t, \n, and \r are technically contorl characters but we treat them
335 | # as whitespace since they are generally considered as such.
336 | if char == " " or char == "\t" or char == "\n" or char == "\r":
337 | return True
338 | cat = unicodedata.category(char)
339 | if cat == "Zs":
340 | return True
341 | return False
342 |
343 |
344 | def _is_control(char):
345 | """Checks whether `chars` is a control character."""
346 | # These are technically control characters but we count them as whitespace
347 | # characters.
348 | if char == "\t" or char == "\n" or char == "\r":
349 | return False
350 | cat = unicodedata.category(char)
351 | if cat.startswith("C"):
352 | return True
353 | return False
354 |
355 |
356 | def _is_punctuation(char):
357 | """Checks whether `chars` is a punctuation character."""
358 | cp = ord(char)
359 | # We treat all non-letter/number ASCII as punctuation.
360 | # Characters such as "^", "$", and "`" are not in the Unicode
361 | # Punctuation class but we treat them as punctuation anyways, for
362 | # consistency.
363 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
364 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
365 | return True
366 | cat = unicodedata.category(char)
367 | if cat.startswith("P"):
368 | return True
369 | return False
370 |
--------------------------------------------------------------------------------
/retrograph/training/__init__.py:
--------------------------------------------------------------------------------
1 | #####################################################
2 | # coding=utf-8
3 | # Copyright 2019 Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | ####################################################
17 |
18 |
19 | ####################################################
20 | # IMPORT STATEMENTS
21 | ####################################################
22 |
23 | # >>>>>> Native Imports <<<<<<<
24 |
25 | # >>>>>> Package Imports <<<<<<<
26 |
27 | # >>>>>> Local Imports <<<<<<<
28 |
29 |
30 | ####################################################
31 | # CODE
32 | ####################################################
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | ####################################################
41 | # MAIN
42 | ####################################################
43 |
44 |
45 | # EOF
46 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #####################################################
2 | # coding=utf-8
3 | # Copyright 2019 Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | ####################################################
17 |
18 | from setuptools import setup, find_packages
19 |
20 | setup(
21 | #
22 | # SETUP
23 | #
24 | name ='retrograph',
25 | version ='0.0.0.1',
26 |
27 | description ='Retrograph',
28 | url ='https://github.com/ai-nikolai/Retrograph',
29 | author ='Anne Lauscher, Nikolai Rozanov',
30 | author_email ='nikolai@wluper.com',
31 | license ='Apache 2.0',
32 | #
33 | # Actual packages, data and scripts
34 | #
35 | packages = find_packages(),
36 |
37 | scripts =[],
38 | #
39 | # Requirements
40 | #
41 | install_requires=[],
42 | )
43 |
--------------------------------------------------------------------------------
/siqa_1_download_siqa.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | waws --downloadS3 -f socialIQa_v1.4.zip -b wluper-retrograph
3 | mkdir data/SIQA
4 | unzip socialIQa_v1.4.zip
5 | mv socialIQa_v1.4_dev.jsonl data/SIQA
6 | mv socialIQa_v1.4_trn.jsonl data/SIQA
7 | mv socialIQa_v1.4_tst.jsonl data/SIQA
8 | mv socialIQa_v1.4.zip data
9 |
--------------------------------------------------------------------------------
/siqa_2_finetune_adapters.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #Step1:
4 | #run_classifier_adapter_tune_all.py ->
5 | #
6 | #
7 | #Need to load the Adapter Model
8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT
9 | TRAINING_UTILITY=training_utility
10 |
11 | export CUDA_VISIBLE_DEVICES=0
12 |
13 | BERT_DIR="models/BERT_BASE_UNCASED"
14 | BERT_CONFIG=$BERT_DIR/bert_config.json
15 | BERT_VOCAB=$BERT_DIR/vocab.txt
16 |
17 | TASKNAME='SIQA'
18 | DATA_DIR=data/$TASKNAME
19 |
20 | LEARNING_RATE=2e-5
21 | EPOCHS=3.0
22 | VARIANT=A
23 |
24 | EXPERIMENT_NAME=$LEARNING_RATE.$EPOCHS$VARIANT
25 | STEP="150000"
26 |
27 | PRETRAINED_NAME="RW30"
28 | BERT_EXTENDED_DIR="models/1.0_1.0_5_30_full_assertions_nl"
29 | # BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter"
30 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
31 |
32 | OUTPUT_DIR="models/output_model_finetunning/${TASKNAME}/${PRETRAINED_NAME}/${STEP}/${EXPERIMENT_NAME}"
33 |
34 |
35 | python3.6 $TRAINING_UTILITY/run_copa_adapter.py \
36 | --do_train=true \
37 | --do_eval=true \
38 | --data_dir=$DATA_DIR \
39 | --vocab_file=$BERT_VOCAB \
40 | --bert_config_file=$BERT_CONFIG \
41 | --init_checkpoint=$CHECKPOINT \
42 | --max_seq_length=128 \
43 | --train_batch_size=8 \
44 | --learning_rate=$LEARNING_RATE \
45 | --num_train_epochs=$EPOCHS \
46 | --variant=$VARIANT \
47 | --output_dir=$OUTPUT_DIR/ | tee $OUTPUT_DIR.out
48 |
--------------------------------------------------------------------------------
/siqa_2_finetune_bert.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #Step1:
4 | #run_classifier_adapter_tune_all.py ->
5 | #
6 | #
7 | #Need to load the Adapter Model
8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT
9 | TRAINING_UTILITY=training_utility
10 |
11 | export CUDA_VISIBLE_DEVICES=7
12 |
13 | BERT_DIR="models/BERT_BASE_UNCASED"
14 | BERT_CONFIG=$BERT_DIR/bert_config.json
15 | BERT_VOCAB=$BERT_DIR/vocab.txt
16 |
17 | TASKNAME='SIQA'
18 | DATA_DIR=data/$TASKNAME
19 |
20 | LEARNING_RATE=1e-5
21 | EPOCHS=2.0
22 | VARIANT=A
23 |
24 | EXPERIMENT_NAME=$LEARNING_RATE.$EPOCHS$VARIANT
25 |
26 | # BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter"
27 | # CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP}
28 |
29 | BERT_EXTENDED_DIR=$BERT_DIR
30 | CHECKPOINT=${BERT_EXTENDED_DIR}/bert_model.ckpt
31 | OUTPUT_DIR="models/output_model_finetunning/${TASKNAME}/BERT_BASE/${EXPERIMENT_NAME}"
32 |
33 |
34 | python3.6 $TRAINING_UTILITY/run_siqa.py \
35 | --do_train=true \
36 | --do_eval=true \
37 | --do_predict=true \
38 | --data_dir=$DATA_DIR \
39 | --vocab_file=$BERT_VOCAB \
40 | --bert_config_file=$BERT_CONFIG \
41 | --init_checkpoint=$CHECKPOINT \
42 | --max_seq_length=128 \
43 | --train_batch_size=8 \
44 | --learning_rate=$LEARNING_RATE \
45 | --num_train_epochs=$EPOCHS \
46 | --variant=$VARIANT \
47 | --output_dir=$OUTPUT_DIR/ | tee $OUTPUT_DIR.out
48 |
--------------------------------------------------------------------------------
/siqa_calc_acc_testset.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import jsonlines
3 | import numpy as np
4 |
5 | file_dataset = list(jsonlines.open(sys.argv[1]))
6 |
7 | file_testresults = open(sys.argv[2], 'r').readlines()
8 |
9 | assert len(file_dataset) == len(file_testresults)
10 |
11 | print("Number of datapoints:", len(file_dataset))
12 |
13 | acc = 0
14 | for f_d, f_t in zip(file_dataset, file_testresults):
15 | if int(f_d['label']) == int(f_t.split(',')[1]):
16 | acc += 1
17 |
18 | print("acc:", acc / len(file_dataset))
19 |
--------------------------------------------------------------------------------
/training_utility/copa_preprocessor.py:
--------------------------------------------------------------------------------
1 | # Nikolai Rozanov
2 | from retrograph.modeling import tokenization
3 | import tensorflow as tf
4 | import os
5 | import json
6 | import numpy as np
7 |
8 | class InputExample(object):
9 | """A single multiple choice question."""
10 |
11 | def __init__(
12 | self,
13 | qid,
14 | question,
15 | answers,
16 | label):
17 | """Construct an instance."""
18 | self.qid = qid
19 | self.question = question
20 | self.answers = answers
21 | self.label = label
22 |
23 |
24 | class DataProcessor(object):
25 | """Base class for data converters for sequence classification data sets."""
26 |
27 | def get_train_examples(self, data_dir):
28 | """Gets a collection of `InputExample`s for the train set."""
29 | raise NotImplementedError()
30 |
31 | def get_dev_examples(self, data_dir):
32 | """Gets a collection of `InputExample`s for the dev set."""
33 | raise NotImplementedError()
34 |
35 | def get_test_examples(self, data_dir):
36 | """Gets a collection of `InputExample`s for prediction."""
37 | raise NotImplementedError()
38 |
39 | def get_labels(self):
40 | """Gets the list of labels for this data set."""
41 | raise NotImplementedError()
42 |
43 | @classmethod
44 | def _read_json(cls, input_file):
45 | """Reads a JSON file."""
46 | with tf.gfile.Open(input_file, "r") as f:
47 | return json.load(f)
48 |
49 | @classmethod
50 | def _read_jsonl(cls, input_file):
51 | """Reads a JSON Lines file."""
52 | with tf.gfile.Open(input_file, "r") as f:
53 | return [json.loads(ln) for ln in f]
54 |
55 |
56 | class COPAProcessor(DataProcessor):
57 | """Processor for the CommonsenseQA data set."""
58 |
59 | LABELS = [0, 1]
60 |
61 | TRAIN_FILE_NAME = 'train.en.jsonl'
62 | DEV_FILE_NAME = 'val.en.jsonl'
63 | TEST_FILE_NAME = 'test_gold.jsonl'
64 |
65 | def __init__(self, variant="A"):
66 | """ There are four variants:
67 | Variant A: PREMISE [SEP] The cause/result was that ANSWER1 [SEP] The cause/result was that ANSWER2
68 | Variant B: PREMISE [SEP] What was the cause/result of ANSWER1 [SEP] What was the cause/result of ANSWER2
69 | Variant C: What was the cause/result of PREMISE [SEP] ANSWER1 [SEP] ANSWER2
70 | Variant D: PREMISE [SEP] ANSWER1 [SEP] ANSWER2
71 |
72 | """
73 | self.variant = variant
74 |
75 |
76 | def get_train_examples(self, data_dir):
77 | train_file_name = self.TRAIN_FILE_NAME
78 |
79 | return self._create_examples(
80 | self._read_jsonl(os.path.join(data_dir, train_file_name)),
81 | 'train')
82 |
83 | def get_dev_examples(self, data_dir):
84 | dev_file_name = self.DEV_FILE_NAME
85 |
86 | return self._create_examples(
87 | self._read_jsonl(os.path.join(data_dir, dev_file_name)),
88 | 'dev')
89 |
90 | def get_test_examples(self, data_dir):
91 | test_file_name = self.TEST_FILE_NAME
92 |
93 | return self._create_examples(
94 | self._read_jsonl(os.path.join(data_dir, test_file_name)),
95 | 'test')
96 |
97 | def get_labels(self):
98 | return [0, 1]
99 |
100 | def _create_examples(self,lines, set_type):
101 | """ Calls one of the variants"""
102 | if self.variant=="A":
103 | return self._create_examples_variant_A(lines, set_type)
104 | elif self.variant=="B":
105 | return self._create_examples_variant_B(lines, set_type)
106 | elif self.variant=="C":
107 | return self._create_examples_variant_C(lines, set_type)
108 | elif self.variant=="D":
109 | return self._create_examples_variant_D(lines, set_type)
110 | else:
111 | raise Exception("NO SUCH VARIAN FOR COPA PREPROCESSING")
112 |
113 |
114 | ## VARIANT_A Premise [SEP] STATMENT_Answer [SEP] ST Answer
115 | def _create_examples_variant_A(self, lines, set_type):
116 | examples = []
117 | for line in lines:
118 | qid = line['idx']
119 | premise = tokenization.convert_to_unicode(line['premise'])
120 |
121 | question = "The cause was that " if line["question"]=="cause" else "The result was that "
122 | answers = np.array([
123 | tokenization.convert_to_unicode(question + line["choice1"]),
124 | tokenization.convert_to_unicode(question + line["choice2"])
125 | ])
126 |
127 | # the test set has no answer key so use '0' as a dummy label
128 | label = line.get('label', 0)
129 |
130 | examples.append(
131 | InputExample(
132 | qid=qid,
133 | question=premise,
134 | answers=answers,
135 | label=label))
136 |
137 | return examples
138 |
139 | ## VARIANT_B Premise [SEP] WH-Question_Answer [SEP] WH_Q Answer
140 | def _create_examples_variant_B(self, lines, set_type):
141 | examples = []
142 | for line in lines:
143 | qid = line['idx']
144 | question = "What was the cause of " if line["question"]=="cause" else "What was the result of"
145 | premise = tokenization.convert_to_unicode(line['premise'])
146 |
147 | answers = np.array([
148 | tokenization.convert_to_unicode(question + line["choice1"]),
149 | tokenization.convert_to_unicode(question + line["choice2"])
150 | ])
151 |
152 | # the test set has no answer key so use '0' as a dummy label
153 | label = line.get('label', 0)
154 |
155 | examples.append(
156 | InputExample(
157 | qid=qid,
158 | question=premise,
159 | answers=answers,
160 | label=label))
161 |
162 | return examples
163 |
164 |
165 | ## VARIANT_C WH-Question_Premise [SEP] Answer [SEP] Answer
166 | def _create_examples_variant_C(self, lines, set_type):
167 | examples = []
168 | for line in lines:
169 | qid = line['idx']
170 | question = "What was the cause of " if line["question"]=="cause" else "What was the result of"
171 | premise = tokenization.convert_to_unicode(question + line['premise'])
172 |
173 | answers = np.array([
174 | tokenization.convert_to_unicode(line["choice1"]),
175 | tokenization.convert_to_unicode(line["choice2"])
176 | ])
177 |
178 | # the test set has no answer key so use '0' as a dummy label
179 | label = line.get('label', 0)
180 |
181 | examples.append(
182 | InputExample(
183 | qid=qid,
184 | question=premise,
185 | answers=answers,
186 | label=label))
187 |
188 | return examples
189 |
190 |
191 | ## VARIANT_D Premise [SEP] Answer [SEP] Answer
192 | def _create_examples_variant_D(self, lines, set_type):
193 | examples = []
194 | for line in lines:
195 | qid = line['idx']
196 |
197 | premise = tokenization.convert_to_unicode(line['premise'])
198 |
199 | answers = np.array([
200 | tokenization.convert_to_unicode(line["choice1"]),
201 | tokenization.convert_to_unicode(line["choice2"])
202 | ])
203 |
204 | # the test set has no answer key so use '0' as a dummy label
205 | label = line.get('label', 0)
206 |
207 | examples.append(
208 | InputExample(
209 | qid=qid,
210 | question=premise,
211 | answers=answers,
212 | label=label))
213 |
214 | return examples
215 |
--------------------------------------------------------------------------------
/training_utility/siqa_preprocessor.py:
--------------------------------------------------------------------------------
1 | # Nikolai Rozanov
2 | from retrograph.modeling import tokenization
3 | import tensorflow as tf
4 | import os
5 | import json
6 | import numpy as np
7 |
8 | class InputExample(object):
9 | """A single multiple choice question."""
10 |
11 | def __init__(
12 | self,
13 | qid,
14 | question,
15 | answers,
16 | label):
17 | """Construct an instance."""
18 | self.qid = qid
19 | self.question = question
20 | self.answers = answers
21 | self.label = label
22 |
23 |
24 | class DataProcessor(object):
25 | """Base class for data converters for sequence classification data sets."""
26 |
27 | def get_train_examples(self, data_dir):
28 | """Gets a collection of `InputExample`s for the train set."""
29 | raise NotImplementedError()
30 |
31 | def get_dev_examples(self, data_dir):
32 | """Gets a collection of `InputExample`s for the dev set."""
33 | raise NotImplementedError()
34 |
35 | def get_test_examples(self, data_dir):
36 | """Gets a collection of `InputExample`s for prediction."""
37 | raise NotImplementedError()
38 |
39 | def get_labels(self):
40 | """Gets the list of labels for this data set."""
41 | raise NotImplementedError()
42 |
43 | @classmethod
44 | def _read_json(cls, input_file):
45 | """Reads a JSON file."""
46 | with tf.gfile.Open(input_file, "r") as f:
47 | return json.load(f)
48 |
49 | @classmethod
50 | def _read_jsonl(cls, input_file):
51 | """Reads a JSON Lines file."""
52 | with tf.gfile.Open(input_file, "r") as f:
53 | return [json.loads(ln) for ln in f]
54 |
55 |
56 | class SIQAProcessor(DataProcessor):
57 | """Processor for the CommonsenseQA data set."""
58 |
59 | LABELS = [0, 1, 2]
60 |
61 | TRAIN_FILE_NAME = 'socialIQa_v1.4_trn.jsonl'
62 | DEV_FILE_NAME = 'socialIQa_v1.4_dev.jsonl'
63 | TEST_FILE_NAME = 'socialIQa_v1.4_tst.jsonl'
64 |
65 | def __init__(self, variant="A"):
66 | """ There are four variants:
67 | Variant A: PREMISE [SEP] The cause/result was that ANSWER1 [SEP] The cause/result was that ANSWER2
68 | Variant B: PREMISE [SEP] What was the cause/result of ANSWER1 [SEP] What was the cause/result of ANSWER2
69 | Variant C: What was the cause/result of PREMISE [SEP] ANSWER1 [SEP] ANSWER2
70 | Variant D: PREMISE [SEP] ANSWER1 [SEP] ANSWER2
71 |
72 | """
73 | self.variant = variant
74 |
75 |
76 | def get_train_examples(self, data_dir):
77 | train_file_name = self.TRAIN_FILE_NAME
78 |
79 | return self._create_examples(
80 | self._read_jsonl(os.path.join(data_dir, train_file_name)),
81 | 'train')
82 |
83 | def get_dev_examples(self, data_dir):
84 | dev_file_name = self.DEV_FILE_NAME
85 |
86 | return self._create_examples(
87 | self._read_jsonl(os.path.join(data_dir, dev_file_name)),
88 | 'dev')
89 |
90 | def get_test_examples(self, data_dir):
91 | test_file_name = self.TEST_FILE_NAME
92 |
93 | return self._create_examples(
94 | self._read_jsonl(os.path.join(data_dir, test_file_name)),
95 | 'test')
96 |
97 | def get_labels(self):
98 | return [0, 1, 2]
99 |
100 | def _create_examples(self,lines, set_type):
101 | """ Calls one of the variants"""
102 | if self.variant=="A":
103 | return self._create_examples_variant_A(lines, set_type)
104 | elif self.variant=="B":
105 | return self._create_examples_variant_B(lines, set_type)
106 | elif self.variant=="C":
107 | return self._create_examples_variant_C(lines, set_type)
108 | elif self.variant=="D":
109 | return self._create_examples_variant_D(lines, set_type)
110 | else:
111 | raise Exception("NO SUCH VARIAN FOR COPA PREPROCESSING")
112 |
113 |
114 | ## VARIANT_A Premise [SEP] STATMENT_Answer [SEP] ST Answer
115 | def _create_examples_variant_A(self, lines, set_type):
116 | examples = []
117 | for line in lines:
118 | qid = line['idx']
119 | premise = tokenization.convert_to_unicode(line['premise'])
120 |
121 | question = line["question"]
122 | answers = np.array([
123 | tokenization.convert_to_unicode(question + line["choice1"]),
124 | tokenization.convert_to_unicode(question + line["choice2"]),
125 | tokenization.convert_to_unicode(question + line["choice3"])
126 | ])
127 |
128 | # the test set has no answer key so use '0' as a dummy label
129 | label = line.get('label', 0)
130 |
131 | examples.append(
132 | InputExample(
133 | qid=qid,
134 | question=premise,
135 | answers=answers,
136 | label=label))
137 |
138 | return examples
139 |
140 | ## VARIANT_B Premise [SEP] WH-Question_Answer [SEP] WH_Q Answer
141 | def _create_examples_variant_B(self, lines, set_type):
142 | examples = []
143 | for line in lines:
144 | qid = line['idx']
145 | question = line["question"]
146 | premise = tokenization.convert_to_unicode(line['premise'])
147 |
148 | answers = np.array([
149 | tokenization.convert_to_unicode(question + line["choice1"]),
150 | tokenization.convert_to_unicode(question + line["choice2"]),
151 | tokenization.convert_to_unicode(question + line["choice3"])
152 | ])
153 |
154 | # the test set has no answer key so use '0' as a dummy label
155 | label = line.get('label', 0)
156 |
157 | examples.append(
158 | InputExample(
159 | qid=qid,
160 | question=premise,
161 | answers=answers,
162 | label=label))
163 |
164 | return examples
165 |
166 |
167 | ## VARIANT_C WH-Question_Premise [SEP] Answer [SEP] Answer
168 | def _create_examples_variant_C(self, lines, set_type):
169 | examples = []
170 | for line in lines:
171 | qid = line['idx']
172 | question = line["question"]
173 | premise = tokenization.convert_to_unicode(question + line['premise'])
174 |
175 | answers = np.array([
176 | tokenization.convert_to_unicode(line["choice1"]),
177 | tokenization.convert_to_unicode(line["choice2"]),
178 | tokenization.convert_to_unicode(line["choice3"])
179 | ])
180 |
181 | # the test set has no answer key so use '0' as a dummy label
182 | label = line.get('label', 0)
183 |
184 | examples.append(
185 | InputExample(
186 | qid=qid,
187 | question=premise,
188 | answers=answers,
189 | label=label))
190 |
191 | return examples
192 |
193 |
194 | ## Premise WH-Question [SEP] Answer [SEP] Answer
195 | def _create_examples_variant_D(self, lines, set_type):
196 | examples = []
197 | for line in lines:
198 | qid = line['idx']
199 | question = line["question"]
200 | premise = tokenization.convert_to_unicode(line['premise'] + question)
201 |
202 | answers = np.array([
203 | tokenization.convert_to_unicode(line["choice1"]),
204 | tokenization.convert_to_unicode(line["choice2"]),
205 | tokenization.convert_to_unicode(line["choice3"])
206 | ])
207 |
208 | # the test set has no answer key so use '0' as a dummy label
209 | label = line.get('label', 0)
210 |
211 | examples.append(
212 | InputExample(
213 | qid=qid,
214 | question=premise,
215 | answers=answers,
216 | label=label))
217 |
218 | return examples
219 |
--------------------------------------------------------------------------------
/utility/ec2.py:
--------------------------------------------------------------------------------
1 | import waws
2 |
3 | inst = waws.InstanceManager()
4 |
5 | #inst.upload_to_EC2(folder_file_name=".", instance="sunshine-1")
6 | #inst.upload_to_EC2(folder_file_name="./modeling.py", optional_remote_path="./ConceptBERT/", instance="sunshine-1")
7 | #inst.upload_to_EC2(folder_file_name="./data/glue_data/", optional_remote_path="./ConceptBERT/data/", instance="sunshine-1")
8 | inst.upload_to_EC2(folder_file_name="./poc_finetuning.sh", optional_remote_path="./ConceptBERT/", instance="sunshine-1")
9 | inst.upload_to_EC2(folder_file_name="./run_regression.py", optional_remote_path="./ConceptBERT/", instance="sunshine-1")
10 | inst.upload_to_EC2(folder_file_name="./run_classifier.py", optional_remote_path="./ConceptBERT/", instance="sunshine-1")
11 | inst.upload_to_EC2(folder_file_name="./poc_bash_test.sh", optional_remote_path="./ConceptBERT/", instance="sunshine-1")
12 | #inst.upload_to_EC2(folder_file_name="/c/Users/anlausch/Downloads/uncased_L-12_H-768_A-12/", instance="sunshine-1")
13 |
14 | #inst.download_from_EC2(folder_file_name="CODE_FOLDER", local_path="./training", optional_remote_path="EXPERIMENT2", instance="sunshine-1")
--------------------------------------------------------------------------------
/utility/ec2_download.py:
--------------------------------------------------------------------------------
1 | import waws
2 |
3 | inst = waws.InstanceManager()
4 |
5 | inst.download_from_EC2(folder_file_name="~/ConceptBERT/output/pretraining/sentences/free-wo-nsp", local_path="/c/Users/anlausch/Downloads/omcs", instance="sunshine-1")
6 |
--------------------------------------------------------------------------------
/utility/s3_download.py:
--------------------------------------------------------------------------------
1 | # Download files
2 | s3.download_file(
3 | file_name="test.txt",
4 | local_path="some/local/path",
5 | remote_path="SOME/S3/PATH",
6 | bucket_name="some_bucket_name"
7 | )
--------------------------------------------------------------------------------
/utility/s3_upload.py:
--------------------------------------------------------------------------------
1 | import waws
2 | import os
3 |
4 | s3 = waws.BucketManager()
5 |
6 | import os
7 |
8 | path = os.getcwd()
9 |
10 | files = []
11 | # r=root, d=directories, f = files
12 | for r, d, f in os.walk(path):
13 | for file in f:
14 | #if ".iml" not in file and ".xml" not in file:
15 | # Upload files
16 | s3.upload_file(
17 | file_name=file,
18 | local_path=r,
19 | remote_path="~/retrograph"
20 | )
21 |
22 |
23 |
--------------------------------------------------------------------------------
/utility/upload_s3.sh:
--------------------------------------------------------------------------------
1 | waws --uploadS3 -b wluper-retrograph -f all -l "./../ConceptBERT/"
--------------------------------------------------------------------------------