├── .editor-settings ├── .gitignore ├── 0_download_bert.sh ├── 1_download_relations.sh ├── 2_create_random_walks.sh ├── 3_generate_corpus.sh ├── 4_pretrain_adapter.sh ├── 9_download_pretrained_adapters_omcs.sh ├── 9_download_pretrained_adapters_rw30.sh ├── LICENSE ├── README.md ├── archive ├── create_pretraining_data.py ├── poc_bash_test.sh ├── poc_create_pretraining_data.sh ├── poc_create_pretraining_data_rw.sh ├── poc_create_pretraining_data_wo_nsp.sh ├── poc_finetuning.sh ├── poc_finetuning_adapter.sh ├── poc_finetuning_adapter_longer.sh ├── poc_finetuning_adapter_longer_2.sh ├── poc_finetuning_adapter_quick_insight.sh ├── poc_finetuning_adapter_sst2.sh ├── poc_finetuning_dws.sh ├── poc_finetuning_rw.sh ├── poc_pretraining.sh ├── poc_pretraining_dws.sh ├── poc_pretraining_rw.sh ├── prediction_diagnostic.sh ├── predictions_rw_100000.sh ├── predictions_rw_100000_all.sh ├── predictions_rw_25000_all.sh ├── run_classifier_adapter.py ├── run_pretraining_adapter.py └── run_regression_adapter.py ├── copa_1_download_copa.sh ├── copa_2_finetune_adapter.sh ├── copa_2_finetune_bert.sh ├── csqa_1_download_commonsenseqa.sh ├── csqa_2_finetune_adapter.sh ├── csqa_3_eval_adapter.sh ├── data_utility ├── create_pretraining_data.py └── create_pretraining_data_wo_nsp.py ├── download_utility ├── download_bert.py ├── download_commonsenseqa.py ├── download_glue.py └── download_relations.py ├── glue_1_download_glue.sh ├── glue_2_finetune_adapter.sh ├── images └── Retrograph.png ├── randomwalks_utility ├── create_corpora_from_random_walks.py ├── preprocess_cn.py └── random_walks.py ├── results_utility ├── fetcher.py └── parse_predictions.py ├── retrograph ├── __init__.py ├── modeling │ ├── __init__.py │ ├── metrics_extension.py │ ├── modeling.py │ ├── modeling_adapter.py │ ├── optimization.py │ ├── optimization_adapter.py │ └── tokenization.py └── training │ ├── __init__.py │ └── preprocessors.py ├── setup.py ├── siqa_1_download_siqa.sh ├── siqa_2_finetune_adapters.sh ├── siqa_2_finetune_bert.sh ├── siqa_calc_acc_testset.py ├── training_utility ├── copa_preprocessor.py ├── run_classifier.py ├── run_classifier_adapter_tune_all.py ├── run_commonsenseqa.py ├── run_commonsenseqa_adapter.py ├── run_copa.py ├── run_copa_adapter.py ├── run_pretraining.py ├── run_pretraining_adapter.py ├── run_pretraining_wo_nsp.py ├── run_pretraining_wo_nsp_adapter.py ├── run_regression.py ├── run_regression_adapter_tune_all.py ├── run_siqa.py ├── run_siqa_adapters.py └── siqa_preprocessor.py └── utility ├── ec2.py ├── ec2_download.py ├── s3_download.py ├── s3_upload.py └── upload_s3.sh /.editor-settings: -------------------------------------------------------------------------------- 1 | tabLength: 2 2 | softTabs: true 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ 3 | data/ 4 | models/ 5 | relations/ 6 | randomwalks/ 7 | -------------------------------------------------------------------------------- /0_download_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DOWNLOAD_UTILITY_SCRIPTS=download_utility 4 | 5 | mkdir -p 'models/BERT_BASE_UNCASED' 6 | 7 | # DOWNLOAD BERT 8 | python3.6 $DOWNLOAD_UTILITY_SCRIPTS/download_bert.py 9 | -------------------------------------------------------------------------------- /1_download_relations.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DOWNLOAD_UTILITY_SCRIPTS=download_utility 4 | 5 | 6 | DIR_SAVE_RELATIONS='relations/' 7 | mkdir -p $DIR_SAVE_RELATIONS 8 | 9 | # DOWNLOAD RELATIONS 10 | python3.6 $DOWNLOAD_UTILITY_SCRIPTS/download_relations.py --data_dir $DIR_SAVE_RELATIONS --relations all 11 | -------------------------------------------------------------------------------- /2_create_random_walks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANDOM_WALKS_SCRIPTS=randomwalks_utility 4 | 5 | mkdir -p 'randomwalks' 6 | 7 | # Preprocess the relations 8 | python3.6 $RANDOM_WALKS_SCRIPTS/preprocess_cn.py 9 | 10 | # Create the randomwalks using node2vec 11 | python3.6 $RANDOM_WALKS_SCRIPTS/random_walks.py 12 | -------------------------------------------------------------------------------- /3_generate_corpus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Create natural language text from the RWs 4 | # create_corpora_from_random_walks.py -> takes as input the pickle file and generates the corpus 5 | # -> output corpus "rw_.txt" 6 | # || could change how sentences are generated. at the moment sentences are always 3 word sentences 7 | # -> if you want extra vocab in bert change function "create_realtionship_token" 8 | 9 | RANDOM_WALKS_SCRIPTS=randomwalks_utility 10 | DATA_SCRIPTS=data_utility 11 | 12 | python3.6 $RANDOM_WALKS_SCRIPTS/create_corpora_from_random_walks.py 13 | 14 | # COMMENTS - NIKOLAI 15 | #create_pretraining_data.py OR 16 | #create_pretraining_data_wo_nsp.py (without Next Sentence Prediciton) 17 | # 18 | #For OMSC you only need to create the pretraining data 19 | ## 4 - Pretraining BERT using RW Corpus 20 | 21 | ## 1.1 - OMCS Pretraining Data 22 | #Step1: (create pretraining out of corpus) 23 | #create_pretraining_data.py OR 24 | #create_pretraining_data_wo_nsp.py (without Next Sentence Prediciton) 25 | 26 | VOCAB_FILE=models/BERT_BASE_UNCASED/vocab.txt 27 | 28 | # TODO: change this to create different pre-training data 29 | INPUT_FILE=randomwalks/rw_corpus_1.0_1.0_2_15_nl.txt 30 | OUTPUT_FILE=randomwalks/rw_corpus_1.0_1.0_2_15_nl.tf 31 | 32 | 33 | python3.6 $DATA_SCRIPTS/create_pretraining_data_wo_nsp.py --input_file $INPUT_FILE --output_file $OUTPUT_FILE --vocab_file $VOCAB_FILE 34 | 35 | # python3.6 $DATA_SCRIPTS/create_pretraining_data.py --input_file $INPUT_FILE --output_file $OUTPUT_FILE --vocab_file $VOCAB_FILE 36 | -------------------------------------------------------------------------------- /4_pretrain_adapter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Step1: (run the pretraining) 4 | #run_pretraining_adapter.py OR 5 | #run_pretraining_adapter_wo_nsp.py (without Next Sentence Prediciton) 6 | # 7 | # 8 | #Need to load the Adapter Model 9 | #And need to load the Adapter Optimiser for that. 10 | 11 | TRAINING_UTILITY=training_utility 12 | 13 | export CUDA_VISIBLE_DEVICES=8 14 | 15 | BERT_CONFIG_FILE=models/BERT_BASE_UNCASED/bert_config.json 16 | INPUT_FILE=randomwalks/rw_corpus_1.0_1.0_2_15_nl.tf 17 | OUTPUT_DIR=models/output_pretrain_adapter 18 | 19 | mkdir -p $OUTPUT_DIR 20 | 21 | python3.6 $TRAINING_UTILITY/run_pretraining_wo_nsp_adapter.py --input_file $INPUT_FILE --output_dir $OUTPUT_DIR \ 22 | --bert_config_file $BERT_CONFIG_FILE \ 23 | --do_train True 24 | -------------------------------------------------------------------------------- /9_download_pretrained_adapters_omcs.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | waws --downloadS3 -f omcs_pretraining_free_wo_nsp_adapter.zip -b wluper-retrograph 3 | unzip omcs_pretraining_free_wo_nsp_adapter.zip 4 | mv omcs_pretraining_free_wo_nsp_adapter.zip models 5 | mv omcs_pretraining_free_wo_nsp_adapter models 6 | -------------------------------------------------------------------------------- /9_download_pretrained_adapters_rw30.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | waws --downloadS3 -f 1.0_1.0_5_30_full_assertions_nl.zip -b wluper-retrograph 3 | unzip 1.0_1.0_5_30_full_assertions_nl.zip 4 | mv 1.0_1.0_5_30_full_assertions_nl.zip models 5 | mv 1.0_1.0_5_30_full_assertions_nl models 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019-Present Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas. All rights reserved. 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | 180 | APPENDIX: How to apply the Apache License to your work. 181 | 182 | To apply the Apache License to your work, attach the following 183 | boilerplate notice, with the fields enclosed by brackets "[]" 184 | replaced with your own identifying information. (Don't include 185 | the brackets!) The text should be enclosed in the appropriate 186 | comment syntax for the file format. We also recommend that a 187 | file or class name and description of purpose be included on the 188 | same "printed page" as the copyright notice for easier 189 | identification within third-party archives. 190 | 191 | Copyright 2019 Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | you may not use this file except in compliance with the License. 195 | You may obtain a copy of the License at 196 | 197 | http://www.apache.org/licenses/LICENSE-2.0 198 | 199 | Unless required by applicable law or agreed to in writing, software 200 | distributed under the License is distributed on an "AS IS" BASIS, 201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 202 | See the License for the specific language governing permissions and 203 | limitations under the License. 204 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Retrograph 2 | # (aka Common Sense of World Knowledge? Investigating Adapter-Based Knowledge Injection into Pretrained Transformers) 3 | # accepted at EMNLP/DeeLIO 2020. 4 | ![Retrograph (C) Wluper](images/Retrograph.png) 5 | 6 | ## Abstract 7 | Following the major success of neural language models (LMs) such as BERT or GPT-2 on a variety of language understanding tasks, recent work focused on injecting (structured) knowledge from external resources into these models. While on the one hand, joint pretraining (i.e., training from scratch, adding objectives based on external knowledge to the primary LM objective) may be prohibitively computationally expensive, post-hoc fine-tuning on external knowledge, on the other hand, may lead to the catastrophic forgetting of distributional knowledge. In this work, we investigate models for complementing the distributional knowledge of BERT with conceptual knowledge from ConceptNet and its corresponding Open Mind Common Sense (OMCS) corpus, respectively, using adapter training. While overall results on the GLUE benchmark paint an inconclusive picture, a deeper analysis reveals that our adapter-based models substantially outperform BERT (up to 15-20 performance points) on inference tasks that require the type of conceptual knowledge explicitly present in ConceptNet and OMCS. 8 | 9 | ## Paper (EMNLP/DeeLIO 2020 Proceedings to follow) 10 | [Link To Paper](https://arxiv.org/abs/2005.11787) 11 | 12 | ## Key people 13 | [Anne Lauscher](https://www.uni-mannheim.de/dws/people/researchers/phd-students/anne-lauscher/) 14 | 15 | [Olga Majewska](https://om304.github.io/) 16 | 17 | [Leonardo Ribeiro](https://github.com/leoribeiro) 18 | 19 | [Goran Glavaš](https://www.uni-mannheim.de/dws/people/professors/prof-dr-goran-glavas/) 20 | 21 | [Nikolai Rozanov](https://github.com/ai-nikolai) 22 | 23 | [Iryna Gurevych](https://www.informatik.tu-darmstadt.de/ukp/ukp_home/staff_ukp/prof_dr_iryna_gurevych/index.en.jsp) 24 | 25 | ## Description 26 | Retrograph is the official repo behind University of Mannheim's, TU Darmstadt's and Wluper's Commonsense Adapter Paper. 27 | 28 | The key idea is that one can inject knowledge into pretrained language models using Adapters. 29 | 30 | We try two methods to generate training data for the adapters: 31 | 1. OMCS 32 | 2. Random walk from ConceptNet 33 | 34 | We evaluate on: 35 | 1. glue 36 | 2. csqa 37 | 3. copa 38 | 4. siqa 39 | 40 | Key results, you can find in the paper: 41 | [Link To Paper](https://arxiv.org/abs/2005.11787) 42 | 43 | 44 | 45 | 46 | ## A - Getting it running: 47 | 48 | Environment: python 3.6 49 | 50 | Please, follow these instructions to execute the experiments. 51 | 52 | ### 0 - Download BERT (This needs to be done for all experiments) 53 | Step 0: Download BERT 54 | ``` 55 | bash ./0_download_bert.sh 56 | ``` 57 | It creates: 58 | 1. models/BERT_BASE_UNCASED 59 | 60 | 61 | 62 | ### Next Steps: 63 | 1. Generate Random Walks and Pretrain Adapter -> Go to [B - Random Walks and Pretraining](#random_walk) 64 | 65 | 2. Finetune on existing Adapters -> Go to [C - Finetuning on Pretrained Adapters](#finetuning): 66 | - [GLUE](#glue) 67 | - [CSQA](#csqa) 68 | - [COPA](#copa) 69 | - [SIQA](#siqa) 70 | 71 | 72 | 73 | ## B - Random Walks and Pretraining 74 | Follow these steps for pretraining adapter. 75 | 76 | 77 | ### 1 - Download Relations 78 | Step 1: Download Relations 79 | ``` 80 | bash ./1_download_relations.sh 81 | ``` 82 | It creates: 83 | 1. relations/cn_relationType*.txt 84 | 85 | 86 | ### 2 - Creating Random Walks 87 | 88 | Step 2: Create the sequences of tokens using random walks generated by node2vec: 89 | ``` 90 | bash ./2_create_random_walks.sh 91 | ``` 92 | 93 | It creates the main file `randomwalks/random_walk_1.0_1.0_2_15.p` and others also (`randomwalks/cn_assertions_filtered.tsv`) 94 | 95 | 96 | 97 | ### 3 - Generating the Corpus (This takes a serious while) 98 | Step 3: Create natural language text from the random walks: 99 | ``` 100 | bash ./3_generate_corpus.sh 101 | ``` 102 | The generated corpus will be used as input for BERT + Adapters. It creates a file in TF format: `randomwalks/rw_corpus_1.0_1.0_2_15_nl.tf` (and also generates: `randomwalks/rw_corpus_1.0_1.0_2_15_nl.tf`) 103 | 104 | 105 | ### 4 - Pretraining Adapter 106 | 107 | Step 4: Pretrain the adapter using the RW corpus: 108 | ``` 109 | bash ./4_pretrain_adapter.sh 110 | ``` 111 | Creates a model in: `models/output_pretrain_adapter` 112 | 113 | 114 | 115 | 116 | 117 | ## C - Finetuning on Pretrained Adapters 118 | 119 | 120 | ### 9 - Download Pretrained Adapters (needs to be done if you don't have already pretrained adapters) 121 | 122 | ``` 123 | 9_download_pretrained_adapters_rw30.sh 124 | 9_download_pretrained_adapters_omcs.sh 125 | ``` 126 | 127 | **ALL models will be saved in Creates a model in: `models/output_model_finetunning`** 128 | **Modify the task_2_....sh files if you want to change hyper parameters** 129 | 130 | 131 | 132 | ## GLUE 133 | 134 | **Run all glue_1,2_.sh files in that order** 135 | 136 | 137 | 138 | 139 | ## CommonsenseQA 140 | 141 | **Run all csqa_1,2_.sh files in that order** 142 | 143 | 144 | 145 | ## COPA 146 | 147 | **Run all copa_1,2_.sh files in that order** 148 | 149 | 150 | 151 | ## SIQA 152 | **Run all siqa_1,2_.sh files in that order** 153 | 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /archive/create_pretraining_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Create masked LM/next sentence masked_lm TF examples for BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import random 23 | 24 | import tokenization 25 | import tensorflow as tf 26 | 27 | flags = tf.flags 28 | 29 | FLAGS = flags.FLAGS 30 | 31 | flags.DEFINE_string("input_file", None, 32 | "Input raw text file (or comma-separated list of files).") 33 | 34 | flags.DEFINE_string( 35 | "output_file", None, 36 | "Output TF example file (or comma-separated list of files).") 37 | 38 | flags.DEFINE_string("vocab_file", None, 39 | "The vocabulary file that the BERT model was trained on.") 40 | 41 | flags.DEFINE_bool( 42 | "do_lower_case", True, 43 | "Whether to lower case the input text. Should be True for uncased " 44 | "models and False for cased models.") 45 | 46 | flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") 47 | 48 | flags.DEFINE_integer("max_predictions_per_seq", 20, 49 | "Maximum number of masked LM predictions per sequence.") 50 | 51 | flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") 52 | 53 | flags.DEFINE_integer( 54 | "dupe_factor", 10, 55 | "Number of times to duplicate the input data (with different masks).") 56 | 57 | flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") 58 | 59 | flags.DEFINE_float( 60 | "short_seq_prob", 0.1, 61 | "Probability of creating sequences which are shorter than the " 62 | "maximum length.") 63 | 64 | 65 | class TrainingInstance(object): 66 | """A single training instance (sentence pair).""" 67 | 68 | def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, 69 | is_random_next): 70 | self.tokens = tokens 71 | self.segment_ids = segment_ids 72 | self.is_random_next = is_random_next 73 | self.masked_lm_positions = masked_lm_positions 74 | self.masked_lm_labels = masked_lm_labels 75 | 76 | def __str__(self): 77 | s = "" 78 | s += "tokens: %s\n" % (" ".join( 79 | [tokenization.printable_text(x) for x in self.tokens])) 80 | s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) 81 | s += "is_random_next: %s\n" % self.is_random_next 82 | s += "masked_lm_positions: %s\n" % (" ".join( 83 | [str(x) for x in self.masked_lm_positions])) 84 | s += "masked_lm_labels: %s\n" % (" ".join( 85 | [tokenization.printable_text(x) for x in self.masked_lm_labels])) 86 | s += "\n" 87 | return s 88 | 89 | def __repr__(self): 90 | return self.__str__() 91 | 92 | 93 | def write_instance_to_example_files(instances, tokenizer, max_seq_length, 94 | max_predictions_per_seq, output_files): 95 | """Create TF example files from `TrainingInstance`s.""" 96 | writers = [] 97 | for output_file in output_files: 98 | writers.append(tf.python_io.TFRecordWriter(output_file)) 99 | 100 | writer_index = 0 101 | 102 | total_written = 0 103 | for (inst_index, instance) in enumerate(instances): 104 | input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) 105 | input_mask = [1] * len(input_ids) 106 | segment_ids = list(instance.segment_ids) 107 | assert len(input_ids) <= max_seq_length 108 | 109 | while len(input_ids) < max_seq_length: 110 | input_ids.append(0) 111 | input_mask.append(0) 112 | segment_ids.append(0) 113 | 114 | assert len(input_ids) == max_seq_length 115 | assert len(input_mask) == max_seq_length 116 | assert len(segment_ids) == max_seq_length 117 | 118 | masked_lm_positions = list(instance.masked_lm_positions) 119 | masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) 120 | masked_lm_weights = [1.0] * len(masked_lm_ids) 121 | 122 | while len(masked_lm_positions) < max_predictions_per_seq: 123 | masked_lm_positions.append(0) 124 | masked_lm_ids.append(0) 125 | masked_lm_weights.append(0.0) 126 | 127 | next_sentence_label = 1 if instance.is_random_next else 0 128 | 129 | features = collections.OrderedDict() 130 | features["input_ids"] = create_int_feature(input_ids) 131 | features["input_mask"] = create_int_feature(input_mask) 132 | features["segment_ids"] = create_int_feature(segment_ids) 133 | features["masked_lm_positions"] = create_int_feature(masked_lm_positions) 134 | features["masked_lm_ids"] = create_int_feature(masked_lm_ids) 135 | features["masked_lm_weights"] = create_float_feature(masked_lm_weights) 136 | features["next_sentence_labels"] = create_int_feature([next_sentence_label]) 137 | 138 | tf_example = tf.train.Example(features=tf.train.Features(feature=features)) 139 | 140 | writers[writer_index].write(tf_example.SerializeToString()) 141 | writer_index = (writer_index + 1) % len(writers) 142 | 143 | total_written += 1 144 | 145 | if inst_index < 20: 146 | tf.logging.info("*** Example ***") 147 | tf.logging.info("tokens: %s" % " ".join( 148 | [tokenization.printable_text(x) for x in instance.tokens])) 149 | 150 | for feature_name in features.keys(): 151 | feature = features[feature_name] 152 | values = [] 153 | if feature.int64_list.value: 154 | values = feature.int64_list.value 155 | elif feature.float_list.value: 156 | values = feature.float_list.value 157 | tf.logging.info( 158 | "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) 159 | 160 | for writer in writers: 161 | writer.close() 162 | 163 | tf.logging.info("Wrote %d total instances", total_written) 164 | 165 | 166 | def create_int_feature(values): 167 | feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) 168 | return feature 169 | 170 | 171 | def create_float_feature(values): 172 | feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) 173 | return feature 174 | 175 | 176 | def create_training_instances(input_files, tokenizer, max_seq_length, 177 | dupe_factor, short_seq_prob, masked_lm_prob, 178 | max_predictions_per_seq, rng): 179 | """Create `TrainingInstance`s from raw text.""" 180 | all_documents = [[]] 181 | 182 | # Input file format: 183 | # (1) One sentence per line. These should ideally be actual sentences, not 184 | # entire paragraphs or arbitrary spans of text. (Because we use the 185 | # sentence boundaries for the "next sentence prediction" task). 186 | # (2) Blank lines between documents. Document boundaries are needed so 187 | # that the "next sentence prediction" task doesn't span between documents. 188 | for input_file in input_files: 189 | with tf.gfile.GFile(input_file, "r") as reader: 190 | while True: 191 | line = tokenization.convert_to_unicode(reader.readline()) 192 | if not line: 193 | break 194 | line = line.strip() 195 | 196 | # Empty lines are used as document delimiters 197 | if not line: 198 | all_documents.append([]) 199 | tokens = tokenizer.tokenize(line) 200 | if tokens: 201 | all_documents[-1].append(tokens) 202 | 203 | # Remove empty documents 204 | all_documents = [x for x in all_documents if x] 205 | rng.shuffle(all_documents) 206 | 207 | vocab_words = list(tokenizer.vocab.keys()) 208 | instances = [] 209 | for _ in range(dupe_factor): 210 | for document_index in range(len(all_documents)): 211 | instances.extend( 212 | create_instances_from_document( 213 | all_documents, document_index, max_seq_length, short_seq_prob, 214 | masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) 215 | 216 | rng.shuffle(instances) 217 | return instances 218 | 219 | 220 | def create_instances_from_document( 221 | all_documents, document_index, max_seq_length, short_seq_prob, 222 | masked_lm_prob, max_predictions_per_seq, vocab_words, rng): 223 | """Creates `TrainingInstance`s for a single document.""" 224 | document = all_documents[document_index] 225 | 226 | # Account for [CLS], [SEP], [SEP] 227 | max_num_tokens = max_seq_length - 3 228 | 229 | # We *usually* want to fill up the entire sequence since we are padding 230 | # to `max_seq_length` anyways, so short sequences are generally wasted 231 | # computation. However, we *sometimes* 232 | # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter 233 | # sequences to minimize the mismatch between pre-training and fine-tuning. 234 | # The `target_seq_length` is just a rough target however, whereas 235 | # `max_seq_length` is a hard limit. 236 | target_seq_length = max_num_tokens 237 | if rng.random() < short_seq_prob: 238 | target_seq_length = rng.randint(2, max_num_tokens) 239 | 240 | # We DON'T just concatenate all of the tokens from a document into a long 241 | # sequence and choose an arbitrary split point because this would make the 242 | # next sentence prediction task too easy. Instead, we split the input into 243 | # segments "A" and "B" based on the actual "sentences" provided by the user 244 | # input. 245 | instances = [] 246 | current_chunk = [] 247 | current_length = 0 248 | i = 0 249 | while i < len(document): 250 | segment = document[i] 251 | current_chunk.append(segment) 252 | current_length += len(segment) 253 | if i == len(document) - 1 or current_length >= target_seq_length: 254 | if current_chunk: 255 | # `a_end` is how many segments from `current_chunk` go into the `A` 256 | # (first) sentence. 257 | a_end = 1 258 | if len(current_chunk) >= 2: 259 | a_end = rng.randint(1, len(current_chunk) - 1) 260 | 261 | tokens_a = [] 262 | for j in range(a_end): 263 | tokens_a.extend(current_chunk[j]) 264 | 265 | tokens_b = [] 266 | # Random next 267 | is_random_next = False 268 | if len(current_chunk) == 1 or rng.random() < 0.5: 269 | is_random_next = True 270 | target_b_length = target_seq_length - len(tokens_a) 271 | 272 | # This should rarely go for more than one iteration for large 273 | # corpora. However, just to be careful, we try to make sure that 274 | # the random document is not the same as the document 275 | # we're processing. 276 | for _ in range(10): 277 | random_document_index = rng.randint(0, len(all_documents) - 1) 278 | if random_document_index != document_index: 279 | break 280 | 281 | random_document = all_documents[random_document_index] 282 | random_start = rng.randint(0, len(random_document) - 1) 283 | for j in range(random_start, len(random_document)): 284 | tokens_b.extend(random_document[j]) 285 | if len(tokens_b) >= target_b_length: 286 | break 287 | # We didn't actually use these segments so we "put them back" so 288 | # they don't go to waste. 289 | num_unused_segments = len(current_chunk) - a_end 290 | i -= num_unused_segments 291 | # Actual next 292 | else: 293 | is_random_next = False 294 | for j in range(a_end, len(current_chunk)): 295 | tokens_b.extend(current_chunk[j]) 296 | truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) 297 | 298 | assert len(tokens_a) >= 1 299 | assert len(tokens_b) >= 1 300 | 301 | tokens = [] 302 | segment_ids = [] 303 | tokens.append("[CLS]") 304 | segment_ids.append(0) 305 | for token in tokens_a: 306 | tokens.append(token) 307 | segment_ids.append(0) 308 | 309 | tokens.append("[SEP]") 310 | segment_ids.append(0) 311 | 312 | for token in tokens_b: 313 | tokens.append(token) 314 | segment_ids.append(1) 315 | tokens.append("[SEP]") 316 | segment_ids.append(1) 317 | 318 | (tokens, masked_lm_positions, 319 | masked_lm_labels) = create_masked_lm_predictions( 320 | tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) 321 | instance = TrainingInstance( 322 | tokens=tokens, 323 | segment_ids=segment_ids, 324 | is_random_next=is_random_next, 325 | masked_lm_positions=masked_lm_positions, 326 | masked_lm_labels=masked_lm_labels) 327 | instances.append(instance) 328 | current_chunk = [] 329 | current_length = 0 330 | i += 1 331 | 332 | return instances 333 | 334 | 335 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance", 336 | ["index", "label"]) 337 | 338 | 339 | def create_masked_lm_predictions(tokens, masked_lm_prob, 340 | max_predictions_per_seq, vocab_words, rng): 341 | """Creates the predictions for the masked LM objective.""" 342 | 343 | cand_indexes = [] 344 | for (i, token) in enumerate(tokens): 345 | if token == "[CLS]" or token == "[SEP]": 346 | continue 347 | cand_indexes.append(i) 348 | 349 | rng.shuffle(cand_indexes) 350 | 351 | output_tokens = list(tokens) 352 | 353 | num_to_predict = min(max_predictions_per_seq, 354 | max(1, int(round(len(tokens) * masked_lm_prob)))) 355 | 356 | masked_lms = [] 357 | covered_indexes = set() 358 | for index in cand_indexes: 359 | if len(masked_lms) >= num_to_predict: 360 | break 361 | if index in covered_indexes: 362 | continue 363 | covered_indexes.add(index) 364 | 365 | masked_token = None 366 | # 80% of the time, replace with [MASK] 367 | if rng.random() < 0.8: 368 | masked_token = "[MASK]" 369 | else: 370 | # 10% of the time, keep original 371 | if rng.random() < 0.5: 372 | masked_token = tokens[index] 373 | # 10% of the time, replace with random word 374 | else: 375 | masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] 376 | 377 | output_tokens[index] = masked_token 378 | 379 | masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) 380 | 381 | masked_lms = sorted(masked_lms, key=lambda x: x.index) 382 | 383 | masked_lm_positions = [] 384 | masked_lm_labels = [] 385 | for p in masked_lms: 386 | masked_lm_positions.append(p.index) 387 | masked_lm_labels.append(p.label) 388 | 389 | return (output_tokens, masked_lm_positions, masked_lm_labels) 390 | 391 | 392 | def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): 393 | """Truncates a pair of sequences to a maximum sequence length.""" 394 | while True: 395 | total_length = len(tokens_a) + len(tokens_b) 396 | if total_length <= max_num_tokens: 397 | break 398 | 399 | trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b 400 | assert len(trunc_tokens) >= 1 401 | 402 | # We want to sometimes truncate from the front and sometimes from the 403 | # back to add more randomness and avoid biases. 404 | if rng.random() < 0.5: 405 | del trunc_tokens[0] 406 | else: 407 | trunc_tokens.pop() 408 | 409 | 410 | def main(_): 411 | tf.logging.set_verbosity(tf.logging.INFO) 412 | 413 | tokenizer = tokenization.FullTokenizer( 414 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 415 | 416 | input_files = [] 417 | for input_pattern in FLAGS.input_file.split(","): 418 | input_files.extend(tf.gfile.Glob(input_pattern)) 419 | 420 | tf.logging.info("*** Reading from input files ***") 421 | for input_file in input_files: 422 | tf.logging.info(" %s", input_file) 423 | 424 | rng = random.Random(FLAGS.random_seed) 425 | instances = create_training_instances( 426 | input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, 427 | FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, 428 | rng) 429 | 430 | output_files = FLAGS.output_file.split(",") 431 | tf.logging.info("*** Writing to output files ***") 432 | for output_file in output_files: 433 | tf.logging.info(" %s", output_file) 434 | 435 | write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, 436 | FLAGS.max_predictions_per_seq, output_files) 437 | 438 | 439 | if __name__ == "__main__": 440 | flags.mark_flag_as_required("input_file") 441 | flags.mark_flag_as_required("output_file") 442 | flags.mark_flag_as_required("vocab_file") 443 | tf.app.run() 444 | -------------------------------------------------------------------------------- /archive/poc_bash_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #export CUDA_VISIBLE_DEVICES=1 3 | export BERT_DIR="/home/Anne/uncased_L-12_H-768_A-12" 4 | export BERT_CONFIG=$BERT_DIR/bert_config.json 5 | export VOCAB_DIR=$BERT_DIR/vocab.txt 6 | export PATH_SUFFIX="/sentences/free-wo-nsp" 7 | export BERT_EXTENDED_DIR="/home/Anne/ConceptBERT/output/pretraining${PATH_SUFFIX}" 8 | export OUTPUT_DIR="/home/Anne/ConceptBERT/output/finetuning${PATH_SUFFIX}" 9 | export GLUE_DIR="/home/Anne/ConceptBERT/data/glue_data" 10 | export S3_PATH="~/test/output/finetuning${PATH_SUFFIX}" 11 | 12 | for STEP in "25000"; do 13 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 14 | for task_name in "SST2"; do 15 | 16 | # Copy the data to s3 17 | for dir in ${OUTPUT_DIR}/${STEP}/*/; do 18 | #for dir in /home/Anne/ConceptBERT/output/finetuning/sentences/free-wo-nsp/25000/CoLA*; do 19 | echo "DIR ${dir}" 20 | for filename in ${dir}*; do 21 | echo "FILENAME ${filename}" 22 | 23 | #IFS='/' # hyphen (-) is set as delimiter 24 | #declare -a PARTS 25 | #read -ra PARTS <<< ${FILE} # str is read into an array as tokens separated by IFS 26 | #echo "PARTS ${PARTS}" 27 | FILE=${filename##*/} 28 | echo ${FILE} 29 | temp=${filename%/*} 30 | SUBDIR=${temp##*/} 31 | echo ${SUBDIR} 32 | 33 | #S3="${S3_PATH}/${STEP}/${PARTS[${#PARTS[@]}-2]}/${PARTS[${#PARTS[@]}-1]}" 34 | S3=${S3_PATH}/${STEP}/${SUBDIR}/${FILE} 35 | #S3="${S3_PATH}/${STEP}/${filename}" 36 | echo "S3 ${S3}" 37 | waws --uploadS3 -b wluper-retrograph -f "${filename}" -l "${S3}" 38 | done 39 | done 40 | #waws --uploadS3 -b wluper-retrograph -f $OUTPUT_DIR/${STEP}/${task_name}/ -l $S3_PATH/${STEP}/${task_name}/ 41 | #rm -r $OUTPUT_DIR/${STEP}/${task_name}* 42 | done 43 | done -------------------------------------------------------------------------------- /archive/poc_create_pretraining_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #--input_file=./data/omcs-sentences-more-filtered.txt \ 3 | #--output_file=./data/omcs-sentences-more-filtered.tfrecord \ 4 | 5 | python create_pretraining_data.py \ 6 | --input_file=./data/omcs-sentences-free-filtered-3.txt \ 7 | --output_file=./data/omcs-sentences-free-filtered.tfrecord \ 8 | --vocab_file=/c/Users/anlausch/Downloads/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12/vocab.txt \ 9 | --do_lower_case=True \ 10 | --max_seq_length=128 \ 11 | --max_predictions_per_seq=20 \ 12 | --masked_lm_prob=0.15 \ 13 | --random_seed=12345 \ 14 | --dupe_factor=5 15 | -------------------------------------------------------------------------------- /archive/poc_create_pretraining_data_rw.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python create_pretraining_data.py \ 4 | --input_file=./data/rw_corpus_1.0_1.0_2_10_2.txt \ 5 | --output_file=./data/rw_corpus_1.0_1.0_2_10_cn_relations_2.tfrecord \ 6 | --vocab_file=/work/anlausch/uncased_L-12_H-768_A-12/vocab_cn_relations.txt \ 7 | --do_lower_case=True \ 8 | --max_seq_length=128 \ 9 | --max_predictions_per_seq=20 \ 10 | --masked_lm_prob=0.15 \ 11 | --random_seed=12345 \ 12 | --dupe_factor=5 |& tee ./data/cn_relations_2.out 13 | 14 | python create_pretraining_data.py \ 15 | --input_file=./data/rw_corpus_1.0_1.0_2_10_3.txt \ 16 | --output_file=./data/rw_corpus_1.0_1.0_2_10_cn_relations_3.tfrecord \ 17 | --vocab_file=/work/anlausch/uncased_L-12_H-768_A-12/vocab_cn_relations_2.txt \ 18 | --do_lower_case=True \ 19 | --max_seq_length=128 \ 20 | --max_predictions_per_seq=20 \ 21 | --masked_lm_prob=0.15 \ 22 | --random_seed=12345 \ 23 | --dupe_factor=5 |& tee ./data/cn_relations_3.out 24 | 25 | python create_pretraining_data.py \ 26 | --input_file=./data/rw_corpus_1.0_1.0_2_10_nl.txt \ 27 | --output_file=./data/rw_corpus_1.0_1.0_2_10_cn_relations_nl.tfrecord \ 28 | --vocab_file=/work/anlausch/uncased_L-12_H-768_A-12/vocab.txt \ 29 | --do_lower_case=True \ 30 | --max_seq_length=128 \ 31 | --max_predictions_per_seq=20 \ 32 | --masked_lm_prob=0.15 \ 33 | --random_seed=12345 \ 34 | --dupe_factor=5 |& tee ./data/cn_relations_nl.out 35 | -------------------------------------------------------------------------------- /archive/poc_create_pretraining_data_wo_nsp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #--input_file=./data/omcs-sentences-more-filtered.txt \ 3 | #--output_file=./data/omcs-sentences-more-filtered.tfrecord \ 4 | 5 | python create_pretraining_data_wo_nsp.py \ 6 | --input_file=./data/omcs-sentences-free-filtered-3.txt \ 7 | --output_file=./data/omcs-sentences-free-filtered-wo-nsp.tfrecord \ 8 | --vocab_file=/c/Users/anlausch/Downloads/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12/vocab.txt \ 9 | --do_lower_case=True \ 10 | --max_seq_length=128 \ 11 | --max_predictions_per_seq=20 \ 12 | --masked_lm_prob=0.15 \ 13 | --random_seed=12345 \ 14 | --dupe_factor=5 15 | -------------------------------------------------------------------------------- /archive/poc_finetuning.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #export CUDA_VISIBLE_DEVICES=1 3 | BERT_DIR="/home/Anne/uncased_L-12_H-768_A-12" 4 | BERT_CONFIG=$BERT_DIR/bert_config.json 5 | VOCAB_DIR=$BERT_DIR/vocab.txt 6 | PATH_SUFFIX="/sentences/free-wo-nsp" 7 | BERT_EXTENDED_DIR="/home/Anne/ConceptBERT/output/pretraining${PATH_SUFFIX}" 8 | OUTPUT_DIR="/home/Anne/ConceptBERT/output/finetuning${PATH_SUFFIX}" 9 | GLUE_DIR="/home/Anne/ConceptBERT/data/glue_data" 10 | S3_PATH="~/anne/output/finetuning${PATH_SUFFIX}" 11 | 12 | for STEP in "25000" "50000" "75000" "100000"; do 13 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 14 | for task_name in "CoLA" "MRPC" "RTE" "SST2" "QNLIV2" ; do 15 | echo $task_name 16 | echo $CHECKPOINT 17 | 18 | GLUE_DATA="$GLUE_DIR/$task_name" 19 | 20 | python run_classifier.py \ 21 | --task_name=$task_name \ 22 | --do_train=true \ 23 | --do_eval=true \ 24 | --do_early_stopping=false \ 25 | --data_dir=$GLUE_DATA \ 26 | --vocab_file=$VOCAB_DIR \ 27 | --bert_config_file=$BERT_CONFIG \ 28 | --init_checkpoint=$CHECKPOINT\ 29 | --max_seq_length=128 \ 30 | --train_batch_size="[16]" \ 31 | --learning_rate="[2e-5, 3e-5]" \ 32 | --num_train_epochs="[3,4]" \ 33 | --original_model=True \ 34 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out 35 | 36 | # Copy the data to s3 37 | for dir in ${OUTPUT_DIR}/${STEP}/*/; do 38 | #for dir in /home/Anne/ConceptBERT/output/finetuning/sentences/free-wo-nsp/25000/CoLA*; do 39 | echo "DIR ${dir}" 40 | for filename in ${dir}*; do 41 | echo "FILENAME ${filename}" 42 | 43 | #IFS='/' # hyphen (-) is set as delimiter 44 | #declare -a PARTS 45 | #read -ra PARTS <<< ${FILE} # str is read into an array as tokens separated by IFS 46 | #echo "PARTS ${PARTS}" 47 | FILE=${filename##*/} 48 | echo ${FILE} 49 | temp=${filename%/*} 50 | SUBDIR=${temp##*/} 51 | echo ${SUBDIR} 52 | 53 | #S3="${S3_PATH}/${STEP}/${PARTS[${#PARTS[@]}-2]}/${PARTS[${#PARTS[@]}-1]}" 54 | S3=${S3_PATH}/${STEP}/${SUBDIR}/${FILE} 55 | #S3="${S3_PATH}/${STEP}/${filename}" 56 | echo "S3 ${S3}" 57 | waws --uploadS3 -b wluper-retrograph -f "${filename}" -l "${S3}" 58 | done 59 | done 60 | #waws --uploadS3 -b wluper-retrograph -f $OUTPUT_DIR/${STEP}/${task_name}/ -l $S3_PATH/${STEP}/${task_name}/ 61 | rm -r $OUTPUT_DIR/${STEP}/${task_name}* 62 | done 63 | 64 | 65 | 66 | for task_name in "STSB" ; do 67 | echo $task_name 68 | export GLUE_DATA="$GLUE_DIR/$task_name" 69 | 70 | python run_regression.py \ 71 | --task_name=$task_name \ 72 | --do_train=true \ 73 | --do_eval=true \ 74 | --do_early_stopping=false \ 75 | --data_dir=$GLUE_DATA \ 76 | --vocab_file=$VOCAB_DIR \ 77 | --bert_config_file=$BERT_CONFIG \ 78 | --init_checkpoint=$CHECKPOINT\ 79 | --max_seq_length=128 \ 80 | --train_batch_size="[16]" \ 81 | --learning_rate="[2e-5, 3e-5]" \ 82 | --num_train_epochs="[3,4]" \ 83 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out 84 | 85 | # Copy the data to s3 86 | for dir in ${OUTPUT_DIR}/${STEP}/*/; do 87 | #for dir in /home/Anne/ConceptBERT/output/finetuning/sentences/free-wo-nsp/25000/CoLA*; do 88 | echo "DIR ${dir}" 89 | for filename in ${dir}*; do 90 | echo "FILENAME ${filename}" 91 | 92 | #IFS='/' # hyphen (-) is set as delimiter 93 | #declare -a PARTS 94 | #read -ra PARTS <<< ${FILE} # str is read into an array as tokens separated by IFS 95 | #echo "PARTS ${PARTS}" 96 | FILE=${filename##*/} 97 | echo ${FILE} 98 | temp=${filename%/*} 99 | SUBDIR=${temp##*/} 100 | echo ${SUBDIR} 101 | 102 | #S3="${S3_PATH}/${STEP}/${PARTS[${#PARTS[@]}-2]}/${PARTS[${#PARTS[@]}-1]}" 103 | S3=${S3_PATH}/${STEP}/${SUBDIR}/${FILE} 104 | #S3="${S3_PATH}/${STEP}/${filename}" 105 | echo "S3 ${S3}" 106 | waws --uploadS3 -b wluper-retrograph -f "${filename}" -l "${S3}" 107 | done 108 | done 109 | #waws --uploadS3 -b wluper-retrograph -f $OUTPUT_DIR/${STEP}/${task_name}/ -l $S3_PATH/${STEP}/${task_name}/ 110 | rm -r $OUTPUT_DIR/${STEP}/${task_name}* 111 | done 112 | done -------------------------------------------------------------------------------- /archive/poc_finetuning_adapter.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 5 | BERT_CONFIG=$BERT_DIR/bert_config.json 6 | VOCAB_DIR=$BERT_DIR/vocab.txt 7 | 8 | PATH_SUFFIX="/omcs/free-wo-nsp-adapter" 9 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}" 10 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}" 11 | 12 | for STEP in "25000" "50000" "75000" "100000"; do 13 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 14 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2" 15 | echo $task_name 16 | echo $CHECKPOINT 17 | 18 | GLUE_DATA="$GLUE_DIR/$task_name" 19 | 20 | python run_classifier_adapter.py \ 21 | --task_name=$task_name \ 22 | --do_train=true \ 23 | --do_eval=true \ 24 | --do_early_stopping=false \ 25 | --data_dir=$GLUE_DATA \ 26 | --vocab_file=$VOCAB_DIR \ 27 | --bert_config_file=$BERT_CONFIG \ 28 | --init_checkpoint=$CHECKPOINT\ 29 | --max_seq_length=128 \ 30 | --train_batch_size="[16]" \ 31 | --learning_rate="[2e-5, 3e-5]" \ 32 | --num_train_epochs="[3,4]" \ 33 | --original_model=True \ 34 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out 35 | done 36 | 37 | for task_name in "STSB" ; do 38 | echo $task_name 39 | export GLUE_DATA="$GLUE_DIR/$task_name" 40 | 41 | python run_regression_adapter.py \ 42 | --task_name=$task_name \ 43 | --do_train=true \ 44 | --do_eval=true \ 45 | --do_early_stopping=false \ 46 | --data_dir=$GLUE_DATA \ 47 | --vocab_file=$VOCAB_DIR \ 48 | --bert_config_file=$BERT_CONFIG \ 49 | --init_checkpoint=$CHECKPOINT\ 50 | --max_seq_length=128 \ 51 | --train_batch_size="[16]" \ 52 | --learning_rate="[2e-5, 3e-5]" \ 53 | --num_train_epochs="[3,4]" \ 54 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out 55 | done 56 | done 57 | 58 | OUTPUT_SUFFIX=_tune_all 59 | ### the second finetuning variant 60 | for STEP in "25000" "50000" "75000" "100000"; do 61 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 62 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2" 63 | echo $task_name 64 | echo $CHECKPOINT 65 | 66 | GLUE_DATA="$GLUE_DIR/$task_name" 67 | 68 | python run_classifier_adapter_tune_all.py \ 69 | --task_name=$task_name \ 70 | --do_train=true \ 71 | --do_eval=true \ 72 | --do_early_stopping=false \ 73 | --data_dir=$GLUE_DATA \ 74 | --vocab_file=$VOCAB_DIR \ 75 | --bert_config_file=$BERT_CONFIG \ 76 | --init_checkpoint=$CHECKPOINT\ 77 | --max_seq_length=128 \ 78 | --train_batch_size="[16]" \ 79 | --learning_rate="[2e-5, 3e-5]" \ 80 | --num_train_epochs="[3,4]" \ 81 | --original_model=True \ 82 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out 83 | done 84 | 85 | for task_name in "STSB" ; do 86 | echo $task_name 87 | export GLUE_DATA="$GLUE_DIR/$task_name" 88 | 89 | python run_regression_adapter_tune_all.py \ 90 | --task_name=$task_name \ 91 | --do_train=true \ 92 | --do_eval=true \ 93 | --do_early_stopping=false \ 94 | --data_dir=$GLUE_DATA \ 95 | --vocab_file=$VOCAB_DIR \ 96 | --bert_config_file=$BERT_CONFIG \ 97 | --init_checkpoint=$CHECKPOINT\ 98 | --max_seq_length=128 \ 99 | --train_batch_size="[16]" \ 100 | --learning_rate="[2e-5, 3e-5]" \ 101 | --num_train_epochs="[3,4]" \ 102 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out 103 | done 104 | done 105 | 106 | PATH_SUFFIX="/rw/1.0_1.0_2_10/nl-adapter" 107 | for STEP in "25000" "50000" "75000" "100000"; do 108 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 109 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2" 110 | echo $task_name 111 | echo $CHECKPOINT 112 | 113 | GLUE_DATA="$GLUE_DIR/$task_name" 114 | 115 | python run_classifier_adapter.py \ 116 | --task_name=$task_name \ 117 | --do_train=true \ 118 | --do_eval=true \ 119 | --do_early_stopping=false \ 120 | --data_dir=$GLUE_DATA \ 121 | --vocab_file=$VOCAB_DIR \ 122 | --bert_config_file=$BERT_CONFIG \ 123 | --init_checkpoint=$CHECKPOINT\ 124 | --max_seq_length=128 \ 125 | --train_batch_size="[16]" \ 126 | --learning_rate="[2e-5, 3e-5]" \ 127 | --num_train_epochs="[3,4]" \ 128 | --original_model=True \ 129 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out 130 | done 131 | 132 | for task_name in "STSB" ; do 133 | echo $task_name 134 | export GLUE_DATA="$GLUE_DIR/$task_name" 135 | 136 | python run_regression_adapter.py \ 137 | --task_name=$task_name \ 138 | --do_train=true \ 139 | --do_eval=true \ 140 | --do_early_stopping=false \ 141 | --data_dir=$GLUE_DATA \ 142 | --vocab_file=$VOCAB_DIR \ 143 | --bert_config_file=$BERT_CONFIG \ 144 | --init_checkpoint=$CHECKPOINT\ 145 | --max_seq_length=128 \ 146 | --train_batch_size="[16]" \ 147 | --learning_rate="[2e-5, 3e-5]" \ 148 | --num_train_epochs="[3,4]" \ 149 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out 150 | done 151 | done 152 | 153 | OUTPUT_SUFFIX=_tune_all 154 | ### the second finetuning variant 155 | for STEP in "25000" "50000" "75000" "100000"; do 156 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 157 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2" 158 | echo $task_name 159 | echo $CHECKPOINT 160 | 161 | GLUE_DATA="$GLUE_DIR/$task_name" 162 | 163 | python run_classifier_adapter_tune_all.py \ 164 | --task_name=$task_name \ 165 | --do_train=true \ 166 | --do_eval=true \ 167 | --do_early_stopping=false \ 168 | --data_dir=$GLUE_DATA \ 169 | --vocab_file=$VOCAB_DIR \ 170 | --bert_config_file=$BERT_CONFIG \ 171 | --init_checkpoint=$CHECKPOINT\ 172 | --max_seq_length=128 \ 173 | --train_batch_size="[16]" \ 174 | --learning_rate="[2e-5, 3e-5]" \ 175 | --num_train_epochs="[3,4]" \ 176 | --original_model=True \ 177 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out 178 | done 179 | 180 | for task_name in "STSB" ; do 181 | echo $task_name 182 | export GLUE_DATA="$GLUE_DIR/$task_name" 183 | 184 | python run_regression_adapter_tune_all.py \ 185 | --task_name=$task_name \ 186 | --do_train=true \ 187 | --do_eval=true \ 188 | --do_early_stopping=false \ 189 | --data_dir=$GLUE_DATA \ 190 | --vocab_file=$VOCAB_DIR \ 191 | --bert_config_file=$BERT_CONFIG \ 192 | --init_checkpoint=$CHECKPOINT\ 193 | --max_seq_length=128 \ 194 | --train_batch_size="[16]" \ 195 | --learning_rate="[2e-5, 3e-5]" \ 196 | --num_train_epochs="[3,4]" \ 197 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out 198 | done 199 | done 200 | -------------------------------------------------------------------------------- /archive/poc_finetuning_adapter_longer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #Step1: 4 | #run_classifier_adapter_tune_all.py -> 5 | # 6 | # 7 | #Need to load the Adapter Model 8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT 9 | 10 | 11 | export CUDA_VISIBLE_DEVICES=8 12 | 13 | BERT_DIR="models/BERT_BASE_UNCASED" 14 | BERT_CONFIG=$BERT_DIR/bert_config.json 15 | VOCAB_DIR=$BERT_DIR/vocab.txt 16 | 17 | BERT_EXTENDED_DIR="data/output_pretrain_adapter" 18 | OUTPUT_DIR="data/output_model_finetunning" 19 | 20 | GLUE_DIR='data/GLUE' 21 | 22 | OUTPUT_SUFFIX=_tune_all 23 | ### the second finetuning variant 24 | for STEP in "98000" "99000"; do 25 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 26 | for task_name in "QNLI" "QQP" "MNLI"; do 27 | echo $task_name 28 | echo $CHECKPOINT 29 | 30 | GLUE_DATA="$GLUE_DIR/$task_name" 31 | 32 | python run_classifier_adapter_tune_all.py \ 33 | --task_name=$task_name \ 34 | --do_train=true \ 35 | --do_eval=true \ 36 | --do_early_stopping=false \ 37 | --data_dir=$GLUE_DATA \ 38 | --vocab_file=$VOCAB_DIR \ 39 | --bert_config_file=$BERT_CONFIG \ 40 | --init_checkpoint=$CHECKPOINT\ 41 | --max_seq_length=128 \ 42 | --train_batch_size="[16]" \ 43 | --learning_rate="[2e-5, 3e-5]" \ 44 | --num_train_epochs="[3,4]" \ 45 | --original_model=True \ 46 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out 47 | done 48 | done 49 | -------------------------------------------------------------------------------- /archive/poc_finetuning_adapter_longer_2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 5 | BERT_CONFIG=$BERT_DIR/bert_config.json 6 | VOCAB_DIR=$BERT_DIR/vocab.txt 7 | 8 | PATH_SUFFIX="/rw/1.0_1.0_2_10/nl-adapter" 9 | OUTPUT_SUFFIX=_tune_all 10 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}" 11 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}" 12 | 13 | ### the second finetuning variant 14 | for STEP in "25000" "100000"; do 15 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 16 | for task_name in "QNLIV2" "QQP" "MNLI"; do 17 | echo $task_name 18 | echo $CHECKPOINT 19 | 20 | GLUE_DATA="$GLUE_DIR/$task_name" 21 | 22 | python run_classifier_adapter_tune_all.py \ 23 | --task_name=$task_name \ 24 | --do_train=true \ 25 | --do_eval=true \ 26 | --do_early_stopping=false \ 27 | --data_dir=$GLUE_DATA \ 28 | --vocab_file=$VOCAB_DIR \ 29 | --bert_config_file=$BERT_CONFIG \ 30 | --init_checkpoint=$CHECKPOINT\ 31 | --max_seq_length=128 \ 32 | --train_batch_size="[16]" \ 33 | --learning_rate="[2e-5, 3e-5]" \ 34 | --num_train_epochs="[3,4]" \ 35 | --original_model=True \ 36 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out 37 | done 38 | done 39 | -------------------------------------------------------------------------------- /archive/poc_finetuning_adapter_quick_insight.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export CUDA_VISIBLE_DEVICES=3 4 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 5 | BERT_CONFIG=$BERT_DIR/bert_config.json 6 | VOCAB_DIR=$BERT_DIR/vocab.txt 7 | 8 | 9 | PATH_SUFFIX="/rw/1.0_1.0_2_10/nl-adapter" 10 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}" 11 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}" 12 | 13 | OUTPUT_SUFFIX=_tune_all_quick_insight 14 | ### the second finetuning variant 15 | for STEP in "25000" "50000" "75000" "100000"; do 16 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 17 | for task_name in "CoLA" "MRPC" "RTE"; do #"QNLIV2" 18 | echo $task_name 19 | echo $CHECKPOINT 20 | 21 | GLUE_DATA="$GLUE_DIR/$task_name" 22 | 23 | python run_classifier_adapter_tune_all.py \ 24 | --task_name=$task_name \ 25 | --do_train=true \ 26 | --do_eval=true \ 27 | --do_early_stopping=false \ 28 | --data_dir=$GLUE_DATA \ 29 | --vocab_file=$VOCAB_DIR \ 30 | --bert_config_file=$BERT_CONFIG \ 31 | --init_checkpoint=$CHECKPOINT\ 32 | --max_seq_length=128 \ 33 | --train_batch_size="[16]" \ 34 | --learning_rate="[2e-5, 3e-5]" \ 35 | --num_train_epochs="[3,4]" \ 36 | --original_model=True \ 37 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out 38 | done 39 | 40 | for task_name in "STSB" ; do 41 | echo $task_name 42 | export GLUE_DATA="$GLUE_DIR/$task_name" 43 | 44 | python run_regression_adapter_tune_all.py \ 45 | --task_name=$task_name \ 46 | --do_train=true \ 47 | --do_eval=true \ 48 | --do_early_stopping=false \ 49 | --data_dir=$GLUE_DATA \ 50 | --vocab_file=$VOCAB_DIR \ 51 | --bert_config_file=$BERT_CONFIG \ 52 | --init_checkpoint=$CHECKPOINT\ 53 | --max_seq_length=128 \ 54 | --train_batch_size="[16]" \ 55 | --learning_rate="[2e-5, 3e-5]" \ 56 | --num_train_epochs="[3,4]" \ 57 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out 58 | done 59 | done 60 | -------------------------------------------------------------------------------- /archive/poc_finetuning_adapter_sst2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export CUDA_VISIBLE_DEVICES=1 4 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 5 | BERT_CONFIG=$BERT_DIR/bert_config.json 6 | VOCAB_DIR=$BERT_DIR/vocab.txt 7 | 8 | 9 | PATH_SUFFIX="/rw/1.0_1.0_2_10/nl-adapter" 10 | OUTPUT_SUFFIX=_tune_all_quick_insight 11 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}" 12 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}" 13 | 14 | ### the second finetuning variant 15 | for STEP in "25000" "100000"; do 16 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 17 | for task_name in "SST2"; do 18 | echo $task_name 19 | echo $CHECKPOINT 20 | 21 | GLUE_DATA="$GLUE_DIR/$task_name" 22 | 23 | python run_classifier_adapter_tune_all.py \ 24 | --task_name=$task_name \ 25 | --do_train=true \ 26 | --do_eval=true \ 27 | --do_early_stopping=false \ 28 | --data_dir=$GLUE_DATA \ 29 | --vocab_file=$VOCAB_DIR \ 30 | --bert_config_file=$BERT_CONFIG \ 31 | --init_checkpoint=$CHECKPOINT\ 32 | --max_seq_length=128 \ 33 | --train_batch_size="[16]" \ 34 | --learning_rate="[2e-5, 3e-5]" \ 35 | --num_train_epochs="[3,4]" \ 36 | --original_model=True \ 37 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} |& tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out 38 | done 39 | 40 | done 41 | -------------------------------------------------------------------------------- /archive/poc_finetuning_dws.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 4 | BERT_CONFIG=$BERT_DIR/bert_config.json 5 | VOCAB_DIR=$BERT_DIR/vocab.txt 6 | #PATH_SUFFIX="/omcs/free-wo-nsp" 7 | PATH_SUFFIX="/omcs/free-wo-nsp-no-warmup" 8 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}" 9 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}" 10 | 11 | for STEP in "25000" "50000" "75000" "100000"; do 12 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 13 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2" 14 | echo $task_name 15 | echo $CHECKPOINT 16 | 17 | GLUE_DATA="$GLUE_DIR/$task_name" 18 | 19 | python run_classifier.py \ 20 | --task_name=$task_name \ 21 | --do_train=true \ 22 | --do_eval=true \ 23 | --do_early_stopping=false \ 24 | --data_dir=$GLUE_DATA \ 25 | --vocab_file=$VOCAB_DIR \ 26 | --bert_config_file=$BERT_CONFIG \ 27 | --init_checkpoint=$CHECKPOINT\ 28 | --max_seq_length=128 \ 29 | --train_batch_size="[16]" \ 30 | --learning_rate="[2e-5, 3e-5]" \ 31 | --num_train_epochs="[3,4]" \ 32 | --original_model=True \ 33 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out 34 | done 35 | 36 | for task_name in "STSB" ; do 37 | echo $task_name 38 | export GLUE_DATA="$GLUE_DIR/$task_name" 39 | 40 | python run_regression.py \ 41 | --task_name=$task_name \ 42 | --do_train=true \ 43 | --do_eval=true \ 44 | --do_early_stopping=false \ 45 | --data_dir=$GLUE_DATA \ 46 | --vocab_file=$VOCAB_DIR \ 47 | --bert_config_file=$BERT_CONFIG \ 48 | --init_checkpoint=$CHECKPOINT\ 49 | --max_seq_length=128 \ 50 | --train_batch_size="[16]" \ 51 | --learning_rate="[2e-5, 3e-5]" \ 52 | --num_train_epochs="[3,4]" \ 53 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out 54 | done 55 | done -------------------------------------------------------------------------------- /archive/poc_finetuning_rw.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export CUDA_VISIBLE_DEVICES=3 3 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 4 | BERT_CONFIG=$BERT_DIR/bert_config.json 5 | VOCAB_DIR=$BERT_DIR/vocab.txt 6 | PATH_SUFFIX="/rw/1.0_1.0_2_10/nl" 7 | BERT_EXTENDED_DIR="/work/anlausch/ConceptBERT/output/pretraining${PATH_SUFFIX}" 8 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/finetuning${PATH_SUFFIX}" 9 | 10 | for STEP in "25000" "50000" "75000" "100000"; do 11 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 12 | for task_name in "CoLA" "MRPC" "RTE" "SST2"; do #"QNLIV2" 13 | echo $task_name 14 | echo $CHECKPOINT 15 | 16 | GLUE_DATA="$GLUE_DIR/$task_name" 17 | 18 | python run_classifier.py \ 19 | --task_name=$task_name \ 20 | --do_train=true \ 21 | --do_eval=true \ 22 | --do_early_stopping=false \ 23 | --data_dir=$GLUE_DATA \ 24 | --vocab_file=$VOCAB_DIR \ 25 | --bert_config_file=$BERT_CONFIG \ 26 | --init_checkpoint=$CHECKPOINT\ 27 | --max_seq_length=128 \ 28 | --train_batch_size="[16]" \ 29 | --learning_rate="[2e-5, 3e-5]" \ 30 | --num_train_epochs="[3,4]" \ 31 | --original_model=True \ 32 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out 33 | done 34 | 35 | for task_name in "STSB" ; do 36 | echo $task_name 37 | export GLUE_DATA="$GLUE_DIR/$task_name" 38 | 39 | python run_regression.py \ 40 | --task_name=$task_name \ 41 | --do_train=true \ 42 | --do_eval=true \ 43 | --do_early_stopping=false \ 44 | --data_dir=$GLUE_DATA \ 45 | --vocab_file=$VOCAB_DIR \ 46 | --bert_config_file=$BERT_CONFIG \ 47 | --init_checkpoint=$CHECKPOINT\ 48 | --max_seq_length=128 \ 49 | --train_batch_size="[16]" \ 50 | --learning_rate="[2e-5, 3e-5]" \ 51 | --num_train_epochs="[3,4]" \ 52 | --output_dir=$OUTPUT_DIR/${STEP}/${task_name} |& tee $OUTPUT_DIR/${STEP}/${task_name}.out 53 | done 54 | done -------------------------------------------------------------------------------- /archive/poc_pretraining.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo "script started" 3 | 4 | INPUT_FILE="/home/Anne/ConceptBERT/data/omcs-sentences-free-filtered-wo-nsp.tfrecord" 5 | OUTPUT_DIR="/home/Anne/ConceptBERT/output/pretraining/sentences/free-wo-nsp/" 6 | NUM_TRAIN_STEPS=100000 7 | BERT_DIR="/home/Anne/uncased_L-12_H-768_A-12" 8 | BERT_CONFIG=$BERT_DIR/bert_config.json 9 | # TODO: Here is an error!!! We should run this again and change run_pretraining to run_pretraining_wo_nsp 10 | # 11 | python run_pretraining.py \ 12 | --input_file=$INPUT_FILE \ 13 | --output_dir=$OUTPUT_DIR \ 14 | --do_train=True \ 15 | --do_eval=True \ 16 | --bert_config_file=$BERT_CONFIG \ 17 | --train_batch_size=16 \ 18 | --eval_batch_size=8 \ 19 | --max_seq_length=128 \ 20 | --max_predictions_per_seq=20 \ 21 | --num_train_steps=$NUM_TRAIN_STEPS \ 22 | --num_warmup_steps=1000 \ 23 | --learning_rate=1e-4 \ 24 | --max_eval_steps=1000 \ 25 | --save_checkpoints_steps=25000 \ 26 | --init_checkpoint=$BERT_DIR/bert_model.ckpt -------------------------------------------------------------------------------- /archive/poc_pretraining_dws.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo "script started" 3 | echo "No warmup here" 4 | export CUDA_VISIBLE_DEVICES=3 5 | 6 | INPUT_FILE="/work/anlausch/ConceptBERT/data/omcs-sentences-free-filtered-wo-nsp.tfrecord" 7 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/pretraining/omcs/free-wo-nsp-no-warmup/" 8 | NUM_TRAIN_STEPS=100000 9 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 10 | BERT_CONFIG=$BERT_DIR/bert_config.json 11 | 12 | # TODO: Here is an error!!! We should run this again and change run_pretraining to run_pretraining_wo_nsp 13 | # 14 | python run_pretraining.py \ 15 | --input_file=$INPUT_FILE \ 16 | --output_dir=$OUTPUT_DIR \ 17 | --do_train=True \ 18 | --do_eval=True \ 19 | --bert_config_file=$BERT_CONFIG \ 20 | --train_batch_size=16 \ 21 | --eval_batch_size=8 \ 22 | --max_seq_length=128 \ 23 | --max_predictions_per_seq=20 \ 24 | --num_train_steps=$NUM_TRAIN_STEPS \ 25 | --num_warmup_steps=0 \ 26 | --learning_rate=1e-4 \ 27 | --max_eval_steps=1000 \ 28 | --save_checkpoints_steps=25000 \ 29 | --init_checkpoint=$BERT_DIR/bert_model.ckpt -------------------------------------------------------------------------------- /archive/poc_pretraining_rw.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo "script started" 3 | export CUDA_VISIBLE_DEVICES=0 4 | 5 | INPUT_FILE="/work/anlausch/ConceptBERT/data/rw_corpus_1.0_1.0_2_10_cn_relations_nl.tfrecord" 6 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/pretraining/rw/1.0_1.0_2_10/nl/" 7 | NUM_TRAIN_STEPS=100000 8 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 9 | BERT_CONFIG=$BERT_DIR/bert_config.json 10 | 11 | python run_pretraining.py \ 12 | --input_file=$INPUT_FILE \ 13 | --output_dir=$OUTPUT_DIR \ 14 | --do_train=True \ 15 | --do_eval=True \ 16 | --bert_config_file=$BERT_CONFIG \ 17 | --train_batch_size=16 \ 18 | --eval_batch_size=8 \ 19 | --max_seq_length=128 \ 20 | --max_predictions_per_seq=20 \ 21 | --num_train_steps=$NUM_TRAIN_STEPS \ 22 | --num_warmup_steps=10000 \ 23 | --learning_rate=1e-4 \ 24 | --max_eval_steps=1000 \ 25 | --save_checkpoints_steps=25000 \ 26 | --init_checkpoint=$BERT_DIR/bert_model.ckpt 27 | 28 | 29 | INPUT_FILE="/work/anlausch/ConceptBERT/data/rw_corpus_1.0_1.0_2_10_cn_relations_2.tfrecord" 30 | OUTPUT_DIR="/work/anlausch/ConceptBERT/output/pretraining/rw/1.0_1.0_2_10/cn_relations/" 31 | NUM_TRAIN_STEPS=100000 32 | BERT_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 33 | BERT_CONFIG=$BERT_DIR/bert_config_cn_relations.json 34 | 35 | python run_pretraining.py \ 36 | --input_file=$INPUT_FILE \ 37 | --output_dir=$OUTPUT_DIR \ 38 | --do_train=True \ 39 | --do_eval=True \ 40 | --bert_config_file=$BERT_CONFIG \ 41 | --train_batch_size=16 \ 42 | --eval_batch_size=8 \ 43 | --max_seq_length=128 \ 44 | --max_predictions_per_seq=20 \ 45 | --num_train_steps=$NUM_TRAIN_STEPS \ 46 | --num_warmup_steps=10000 \ 47 | --learning_rate=1e-4 \ 48 | --max_eval_steps=1000 \ 49 | --save_checkpoints_steps=25000 \ 50 | --init_checkpoint=$BERT_DIR/bert_model.ckpt -------------------------------------------------------------------------------- /archive/prediction_diagnostic.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # general config 3 | export CUDA_VISIBLE_DEVICES=; 4 | BERT_BASE_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 5 | VOCAB_DIR=$BERT_BASE_DIR/vocab.txt 6 | BERT_CONFIG=$BERT_BASE_DIR/bert_config.json 7 | GLUE_DATA="$GLUE_DIR" 8 | STEP_NUMBER=25000 9 | 10 | # root dir of your checkpoints 11 | # ROOT="/work/anlausch/ConceptBERT/output/finetuning/omcs/free-wo-nsp-adapter_tune_all/${STEP_NUMBER}/" 12 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/" 13 | 14 | # this is the mnli model which was best on the matched dataset 15 | for config in "MNLI_16_2e-05_3/model.ckpt-73631","diagnostic"; do 16 | IFS="," 17 | set -- $config 18 | echo $1 and $2 19 | TASK=$2 20 | 21 | # location of the checkpoint which was best on dev 22 | TRAINED_CLASSIFIER=${ROOT}${1} 23 | OUTPUT_DIR=${ROOT}predictions/${TASK} 24 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 25 | python run_classifier_adapter_tune_all.py \ 26 | --task_name=${TASK} \ 27 | --do_predict=true \ 28 | --do_train=false \ 29 | --do_eval=false \ 30 | --data_dir=$GLUE_DIR/${TASK} \ 31 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 32 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 33 | --init_checkpoint=$TRAINED_CLASSIFIER \ 34 | --do_early_stopping=false \ 35 | --max_seq_length=128 \ 36 | --original_model=True \ 37 | --matched=False \ 38 | --output_dir=${OUTPUT_DIR} 39 | 40 | # this is a parser I wrote which should output the predictions in the glue platform format 41 | python parse_predictions.py \ 42 | --task=${TASK} \ 43 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 44 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 45 | done -------------------------------------------------------------------------------- /archive/predictions_rw_100000.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # general config 3 | export CUDA_VISIBLE_DEVICES=; 4 | BERT_BASE_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 5 | VOCAB_DIR=$BERT_BASE_DIR/vocab.txt 6 | BERT_CONFIG=$BERT_BASE_DIR/bert_config.json 7 | GLUE_DATA="$GLUE_DIR" 8 | STEP_NUMBER=100000 9 | 10 | # root dir of your checkpoints 11 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all_quick_insight/${STEP_NUMBER}/" 12 | 13 | # this is a tuple of trained model and task, you can add more tuples 14 | # todo: sst2 missing here 15 | for config in "CoLA_16_2e-05_4/model.ckpt-2137","CoLA" "MRPC_16_2e-05_3/model.ckpt-687","MRPC" "RTE_16_3e-05_4/model.ckpt-622","RTE"; do 16 | IFS="," 17 | set -- $config 18 | echo $1 and $2 19 | TASK=$2 20 | 21 | # location of the checkpoint which was best on dev 22 | TRAINED_CLASSIFIER=${ROOT}${1} 23 | OUTPUT_DIR=${ROOT}predictions/${TASK} 24 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 25 | python run_classifier.py \ 26 | --task_name=${TASK} \ 27 | --do_predict=true \ 28 | --do_train=false \ 29 | --do_eval=false \ 30 | --data_dir=$GLUE_DIR/${TASK} \ 31 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 32 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 33 | --init_checkpoint=$TRAINED_CLASSIFIER \ 34 | --do_early_stopping=false \ 35 | --max_seq_length=128 \ 36 | --original_model=True \ 37 | --matched=False \ 38 | --output_dir=${OUTPUT_DIR} 39 | 40 | # this is a parser I wrote which should output the predictions in the glue platform format 41 | python parse_predictions.py \ 42 | --task=${TASK} \ 43 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 44 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 45 | done 46 | 47 | for config in "STSB_16_2e-05_4/model.ckpt-1437","STSB"; do 48 | IFS="," 49 | set -- $config 50 | echo $1 and $2 51 | TASK=$2 52 | 53 | # location of the checkpoint which was best on dev 54 | TRAINED_CLASSIFIER=${ROOT}${1} 55 | OUTPUT_DIR=${ROOT}predictions/${TASK} 56 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 57 | python run_regression.py \ 58 | --task_name=${TASK} \ 59 | --do_predict=true \ 60 | --do_train=false \ 61 | --do_eval=false \ 62 | --data_dir=$GLUE_DIR/${TASK} \ 63 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 64 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 65 | --init_checkpoint=$TRAINED_CLASSIFIER \ 66 | --do_early_stopping=false \ 67 | --max_seq_length=128 \ 68 | --original_model=True \ 69 | --matched=False \ 70 | --output_dir=${OUTPUT_DIR} 71 | 72 | # this is a parser I wrote which should output the predictions in the glue platform format 73 | python parse_predictions.py \ 74 | --task=${TASK} \ 75 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 76 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 77 | done 78 | 79 | 80 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/" 81 | 82 | # TODO: MNLI is missing here 83 | # this is a tuple of trained model and task, you can add more tuples 84 | for config in "QNLIV2_16_3e-05_3/model.ckpt-19639","QNLIV2" "QQP_16_3e-05_4/model.ckpt-90962","QQP"; do 85 | IFS="," 86 | set -- $config 87 | echo $1 and $2 88 | TASK=$2 89 | 90 | # location of the checkpoint which was best on dev 91 | TRAINED_CLASSIFIER=${ROOT}${1} 92 | OUTPUT_DIR=${ROOT}predictions/${TASK} 93 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 94 | python run_classifier.py \ 95 | --task_name=${TASK} \ 96 | --do_predict=true \ 97 | --do_train=false \ 98 | --do_eval=false \ 99 | --data_dir=$GLUE_DIR/${TASK} \ 100 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 101 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 102 | --init_checkpoint=$TRAINED_CLASSIFIER \ 103 | --do_early_stopping=false \ 104 | --max_seq_length=128 \ 105 | --original_model=True \ 106 | --matched=False \ 107 | --output_dir=${OUTPUT_DIR} 108 | 109 | # this is a parser I wrote which should output the predictions in the glue platform format 110 | python parse_predictions.py \ 111 | --task=${TASK} \ 112 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 113 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 114 | done 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /archive/predictions_rw_100000_all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # general config 3 | export CUDA_VISIBLE_DEVICES=1; 4 | BERT_BASE_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 5 | VOCAB_DIR=$BERT_BASE_DIR/vocab.txt 6 | BERT_CONFIG=$BERT_BASE_DIR/bert_config.json 7 | GLUE_DATA="$GLUE_DIR" 8 | STEP_NUMBER=100000 9 | 10 | # root dir of your checkpoints 11 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all_quick_insight/${STEP_NUMBER}/" 12 | 13 | # this is a tuple of trained model and task, you can add more tuples 14 | 15 | for config in "SST2_16_3e-05_3/model.ckpt-12627","SST2" "CoLA_16_2e-05_4/model.ckpt-2137","CoLA" "MRPC_16_2e-05_3/model.ckpt-687","MRPC" "RTE_16_3e-05_4/model.ckpt-622","RTE"; do 16 | IFS="," 17 | set -- $config 18 | echo $1 and $2 19 | TASK=$2 20 | 21 | # location of the checkpoint which was best on dev 22 | TRAINED_CLASSIFIER=${ROOT}${1} 23 | OUTPUT_DIR=${ROOT}predictions/${TASK} 24 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 25 | python run_classifier_adapter_tune_all.py \ 26 | --task_name=${TASK} \ 27 | --do_predict=true \ 28 | --do_train=false \ 29 | --do_eval=false \ 30 | --data_dir=$GLUE_DIR/${TASK} \ 31 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 32 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 33 | --init_checkpoint=$TRAINED_CLASSIFIER \ 34 | --do_early_stopping=false \ 35 | --max_seq_length=128 \ 36 | --original_model=True \ 37 | --matched=True \ 38 | --output_dir=${OUTPUT_DIR} 39 | 40 | # this is a parser I wrote which should output the predictions in the glue platform format 41 | python parse_predictions.py \ 42 | --task=${TASK} \ 43 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 44 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 45 | done 46 | 47 | for config in "STSB_16_2e-05_4/model.ckpt-1437","STSB"; do 48 | IFS="," 49 | set -- $config 50 | echo $1 and $2 51 | TASK=$2 52 | 53 | # location of the checkpoint which was best on dev 54 | TRAINED_CLASSIFIER=${ROOT}${1} 55 | OUTPUT_DIR=${ROOT}predictions/${TASK} 56 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 57 | python run_regression_adapter_tune_all.py \ 58 | --task_name=${TASK} \ 59 | --do_predict=true \ 60 | --do_train=false \ 61 | --do_eval=false \ 62 | --data_dir=$GLUE_DIR/${TASK} \ 63 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 64 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 65 | --init_checkpoint=$TRAINED_CLASSIFIER \ 66 | --do_early_stopping=false \ 67 | --max_seq_length=128 \ 68 | --original_model=True \ 69 | --matched=True \ 70 | --output_dir=${OUTPUT_DIR} 71 | 72 | # this is a parser I wrote which should output the predictions in the glue platform format 73 | python parse_predictions.py \ 74 | --task=${TASK} \ 75 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 76 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 77 | done 78 | 79 | 80 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/" 81 | 82 | 83 | # this is a tuple of trained model and task, you can add more tuples 84 | for config in "QQP_16_3e-05_4/model.ckpt-90962","QQP" "MNLI_16_3e-05_4/model.ckpt-98175","MNLI" "QNLIV2_16_3e-05_3/model.ckpt-19639","QNLIV2"; do 85 | IFS="," 86 | set -- $config 87 | echo $1 and $2 88 | TASK=$2 89 | 90 | # location of the checkpoint which was best on dev 91 | TRAINED_CLASSIFIER=${ROOT}${1} 92 | OUTPUT_DIR=${ROOT}predictions/${TASK} 93 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 94 | python run_classifier_adapter_tune_all.py \ 95 | --task_name=${TASK} \ 96 | --do_predict=true \ 97 | --do_train=false \ 98 | --do_eval=false \ 99 | --data_dir=$GLUE_DIR/${TASK} \ 100 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 101 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 102 | --init_checkpoint=$TRAINED_CLASSIFIER \ 103 | --do_early_stopping=false \ 104 | --max_seq_length=128 \ 105 | --original_model=True \ 106 | --matched=True \ 107 | --output_dir=${OUTPUT_DIR} 108 | 109 | # this is a parser I wrote which should output the predictions in the glue platform format 110 | python parse_predictions.py \ 111 | --task=${TASK} \ 112 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 113 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 114 | done 115 | 116 | 117 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/" 118 | 119 | 120 | # this is a tuple of trained model and task, you can add more tuples 121 | # TODO: Do another dev set evaluation 122 | for config in "MNLI_16_3e-05_4/model.ckpt-98175","MNLI"; do 123 | IFS="," 124 | set -- $config 125 | echo $1 and $2 126 | TASK=$2 127 | 128 | # location of the checkpoint which was best on dev 129 | TRAINED_CLASSIFIER=${ROOT}${1} 130 | OUTPUT_DIR=${ROOT}predictions/${TASK}-mm 131 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 132 | python run_classifier_adapter_tune_all.py \ 133 | --task_name=${TASK} \ 134 | --do_predict=true \ 135 | --do_train=false \ 136 | --do_eval=false \ 137 | --data_dir=$GLUE_DIR/${TASK} \ 138 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 139 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 140 | --init_checkpoint=$TRAINED_CLASSIFIER \ 141 | --do_early_stopping=false \ 142 | --max_seq_length=128 \ 143 | --original_model=True \ 144 | --matched=False \ 145 | --output_dir=${OUTPUT_DIR} 146 | 147 | # this is a parser I wrote which should output the predictions in the glue platform format 148 | python parse_predictions.py \ 149 | --task=${TASK} \ 150 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 151 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 152 | done 153 | 154 | # this is the mnli model which was best on the matched dataset 155 | for config in "MNLI_16_3e-05_4/model.ckpt-98175","diagnostic"; do 156 | IFS="," 157 | set -- $config 158 | echo $1 and $2 159 | TASK=$2 160 | 161 | # location of the checkpoint which was best on dev 162 | TRAINED_CLASSIFIER=${ROOT}${1} 163 | OUTPUT_DIR=${ROOT}predictions/${TASK} 164 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 165 | python run_classifier_adapter_tune_all.py \ 166 | --task_name=${TASK} \ 167 | --do_predict=true \ 168 | --do_train=false \ 169 | --do_eval=false \ 170 | --data_dir=$GLUE_DIR/${TASK} \ 171 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 172 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 173 | --init_checkpoint=$TRAINED_CLASSIFIER \ 174 | --do_early_stopping=false \ 175 | --max_seq_length=128 \ 176 | --original_model=True \ 177 | --matched=True \ 178 | --output_dir=${OUTPUT_DIR} 179 | 180 | # this is a parser I wrote which should output the predictions in the glue platform format 181 | python parse_predictions.py \ 182 | --task=${TASK} \ 183 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 184 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 185 | done 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /archive/predictions_rw_25000_all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # general config 3 | export CUDA_VISIBLE_DEVICES=1; 4 | BERT_BASE_DIR="/work/anlausch/uncased_L-12_H-768_A-12" 5 | VOCAB_DIR=$BERT_BASE_DIR/vocab.txt 6 | BERT_CONFIG=$BERT_BASE_DIR/bert_config.json 7 | GLUE_DATA="$GLUE_DIR" 8 | STEP_NUMBER=25000 9 | 10 | # root dir of your checkpoints 11 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all_quick_insight/${STEP_NUMBER}/" 12 | 13 | # this is a tuple of trained model and task, you can add more tuples 14 | 15 | for config in "SST2_16_2e-05_3/model.ckpt-12627","SST2" "CoLA_16_2e-05_4/model.ckpt-2137","CoLA" "MRPC_16_2e-05_4/model.ckpt-917","MRPC" "RTE_16_3e-05_4/model.ckpt-622","RTE"; do 16 | IFS="," 17 | set -- $config 18 | echo $1 and $2 19 | TASK=$2 20 | 21 | # location of the checkpoint which was best on dev 22 | TRAINED_CLASSIFIER=${ROOT}${1} 23 | OUTPUT_DIR=${ROOT}predictions/${TASK} 24 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 25 | python run_classifier_adapter_tune_all.py \ 26 | --task_name=${TASK} \ 27 | --do_predict=true \ 28 | --do_train=false \ 29 | --do_eval=false \ 30 | --data_dir=$GLUE_DIR/${TASK} \ 31 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 32 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 33 | --init_checkpoint=$TRAINED_CLASSIFIER \ 34 | --do_early_stopping=false \ 35 | --max_seq_length=128 \ 36 | --original_model=True \ 37 | --matched=True \ 38 | --output_dir=${OUTPUT_DIR} 39 | 40 | # this is a parser I wrote which should output the predictions in the glue platform format 41 | python parse_predictions.py \ 42 | --task=${TASK} \ 43 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 44 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 45 | done 46 | 47 | for config in "STSB_16_3e-05_4/model.ckpt-1437","STSB"; do 48 | IFS="," 49 | set -- $config 50 | echo $1 and $2 51 | TASK=$2 52 | 53 | # location of the checkpoint which was best on dev 54 | TRAINED_CLASSIFIER=${ROOT}${1} 55 | OUTPUT_DIR=${ROOT}predictions/${TASK} 56 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 57 | python run_regression_adapter_tune_all.py \ 58 | --task_name=${TASK} \ 59 | --do_predict=true \ 60 | --do_train=false \ 61 | --do_eval=false \ 62 | --data_dir=$GLUE_DIR/${TASK} \ 63 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 64 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 65 | --init_checkpoint=$TRAINED_CLASSIFIER \ 66 | --do_early_stopping=false \ 67 | --max_seq_length=128 \ 68 | --original_model=True \ 69 | --matched=True \ 70 | --output_dir=${OUTPUT_DIR} 71 | 72 | # this is a parser I wrote which should output the predictions in the glue platform format 73 | python parse_predictions.py \ 74 | --task=${TASK} \ 75 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 76 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 77 | done 78 | 79 | 80 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/" 81 | 82 | 83 | # this is a tuple of trained model and task, you can add more tuples 84 | for config in "QQP_16_2e-05_3/model.ckpt-68221","QQP" "MNLI_16_2e-05_3/model.ckpt-73631","MNLI" "QNLIV2_16_2e-05_3/model.ckpt-19639","QNLIV2"; do 85 | IFS="," 86 | set -- $config 87 | echo $1 and $2 88 | TASK=$2 89 | 90 | # location of the checkpoint which was best on dev 91 | TRAINED_CLASSIFIER=${ROOT}${1} 92 | OUTPUT_DIR=${ROOT}predictions/${TASK} 93 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 94 | python run_classifier_adapter_tune_all.py \ 95 | --task_name=${TASK} \ 96 | --do_predict=true \ 97 | --do_train=false \ 98 | --do_eval=false \ 99 | --data_dir=$GLUE_DIR/${TASK} \ 100 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 101 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 102 | --init_checkpoint=$TRAINED_CLASSIFIER \ 103 | --do_early_stopping=false \ 104 | --max_seq_length=128 \ 105 | --original_model=True \ 106 | --matched=True \ 107 | --output_dir=${OUTPUT_DIR} 108 | 109 | # this is a parser I wrote which should output the predictions in the glue platform format 110 | python parse_predictions.py \ 111 | --task=${TASK} \ 112 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 113 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 114 | done 115 | 116 | 117 | ROOT="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/${STEP_NUMBER}/" 118 | 119 | 120 | # this is a tuple of trained model and task, you can add more tuples 121 | for config in "MNLI_16_2e-05_3/model.ckpt-73631","MNLI"; do 122 | IFS="," 123 | set -- $config 124 | echo $1 and $2 125 | TASK=$2 126 | 127 | # location of the checkpoint which was best on dev 128 | TRAINED_CLASSIFIER=${ROOT}${1} 129 | OUTPUT_DIR=${ROOT}predictions/${TASK}-mm 130 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 131 | python run_classifier_adapter_tune_all.py \ 132 | --task_name=${TASK} \ 133 | --do_predict=true \ 134 | --do_train=false \ 135 | --do_eval=false \ 136 | --data_dir=$GLUE_DIR/${TASK} \ 137 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 138 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 139 | --init_checkpoint=$TRAINED_CLASSIFIER \ 140 | --do_early_stopping=false \ 141 | --max_seq_length=128 \ 142 | --original_model=True \ 143 | --matched=False \ 144 | --output_dir=${OUTPUT_DIR} 145 | 146 | # this is a parser I wrote which should output the predictions in the glue platform format 147 | python parse_predictions.py \ 148 | --task=${TASK} \ 149 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 150 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 151 | done 152 | 153 | # this is the mnli model which was best on the matched dataset 154 | for config in "MNLI_16_2e-05_3/model.ckpt-73631","diagnostic"; do 155 | IFS="," 156 | set -- $config 157 | echo $1 and $2 158 | TASK=$2 159 | 160 | # location of the checkpoint which was best on dev 161 | TRAINED_CLASSIFIER=${ROOT}${1} 162 | OUTPUT_DIR=${ROOT}predictions/${TASK} 163 | # the actual prediction -- it is important to specify the checkpoint and to set train and eval to false but predict to true 164 | python run_classifier_adapter_tune_all.py \ 165 | --task_name=${TASK} \ 166 | --do_predict=true \ 167 | --do_train=false \ 168 | --do_eval=false \ 169 | --data_dir=$GLUE_DIR/${TASK} \ 170 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 171 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 172 | --init_checkpoint=$TRAINED_CLASSIFIER \ 173 | --do_early_stopping=false \ 174 | --max_seq_length=128 \ 175 | --original_model=True \ 176 | --matched=True \ 177 | --output_dir=${OUTPUT_DIR} 178 | 179 | # this is a parser I wrote which should output the predictions in the glue platform format 180 | python parse_predictions.py \ 181 | --task=${TASK} \ 182 | --input_path="${OUTPUT_DIR}_32_5e-05_3.0/test_results.tsv" \ 183 | --output_path_root="${OUTPUT_DIR}_32_5e-05_3.0" 184 | done 185 | 186 | 187 | 188 | -------------------------------------------------------------------------------- /copa_1_download_copa.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | waws --downloadS3 -f copa_en.zip -b wluper-retrograph 3 | mkdir data/COPA 4 | unzip copa_en.zip 5 | mv test_gold.jsonl data/COPA 6 | mv train.en.jsonl data/COPA 7 | mv val.en.jsonl data/COPA 8 | mv copa_en.zip data 9 | -------------------------------------------------------------------------------- /copa_2_finetune_adapter.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #Step1: 4 | #run_classifier_adapter_tune_all.py -> 5 | # 6 | # 7 | #Need to load the Adapter Model 8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT 9 | TRAINING_UTILITY=training_utility 10 | 11 | export CUDA_VISIBLE_DEVICES=0 12 | 13 | BERT_DIR="models/BERT_BASE_UNCASED" 14 | BERT_CONFIG=$BERT_DIR/bert_config.json 15 | BERT_VOCAB=$BERT_DIR/vocab.txt 16 | 17 | TASKNAME='COPA' 18 | DATA_DIR=data/$TASKNAME 19 | 20 | LEARNING_RATE=2e-5 21 | EPOCHS=3.0 22 | VARIANT=A 23 | 24 | EXPERIMENT_NAME=$LEARNING_RATE.$EPOCHS$VARIANT 25 | STEP="150000" 26 | 27 | PRETRAINED_NAME="RW30" 28 | BERT_EXTENDED_DIR="models/1.0_1.0_5_30_full_assertions_nl" 29 | # BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter" 30 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 31 | 32 | OUTPUT_DIR="models/output_model_finetunning/${TASKNAME}/${PRETRAINED_NAME}/${STEP}/${EXPERIMENT_NAME}" 33 | 34 | 35 | python3.6 $TRAINING_UTILITY/run_copa_adapter.py \ 36 | --do_train=true \ 37 | --do_eval=true \ 38 | --data_dir=$DATA_DIR \ 39 | --vocab_file=$BERT_VOCAB \ 40 | --bert_config_file=$BERT_CONFIG \ 41 | --init_checkpoint=$CHECKPOINT \ 42 | --max_seq_length=128 \ 43 | --train_batch_size=8 \ 44 | --learning_rate=$LEARNING_RATE \ 45 | --num_train_epochs=$EPOCHS \ 46 | --variant=$VARIANT \ 47 | --output_dir=$OUTPUT_DIR/ | tee $OUTPUT_DIR.out 48 | -------------------------------------------------------------------------------- /copa_2_finetune_bert.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #Step1: 4 | #run_classifier_adapter_tune_all.py -> 5 | # 6 | # 7 | #Need to load the Adapter Model 8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT 9 | TRAINING_UTILITY=training_utility 10 | 11 | export CUDA_VISIBLE_DEVICES=0 12 | 13 | BERT_DIR="models/BERT_BASE_UNCASED" 14 | BERT_CONFIG=$BERT_DIR/bert_config.json 15 | BERT_VOCAB=$BERT_DIR/vocab.txt 16 | 17 | TASKNAME='COPA' 18 | DATA_DIR=data/$TASKNAME 19 | 20 | LEARNING_RATE=2e-5 21 | EPOCHS=3.0 22 | VARIANT=A 23 | 24 | EXPERIMENT_NAME=$LEARNING_RATE.$EPOCHS$VARIANT 25 | 26 | # BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter" 27 | # CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 28 | 29 | BERT_EXTENDED_DIR=$BERT_DIR 30 | CHECKPOINT=${BERT_EXTENDED_DIR}/bert_model.ckpt 31 | OUTPUT_DIR="models/output_model_finetunning/${TASKNAME}/BERT_BASE/${EXPERIMENT_NAME}" 32 | 33 | 34 | python3.6 $TRAINING_UTILITY/run_copa.py \ 35 | --do_train=true \ 36 | --do_eval=true \ 37 | --data_dir=$DATA_DIR \ 38 | --vocab_file=$BERT_VOCAB \ 39 | --bert_config_file=$BERT_CONFIG \ 40 | --init_checkpoint=$CHECKPOINT \ 41 | --max_seq_length=128 \ 42 | --train_batch_size=8 \ 43 | --learning_rate=$LEARNING_RATE \ 44 | --num_train_epochs=$EPOCHS \ 45 | --variant=$VARIANT \ 46 | --output_dir=$OUTPUT_DIR/ | tee $OUTPUT_DIR.out 47 | -------------------------------------------------------------------------------- /csqa_1_download_commonsenseqa.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python3.6 download_utility/download_commonsenseqa.py 4 | -------------------------------------------------------------------------------- /csqa_2_finetune_adapter.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #Step1: 4 | #run_classifier_adapter_tune_all.py -> 5 | # 6 | # 7 | #Need to load the Adapter Model 8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT 9 | TRAINING_UTILITY=training_utility 10 | 11 | export CUDA_VISIBLE_DEVICES=0 12 | 13 | BERT_DIR="models/BERT_BASE_UNCASED" 14 | BERT_CONFIG=$BERT_DIR/bert_config.json 15 | BERT_VOCAB=$BERT_DIR/vocab.txt 16 | 17 | BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter" 18 | OUTPUT_DIR="models/output_model_finetunning" 19 | OUTPUT_SUFFIX=_tune_all 20 | 21 | TASKNAME='COMMONSENSEQA' 22 | DATA_DIR=data/$TASKNAME 23 | 24 | SPLIT="rand" 25 | 26 | STEP="25000" 27 | 28 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 29 | 30 | python3.6 $TRAINING_UTILITY/run_commonsenseqa_adapter.py \ 31 | --split=$SPLIT \ 32 | --do_train=true \ 33 | --do_eval=true \ 34 | --data_dir=$DATA_DIR \ 35 | --vocab_file=$BERT_VOCAB \ 36 | --bert_config_file=$BERT_CONFIG \ 37 | --init_checkpoint=$CHECKPOINT \ 38 | --max_seq_length=128 \ 39 | --train_batch_size=8 \ 40 | --learning_rate=2e-5 \ 41 | --num_train_epochs=3.0 \ 42 | --output_dir=$OUTPUT_DIR/$TASKNAME/ | tee $OUTPUT_DIR/$TASKNAME.out 43 | -------------------------------------------------------------------------------- /csqa_3_eval_adapter.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #Step1: 4 | #run_classifier_adapter_tune_all.py -> 5 | # 6 | # 7 | #Need to load the Adapter Model 8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT 9 | TRAINING_UTILITY=training_utility 10 | 11 | export CUDA_VISIBLE_DEVICES=0 12 | 13 | BERT_DIR="models/BERT_BASE_UNCASED" 14 | BERT_CONFIG=$BERT_DIR/bert_config.json 15 | BERT_VOCAB=$BERT_DIR/vocab.txt 16 | 17 | BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter" 18 | OUTPUT_DIR="models/output_model_finetunning" 19 | OUTPUT_SUFFIX=_tune_all 20 | 21 | TASKNAME='COMMONSENSEQA' 22 | DATA_DIR=data/$TASKNAME 23 | 24 | SPLIT="rand" 25 | 26 | STEP="25000" 27 | 28 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 29 | 30 | TRAINED_MODEL=$OUTPUT_DIR/$TASKNAME/model.ckpt-3000 31 | 32 | python3.6 $TRAINING_UTILITY/run_commonsenseqa_adapter.py \ 33 | --split=$SPLIT \ 34 | --do_train=false \ 35 | --do_eval=true \ 36 | --data_dir=$DATA_DIR \ 37 | --vocab_file=$BERT_VOCAB \ 38 | --bert_config_file=$BERT_CONFIG \ 39 | --init_checkpoint=$TRAINED_MODEL \ 40 | --max_seq_length=128 \ 41 | --train_batch_size=8 \ 42 | --learning_rate=2e-5 \ 43 | --num_train_epochs=3.0 \ 44 | --output_dir=$OUTPUT_DIR/$TASKNAME/ | tee $OUTPUT_DIR/$TASKNAME.out 45 | -------------------------------------------------------------------------------- /download_utility/download_bert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 Wluper Ltd. Team, Nikolai Rozanov. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # ############################################################################## 18 | # Import 19 | ############################################################################## 20 | 21 | # Native 22 | import urllib.request 23 | import os 24 | import zipfile 25 | 26 | # Packages 27 | import shutil 28 | 29 | # Local 30 | 31 | 32 | # ############################################################################# 33 | # Code 34 | ############################################################################## 35 | BERT_TO_URL_MAPPING = { 36 | "BERT_LARGE_UNCASED_WHOLEWORD" : "https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip", 37 | "BERT_LARGE_CASED_WHOLEWORD" : "https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip", 38 | 39 | "BERT_LARGE_UNCASED" : "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip", 40 | "BERT_LARGE_CASED" : "https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip", 41 | 42 | "BERT_BASE_UNCASED" : "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip", 43 | "BERT_BASE_CASED" : "https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip", 44 | 45 | "BERT_BASE_CASED_MULTI" : "https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip",#re 46 | "BERT_BASE_UNCASED_MULTI" : "https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip", 47 | 48 | "BERT_BASE_CHINESE" : "https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip" 49 | } 50 | 51 | 52 | def download_bert_zip(target_file_name:str, which_bert:str="BERT_BASE_CASED"): 53 | """ 54 | Downloads the officially pre-trained model from google. 55 | File is a zip and contains: 56 | 1. A TensorFlow checkpoint (bert_model.ckpt) containing the pre-trained weights (which is actually 3 files). 57 | 2. A vocab file (vocab.txt) to map WordPiece to word id. 58 | 3. A config file (bert_config.json) which specifies the hyperparameters of the model. 59 | 60 | Part Reference: 61 | https://stackoverflow.com/questions/7243750/download-file-from-web-in-python-3 62 | """ 63 | try: 64 | url = BERT_TO_URL_MAPPING[which_bert] 65 | except KeyError: 66 | print("Seems like this BERT model doesn't exist. Please specify a possible option.") 67 | exit() 68 | os.makedirs(os.path.dirname(target_file_name),exist_ok=True) #creates path if not in existence. 69 | with urllib.request.urlopen(url) as response, open(target_file_name, 'wb') as out_file: 70 | print(f"Downloading: {which_bert}. Target_file: {target_file_name}\nThis may take some time.") 71 | shutil.copyfileobj(response, out_file) 72 | print("Finished the Download.") 73 | 74 | 75 | def unzip_bert(path_to_zip_file: str, target_folder_name: str): 76 | """ 77 | unzips the bert and places the content into target_folder_name. 78 | 79 | Part Reference: 80 | https://stackoverflow.com/questions/3451111/unzipping-files-in-python 81 | """ 82 | print(f"Unzipping Bert zip {path_to_zip_file}.") 83 | with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref: 84 | zip_ref.extractall(target_folder_name) 85 | print("Finished Unzipping.") 86 | print(f"Moving Content to: {target_folder_name}") 87 | _move_unzipped_content(target_folder_name) 88 | print("Finished Moving. Finished Process.") 89 | 90 | 91 | def _move_unzipped_content(target_folder_name:str): 92 | """ Helper function to move content for function unzip_bert. (has assumptions)""" 93 | bert_data = os.listdir(target_folder_name)[0] 94 | final_bert_data_path = os.path.join(target_folder_name, bert_data) 95 | for file in os.listdir(final_bert_data_path): 96 | shutil.move(os.path.join(final_bert_data_path,file), target_folder_name) 97 | os.rmdir(final_bert_data_path) 98 | 99 | # ############################################################################# 100 | # MAIN 101 | ############################################################################## 102 | if __name__=="__main__": 103 | which_bert = "BERT_BASE_UNCASED" 104 | target_file_name = os.path.join("models","bert_pretrained.zip") 105 | target_folder_name = os.path.join("models",which_bert) 106 | download_bert_zip(target_file_name=target_file_name, which_bert=which_bert) 107 | unzip_bert(target_file_name, target_folder_name) 108 | -------------------------------------------------------------------------------- /download_utility/download_commonsenseqa.py: -------------------------------------------------------------------------------- 1 | ''' Script for downloading all CommonsenseQA data. 2 | Author: Nikolai Rozanov 3 | ''' 4 | 5 | import os 6 | import sys 7 | import shutil 8 | import argparse 9 | import tempfile 10 | import urllib.request 11 | 12 | 13 | LINKS = [ 14 | "https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl", 15 | "https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl", 16 | "https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl" 17 | ] 18 | 19 | def download_and_extract(link, data_dir): 20 | """ downloads and moves. """ 21 | print("Downloading and extracting %s..." % link) 22 | data_file = get_name_from_link(link) 23 | urllib.request.urlretrieve(link,data_file) 24 | shutil.move(data_file,os.path.join(data_dir,data_file)) 25 | print("\tCompleted!") 26 | 27 | def get_name_from_link(link): 28 | """ returns name from link. """ 29 | name = link.split("/")[-1] 30 | return name 31 | 32 | def make_dir(directory_path, directory_name): 33 | """ Makes a directory if it doesn't exist. """ 34 | directory = os.path.join(directory_path, directory_name) 35 | if not os.path.exists(directory): 36 | os.makedirs(directory) 37 | 38 | def main(): 39 | DATA="data" 40 | TARGET_FOLDER="COMMONSENSEQA" 41 | data_dir = os.path.join(DATA, TARGET_FOLDER) 42 | 43 | make_dir(DATA, TARGET_FOLDER) 44 | for link in LINKS: 45 | download_and_extract(link, data_dir) 46 | 47 | 48 | 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /download_utility/download_glue.py: -------------------------------------------------------------------------------- 1 | ''' Script for downloading all GLUE data. 2 | Note: for legal reasons, we are unable to host MRPC. 3 | You can either use the version hosted by the SentEval team, which is already tokenized, 4 | or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually. 5 | For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example). 6 | You should then rename and place specific files in a folder (see below for an example). 7 | mkdir MRPC 8 | cabextract MSRParaphraseCorpus.msi -d MRPC 9 | cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt 10 | cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt 11 | rm MRPC/_* 12 | rm MSRParaphraseCorpus.msi 13 | 1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now. 14 | 2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray! 15 | 16 | Part Source: 17 | https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e 18 | 19 | Instructions: 20 | python3 download_glue.py --data_dir data/GLUE --tasks all 21 | ''' 22 | 23 | import os 24 | import sys 25 | import shutil 26 | import argparse 27 | import tempfile 28 | import urllib.request 29 | import zipfile 30 | 31 | TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS-B", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic", "COPA"] 32 | TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip', 33 | "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip', 34 | "MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc', 35 | "QQP":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip', 36 | "STS-B":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip', 37 | "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip', 38 | "SNLI":'https://dl.fbaipublicfiles.com/glue/data/SNLI.zip', 39 | "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip', 40 | "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip', 41 | "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip', 42 | "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv', 43 | "COPA": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/COPA.zip"} 44 | 45 | MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt' 46 | MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt' 47 | 48 | def download_and_extract(task, data_dir): 49 | print("Downloading and extracting %s..." % task) 50 | data_file = "%s.zip" % task 51 | urllib.request.urlretrieve(TASK2PATH[task], data_file) 52 | with zipfile.ZipFile(data_file) as zip_ref: 53 | zip_ref.extractall(data_dir) 54 | os.remove(data_file) 55 | print("\tCompleted!") 56 | 57 | def format_mrpc(data_dir, path_to_data): 58 | print("Processing MRPC...") 59 | mrpc_dir = os.path.join(data_dir, "MRPC") 60 | if not os.path.isdir(mrpc_dir): 61 | os.mkdir(mrpc_dir) 62 | if path_to_data: 63 | mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt") 64 | mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt") 65 | else: 66 | print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN) 67 | mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt") 68 | mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt") 69 | urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file) 70 | urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file) 71 | assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file 72 | assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file 73 | urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv")) 74 | 75 | dev_ids = [] 76 | with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh: 77 | for row in ids_fh: 78 | dev_ids.append(row.strip().split('\t')) 79 | 80 | with open(mrpc_train_file, encoding="utf8") as data_fh, \ 81 | open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \ 82 | open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh: 83 | header = data_fh.readline() 84 | train_fh.write(header) 85 | dev_fh.write(header) 86 | for row in data_fh: 87 | label, id1, id2, s1, s2 = row.strip().split('\t') 88 | if [id1, id2] in dev_ids: 89 | dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) 90 | else: 91 | train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) 92 | 93 | with open(mrpc_test_file, encoding="utf8") as data_fh, \ 94 | open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh: 95 | header = data_fh.readline() 96 | test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n") 97 | for idx, row in enumerate(data_fh): 98 | label, id1, id2, s1, s2 = row.strip().split('\t') 99 | test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2)) 100 | print("\tCompleted!") 101 | 102 | def download_diagnostic(data_dir): 103 | print("Downloading and extracting diagnostic...") 104 | if not os.path.isdir(os.path.join(data_dir, "diagnostic")): 105 | os.mkdir(os.path.join(data_dir, "diagnostic")) 106 | data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv") 107 | urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file) 108 | print("\tCompleted!") 109 | return 110 | 111 | def get_tasks(task_names): 112 | task_names = task_names.split(',') 113 | if "all" in task_names: 114 | tasks = TASKS 115 | else: 116 | tasks = [] 117 | for task_name in task_names: 118 | assert task_name in TASKS, "Task %s not found!" % task_name 119 | tasks.append(task_name) 120 | return tasks 121 | 122 | def main(arguments): 123 | parser = argparse.ArgumentParser() 124 | parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data') 125 | parser.add_argument('--tasks', help='tasks to download data for as a comma separated string', 126 | type=str, default='all') 127 | parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt', 128 | type=str, default='') 129 | args = parser.parse_args(arguments) 130 | 131 | if not os.path.isdir(args.data_dir): 132 | os.mkdir(args.data_dir) 133 | tasks = get_tasks(args.tasks) 134 | 135 | for task in tasks: 136 | if task == 'MRPC': 137 | format_mrpc(args.data_dir, args.path_to_mrpc) 138 | elif task == 'diagnostic': 139 | download_diagnostic(args.data_dir) 140 | else: 141 | download_and_extract(task, args.data_dir) 142 | 143 | 144 | if __name__ == '__main__': 145 | sys.exit(main(sys.argv[1:])) 146 | -------------------------------------------------------------------------------- /download_utility/download_relations.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the script for downloading ConceptNet relations from S3. 3 | Make sure waws is installed and configured: 4 | 5 | pip3 install waws 6 | waws --configure 7 | 8 | To run: python3 download_relations.py 9 | To download all (31): -r all (default) 10 | To download specific relations, provide a comma-separated list of relations: -r isA,formOf 11 | To specify/create a local download directory, use: -d directory_name. 12 | """ 13 | 14 | import os 15 | import sys 16 | import argparse 17 | import waws 18 | 19 | RELATIONS = ['relatedTo', 'formOf', 'isA', 'partOf', 'hasA', 'usedFor', 'capableOf', 20 | 'atLocation', 'causes', 'hasSubevent', 'hasFirstSubevent', 'hasLastSubevent', 21 | 'hasPrerequisite', 'hasProperty', 'motivatedByGoal', 'obstructedBy', 'desires', 22 | 'createdBy', 'synonyms', 'antonyms', 'distinctFrom', 'derivedFrom', 'symbolOf', 23 | 'definedAs', 'mannerOf', 'locatedNear', 'hasContext', 'similarTo', 'causesDesire', 24 | 'madeOf', 'receivesAction'] 25 | 26 | s3 = waws.BucketManager() 27 | 28 | def download(relation, data_dir): 29 | print("Downloading and extracting %s..." % relation) 30 | data_file = "cn_%s.txt" % relation 31 | 32 | s3.download_file( 33 | file_name=data_file, 34 | local_path=data_dir, 35 | remote_path="", 36 | bucket_name="wluper-retrograph" 37 | ) 38 | 39 | print("\tDone!") 40 | 41 | def get_relations(relation_names): 42 | relation_names = relation_names.split(',') 43 | if "all" in relation_names: 44 | relations = RELATIONS 45 | else: 46 | relations = [] 47 | for rel_name in relation_names: 48 | assert rel_name in RELATIONS, "Relation %s not found!" % rel_name 49 | relations.append(rel_name) 50 | return relations 51 | 52 | def main(arguments): 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument('-d', '--data_dir', help='directory to save data to', type=str, default='./') 55 | parser.add_argument('-r', '--relations', help='relations to download as a comma separated string', 56 | type=str, default='all') 57 | args = parser.parse_args(arguments) 58 | 59 | if not os.path.isdir(args.data_dir): 60 | os.mkdir(args.data_dir) 61 | relations = get_relations(args.relations) 62 | 63 | for rel in relations: 64 | download(rel, args.data_dir) 65 | 66 | 67 | if __name__ == '__main__': 68 | sys.exit(main(sys.argv[1:])) 69 | -------------------------------------------------------------------------------- /glue_1_download_glue.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DOWNLOAD_UTILITY_SCRIPTS=download_utility 4 | 5 | 6 | DIR_SAVE_RELATIONS='relations/' 7 | mkdir -p $DIR_SAVE_RELATIONS 8 | 9 | # DOWNLOAD RELATIONS 10 | python3.6 $DOWNLOAD_UTILITY_SCRIPTS/download_relations.py --data_dir $DIR_SAVE_RELATIONS --relations all 11 | 12 | mkdir -p 'data/GLUE' 13 | mkdir -p 'models/BERT_BASE_UNCASED' 14 | 15 | # DOWNLOAD BERT 16 | python3.6 $DOWNLOAD_UTILITY_SCRIPTS/download_bert.py 17 | 18 | # DOWNLOAD GLUE 19 | python3.6 $DOWNLOAD_UTILITY_SCRIPTS/download_glue.py --data_dir data/GLUE --tasks all 20 | -------------------------------------------------------------------------------- /glue_2_finetune_adapter.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #Step1: 4 | #run_classifier_adapter_tune_all.py -> 5 | # 6 | # 7 | #Need to load the Adapter Model 8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT 9 | TRAINING_UTILITY=training_utility 10 | 11 | export CUDA_VISIBLE_DEVICES=8 12 | 13 | BERT_DIR="models/BERT_BASE_UNCASED" 14 | BERT_CONFIG=$BERT_DIR/bert_config.json 15 | VOCAB_DIR=$BERT_DIR/vocab.txt 16 | 17 | BERT_EXTENDED_DIR="models/output_pretrain_adapter" 18 | OUTPUT_DIR="models/output_model_finetunning" 19 | OUTPUT_SUFFIX=_tune_all 20 | 21 | GLUE_DIR='data/GLUE' 22 | 23 | ### the second finetuning variant 24 | for STEP in "0" "99000"; do 25 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 26 | for task_name in "QNLI" "QQP" "MNLI"; do 27 | echo $task_name 28 | echo $CHECKPOINT 29 | 30 | GLUE_DATA="$GLUE_DIR/$task_name" 31 | 32 | python3.6 $TRAINING_UTILITY/run_classifier_adapter_tune_all.py \ 33 | --task_name=$task_name \ 34 | --do_train=true \ 35 | --do_eval=true \ 36 | --do_early_stopping=false \ 37 | --data_dir=$GLUE_DATA \ 38 | --vocab_file=$VOCAB_DIR \ 39 | --bert_config_file=$BERT_CONFIG \ 40 | --init_checkpoint=$CHECKPOINT\ 41 | --max_seq_length=128 \ 42 | --train_batch_size="[16]" \ 43 | --learning_rate="[2e-5, 3e-5]" \ 44 | --num_train_epochs="[3,4]" \ 45 | --original_model=True \ 46 | --output_dir=${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name} | tee ${OUTPUT_DIR}${OUTPUT_SUFFIX}/${STEP}/${task_name}.out 47 | done 48 | done 49 | -------------------------------------------------------------------------------- /images/Retrograph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Wluper/Retrograph/d275e45c9127e645e4f02f32f42a62c2636f6c3a/images/Retrograph.png -------------------------------------------------------------------------------- /randomwalks_utility/create_corpora_from_random_walks.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import codecs 3 | from tqdm import tqdm 4 | from concurrent.futures import ProcessPoolExecutor, as_completed 5 | 6 | def load_walks(path="./randomwalks/random_walk_1.0_1.0_2_10.p"): 7 | return pickle.load(open(path, "rb")) 8 | 9 | 10 | def create_relationship_token(text): 11 | # here, I've changed something 12 | return text #"<" + "".join(text.split(" ")) + ">" 13 | 14 | def process_walks(walks): 15 | text = "" 16 | for walk in walks: 17 | previous_token = "" 18 | for i, token in enumerate(walk): 19 | # every first token is a node and every second is a relationship 20 | # we don't need to capitalize anything as we are anyways working with the uncased BERT 21 | if (i % 2 == 0 and previous_token != "" and i != 0 and i != 2) or (i == 3 and previous_token != ""): 22 | # we have reached the end of a valid sentence sequence, so we put a period 23 | if i == 3: 24 | text = text[:-1] + ".\n" 25 | else: 26 | text = text + token + ".\n" 27 | if i != len(walk) - 1 and i == 3: 28 | # if the walk is not finished yet, we duplicate the token 29 | text = text + previous_token + " " + create_relationship_token(token) + " " 30 | elif i != len(walk) - 1: 31 | # if the walk is not finished yet, we duplicate the token 32 | text = text + token + " " 33 | else: 34 | # otherwise we can put a new line to mark the end of a document 35 | text = text + "\n\n" 36 | elif i % 2 == 0: 37 | text = text + token + " " 38 | elif i % 1 == 0: 39 | text = text + create_relationship_token(token) + " " 40 | previous_token = token 41 | 42 | return text 43 | 44 | def chunks(lst, n): 45 | """Yield successive n-sized chunks from lst.""" 46 | for i in range(0, len(lst), n): 47 | yield lst[i:i + n] 48 | 49 | def generate_corpus_from_walks(walks, output_path_prefix="./randomwalks/rw_corpus_", output_path_suffix=""): 50 | # how do we actually want to generate the corpus? 51 | # one option is to always dublicate the node in the middle.. 52 | # also Goran says that we want to keep the relations as separate tokens in the vocab. I do not necessarily agree with this, but we try. 53 | # What is one document? Is is always one walk? Maybe yes... 54 | output_path = output_path_prefix + output_path_suffix + ".txt" 55 | text = "" 56 | print('size of walks', len(walks)) 57 | print('processing RWs...') 58 | 59 | workers = 10 60 | splits = 1000 61 | text = "" 62 | with ProcessPoolExecutor(max_workers=workers) as executor: 63 | futures = {} 64 | for i, ws in enumerate(chunks(walks, splits)): 65 | job = executor.submit(process_walks, ws) 66 | futures[job] = i 67 | 68 | for job in tqdm(as_completed(futures)): 69 | t = job.result() 70 | text += t 71 | r = futures[job] 72 | del futures[job] 73 | 74 | 75 | with codecs.open(output_path, "w", "utf8") as out: 76 | out.write(text) 77 | 78 | 79 | def main(): 80 | in_prefix = "randomwalks/random_walk_" 81 | in_suffix = "1.0_1.0_2_15" 82 | walks = load_walks(in_prefix + in_suffix + ".p") 83 | generate_corpus_from_walks(walks, output_path_suffix=in_suffix + "_nl") 84 | 85 | 86 | if __name__=="__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /randomwalks_utility/preprocess_cn.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | """ 4 | As we got the relations to consider from olga, we don't need to do this anymore 5 | """ 6 | # def filter_assertions(path="./relations/assertions.csv"): 7 | # assertions = [] 8 | # with codecs.open(path, "r", "utf8") as f: 9 | # reader = csv.DictReader(f, dialect=csv.excel_tab, fieldnames=["URI", "relation", "node_a", "node_b", "info"]) 10 | # for i,row in enumerate(reader): 11 | # node_a = row["node_a"].split("/c/en/") 12 | # node_b = row["node_b"].split("/c/en/") 13 | # if len(node_a) > 1 and len(node_b) > 1: 14 | # # these should be nodes in english 15 | # node_a = node_a[1].split("/")[-1].replace("_", "-") 16 | # node_b = node_b[1].split("/")[-1].replace("_", "-") 17 | # print(node_a) 18 | # print(node_b) 19 | 20 | """ 21 | Based on the relations from olga 22 | """ 23 | def create_joined_assertions_for_random_walks(paths=["./relations/cn_antonyms.txt", "./relations/cn_isA.txt", "./relations/cn_mannerOf.txt","./relations/cn_synonyms.txt"], output_path="./randomwalks/cn_assertions_filtered.tsv"): 24 | # we ideally want to have a "natural language representation" of the relations 25 | # TODO: keep in mind that antonymy and synonymy are bidirectional relationships, so maybe we want to account for this, i.e., by creating the corresponding pairs in the opposite direction or so 26 | # TODO: As an alternative of random walks, we can also just use the natural language representation of the relationships 27 | relation_dict = { 28 | "antonyms": "is an antonym of", 29 | "isA": "is a", 30 | "mannerOf": "is a manner of", 31 | "synonyms": "is a synonym of" 32 | } 33 | all_assertions = [] 34 | for path in paths: 35 | relation = path.split("cn_")[1].split(".txt")[0] 36 | nl_relation = relation_dict[relation] 37 | with codecs.open(path, "r", "utf8") as f: 38 | for line in f.readlines(): 39 | word_a, word_b = line.strip().split("\t") 40 | full_assertion = [word_a, word_b, nl_relation] 41 | all_assertions.append(full_assertion) 42 | # TODO: here is an attempt to account for bidirectionality; Does it make sense? 43 | if relation == "antonyms" or relation == "synonyms": 44 | full_assertion_b = [word_b, word_a, nl_relation] 45 | all_assertions.append(full_assertion_b) 46 | # In total, we have 293105 assertions 47 | print("In total, we have %d assertions" % len(all_assertions)) 48 | with codecs.open(output_path, "w", "utf8") as out: 49 | for assertion in all_assertions: 50 | out.write(assertion[0] + "\t" + assertion[1] + "\t" + assertion[2] + "\n") 51 | 52 | 53 | 54 | def main(): 55 | create_joined_assertions_for_random_walks() 56 | #profile_data() 57 | #filter_assertions() 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /randomwalks_utility/random_walks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import networkx as nx 3 | import random 4 | import pickle 5 | 6 | def read_graph(path="./randomwalks/cn_assertions_filtered.tsv"): 7 | ''' 8 | Reads the input network in networkx. 9 | ''' 10 | 11 | G = nx.read_edgelist(path, nodetype=str, data=(('edge_type', str),), create_using=nx.DiGraph(), delimiter="\t") 12 | for edge in G.edges(): 13 | G[edge[0]][edge[1]]['weight'] = 1 14 | return G 15 | 16 | 17 | class Graph(): 18 | def __init__(self, nx_G, is_directed, p, q): 19 | self.G = nx_G 20 | self.is_directed = is_directed 21 | self.p = p 22 | self.q = q 23 | 24 | def node2vec_walk(self, walk_length, start_node): 25 | ''' 26 | Simulate a random walk starting from start node. 27 | ''' 28 | G = self.G 29 | alias_nodes = self.alias_nodes 30 | alias_edges = self.alias_edges 31 | 32 | walk = [start_node] 33 | 34 | while len(walk) < walk_length: 35 | cur = walk[-1] 36 | cur_nbrs = sorted(G.neighbors(cur)) 37 | if len(cur_nbrs) > 0: 38 | if len(walk) == 1: 39 | # TODO: This is Annes main change to the code, the rest is original node2vec code 40 | # NEW 41 | n = cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])] 42 | walk.append(G.get_edge_data(cur, n)["edge_type"]) 43 | walk.append(n) 44 | 45 | #walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) 46 | else: 47 | #prev = walk[-2] 48 | prev = walk[-3] 49 | next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], 50 | alias_edges[(prev, cur)][1])] 51 | ## new 52 | walk.append(G.get_edge_data(cur, next)["edge_type"]) 53 | #### 54 | walk.append(next) 55 | else: 56 | break 57 | 58 | return walk 59 | 60 | def simulate_walks(self, num_walks, walk_length): 61 | ''' 62 | Repeatedly simulate random walks from each node. 63 | ''' 64 | G = self.G 65 | walks = [] 66 | nodes = list(G.nodes()) 67 | print 68 | 'Walk iteration:' 69 | for walk_iter in range(num_walks): 70 | print 71 | str(walk_iter + 1), '/', str(num_walks) 72 | random.shuffle(nodes) 73 | for node in nodes: 74 | walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node)) 75 | 76 | return walks 77 | 78 | def get_alias_edge(self, src, dst): 79 | ''' 80 | Get the alias edge setup lists for a given edge. 81 | ''' 82 | G = self.G 83 | p = self.p 84 | q = self.q 85 | 86 | unnormalized_probs = [] 87 | for dst_nbr in sorted(G.neighbors(dst)): 88 | if dst_nbr == src: 89 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p) 90 | elif G.has_edge(dst_nbr, src): 91 | unnormalized_probs.append(G[dst][dst_nbr]['weight']) 92 | else: 93 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q) 94 | norm_const = sum(unnormalized_probs) 95 | normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs] 96 | 97 | return alias_setup(normalized_probs) 98 | 99 | def preprocess_transition_probs(self): 100 | ''' 101 | Preprocessing of transition probabilities for guiding the random walks. 102 | ''' 103 | G = self.G 104 | is_directed = self.is_directed 105 | 106 | alias_nodes = {} 107 | for node in G.nodes(): 108 | unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))] 109 | norm_const = sum(unnormalized_probs) 110 | normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs] 111 | alias_nodes[node] = alias_setup(normalized_probs) 112 | 113 | alias_edges = {} 114 | triads = {} 115 | 116 | if is_directed: 117 | for edge in G.edges(): 118 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 119 | else: 120 | for edge in G.edges(): 121 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 122 | alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0]) 123 | 124 | self.alias_nodes = alias_nodes 125 | self.alias_edges = alias_edges 126 | 127 | return 128 | 129 | 130 | def alias_setup(probs): 131 | ''' 132 | Compute utility lists for non-uniform sampling from discrete distributions. 133 | Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ 134 | for details 135 | ''' 136 | K = len(probs) 137 | q = np.zeros(K) 138 | J = np.zeros(K, dtype=np.int) 139 | 140 | smaller = [] 141 | larger = [] 142 | for kk, prob in enumerate(probs): 143 | q[kk] = K * prob 144 | if q[kk] < 1.0: 145 | smaller.append(kk) 146 | else: 147 | larger.append(kk) 148 | 149 | while len(smaller) > 0 and len(larger) > 0: 150 | small = smaller.pop() 151 | large = larger.pop() 152 | 153 | J[small] = large 154 | q[large] = q[large] + q[small] - 1.0 155 | if q[large] < 1.0: 156 | smaller.append(large) 157 | else: 158 | larger.append(large) 159 | 160 | return J, q 161 | 162 | 163 | def alias_draw(J, q): 164 | ''' 165 | Draw sample from a non-uniform discrete distribution using alias sampling. 166 | ''' 167 | K = len(J) 168 | 169 | kk = int(np.floor(np.random.rand() * K)) 170 | if np.random.rand() < q[kk]: 171 | return kk 172 | else: 173 | return J[kk] 174 | 175 | 176 | """ 177 | parser.add_argument('--walk-length', type=int, default=80, 178 | help='Length of walk per source. Default is 80.') 179 | 180 | parser.add_argument('--num-walks', type=int, default=10, 181 | help='Number of walks per source. Default is 10.') 182 | 183 | parser.add_argument('--workers', type=int, default=8, 184 | help='Number of parallel workers. Default is 8.') 185 | 186 | parser.add_argument('--p', type=float, default=1, 187 | help='Return hyperparameter. Default is 1.') 188 | 189 | parser.add_argument('--q', type=float, default=1, 190 | help='Inout hyperparameter. Default is 1.') 191 | 192 | """ 193 | 194 | def generate_random_walks_from_assertions(): 195 | p = 1.0 # return hyperparameter 196 | q = 1.0 # inout hyperparameter 197 | is_directed = True # whether the graph is directed 198 | num_walks = 2 # number of wandom walks per source def. 10 199 | walk_length = 15 # length of walk per source def. 80 200 | 201 | nx_G = read_graph(path="./randomwalks/cn_assertions_filtered.tsv") 202 | G = Graph(nx_G, is_directed, p, q) 203 | G.preprocess_transition_probs() 204 | walks = G.simulate_walks(num_walks, walk_length) 205 | filename = "./randomwalks/random_walk_" + str(p) + "_" + str(q) + "_" + str(num_walks) + "_" + str(walk_length) + ".p" 206 | with open(filename, 'wb') as handle: 207 | pickle.dump(walks, handle) 208 | print(len(walks)) 209 | 210 | 211 | def analyze_graph(): 212 | nx_G = read_graph(path="./randomwalks/cn_assertions_filtered.tsv") 213 | print("%d nodes in the graph" % nx_G.number_of_nodes()) 214 | print("%d edges in the graph" % nx_G.number_of_edges()) 215 | print("%f density of graph" % nx.density(nx_G)) 216 | #print("%f density of graph" % nx.number_of_selfloops(nx_G)) 217 | print("%s" % nx.info(nx_G)) 218 | print("%f avg in-degree" % float(float(sum(nx_G.in_degree().values()))/float(len(nx_G.in_degree().values())))) 219 | print("%f min in-degree" % float(float(min(nx_G.in_degree().values())))) 220 | print("%f max in-degree" % float(float(max(nx_G.in_degree().values())))) 221 | print("%f std in-degree" % float(float(np.std(np.array([float(v) for v in nx_G.in_degree().values()], dtype=np.float))))) 222 | print("%f avg in-degree" % float(float(np.average(np.array([float(v) for v in nx_G.in_degree().values()], dtype=np.float))))) 223 | 224 | print("%f avg out-degree" % float(float(sum(nx_G.out_degree().values()))/float(len(nx_G.out_degree().values())))) 225 | print("%f min out-degree" % float(float(min(nx_G.out_degree().values())))) 226 | print("%f max out-degree" % float(float(max(nx_G.out_degree().values())))) 227 | print("%f std out-degree" % float(float(np.std(np.array([float(v) for v in nx_G.out_degree().values()], dtype=np.float))))) 228 | print("%f avg out-degree" % float(float(np.average(np.array([float(v) for v in nx_G.out_degree().values()], dtype=np.float))))) 229 | 230 | 231 | comps_strong = list(nx.strongly_connected_component_subgraphs(nx_G)) 232 | print("%d num strongly connected components" % len(comps_strong)) 233 | comps_weak = list(nx.weakly_connected_component_subgraphs(nx_G)) 234 | print("%d num weakly connected components" % len(comps_weak)) 235 | diameters=[] 236 | for c in comps_strong: 237 | diameters.append(nx.diameter(c)) 238 | print("Avg diameter %f for strongly connected components" % float(sum(diameters)/len(diameters))) 239 | print("Max diameter %f for strongly connected components" % max(diameters)) 240 | print("Min diameter %f for strongly connected components" % min(diameters)) 241 | print("%f std diameter" % float(float(np.std(np.array(diameters, dtype=np.float))))) 242 | print("%f avg diameter" % float(float(np.average(np.array(diameters, dtype=np.float))))) 243 | 244 | 245 | def load_random_walk(p): 246 | walk = pickle.load(open(p, 'rb')) 247 | return walk 248 | 249 | 250 | def main(): 251 | generate_random_walks_from_assertions() 252 | #analyze_graph() 253 | # load_random_walk(p="./randomwalks/random_walk_1.0_1.0_2_10.p") 254 | 255 | if __name__=="__main__": 256 | main() 257 | -------------------------------------------------------------------------------- /results_utility/fetcher.py: -------------------------------------------------------------------------------- 1 | import os 2 | import codecs 3 | import csv 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import seaborn as sns 7 | import pandas as pd 8 | 9 | def fetch_results(base_path="/work/anlausch/ConceptBERT/output/finetuning/omcs/", subdir="free-wo-nsp/"):#subdir="base_16_longer/"):# ## 10 | """ 11 | :param base_path: 12 | :param subdir: 13 | :return: 14 | >>> fetch_results() 15 | """ 16 | path = base_path + subdir 17 | result_dict = {} 18 | for root, dirs, files in os.walk(path): 19 | for f in files: 20 | if f == "eval_results.txt": 21 | id = root.split("/")[-1] 22 | task = "_".join(id.split("_")[:1]) 23 | hyperparams = "_".join(id.split("_")[1:]) 24 | train_step = root.split("/")[-2] 25 | if train_step in result_dict: 26 | train_step_dict = result_dict[train_step] 27 | else: 28 | train_step_dict = {} 29 | with codecs.open(os.path.join(root, f), "r", "utf8") as file: 30 | file_dict = {} 31 | for line in file.readlines(): 32 | key = line.split(" = ")[0] 33 | try: 34 | value = float(line.split(" = ")[1].strip()) 35 | file_dict[key] = value 36 | except Exception as e: 37 | print(e) 38 | if task not in train_step_dict: 39 | train_step_dict[task] = {} 40 | train_step_dict[task][hyperparams] = file_dict 41 | result_dict[train_step] = train_step_dict 42 | filtered_list = [] 43 | for train_step, train_step_dict in result_dict.items(): 44 | for task, task_dict in train_step_dict.items(): 45 | if task in ["MRPC", "RTE", "MNLI", "QQP", "SST2", "QNLI", "QNLIV2"]: 46 | measure = "eval_accuracy" 47 | elif task in ["CoLA"]: 48 | measure = "mcc" 49 | elif task in ["STSB"]: 50 | measure = "spearman" 51 | else: 52 | print("Task name not in list: %s", task) 53 | if len(task_dict) < 4: 54 | print("Task %s result dict for train step %s has not all hyperparam results" % (task, train_step)) 55 | break 56 | else: 57 | best_config = "" 58 | best_result = 0.0 59 | for i, (config, result) in enumerate(task_dict.items()): 60 | if result[measure] >= best_result: 61 | best_config = config 62 | best_result = result[measure] 63 | filtered_list.append({"train_step": train_step, "task": task, "hyperparams": best_config, "score": best_result}) 64 | return filtered_list 65 | 66 | 67 | def output_results_as_csv(filtered_list, output_path="./../finetuning/poc_over_time/wn_binary.csv"): 68 | csv_keys = list(filtered_list[0].keys()) 69 | with open(output_path, 'w') as output_file: 70 | dict_writer = csv.DictWriter(output_file, csv_keys) 71 | dict_writer.writeheader() 72 | dict_writer.writerows(filtered_list) 73 | 74 | def plot_task(task="CoLA", output_path="./../finetuning/poc_over_time/cola2.pdf"): 75 | """ 76 | :param output_path: 77 | :param task: 78 | :return: 79 | >>> plot_task() 80 | """ 81 | filtered_list_wn = fetch_results(subdir="wn_binary") + fetch_results(subdir="wn_binary_16_longer") 82 | # kick out ill stsb 83 | for d in filtered_list_wn: 84 | if d["train_step"] != "stsb_first": 85 | d["train_step"] = int(d["train_step"])/2 86 | d["model"] = "informed" 87 | filtered_list_base = fetch_results(subdir="base_16") + fetch_results(subdir="base_16_longer") 88 | for d in filtered_list_base: 89 | d["train_step"] = int(d["train_step"]) 90 | d["model"] = "base" 91 | filtered_list_wn = [d for d in filtered_list_wn if d["train_step"] != "stsb_first" and d["task"]==task and d["train_step"] == 1000000] 92 | filtered_list_base = [d for d in filtered_list_base if d["task"] == task and d["train_step"] == 1000000] 93 | filtered_list_wn = sorted(filtered_list_wn, key=lambda k: k['train_step']) 94 | filtered_list_base = sorted(filtered_list_base, key=lambda k: k['train_step']) 95 | 96 | # aligned_wn = [] 97 | # aligned_base = [] 98 | # ind = [] 99 | # for d_base in filtered_list_base: 100 | # for d_wn in filtered_list_wn: 101 | # if int(d_base["train_step"])*2 == int(d_wn["train_step"]): 102 | # aligned_base.append(d_base["score"]) 103 | # aligned_wn.append(d_wn["score"]) 104 | # ind.append(d_base["train_step"]) 105 | # break 106 | all = filtered_list_wn + filtered_list_base 107 | df = pd.DataFrame(all) 108 | 109 | sns.set() 110 | 111 | with sns.plotting_context("paper"): 112 | #ind = lm_steps # the x locations for the groups 113 | 114 | fig, ax = plt.subplots() 115 | 116 | sns.lineplot(x="train_step", y="score", hue="model", style="model", data=df) 117 | #plt.title(task) 118 | 119 | ax.set(xlabel='Language Modeling Steps', ylabel='Accuracy') 120 | ax.yaxis.grid(True, linestyle="dotted") 121 | ax.xaxis.grid(True, linestyle="dotted") 122 | 123 | fig.savefig(output_path) 124 | #plt.show() 125 | print("Done") 126 | 127 | 128 | def main(): 129 | #filtered_list = fetch_results(base_path="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/", subdir="nl-adapter/") 130 | filtered_list = fetch_results(base_path="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/", 131 | subdir="nl-adapter_tune_all") 132 | output_results_as_csv(filtered_list, 133 | output_path="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all/results_filtered.csv") 134 | filtered_list = fetch_results(base_path="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/", 135 | subdir="nl-adapter_tune_all_quick_insight") 136 | output_results_as_csv(filtered_list, 137 | output_path="/work/anlausch/ConceptBERT/output/finetuning/rw/1.0_1.0_2_10/nl-adapter_tune_all_quick_insight/results_filtered.csv") 138 | filtered_list = fetch_results(base_path="/work/anlausch/ConceptBERT/output/finetuning/omcs/", subdir="free-wo-nsp-adapter_tune_all/") 139 | output_results_as_csv(filtered_list, output_path="/work/anlausch/ConceptBERT/output/finetuning/omcs/free-wo-nsp-adapter_tune_all/results_filtered.csv") 140 | filtered_list = fetch_results(base_path="/work/anlausch/replant/bert/finetuning/poc_over_time/", subdir="wn_binary_16_longer/") 141 | output_results_as_csv(filtered_list, output_path="/work/anlausch/replant/bert/finetuning/poc_over_time/wn_binary_16_longer/results_filtered.csv") 142 | filtered_list = fetch_results(base_path="/work/anlausch/replant/bert/finetuning/poc_over_time/", subdir="base_16_longer/") 143 | output_results_as_csv(filtered_list, output_path="/work/anlausch/replant/bert/finetuning/poc_over_time/base_16_longer/results_filtered.csv") 144 | #output_results_as_csv(filtered_list, output_path="/work/anlausch/ConceptBERT/output/finetuning/omcs/free-wo-nsp-adapter_tune_all/results_filtered.csv") 145 | 146 | if __name__=="__main__": 147 | main() -------------------------------------------------------------------------------- /results_utility/parse_predictions.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import numpy as np 3 | import os 4 | import argparse 5 | 6 | def parse_predictions(input_path, output_path, task="STSB"): 7 | """ 8 | :param input_path: 9 | :param output_path: 10 | :param task: 11 | :return: 12 | >>> parse_predictions("/work/anlausch/replant/bert/predictions/wn_binary/mnli_neu_32_5e-05_3.0/test_results.tsv", "/work/anlausch/replant/bert/predictions/wn_binary_32_5e-05_3.0/MNLI-mm-neu.tsv", task="MNLI") 13 | """ 14 | if task != "STSB": 15 | import run_classifier 16 | else: 17 | import run_regression 18 | predicted_labels = [] 19 | if task == "MRPC": 20 | #ids = MrpcProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/MRPC") 21 | labels = run_classifier.MrpcProcessor().get_labels() 22 | if task == "RTE": 23 | labels = run_classifier.RTEProcessor().get_labels() 24 | if task == "QNLI": 25 | labels = run_classifier.QNLIProcessor().get_labels() 26 | if task == "QNLIV2": 27 | labels = run_classifier.QNLIProcessor().get_labels() 28 | if task == "MNLI": 29 | labels = run_classifier.MnliProcessor().get_labels() 30 | if task == "SST2": 31 | labels = run_classifier.SST2Processor().get_labels() 32 | if task == "CoLA": 33 | labels = run_classifier.ColaProcessor().get_labels() 34 | if task == "QQP": 35 | labels = run_classifier.QQPProcessor().get_labels() 36 | if task == "diagnostic": 37 | labels = run_classifier.DiagnosticProcessor().get_labels() 38 | with codecs.open(input_path, "r", "utf8") as f_in: 39 | for line in f_in.readlines(): 40 | predictions = np.array(line.split("\t"), dtype=np.float32) 41 | if task != "STSB": 42 | predicted_index = np.argmax(predictions) 43 | predicted_labels.append(labels[predicted_index]) 44 | else: 45 | predicted_labels.append(predictions[0]) 46 | f_in.close() 47 | with codecs.open(output_path, "w", "utf8") as f_out: 48 | f_out.write("index\tprediction\n") 49 | for i, prediction in enumerate(predicted_labels): 50 | f_out.write(str(i) + "\t" + str(prediction) + "\n") 51 | f_out.close() 52 | 53 | 54 | def write_fake_predictions(output_path, task="MRPC"): 55 | """ 56 | :param input_path: 57 | :param output_path: 58 | :param task: 59 | :return: 60 | >>> write_fake_predictions("/work/anlausch/replant/bert/predictions/base_32_5e-05_3.0/copy_for_submission/fakes/STS-B.tsv", task="STSB") 61 | """ 62 | if task != "STSB": 63 | import run_classifier 64 | else: 65 | import run_regression 66 | if task == "MNLI": 67 | test_examples = run_classifier.MnliProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task, False) 68 | labels = run_classifier.MnliProcessor().get_labels() 69 | elif task == "QQP": 70 | test_examples = run_classifier.QQPProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task) 71 | labels = run_classifier.QQPProcessor().get_labels() 72 | elif task == "WNLI": 73 | test_examples = run_classifier.WNLIProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task) 74 | labels = run_classifier.WNLIProcessor().get_labels() 75 | elif task == "CoLA": 76 | test_examples = run_classifier.ColaProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task) 77 | labels = run_classifier.ColaProcessor().get_labels() 78 | elif task == "STSB": 79 | test_examples = run_regression.STSBProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task) 80 | elif task == "diagnostic": 81 | test_examples = run_classifier.DiagnosticProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/" + task) 82 | labels = run_classifier.DiagnosticProcessor().get_labels() 83 | with codecs.open(output_path, "w", "utf8") as f_out: 84 | f_out.write("index\tprediction\n") 85 | if task != "STSB": 86 | for i, data in enumerate(test_examples): 87 | f_out.write(str(i) + "\t" + str(labels[0]) + "\n") 88 | else: 89 | for i, data in enumerate(test_examples): 90 | f_out.write(str(i) + "\t" + str(2.5) + "\n") 91 | f_out.close() 92 | 93 | 94 | 95 | def main(): 96 | parser = argparse.ArgumentParser(description="Running prediction parser") 97 | parser.add_argument("--task", type=str, default=None, 98 | help="Input path in case train and dev are in a single file", required=True) 99 | parser.add_argument("--input_path", type=str, default="/work/anlausch/replant/bert/predictions/wn_binary_32_5e-05_3.0/test_results.tsv", 100 | help="Input path in case train and dev are in a single file", required=False) 101 | parser.add_argument("--output_path_root", type=str, default="/work/anlausch/replant/bert/predictions/wn_binary_32_5e-05_3.0/", 102 | help="Input path in case train and dev are in a single file", required=False) 103 | 104 | args = parser.parse_args() 105 | task = args.task 106 | input_path = args.input_path 107 | root = args.output_path_root 108 | output_path = root + str(task) + ".tsv" 109 | parse_predictions(input_path, output_path, task) 110 | 111 | if __name__ == "__main__": 112 | main() -------------------------------------------------------------------------------- /retrograph/__init__.py: -------------------------------------------------------------------------------- 1 | ##################################################### 2 | # coding=utf-8 3 | # Copyright 2019 Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | #################################################### 17 | 18 | 19 | #################################################### 20 | # IMPORT STATEMENTS 21 | #################################################### 22 | 23 | # >>>>>> Native Imports <<<<<<< 24 | 25 | # >>>>>> Package Imports <<<<<<< 26 | 27 | # >>>>>> Local Imports <<<<<<< 28 | 29 | 30 | #################################################### 31 | # CODE 32 | #################################################### 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | #################################################### 41 | # MAIN 42 | #################################################### 43 | 44 | 45 | # EOF 46 | -------------------------------------------------------------------------------- /retrograph/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | ##################################################### 2 | # coding=utf-8 3 | # Copyright 2019 Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | #################################################### 17 | 18 | 19 | #################################################### 20 | # IMPORT STATEMENTS 21 | #################################################### 22 | 23 | # >>>>>> Native Imports <<<<<<< 24 | 25 | # >>>>>> Package Imports <<<<<<< 26 | 27 | # >>>>>> Local Imports <<<<<<< 28 | 29 | 30 | #################################################### 31 | # CODE 32 | #################################################### 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | #################################################### 41 | # MAIN 42 | #################################################### 43 | 44 | 45 | # EOF 46 | -------------------------------------------------------------------------------- /retrograph/modeling/metrics_extension.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.eager import context 3 | from tensorflow.python.framework import dtypes 4 | from tensorflow.python.framework import ops 5 | from tensorflow.python.framework import sparse_tensor 6 | from tensorflow.python.ops import array_ops 7 | from tensorflow.python.ops import check_ops 8 | from tensorflow.python.ops import confusion_matrix 9 | from tensorflow.python.ops import control_flow_ops 10 | from tensorflow.python.ops import math_ops 11 | from tensorflow.python.ops import nn 12 | from tensorflow.python.ops import sets 13 | from tensorflow.python.ops import sparse_ops 14 | from tensorflow.python.ops import state_ops 15 | from tensorflow.python.ops import variable_scope 16 | from tensorflow.python.ops import weights_broadcast_ops 17 | from tensorflow.python.platform import tf_logging as logging 18 | from tensorflow.python.training import distribution_strategy_context 19 | from tensorflow.python.util.deprecation import deprecated 20 | from tensorflow.python.util.tf_export import tf_export 21 | 22 | def mcc(labels, 23 | predictions, 24 | weights=None, 25 | metrics_collections=None, 26 | updates_collections=None, 27 | name=None): 28 | 29 | if context.executing_eagerly(): 30 | raise RuntimeError('mcc is not ' 31 | 'supported when eager execution is enabled.') 32 | 33 | with tf.variable_scope(name, 'mcc',(predictions, labels, weights)): 34 | 35 | predictions, labels, weights = _remove_squeezable_dimensions( 36 | predictions=math_ops.cast(predictions, dtype=dtypes.bool), 37 | labels=math_ops.cast(labels, dtype=dtypes.bool), 38 | weights=weights) 39 | 40 | true_p, true_positives_update_op = tf.metrics.true_positives( 41 | labels, 42 | predictions, 43 | weights, 44 | metrics_collections=None, 45 | updates_collections=None, 46 | name=None) 47 | false_p, false_positives_update_op = tf.metrics.false_positives( 48 | labels, 49 | predictions, 50 | weights, 51 | metrics_collections=None, 52 | updates_collections=None, 53 | name=None) 54 | true_n, true_negatives_update_op = tf.metrics.true_negatives( 55 | labels, 56 | predictions, 57 | weights, 58 | metrics_collections=None, 59 | updates_collections=None, 60 | name=None) 61 | false_n, false_negatives_update_op = tf.metrics.false_negatives( 62 | labels, 63 | predictions, 64 | weights, 65 | metrics_collections=None, 66 | updates_collections=None, 67 | name=None) 68 | 69 | def compute_mcc(tp, fp, tn, fn, name): 70 | return tf.math.divide( 71 | tf.math.subtract( 72 | tf.math.multiply(tp,tn), 73 | tf.math.multiply(fp,fn)) 74 | ,tf.sqrt( 75 | tf.math.multiply( 76 | tf.math.multiply(tf.math.add(tp,fp),tf.math.add(tp,fn)), 77 | tf.math.multiply(tf.math.add(tn,fp),tf.math.add(tn,fn)))), name=name) 78 | 79 | def once_across_towers(_, true_p, false_p, true_n, false_n): 80 | return compute_mcc(true_p, false_p, true_n, false_n, 'value') 81 | 82 | mcc = _aggregate_across_towers(metrics_collections, once_across_towers, 83 | true_p, false_p, true_n, false_n) 84 | 85 | update_op = compute_mcc(true_positives_update_op, 86 | false_positives_update_op, true_negatives_update_op, false_negatives_update_op, 'update_op') 87 | if updates_collections: 88 | ops.add_to_collections(updates_collections, update_op) 89 | 90 | return mcc, update_op 91 | 92 | 93 | def _remove_squeezable_dimensions(predictions, labels, weights): 94 | """Squeeze or expand last dim if needed. 95 | 96 | Squeezes last dim of `predictions` or `labels` if their rank differs by 1 97 | (using confusion_matrix.remove_squeezable_dimensions). 98 | Squeezes or expands last dim of `weights` if its rank differs by 1 from the 99 | new rank of `predictions`. 100 | 101 | If `weights` is scalar, it is kept scalar. 102 | 103 | This will use static shape if available. Otherwise, it will add graph 104 | operations, which could result in a performance hit. 105 | 106 | Args: 107 | predictions: Predicted values, a `Tensor` of arbitrary dimensions. 108 | labels: Optional label `Tensor` whose dimensions match `predictions`. 109 | weights: Optional weight scalar or `Tensor` whose dimensions match 110 | `predictions`. 111 | 112 | Returns: 113 | Tuple of `predictions`, `labels` and `weights`. Each of them possibly has 114 | the last dimension squeezed, `weights` could be extended by one dimension. 115 | """ 116 | predictions = ops.convert_to_tensor(predictions) 117 | if labels is not None: 118 | labels, predictions = confusion_matrix.remove_squeezable_dimensions( 119 | labels, predictions) 120 | predictions.get_shape().assert_is_compatible_with(labels.get_shape()) 121 | 122 | if weights is None: 123 | return predictions, labels, None 124 | 125 | weights = ops.convert_to_tensor(weights) 126 | weights_shape = weights.get_shape() 127 | weights_rank = weights_shape.ndims 128 | if weights_rank == 0: 129 | return predictions, labels, weights 130 | 131 | predictions_shape = predictions.get_shape() 132 | predictions_rank = predictions_shape.ndims 133 | if (predictions_rank is not None) and (weights_rank is not None): 134 | # Use static rank. 135 | if weights_rank - predictions_rank == 1: 136 | weights = array_ops.squeeze(weights, [-1]) 137 | elif predictions_rank - weights_rank == 1: 138 | weights = array_ops.expand_dims(weights, [-1]) 139 | else: 140 | # Use dynamic rank. 141 | weights_rank_tensor = array_ops.rank(weights) 142 | rank_diff = weights_rank_tensor - array_ops.rank(predictions) 143 | 144 | def _maybe_expand_weights(): 145 | return control_flow_ops.cond( 146 | math_ops.equal(rank_diff, -1), 147 | lambda: array_ops.expand_dims(weights, [-1]), lambda: weights) 148 | 149 | # Don't attempt squeeze if it will fail based on static check. 150 | if ((weights_rank is not None) and 151 | (not weights_shape.dims[-1].is_compatible_with(1))): 152 | maybe_squeeze_weights = lambda: weights 153 | else: 154 | maybe_squeeze_weights = lambda: array_ops.squeeze(weights, [-1]) 155 | 156 | def _maybe_adjust_weights(): 157 | return control_flow_ops.cond( 158 | math_ops.equal(rank_diff, 1), maybe_squeeze_weights, 159 | _maybe_expand_weights) 160 | 161 | # If weights are scalar, do nothing. Otherwise, try to add or remove a 162 | # dimension to match predictions. 163 | weights = control_flow_ops.cond( 164 | math_ops.equal(weights_rank_tensor, 0), lambda: weights, 165 | _maybe_adjust_weights) 166 | return predictions, labels, weights 167 | 168 | def _aggregate_across_towers(metrics_collections, metric_value_fn, *args): 169 | """Aggregate metric value across towers.""" 170 | def fn(distribution, *a): 171 | """Call `metric_value_fn` in the correct control flow context.""" 172 | if hasattr(distribution, '_outer_control_flow_context'): 173 | # If there was an outer context captured before this method was called, 174 | # then we enter that context to create the metric value op. If the 175 | # caputred context is `None`, ops.control_dependencies(None) gives the 176 | # desired behavior. Else we use `Enter` and `Exit` to enter and exit the 177 | # captured context. 178 | # This special handling is needed because sometimes the metric is created 179 | # inside a while_loop (and perhaps a TPU rewrite context). But we don't 180 | # want the value op to be evaluated every step or on the TPU. So we 181 | # create it outside so that it can be evaluated at the end on the host, 182 | # once the update ops have been evaluted. 183 | 184 | # pylint: disable=protected-access 185 | if distribution._outer_control_flow_context is None: 186 | with ops.control_dependencies(None): 187 | metric_value = metric_value_fn(distribution, *a) 188 | else: 189 | distribution._outer_control_flow_context.Enter() 190 | metric_value = metric_value_fn(distribution, *a) 191 | distribution._outer_control_flow_context.Exit() 192 | # pylint: enable=protected-access 193 | else: 194 | metric_value = metric_value_fn(distribution, *a) 195 | if metrics_collections: 196 | ops.add_to_collections(metrics_collections, metric_value) 197 | return metric_value 198 | 199 | return distribution_strategy_context.get_tower_context().merge_call(fn, *args) 200 | -------------------------------------------------------------------------------- /retrograph/modeling/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = AdamWeightDecayOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | beta_1=0.9, 63 | beta_2=0.999, 64 | epsilon=1e-6, 65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 66 | 67 | if use_tpu: 68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 69 | 70 | tvars = tf.trainable_variables() 71 | grads = tf.gradients(loss, tvars) 72 | 73 | # This is how the model was pre-trained. 74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 75 | 76 | train_op = optimizer.apply_gradients( 77 | zip(grads, tvars), global_step=global_step) 78 | 79 | new_global_step = global_step + 1 80 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 81 | return train_op 82 | 83 | 84 | def create_optimizer_multitask(standard_loss, wn_loss, selected_task_id, wn_upper_bound, init_lr, num_train_steps, num_warmup_steps, use_tpu): 85 | """Creates an optimizer training op.""" 86 | global_step = tf.train.get_or_create_global_step() 87 | 88 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 89 | 90 | # Implements linear decay of the learning rate. 91 | learning_rate = tf.train.polynomial_decay( 92 | learning_rate, 93 | global_step, 94 | num_train_steps, 95 | end_learning_rate=0.0, 96 | power=1.0, 97 | cycle=False) 98 | 99 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 100 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 101 | if num_warmup_steps: 102 | global_steps_int = tf.cast(global_step, tf.int32) 103 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 104 | 105 | global_steps_float = tf.cast(global_steps_int, tf.float32) 106 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 107 | 108 | warmup_percent_done = global_steps_float / warmup_steps_float 109 | warmup_learning_rate = init_lr * warmup_percent_done 110 | 111 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 112 | learning_rate = ( 113 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 114 | 115 | # It is recommended that you use this optimizer for fine tuning, since this 116 | # is how the model was trained (note that the Adam m/v variables are NOT 117 | # loaded from init_checkpoint.) 118 | optimizer = AdamWeightDecayOptimizer( 119 | learning_rate=learning_rate, 120 | weight_decay_rate=0.01, 121 | beta_1=0.9, 122 | beta_2=0.999, 123 | epsilon=1e-6, 124 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 125 | 126 | if use_tpu: 127 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 128 | 129 | multitask_optimizer = tf.contrib.opt.MultitaskOptimizerWrapper(optimizer) 130 | tvars = tf.trainable_variables() 131 | 132 | standard_grads = tf.gradients(standard_loss, tvars) 133 | wn_grads = tf.gradients(wn_loss, tvars) 134 | 135 | # This is how the model was pre-trained. 136 | (standard_grads, _) = tf.clip_by_global_norm(standard_grads, clip_norm=1.0) 137 | (wn_grads, _) = tf.clip_by_global_norm(wn_grads, clip_norm=1.0) 138 | 139 | train_op_standard = multitask_optimizer.apply_gradients( 140 | zip(standard_grads, tvars), global_step=global_step) 141 | train_op_wn = multitask_optimizer.apply_gradients( 142 | zip(wn_grads, tvars), global_step=global_step) 143 | 144 | new_global_step = global_step + 1 145 | 146 | train_op_standard = tf.group(train_op_standard, [global_step.assign(new_global_step)]) 147 | train_op_wn = tf.group(train_op_wn, [global_step.assign(new_global_step)]) 148 | 149 | # TODO: Check this 150 | #wn_step = tf.Variable(name='wn_step', trainable=False, dtype=tf.int32, initial_value=tf.constant(0)) 151 | #bert_step = tf.Variable(name='bert_step', trainable=False, dtype=tf.int32, initial_value=tf.constant(0)) 152 | 153 | #(increment_wn, increment_bert) = tf.case( 154 | # [(tf.less(selected_task_id, wn_upper_bound), lambda: (tf.constant(1), tf.constant(0)))], 155 | # default=lambda: (tf.constant(0), tf.constant(1)), 156 | # exclusive=True) 157 | 158 | #bert_step = tf.assign_add(bert_step, increment_bert) 159 | #wn_step = tf.assign_add(wn_step, increment_wn) 160 | #tf.summary.scalar(name='selected_task_id', tensor=selected_task_id) 161 | #tf.summary.scalar(name='wn_step', tensor=wn_step) 162 | #tf.summary.scalar(name='bert_step', tensor=bert_step) 163 | #tf.summary.scalar('gs', global_step) 164 | 165 | #train_op = tf.cond(tf.less(selected_task_id, wn_upper_bound), lambda: train_op_wn, lambda: train_op_standard, name="multitask_train") 166 | 167 | train_op = tf.case([(tf.less(selected_task_id, wn_upper_bound), lambda: train_op_wn)], default=lambda: train_op_standard, exclusive=True) 168 | 169 | return train_op 170 | 171 | 172 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 173 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 174 | 175 | def __init__(self, 176 | learning_rate, 177 | weight_decay_rate=0.0, 178 | beta_1=0.9, 179 | beta_2=0.999, 180 | epsilon=1e-6, 181 | exclude_from_weight_decay=None, 182 | name="AdamWeightDecayOptimizer"): 183 | """Constructs a AdamWeightDecayOptimizer.""" 184 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 185 | 186 | self.learning_rate = learning_rate 187 | self.weight_decay_rate = weight_decay_rate 188 | self.beta_1 = beta_1 189 | self.beta_2 = beta_2 190 | self.epsilon = epsilon 191 | self.exclude_from_weight_decay = exclude_from_weight_decay 192 | 193 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 194 | """See base class.""" 195 | assignments = [] 196 | for (grad, param) in grads_and_vars: 197 | if grad is None or param is None: 198 | continue 199 | 200 | param_name = self._get_variable_name(param.name) 201 | 202 | # TODO: Check this carefully, because I added the variable reuse 203 | with tf.variable_scope("adam", reuse=tf.AUTO_REUSE): 204 | m = tf.get_variable( 205 | name=param_name + "/adam_m", 206 | shape=param.shape.as_list(), 207 | dtype=tf.float32, 208 | trainable=False, 209 | initializer=tf.zeros_initializer()) 210 | v = tf.get_variable( 211 | name=param_name + "/adam_v", 212 | shape=param.shape.as_list(), 213 | dtype=tf.float32, 214 | trainable=False, 215 | initializer=tf.zeros_initializer()) 216 | 217 | # Standard Adam update. 218 | next_m = ( 219 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 220 | next_v = ( 221 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 222 | tf.square(grad))) 223 | 224 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 225 | 226 | # Just adding the square of the weights to the loss function is *not* 227 | # the correct way of using L2 regularization/weight decay with Adam, 228 | # since that will interact with the m and v parameters in strange ways. 229 | # 230 | # Instead we want ot decay the weights in a manner that doesn't interact 231 | # with the m/v parameters. This is equivalent to adding the square 232 | # of the weights to the loss with plain (non-momentum) SGD. 233 | if self._do_use_weight_decay(param_name): 234 | update += self.weight_decay_rate * param 235 | 236 | update_with_lr = self.learning_rate * update 237 | 238 | next_param = param - update_with_lr 239 | 240 | assignments.extend( 241 | [param.assign(next_param), 242 | m.assign(next_m), 243 | v.assign(next_v)]) 244 | return tf.group(*assignments, name=name) 245 | 246 | def _do_use_weight_decay(self, param_name): 247 | """Whether to use L2 weight decay for `param_name`.""" 248 | if not self.weight_decay_rate: 249 | return False 250 | if self.exclude_from_weight_decay: 251 | for r in self.exclude_from_weight_decay: 252 | if re.search(r, param_name) is not None: 253 | return False 254 | return True 255 | 256 | def _get_variable_name(self, param_name): 257 | """Get the variable name from the tensor name.""" 258 | m = re.match("^(.*):\\d+$", param_name) 259 | if m is not None: 260 | param_name = m.group(1) 261 | return param_name 262 | -------------------------------------------------------------------------------- /retrograph/modeling/optimization_adapter.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = AdamWeightDecayOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | adapter_weight_decay_rate=0.01, 63 | beta_1=0.9, 64 | beta_2=0.999, 65 | epsilon=1e-6, 66 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 67 | 68 | if use_tpu: 69 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 70 | 71 | tvars = [] 72 | for collection in ["adapters", "layer_norm", "head"]: 73 | tvars += tf.get_collection(collection) 74 | grads = tf.gradients(loss, tvars) 75 | 76 | # This is how the model was pre-trained. 77 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 78 | 79 | train_op = optimizer.apply_gradients( 80 | zip(grads, tvars), global_step=global_step) 81 | 82 | # Normally the global step update is done inside of `apply_gradients`. 83 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use 84 | # a different optimizer, you should probably take this line out. 85 | new_global_step = global_step + 1 86 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 87 | return train_op 88 | 89 | 90 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 91 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 92 | 93 | def __init__(self, 94 | learning_rate, 95 | weight_decay_rate=0.0, 96 | adapter_weight_decay_rate=0.0, 97 | beta_1=0.9, 98 | beta_2=0.999, 99 | epsilon=1e-6, 100 | exclude_from_weight_decay=None, 101 | name="AdamWeightDecayOptimizer"): 102 | """Constructs a AdamWeightDecayOptimizer.""" 103 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 104 | 105 | self.learning_rate = learning_rate 106 | self.weight_decay_rate = weight_decay_rate 107 | self.adapter_weight_decay_rate = adapter_weight_decay_rate 108 | self.beta_1 = beta_1 109 | self.beta_2 = beta_2 110 | self.epsilon = epsilon 111 | self.exclude_from_weight_decay = exclude_from_weight_decay 112 | self._adapter_variable_names = { 113 | self._get_variable_name(v.name) for v in tf.get_collection("adapters") 114 | } 115 | 116 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 117 | """See base class.""" 118 | assignments = [] 119 | for (grad, param) in grads_and_vars: 120 | if grad is None or param is None: 121 | continue 122 | 123 | param_name = self._get_variable_name(param.name) 124 | 125 | m = tf.get_variable( 126 | name=param_name + "/adam_m", 127 | shape=param.shape.as_list(), 128 | dtype=tf.float32, 129 | trainable=False, 130 | initializer=tf.zeros_initializer()) 131 | v = tf.get_variable( 132 | name=param_name + "/adam_v", 133 | shape=param.shape.as_list(), 134 | dtype=tf.float32, 135 | trainable=False, 136 | initializer=tf.zeros_initializer()) 137 | 138 | # Standard Adam update. 139 | next_m = ( 140 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 141 | next_v = ( 142 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 143 | tf.square(grad))) 144 | 145 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 146 | 147 | # Just adding the square of the weights to the loss function is *not* 148 | # the correct way of using L2 regularization/weight decay with Adam, 149 | # since that will interact with the m and v parameters in strange ways. 150 | # 151 | # Instead we want ot decay the weights in a manner that doesn't interact 152 | # with the m/v parameters. This is equivalent to adding the square 153 | # of the weights to the loss with plain (non-momentum) SGD. 154 | if self._do_use_weight_decay(param_name): 155 | if param_name in self._adapter_variable_names: 156 | update += self.adapter_weight_decay_rate * param 157 | else: 158 | update += self.weight_decay_rate * param 159 | 160 | update_with_lr = self.learning_rate * update 161 | 162 | next_param = param - update_with_lr 163 | 164 | assignments.extend( 165 | [param.assign(next_param), 166 | m.assign(next_m), 167 | v.assign(next_v)]) 168 | return tf.group(*assignments, name=name) 169 | 170 | def _do_use_weight_decay(self, param_name): 171 | """Whether to use L2 weight decay for `param_name`.""" 172 | if param_name in self._adapter_variable_names: 173 | if not self.adapter_weight_decay_rate: 174 | return False 175 | else: 176 | if not self.weight_decay_rate: 177 | return False 178 | 179 | if self.exclude_from_weight_decay: 180 | for r in self.exclude_from_weight_decay: 181 | if re.search(r, param_name) is not None: 182 | return False 183 | 184 | return True 185 | 186 | def _get_variable_name(self, param_name): 187 | """Get the variable name from the tensor name.""" 188 | m = re.match("^(.*):\\d+$", param_name) 189 | if m is not None: 190 | param_name = m.group(1) 191 | return param_name -------------------------------------------------------------------------------- /retrograph/modeling/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import unicodedata 23 | import six 24 | import tensorflow as tf 25 | import codecs 26 | 27 | def write_vocab_from_fasttext(path_in, path_out): 28 | """Converts fasttext vectors into simple vocab file 29 | >>> write_vocab_from_fasttext("./../data/fasttext/wiki-news-300d-1M.vec", "./../data/vocab_word_level.txt") 30 | """ 31 | with codecs.open(path_in, "r", "utf8") as f_in: 32 | with codecs.open(path_out, "w", "utf8") as f_out: 33 | for i, line in enumerate(f_in.readlines()): 34 | if i== 0: 35 | print(line) 36 | elif i <= 200000: 37 | word = line.split(' ')[0] 38 | print(word) 39 | f_out.write(word) 40 | f_out.write("\n") 41 | f_out.close() 42 | f_in.close() 43 | 44 | 45 | def convert_to_unicode(text): 46 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 47 | if six.PY3: 48 | if isinstance(text, str): 49 | return text 50 | elif isinstance(text, bytes): 51 | return text.decode("utf-8", "ignore") 52 | else: 53 | raise ValueError("Unsupported string type: %s" % (type(text))) 54 | elif six.PY2: 55 | if isinstance(text, str): 56 | return text.decode("utf-8", "ignore") 57 | elif isinstance(text, unicode): 58 | return text 59 | else: 60 | raise ValueError("Unsupported string type: %s" % (type(text))) 61 | else: 62 | raise ValueError("Not running on Python2 or Python 3?") 63 | 64 | 65 | def printable_text(text): 66 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 67 | 68 | # These functions want `str` for both Python2 and Python3, but in one case 69 | # it's a Unicode string and in the other it's a byte string. 70 | if six.PY3: 71 | if isinstance(text, str): 72 | return text 73 | elif isinstance(text, bytes): 74 | return text.decode("utf-8", "ignore") 75 | else: 76 | raise ValueError("Unsupported string type: %s" % (type(text))) 77 | elif six.PY2: 78 | if isinstance(text, str): 79 | return text 80 | elif isinstance(text, unicode): 81 | return text.encode("utf-8") 82 | else: 83 | raise ValueError("Unsupported string type: %s" % (type(text))) 84 | else: 85 | raise ValueError("Not running on Python2 or Python 3?") 86 | 87 | 88 | def load_vocab(vocab_file): 89 | """Loads a vocabulary file into a dictionary.""" 90 | vocab = collections.OrderedDict() 91 | index = 0 92 | with tf.gfile.GFile(vocab_file, "r") as reader: 93 | while True: 94 | token = convert_to_unicode(reader.readline()) 95 | if not token: 96 | break 97 | token = token.strip() 98 | vocab[token] = index 99 | index += 1 100 | return vocab 101 | 102 | 103 | def convert_by_vocab(vocab, items): 104 | """Converts a sequence of [tokens|ids] using the vocab.""" 105 | output = [] 106 | for item in items: 107 | if item in vocab: 108 | output.append(vocab[item]) 109 | else: 110 | return [] 111 | return output 112 | 113 | 114 | def convert_tokens_to_ids(vocab, tokens): 115 | return convert_by_vocab(vocab, tokens) 116 | 117 | 118 | def convert_ids_to_tokens(inv_vocab, ids): 119 | return convert_by_vocab(inv_vocab, ids) 120 | 121 | 122 | def whitespace_tokenize(text): 123 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 124 | text = text.strip() 125 | if not text: 126 | return [] 127 | tokens = text.split() 128 | return tokens 129 | 130 | 131 | class FullTokenizer(object): 132 | """Runs end-to-end tokenziation.""" 133 | 134 | def __init__(self, vocab_file, do_lower_case=True): 135 | self.vocab = load_vocab(vocab_file) 136 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 137 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 138 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 139 | 140 | def tokenize(self, text): 141 | split_tokens = [] 142 | for token in self.basic_tokenizer.tokenize(text): 143 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 144 | split_tokens.append(sub_token) 145 | 146 | return split_tokens 147 | 148 | def convert_tokens_to_ids(self, tokens): 149 | return convert_by_vocab(self.vocab, tokens) 150 | 151 | def convert_ids_to_tokens(self, ids): 152 | return convert_by_vocab(self.inv_vocab, ids) 153 | 154 | 155 | class BasicTokenizer(object): 156 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 157 | 158 | def __init__(self, do_lower_case=True): 159 | """Constructs a BasicTokenizer. 160 | 161 | Args: 162 | do_lower_case: Whether to lower case the input. 163 | """ 164 | self.do_lower_case = do_lower_case 165 | 166 | def tokenize(self, text): 167 | """Tokenizes a piece of text.""" 168 | text = convert_to_unicode(text) 169 | text = self._clean_text(text) 170 | 171 | # This was added on November 1st, 2018 for the multilingual and Chinese 172 | # models. This is also applied to the English models now, but it doesn't 173 | # matter since the English models were not trained on any Chinese data 174 | # and generally don't have any Chinese data in them (there are Chinese 175 | # characters in the vocabulary because Wikipedia does have some Chinese 176 | # words in the English Wikipedia.). 177 | text = self._tokenize_chinese_chars(text) 178 | 179 | orig_tokens = whitespace_tokenize(text) 180 | split_tokens = [] 181 | for token in orig_tokens: 182 | if self.do_lower_case: 183 | token = token.lower() 184 | token = self._run_strip_accents(token) 185 | split_tokens.extend(self._run_split_on_punc(token)) 186 | 187 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 188 | return output_tokens 189 | 190 | def _run_strip_accents(self, text): 191 | """Strips accents from a piece of text.""" 192 | text = unicodedata.normalize("NFD", text) 193 | output = [] 194 | for char in text: 195 | cat = unicodedata.category(char) 196 | if cat == "Mn": 197 | continue 198 | output.append(char) 199 | return "".join(output) 200 | 201 | def _run_split_on_punc(self, text): 202 | """Splits punctuation on a piece of text.""" 203 | chars = list(text) 204 | i = 0 205 | start_new_word = True 206 | output = [] 207 | while i < len(chars): 208 | char = chars[i] 209 | if _is_punctuation(char): 210 | output.append([char]) 211 | start_new_word = True 212 | else: 213 | if start_new_word: 214 | output.append([]) 215 | start_new_word = False 216 | output[-1].append(char) 217 | i += 1 218 | 219 | return ["".join(x) for x in output] 220 | 221 | def _tokenize_chinese_chars(self, text): 222 | """Adds whitespace around any CJK character.""" 223 | output = [] 224 | for char in text: 225 | cp = ord(char) 226 | if self._is_chinese_char(cp): 227 | output.append(" ") 228 | output.append(char) 229 | output.append(" ") 230 | else: 231 | output.append(char) 232 | return "".join(output) 233 | 234 | def _is_chinese_char(self, cp): 235 | """Checks whether CP is the codepoint of a CJK character.""" 236 | # This defines a "chinese character" as anything in the CJK Unicode block: 237 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 238 | # 239 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 240 | # despite its name. The modern Korean Hangul alphabet is a different block, 241 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 242 | # space-separated words, so they are not treated specially and handled 243 | # like the all of the other languages. 244 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 245 | (cp >= 0x3400 and cp <= 0x4DBF) or # 246 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 247 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 248 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 249 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 250 | (cp >= 0xF900 and cp <= 0xFAFF) or # 251 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 252 | return True 253 | 254 | return False 255 | 256 | def _clean_text(self, text): 257 | """Performs invalid character removal and whitespace cleanup on text.""" 258 | output = [] 259 | for char in text: 260 | cp = ord(char) 261 | if cp == 0 or cp == 0xfffd or _is_control(char): 262 | continue 263 | if _is_whitespace(char): 264 | output.append(" ") 265 | else: 266 | output.append(char) 267 | return "".join(output) 268 | 269 | 270 | class WordpieceTokenizer(object): 271 | """Runs WordPiece tokenziation.""" 272 | 273 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): 274 | self.vocab = vocab 275 | self.unk_token = unk_token 276 | self.max_input_chars_per_word = max_input_chars_per_word 277 | 278 | def tokenize(self, text): 279 | """Tokenizes a piece of text into its word pieces. 280 | 281 | This uses a greedy longest-match-first algorithm to perform tokenization 282 | using the given vocabulary. 283 | 284 | For example: 285 | input = "unaffable" 286 | output = ["un", "##aff", "##able"] 287 | 288 | Args: 289 | text: A single token or whitespace separated tokens. This should have 290 | already been passed through `BasicTokenizer. 291 | 292 | Returns: 293 | A list of wordpiece tokens. 294 | """ 295 | 296 | text = convert_to_unicode(text) 297 | 298 | output_tokens = [] 299 | for token in whitespace_tokenize(text): 300 | chars = list(token) 301 | if len(chars) > self.max_input_chars_per_word: 302 | output_tokens.append(self.unk_token) 303 | continue 304 | 305 | is_bad = False 306 | start = 0 307 | sub_tokens = [] 308 | while start < len(chars): 309 | end = len(chars) 310 | cur_substr = None 311 | while start < end: 312 | substr = "".join(chars[start:end]) 313 | if start > 0: 314 | substr = "##" + substr 315 | if substr in self.vocab: 316 | cur_substr = substr 317 | break 318 | end -= 1 319 | if cur_substr is None: 320 | is_bad = True 321 | break 322 | sub_tokens.append(cur_substr) 323 | start = end 324 | 325 | if is_bad: 326 | output_tokens.append(self.unk_token) 327 | else: 328 | output_tokens.extend(sub_tokens) 329 | return output_tokens 330 | 331 | 332 | def _is_whitespace(char): 333 | """Checks whether `chars` is a whitespace character.""" 334 | # \t, \n, and \r are technically contorl characters but we treat them 335 | # as whitespace since they are generally considered as such. 336 | if char == " " or char == "\t" or char == "\n" or char == "\r": 337 | return True 338 | cat = unicodedata.category(char) 339 | if cat == "Zs": 340 | return True 341 | return False 342 | 343 | 344 | def _is_control(char): 345 | """Checks whether `chars` is a control character.""" 346 | # These are technically control characters but we count them as whitespace 347 | # characters. 348 | if char == "\t" or char == "\n" or char == "\r": 349 | return False 350 | cat = unicodedata.category(char) 351 | if cat.startswith("C"): 352 | return True 353 | return False 354 | 355 | 356 | def _is_punctuation(char): 357 | """Checks whether `chars` is a punctuation character.""" 358 | cp = ord(char) 359 | # We treat all non-letter/number ASCII as punctuation. 360 | # Characters such as "^", "$", and "`" are not in the Unicode 361 | # Punctuation class but we treat them as punctuation anyways, for 362 | # consistency. 363 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 364 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 365 | return True 366 | cat = unicodedata.category(char) 367 | if cat.startswith("P"): 368 | return True 369 | return False 370 | -------------------------------------------------------------------------------- /retrograph/training/__init__.py: -------------------------------------------------------------------------------- 1 | ##################################################### 2 | # coding=utf-8 3 | # Copyright 2019 Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | #################################################### 17 | 18 | 19 | #################################################### 20 | # IMPORT STATEMENTS 21 | #################################################### 22 | 23 | # >>>>>> Native Imports <<<<<<< 24 | 25 | # >>>>>> Package Imports <<<<<<< 26 | 27 | # >>>>>> Local Imports <<<<<<< 28 | 29 | 30 | #################################################### 31 | # CODE 32 | #################################################### 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | #################################################### 41 | # MAIN 42 | #################################################### 43 | 44 | 45 | # EOF 46 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | ##################################################### 2 | # coding=utf-8 3 | # Copyright 2019 Anne Lauscher, Nikolai Rozanov, Olga Majewska, Leonardo Ribeiro, Goran Glavas 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | #################################################### 17 | 18 | from setuptools import setup, find_packages 19 | 20 | setup( 21 | # 22 | # SETUP 23 | # 24 | name ='retrograph', 25 | version ='0.0.0.1', 26 | 27 | description ='Retrograph', 28 | url ='https://github.com/ai-nikolai/Retrograph', 29 | author ='Anne Lauscher, Nikolai Rozanov', 30 | author_email ='nikolai@wluper.com', 31 | license ='Apache 2.0', 32 | # 33 | # Actual packages, data and scripts 34 | # 35 | packages = find_packages(), 36 | 37 | scripts =[], 38 | # 39 | # Requirements 40 | # 41 | install_requires=[], 42 | ) 43 | -------------------------------------------------------------------------------- /siqa_1_download_siqa.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | waws --downloadS3 -f socialIQa_v1.4.zip -b wluper-retrograph 3 | mkdir data/SIQA 4 | unzip socialIQa_v1.4.zip 5 | mv socialIQa_v1.4_dev.jsonl data/SIQA 6 | mv socialIQa_v1.4_trn.jsonl data/SIQA 7 | mv socialIQa_v1.4_tst.jsonl data/SIQA 8 | mv socialIQa_v1.4.zip data 9 | -------------------------------------------------------------------------------- /siqa_2_finetune_adapters.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #Step1: 4 | #run_classifier_adapter_tune_all.py -> 5 | # 6 | # 7 | #Need to load the Adapter Model 8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT 9 | TRAINING_UTILITY=training_utility 10 | 11 | export CUDA_VISIBLE_DEVICES=0 12 | 13 | BERT_DIR="models/BERT_BASE_UNCASED" 14 | BERT_CONFIG=$BERT_DIR/bert_config.json 15 | BERT_VOCAB=$BERT_DIR/vocab.txt 16 | 17 | TASKNAME='SIQA' 18 | DATA_DIR=data/$TASKNAME 19 | 20 | LEARNING_RATE=2e-5 21 | EPOCHS=3.0 22 | VARIANT=A 23 | 24 | EXPERIMENT_NAME=$LEARNING_RATE.$EPOCHS$VARIANT 25 | STEP="150000" 26 | 27 | PRETRAINED_NAME="RW30" 28 | BERT_EXTENDED_DIR="models/1.0_1.0_5_30_full_assertions_nl" 29 | # BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter" 30 | CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 31 | 32 | OUTPUT_DIR="models/output_model_finetunning/${TASKNAME}/${PRETRAINED_NAME}/${STEP}/${EXPERIMENT_NAME}" 33 | 34 | 35 | python3.6 $TRAINING_UTILITY/run_copa_adapter.py \ 36 | --do_train=true \ 37 | --do_eval=true \ 38 | --data_dir=$DATA_DIR \ 39 | --vocab_file=$BERT_VOCAB \ 40 | --bert_config_file=$BERT_CONFIG \ 41 | --init_checkpoint=$CHECKPOINT \ 42 | --max_seq_length=128 \ 43 | --train_batch_size=8 \ 44 | --learning_rate=$LEARNING_RATE \ 45 | --num_train_epochs=$EPOCHS \ 46 | --variant=$VARIANT \ 47 | --output_dir=$OUTPUT_DIR/ | tee $OUTPUT_DIR.out 48 | -------------------------------------------------------------------------------- /siqa_2_finetune_bert.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #Step1: 4 | #run_classifier_adapter_tune_all.py -> 5 | # 6 | # 7 | #Need to load the Adapter Model 8 | #Here it is probably recommended to use the orginal optimiser as it optimises BERT 9 | TRAINING_UTILITY=training_utility 10 | 11 | export CUDA_VISIBLE_DEVICES=7 12 | 13 | BERT_DIR="models/BERT_BASE_UNCASED" 14 | BERT_CONFIG=$BERT_DIR/bert_config.json 15 | BERT_VOCAB=$BERT_DIR/vocab.txt 16 | 17 | TASKNAME='SIQA' 18 | DATA_DIR=data/$TASKNAME 19 | 20 | LEARNING_RATE=1e-5 21 | EPOCHS=2.0 22 | VARIANT=A 23 | 24 | EXPERIMENT_NAME=$LEARNING_RATE.$EPOCHS$VARIANT 25 | 26 | # BERT_EXTENDED_DIR="models/omcs_pretraining_free_wo_nsp_adapter" 27 | # CHECKPOINT=${BERT_EXTENDED_DIR}/model.ckpt-${STEP} 28 | 29 | BERT_EXTENDED_DIR=$BERT_DIR 30 | CHECKPOINT=${BERT_EXTENDED_DIR}/bert_model.ckpt 31 | OUTPUT_DIR="models/output_model_finetunning/${TASKNAME}/BERT_BASE/${EXPERIMENT_NAME}" 32 | 33 | 34 | python3.6 $TRAINING_UTILITY/run_siqa.py \ 35 | --do_train=true \ 36 | --do_eval=true \ 37 | --do_predict=true \ 38 | --data_dir=$DATA_DIR \ 39 | --vocab_file=$BERT_VOCAB \ 40 | --bert_config_file=$BERT_CONFIG \ 41 | --init_checkpoint=$CHECKPOINT \ 42 | --max_seq_length=128 \ 43 | --train_batch_size=8 \ 44 | --learning_rate=$LEARNING_RATE \ 45 | --num_train_epochs=$EPOCHS \ 46 | --variant=$VARIANT \ 47 | --output_dir=$OUTPUT_DIR/ | tee $OUTPUT_DIR.out 48 | -------------------------------------------------------------------------------- /siqa_calc_acc_testset.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import jsonlines 3 | import numpy as np 4 | 5 | file_dataset = list(jsonlines.open(sys.argv[1])) 6 | 7 | file_testresults = open(sys.argv[2], 'r').readlines() 8 | 9 | assert len(file_dataset) == len(file_testresults) 10 | 11 | print("Number of datapoints:", len(file_dataset)) 12 | 13 | acc = 0 14 | for f_d, f_t in zip(file_dataset, file_testresults): 15 | if int(f_d['label']) == int(f_t.split(',')[1]): 16 | acc += 1 17 | 18 | print("acc:", acc / len(file_dataset)) 19 | -------------------------------------------------------------------------------- /training_utility/copa_preprocessor.py: -------------------------------------------------------------------------------- 1 | # Nikolai Rozanov 2 | from retrograph.modeling import tokenization 3 | import tensorflow as tf 4 | import os 5 | import json 6 | import numpy as np 7 | 8 | class InputExample(object): 9 | """A single multiple choice question.""" 10 | 11 | def __init__( 12 | self, 13 | qid, 14 | question, 15 | answers, 16 | label): 17 | """Construct an instance.""" 18 | self.qid = qid 19 | self.question = question 20 | self.answers = answers 21 | self.label = label 22 | 23 | 24 | class DataProcessor(object): 25 | """Base class for data converters for sequence classification data sets.""" 26 | 27 | def get_train_examples(self, data_dir): 28 | """Gets a collection of `InputExample`s for the train set.""" 29 | raise NotImplementedError() 30 | 31 | def get_dev_examples(self, data_dir): 32 | """Gets a collection of `InputExample`s for the dev set.""" 33 | raise NotImplementedError() 34 | 35 | def get_test_examples(self, data_dir): 36 | """Gets a collection of `InputExample`s for prediction.""" 37 | raise NotImplementedError() 38 | 39 | def get_labels(self): 40 | """Gets the list of labels for this data set.""" 41 | raise NotImplementedError() 42 | 43 | @classmethod 44 | def _read_json(cls, input_file): 45 | """Reads a JSON file.""" 46 | with tf.gfile.Open(input_file, "r") as f: 47 | return json.load(f) 48 | 49 | @classmethod 50 | def _read_jsonl(cls, input_file): 51 | """Reads a JSON Lines file.""" 52 | with tf.gfile.Open(input_file, "r") as f: 53 | return [json.loads(ln) for ln in f] 54 | 55 | 56 | class COPAProcessor(DataProcessor): 57 | """Processor for the CommonsenseQA data set.""" 58 | 59 | LABELS = [0, 1] 60 | 61 | TRAIN_FILE_NAME = 'train.en.jsonl' 62 | DEV_FILE_NAME = 'val.en.jsonl' 63 | TEST_FILE_NAME = 'test_gold.jsonl' 64 | 65 | def __init__(self, variant="A"): 66 | """ There are four variants: 67 | Variant A: PREMISE [SEP] The cause/result was that ANSWER1 [SEP] The cause/result was that ANSWER2 68 | Variant B: PREMISE [SEP] What was the cause/result of ANSWER1 [SEP] What was the cause/result of ANSWER2 69 | Variant C: What was the cause/result of PREMISE [SEP] ANSWER1 [SEP] ANSWER2 70 | Variant D: PREMISE [SEP] ANSWER1 [SEP] ANSWER2 71 | 72 | """ 73 | self.variant = variant 74 | 75 | 76 | def get_train_examples(self, data_dir): 77 | train_file_name = self.TRAIN_FILE_NAME 78 | 79 | return self._create_examples( 80 | self._read_jsonl(os.path.join(data_dir, train_file_name)), 81 | 'train') 82 | 83 | def get_dev_examples(self, data_dir): 84 | dev_file_name = self.DEV_FILE_NAME 85 | 86 | return self._create_examples( 87 | self._read_jsonl(os.path.join(data_dir, dev_file_name)), 88 | 'dev') 89 | 90 | def get_test_examples(self, data_dir): 91 | test_file_name = self.TEST_FILE_NAME 92 | 93 | return self._create_examples( 94 | self._read_jsonl(os.path.join(data_dir, test_file_name)), 95 | 'test') 96 | 97 | def get_labels(self): 98 | return [0, 1] 99 | 100 | def _create_examples(self,lines, set_type): 101 | """ Calls one of the variants""" 102 | if self.variant=="A": 103 | return self._create_examples_variant_A(lines, set_type) 104 | elif self.variant=="B": 105 | return self._create_examples_variant_B(lines, set_type) 106 | elif self.variant=="C": 107 | return self._create_examples_variant_C(lines, set_type) 108 | elif self.variant=="D": 109 | return self._create_examples_variant_D(lines, set_type) 110 | else: 111 | raise Exception("NO SUCH VARIAN FOR COPA PREPROCESSING") 112 | 113 | 114 | ## VARIANT_A Premise [SEP] STATMENT_Answer [SEP] ST Answer 115 | def _create_examples_variant_A(self, lines, set_type): 116 | examples = [] 117 | for line in lines: 118 | qid = line['idx'] 119 | premise = tokenization.convert_to_unicode(line['premise']) 120 | 121 | question = "The cause was that " if line["question"]=="cause" else "The result was that " 122 | answers = np.array([ 123 | tokenization.convert_to_unicode(question + line["choice1"]), 124 | tokenization.convert_to_unicode(question + line["choice2"]) 125 | ]) 126 | 127 | # the test set has no answer key so use '0' as a dummy label 128 | label = line.get('label', 0) 129 | 130 | examples.append( 131 | InputExample( 132 | qid=qid, 133 | question=premise, 134 | answers=answers, 135 | label=label)) 136 | 137 | return examples 138 | 139 | ## VARIANT_B Premise [SEP] WH-Question_Answer [SEP] WH_Q Answer 140 | def _create_examples_variant_B(self, lines, set_type): 141 | examples = [] 142 | for line in lines: 143 | qid = line['idx'] 144 | question = "What was the cause of " if line["question"]=="cause" else "What was the result of" 145 | premise = tokenization.convert_to_unicode(line['premise']) 146 | 147 | answers = np.array([ 148 | tokenization.convert_to_unicode(question + line["choice1"]), 149 | tokenization.convert_to_unicode(question + line["choice2"]) 150 | ]) 151 | 152 | # the test set has no answer key so use '0' as a dummy label 153 | label = line.get('label', 0) 154 | 155 | examples.append( 156 | InputExample( 157 | qid=qid, 158 | question=premise, 159 | answers=answers, 160 | label=label)) 161 | 162 | return examples 163 | 164 | 165 | ## VARIANT_C WH-Question_Premise [SEP] Answer [SEP] Answer 166 | def _create_examples_variant_C(self, lines, set_type): 167 | examples = [] 168 | for line in lines: 169 | qid = line['idx'] 170 | question = "What was the cause of " if line["question"]=="cause" else "What was the result of" 171 | premise = tokenization.convert_to_unicode(question + line['premise']) 172 | 173 | answers = np.array([ 174 | tokenization.convert_to_unicode(line["choice1"]), 175 | tokenization.convert_to_unicode(line["choice2"]) 176 | ]) 177 | 178 | # the test set has no answer key so use '0' as a dummy label 179 | label = line.get('label', 0) 180 | 181 | examples.append( 182 | InputExample( 183 | qid=qid, 184 | question=premise, 185 | answers=answers, 186 | label=label)) 187 | 188 | return examples 189 | 190 | 191 | ## VARIANT_D Premise [SEP] Answer [SEP] Answer 192 | def _create_examples_variant_D(self, lines, set_type): 193 | examples = [] 194 | for line in lines: 195 | qid = line['idx'] 196 | 197 | premise = tokenization.convert_to_unicode(line['premise']) 198 | 199 | answers = np.array([ 200 | tokenization.convert_to_unicode(line["choice1"]), 201 | tokenization.convert_to_unicode(line["choice2"]) 202 | ]) 203 | 204 | # the test set has no answer key so use '0' as a dummy label 205 | label = line.get('label', 0) 206 | 207 | examples.append( 208 | InputExample( 209 | qid=qid, 210 | question=premise, 211 | answers=answers, 212 | label=label)) 213 | 214 | return examples 215 | -------------------------------------------------------------------------------- /training_utility/siqa_preprocessor.py: -------------------------------------------------------------------------------- 1 | # Nikolai Rozanov 2 | from retrograph.modeling import tokenization 3 | import tensorflow as tf 4 | import os 5 | import json 6 | import numpy as np 7 | 8 | class InputExample(object): 9 | """A single multiple choice question.""" 10 | 11 | def __init__( 12 | self, 13 | qid, 14 | question, 15 | answers, 16 | label): 17 | """Construct an instance.""" 18 | self.qid = qid 19 | self.question = question 20 | self.answers = answers 21 | self.label = label 22 | 23 | 24 | class DataProcessor(object): 25 | """Base class for data converters for sequence classification data sets.""" 26 | 27 | def get_train_examples(self, data_dir): 28 | """Gets a collection of `InputExample`s for the train set.""" 29 | raise NotImplementedError() 30 | 31 | def get_dev_examples(self, data_dir): 32 | """Gets a collection of `InputExample`s for the dev set.""" 33 | raise NotImplementedError() 34 | 35 | def get_test_examples(self, data_dir): 36 | """Gets a collection of `InputExample`s for prediction.""" 37 | raise NotImplementedError() 38 | 39 | def get_labels(self): 40 | """Gets the list of labels for this data set.""" 41 | raise NotImplementedError() 42 | 43 | @classmethod 44 | def _read_json(cls, input_file): 45 | """Reads a JSON file.""" 46 | with tf.gfile.Open(input_file, "r") as f: 47 | return json.load(f) 48 | 49 | @classmethod 50 | def _read_jsonl(cls, input_file): 51 | """Reads a JSON Lines file.""" 52 | with tf.gfile.Open(input_file, "r") as f: 53 | return [json.loads(ln) for ln in f] 54 | 55 | 56 | class SIQAProcessor(DataProcessor): 57 | """Processor for the CommonsenseQA data set.""" 58 | 59 | LABELS = [0, 1, 2] 60 | 61 | TRAIN_FILE_NAME = 'socialIQa_v1.4_trn.jsonl' 62 | DEV_FILE_NAME = 'socialIQa_v1.4_dev.jsonl' 63 | TEST_FILE_NAME = 'socialIQa_v1.4_tst.jsonl' 64 | 65 | def __init__(self, variant="A"): 66 | """ There are four variants: 67 | Variant A: PREMISE [SEP] The cause/result was that ANSWER1 [SEP] The cause/result was that ANSWER2 68 | Variant B: PREMISE [SEP] What was the cause/result of ANSWER1 [SEP] What was the cause/result of ANSWER2 69 | Variant C: What was the cause/result of PREMISE [SEP] ANSWER1 [SEP] ANSWER2 70 | Variant D: PREMISE [SEP] ANSWER1 [SEP] ANSWER2 71 | 72 | """ 73 | self.variant = variant 74 | 75 | 76 | def get_train_examples(self, data_dir): 77 | train_file_name = self.TRAIN_FILE_NAME 78 | 79 | return self._create_examples( 80 | self._read_jsonl(os.path.join(data_dir, train_file_name)), 81 | 'train') 82 | 83 | def get_dev_examples(self, data_dir): 84 | dev_file_name = self.DEV_FILE_NAME 85 | 86 | return self._create_examples( 87 | self._read_jsonl(os.path.join(data_dir, dev_file_name)), 88 | 'dev') 89 | 90 | def get_test_examples(self, data_dir): 91 | test_file_name = self.TEST_FILE_NAME 92 | 93 | return self._create_examples( 94 | self._read_jsonl(os.path.join(data_dir, test_file_name)), 95 | 'test') 96 | 97 | def get_labels(self): 98 | return [0, 1, 2] 99 | 100 | def _create_examples(self,lines, set_type): 101 | """ Calls one of the variants""" 102 | if self.variant=="A": 103 | return self._create_examples_variant_A(lines, set_type) 104 | elif self.variant=="B": 105 | return self._create_examples_variant_B(lines, set_type) 106 | elif self.variant=="C": 107 | return self._create_examples_variant_C(lines, set_type) 108 | elif self.variant=="D": 109 | return self._create_examples_variant_D(lines, set_type) 110 | else: 111 | raise Exception("NO SUCH VARIAN FOR COPA PREPROCESSING") 112 | 113 | 114 | ## VARIANT_A Premise [SEP] STATMENT_Answer [SEP] ST Answer 115 | def _create_examples_variant_A(self, lines, set_type): 116 | examples = [] 117 | for line in lines: 118 | qid = line['idx'] 119 | premise = tokenization.convert_to_unicode(line['premise']) 120 | 121 | question = line["question"] 122 | answers = np.array([ 123 | tokenization.convert_to_unicode(question + line["choice1"]), 124 | tokenization.convert_to_unicode(question + line["choice2"]), 125 | tokenization.convert_to_unicode(question + line["choice3"]) 126 | ]) 127 | 128 | # the test set has no answer key so use '0' as a dummy label 129 | label = line.get('label', 0) 130 | 131 | examples.append( 132 | InputExample( 133 | qid=qid, 134 | question=premise, 135 | answers=answers, 136 | label=label)) 137 | 138 | return examples 139 | 140 | ## VARIANT_B Premise [SEP] WH-Question_Answer [SEP] WH_Q Answer 141 | def _create_examples_variant_B(self, lines, set_type): 142 | examples = [] 143 | for line in lines: 144 | qid = line['idx'] 145 | question = line["question"] 146 | premise = tokenization.convert_to_unicode(line['premise']) 147 | 148 | answers = np.array([ 149 | tokenization.convert_to_unicode(question + line["choice1"]), 150 | tokenization.convert_to_unicode(question + line["choice2"]), 151 | tokenization.convert_to_unicode(question + line["choice3"]) 152 | ]) 153 | 154 | # the test set has no answer key so use '0' as a dummy label 155 | label = line.get('label', 0) 156 | 157 | examples.append( 158 | InputExample( 159 | qid=qid, 160 | question=premise, 161 | answers=answers, 162 | label=label)) 163 | 164 | return examples 165 | 166 | 167 | ## VARIANT_C WH-Question_Premise [SEP] Answer [SEP] Answer 168 | def _create_examples_variant_C(self, lines, set_type): 169 | examples = [] 170 | for line in lines: 171 | qid = line['idx'] 172 | question = line["question"] 173 | premise = tokenization.convert_to_unicode(question + line['premise']) 174 | 175 | answers = np.array([ 176 | tokenization.convert_to_unicode(line["choice1"]), 177 | tokenization.convert_to_unicode(line["choice2"]), 178 | tokenization.convert_to_unicode(line["choice3"]) 179 | ]) 180 | 181 | # the test set has no answer key so use '0' as a dummy label 182 | label = line.get('label', 0) 183 | 184 | examples.append( 185 | InputExample( 186 | qid=qid, 187 | question=premise, 188 | answers=answers, 189 | label=label)) 190 | 191 | return examples 192 | 193 | 194 | ## Premise WH-Question [SEP] Answer [SEP] Answer 195 | def _create_examples_variant_D(self, lines, set_type): 196 | examples = [] 197 | for line in lines: 198 | qid = line['idx'] 199 | question = line["question"] 200 | premise = tokenization.convert_to_unicode(line['premise'] + question) 201 | 202 | answers = np.array([ 203 | tokenization.convert_to_unicode(line["choice1"]), 204 | tokenization.convert_to_unicode(line["choice2"]), 205 | tokenization.convert_to_unicode(line["choice3"]) 206 | ]) 207 | 208 | # the test set has no answer key so use '0' as a dummy label 209 | label = line.get('label', 0) 210 | 211 | examples.append( 212 | InputExample( 213 | qid=qid, 214 | question=premise, 215 | answers=answers, 216 | label=label)) 217 | 218 | return examples 219 | -------------------------------------------------------------------------------- /utility/ec2.py: -------------------------------------------------------------------------------- 1 | import waws 2 | 3 | inst = waws.InstanceManager() 4 | 5 | #inst.upload_to_EC2(folder_file_name=".", instance="sunshine-1") 6 | #inst.upload_to_EC2(folder_file_name="./modeling.py", optional_remote_path="./ConceptBERT/", instance="sunshine-1") 7 | #inst.upload_to_EC2(folder_file_name="./data/glue_data/", optional_remote_path="./ConceptBERT/data/", instance="sunshine-1") 8 | inst.upload_to_EC2(folder_file_name="./poc_finetuning.sh", optional_remote_path="./ConceptBERT/", instance="sunshine-1") 9 | inst.upload_to_EC2(folder_file_name="./run_regression.py", optional_remote_path="./ConceptBERT/", instance="sunshine-1") 10 | inst.upload_to_EC2(folder_file_name="./run_classifier.py", optional_remote_path="./ConceptBERT/", instance="sunshine-1") 11 | inst.upload_to_EC2(folder_file_name="./poc_bash_test.sh", optional_remote_path="./ConceptBERT/", instance="sunshine-1") 12 | #inst.upload_to_EC2(folder_file_name="/c/Users/anlausch/Downloads/uncased_L-12_H-768_A-12/", instance="sunshine-1") 13 | 14 | #inst.download_from_EC2(folder_file_name="CODE_FOLDER", local_path="./training", optional_remote_path="EXPERIMENT2", instance="sunshine-1") -------------------------------------------------------------------------------- /utility/ec2_download.py: -------------------------------------------------------------------------------- 1 | import waws 2 | 3 | inst = waws.InstanceManager() 4 | 5 | inst.download_from_EC2(folder_file_name="~/ConceptBERT/output/pretraining/sentences/free-wo-nsp", local_path="/c/Users/anlausch/Downloads/omcs", instance="sunshine-1") 6 | -------------------------------------------------------------------------------- /utility/s3_download.py: -------------------------------------------------------------------------------- 1 | # Download files 2 | s3.download_file( 3 | file_name="test.txt", 4 | local_path="some/local/path", 5 | remote_path="SOME/S3/PATH", 6 | bucket_name="some_bucket_name" 7 | ) -------------------------------------------------------------------------------- /utility/s3_upload.py: -------------------------------------------------------------------------------- 1 | import waws 2 | import os 3 | 4 | s3 = waws.BucketManager() 5 | 6 | import os 7 | 8 | path = os.getcwd() 9 | 10 | files = [] 11 | # r=root, d=directories, f = files 12 | for r, d, f in os.walk(path): 13 | for file in f: 14 | #if ".iml" not in file and ".xml" not in file: 15 | # Upload files 16 | s3.upload_file( 17 | file_name=file, 18 | local_path=r, 19 | remote_path="~/retrograph" 20 | ) 21 | 22 | 23 | -------------------------------------------------------------------------------- /utility/upload_s3.sh: -------------------------------------------------------------------------------- 1 | waws --uploadS3 -b wluper-retrograph -f all -l "./../ConceptBERT/" --------------------------------------------------------------------------------