├── requirements.txt ├── src ├── Data_generation │ ├── filter_CWWV.py │ ├── generate_from_CWWV.py │ └── generate_from_ATOMIC.py ├── Training │ ├── AFLite │ │ ├── custimized_models.py │ │ ├── run_AFLite.py │ │ └── run_roberta_classification.py │ ├── data_utils.py │ ├── MLM │ │ ├── run_lm_gpt2.py │ │ └── run_mlm_roberta.py │ ├── run_pretrain_gpt2.py │ └── run_pretrain.py └── Evaluation │ ├── evaluate_GPT2.py │ └── evaluate_RoBERTa.py ├── .gitignore ├── README.md └── LICENSE /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.5.1 2 | transformers==3.0.2 3 | overrides==3.0.0 4 | ftfy==5.6 5 | nltk==3.4.5 6 | sentence-transformers==0.3.4 7 | tensorboard==2.0 8 | wordfreq==2.3.2 9 | -------------------------------------------------------------------------------- /src/Data_generation/filter_CWWV.py: -------------------------------------------------------------------------------- 1 | from wordfreq import word_frequency 2 | import json 3 | from tqdm import tqdm 4 | import argparse 5 | import random 6 | import os 7 | random.seed(1) 8 | threshold=1e-06 9 | 10 | def write_data(data, dest): 11 | with open(dest, 'w') as w: 12 | for x in data: 13 | w.write(json.dumps(x) + '\n') 14 | 15 | def get_answer(data): 16 | answers={} 17 | for choice in data['question']['choices']: 18 | answers[choice['label']]=choice['text'] 19 | return answers[data['answerKey']] 20 | 21 | 22 | if __name__=="__main__": 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--input_file", type=str, default=None, required=True, 26 | help="Input file with artificial QA data") 27 | parser.add_argument('--do_split', action="store_true", help="Further split training set into subsets for AFLite") 28 | args = parser.parse_args() 29 | 30 | common_concepts_omcs=[] 31 | with open(args.input_file, 'r') as f: 32 | for line in tqdm(f, total=500000): 33 | qdata=json.loads(line) 34 | head_label=qdata['question']['head'] 35 | source=qdata['question']['source'] 36 | answer=get_answer(qdata) 37 | is_concept=source=='omcs' or (head_label.islower() and answer.islower()) 38 | is_common=(word_frequency(head_label, 'en')>=threshold and word_frequency(answer, 'en')>=threshold) 39 | if is_concept and is_common and source == 'omcs': 40 | common_concepts_omcs.append(qdata) 41 | 42 | print('common concepts omcs', len(common_concepts_omcs)) 43 | random.shuffle(common_concepts_omcs) 44 | train_set = common_concepts_omcs[:int(len(common_concepts_omcs)*0.95)] 45 | dev_set = common_concepts_omcs[int(len(common_concepts_omcs)*0.95):] 46 | basename = os.path.basename(args.input_file) 47 | write_data(train_set, args.input_file.replace(basename, 'train_'+basename)) 48 | write_data(dev_set, args.input_file.replace(basename, 'dev_'+basename)) 49 | if args.do_split: 50 | assert 'random' in args.input_file 51 | print ('splitting train into subsets, which can be used for AFLite (only valid for random strategy)') 52 | train_set_1 = train_set[:int(len(train_set)*0.01)] 53 | train_set_4 = train_set[int(len(train_set)*0.01):int(len(train_set)*0.05)] 54 | train_set_95 = train_set[int(len(train_set)*0.05):] 55 | write_data(train_set_1, 'train_1%_'+args.input_file) 56 | write_data(train_set_4, 'train_4%_'+args.input_file) 57 | write_data(train_set_95, 'train_95%_'+args.input_file) 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /src/Training/AFLite/custimized_models.py: -------------------------------------------------------------------------------- 1 | from __future__ import (absolute_import, division, print_function, 2 | unicode_literals) 3 | import logging 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.nn import CrossEntropyLoss 8 | 9 | from transformers import BertPreTrainedModel, RobertaConfig, RobertaModel 10 | 11 | class RobertaForMultipleChoice(BertPreTrainedModel): 12 | config_class = RobertaConfig 13 | base_model_prefix = "roberta" 14 | 15 | def __init__(self, config): 16 | super().__init__(config) 17 | 18 | self.roberta = RobertaModel(config) 19 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 20 | self.classifier = nn.Linear(config.hidden_size, 1) 21 | 22 | self.init_weights() 23 | 24 | def forward( 25 | self, 26 | input_ids=None, 27 | token_type_ids=None, 28 | attention_mask=None, 29 | labels=None, 30 | position_ids=None, 31 | head_mask=None, 32 | inputs_embeds=None, 33 | output_attentions=None, 34 | output_hidden_states=None, 35 | ): 36 | num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] 37 | 38 | flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None 39 | flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None 40 | flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None 41 | flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None 42 | flat_inputs_embeds = ( 43 | inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) 44 | if inputs_embeds is not None 45 | else None 46 | ) 47 | 48 | outputs = self.roberta( 49 | flat_input_ids, 50 | position_ids=flat_position_ids, 51 | token_type_ids=flat_token_type_ids, 52 | attention_mask=flat_attention_mask, 53 | head_mask=head_mask, 54 | inputs_embeds=flat_inputs_embeds, 55 | output_attentions=output_attentions, 56 | output_hidden_states=output_hidden_states, 57 | ) 58 | pooled_output = outputs[1] 59 | 60 | pooled_output = self.dropout(pooled_output) 61 | logits = self.classifier(pooled_output) 62 | reshaped_logits = logits.view(-1, num_choices) 63 | 64 | outputs = (reshaped_logits,pooled_output,) + outputs[2:] # add hidden states and attention if they are here 65 | 66 | if labels is not None: 67 | loss_fct = CrossEntropyLoss() 68 | loss = loss_fct(reshaped_logits, labels) 69 | outputs = (loss,) + outputs 70 | 71 | return outputs # (loss), reshaped_logits, (hidden_states), (attentions) 72 | -------------------------------------------------------------------------------- /src/Training/AFLite/run_AFLite.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import numpy as np 4 | import torch 5 | from collections import Counter 6 | from tqdm import tqdm 7 | import argparse 8 | correct_count = Counter() 9 | chosen_count = Counter() 10 | dev_correct_count = Counter() 11 | dev_chosen_count = Counter() 12 | 13 | def set_seed(seed): 14 | random.seed(seed) 15 | np.random.seed(seed) 16 | torch.manual_seed(seed) 17 | 18 | def read_data(filename): 19 | data = [] 20 | with open(filename, 'r') as f: 21 | for line in f: 22 | data.append(json.loads(line)) 23 | return data 24 | 25 | def write_data(filename, data): 26 | with open(filename, 'w') as fout: 27 | for sample in data: 28 | fout.write(json.dumps(sample)) 29 | fout.write('\n') 30 | 31 | def predict(model, features, labels): 32 | n_samples, num_cand, feat_dim = features.shape 33 | if len(features) > 500000: 34 | logits = [] 35 | batch_size = int(len(features)/10) 36 | for b in range(0, len(features), batch_size): 37 | batch_logits = model(features[b:b+batch_size].cuda()) 38 | logits.append(batch_logits.squeeze(2).detach().cpu()) 39 | logits = torch.cat(logits, dim=0).numpy() 40 | else: 41 | features = features.cuda() 42 | logits = model(features) 43 | logits = logits.squeeze(2).detach().cpu().numpy() 44 | preds = np.argmax(logits, axis=1) 45 | acc = (preds == labels).mean() 46 | return preds == labels 47 | 48 | def train_classifier(features, labels): 49 | model = torch.nn.Linear(1024, 1) 50 | model.to('cuda') 51 | optimizer = torch.optim.Adam(model.parameters()) 52 | loss_fct = torch.nn.CrossEntropyLoss() 53 | features = features.cuda() 54 | labels = torch.tensor(labels, dtype=torch.long).cuda() 55 | batch_size = int(len(features)/10) 56 | for i in range(3): 57 | 58 | for b in range(0, len(features), batch_size): 59 | logits = model(features[b:b+batch_size]) 60 | loss = loss_fct(logits.squeeze(2), labels[b:b+batch_size]) 61 | loss.backward() 62 | optimizer.step() 63 | model.zero_grad() 64 | return model 65 | 66 | def run_iteration(features, labels, sample_ids, test_features, test_labels, test_sample_ids, target_size): 67 | global correct_count, chosen_count, dev_correct_count, dev_chosen_count 68 | idx = [_ for _ in range(len(features))] 69 | random.shuffle(idx) 70 | features = features[idx] 71 | labels = labels[idx] 72 | sample_ids = [sample_ids[i] for i in idx] 73 | train_size = target_size 74 | train_feat = features[:train_size] 75 | dev_feat = features[train_size:] 76 | train_labels = labels[:train_size] 77 | dev_labels = labels[train_size:] 78 | train_sample_ids = sample_ids[:train_size] 79 | dev_sample_ids = sample_ids[train_size:] 80 | model = train_classifier(train_feat, train_labels) 81 | preds = predict(model, dev_feat, dev_labels) 82 | chosen_count.update(dev_sample_ids) 83 | correct_ids = [dev_sample_ids[sid] for sid in range(len(dev_sample_ids)) if preds[sid]] 84 | correct_count.update(correct_ids) 85 | test_preds = predict(model, test_features, test_labels) 86 | dev_chosen_count.update(test_sample_ids) 87 | test_correct_ids = [test_sample_ids[sid] for sid in range(len(test_sample_ids)) if test_preds[sid]] 88 | dev_correct_count.update(test_correct_ids) 89 | 90 | def main(): 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument("--train_file", default=None, type=str, required=True, help="train file") 93 | parser.add_argument("--dev_file", default=None, type=str, required=True, help="dev file") 94 | args = parser.parse_args() 95 | set_seed(1) 96 | data = read_data(args.train_file) 97 | dev_data = read_data(args.dev_file) 98 | print (len(data), len(dev_data)) 99 | features = torch.load(args.train_file.replace('.jsonl', '_features')) 100 | torch_labels = torch.load(args.train_file.replace('.jsonl', '_labels')) 101 | dev_features = torch.load(args.dev_file.replace('.jsonl', '_features')) 102 | dev_torch_labels = torch.load(args.dev_file.replace('.jsonl', '_labels')) 103 | print (features.shape, dev_features.shape) 104 | if 'correct' in data[0]: 105 | labels = [sample['correct'] for sample in data] 106 | dev_labels = [sample['correct'] for sample in dev_data] 107 | else: 108 | mapping = {'A':0, 'B':1, 'C':2} 109 | labels = [mapping[sample['answerKey']] for sample in data] 110 | dev_labels = [mapping[sample['answerKey']] for sample in dev_data] 111 | print (torch_labels.shape, dev_torch_labels.shape) 112 | print (np.array(labels).shape, np.array(dev_labels).shape) 113 | assert all(np.array(labels) == torch_labels) 114 | assert all(np.array(dev_labels) == dev_torch_labels) 115 | sample_ids = [sample['id'] for sample in data] 116 | dev_sample_ids = [sample['id'] for sample in dev_data] 117 | labels = np.array(labels) 118 | dev_labels = np.array(dev_labels) 119 | target_size = int(len(features)*0.2) 120 | cutoff_size = int(len(features)*0.02) 121 | dev_cutoff_size = int(len(dev_features)*0.02) 122 | print ('target size', target_size) 123 | global correct_count, chosen_count, dev_correct_count, dev_chosen_count 124 | while len(features) > target_size: 125 | correct_count = Counter() 126 | chosen_count = Counter() 127 | dev_correct_count = Counter() 128 | dev_chosen_count = Counter() 129 | for i in tqdm(range(64)): 130 | run_iteration(features, labels, sample_ids, dev_features, dev_labels, dev_sample_ids, target_size) 131 | for k, v in correct_count.items(): 132 | correct_count[k] = float(v)/chosen_count[k] 133 | for k, v in dev_correct_count.items(): 134 | dev_correct_count[k] = float(v)/dev_chosen_count[k] 135 | sorted_correct_count = sorted(correct_count.items(), key=lambda x: x[1], reverse=True) 136 | sorted_dev_correct_count = sorted(dev_correct_count.items(), key=lambda x: x[1], reverse=True) 137 | easy_train = [s[0] for s in sorted_correct_count[:cutoff_size] if s[1] > 0.75] 138 | easy_dev = [s[0] for s in sorted_dev_correct_count[:dev_cutoff_size] if s[1] > 0.75] 139 | 140 | kept_idx = [sid for sid in range(len(sample_ids)) if sample_ids[sid] not in easy_train] 141 | newly_removed = len(features) - len(kept_idx) 142 | features = features[kept_idx] 143 | labels = labels[kept_idx] 144 | sample_ids = [sample_ids[ki] for ki in kept_idx] 145 | dev_kept_ids = [sid for sid in range(len(dev_sample_ids)) if dev_sample_ids[sid] not in easy_dev] 146 | dev_features = dev_features[dev_kept_ids] 147 | dev_labels = dev_labels[dev_kept_ids] 148 | dev_sample_ids = [dev_sample_ids[ki] for ki in dev_kept_ids] 149 | print ('now keeping train', len(kept_idx), 'dev', len(dev_kept_ids)) 150 | if newly_removed < cutoff_size: 151 | break 152 | print ('finally keeping train', len(sample_ids), 'dev', len(dev_sample_ids)) 153 | kept = Counter(sample_ids) 154 | kept_data = [sample for sample in data if sample['id'] in kept] 155 | dev_kept = Counter(dev_sample_ids) 156 | dev_kept_data = [sample for sample in dev_data if sample['id'] in dev_kept] 157 | write_data(args.train_file.replace('.jsonl', '_adv-filter.jsonl'), kept_data) 158 | write_data(args.dev_file.replace('.jsonl', '_adv-filter.jsonl'), dev_kept_data) 159 | 160 | if __name__ == "__main__": 161 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Knowledge-driven Data Construction for Zero-shot Evaluation in Commonsense Question Answering 2 | This repository contains the code for the paper "Knowledge-driven Data Construction for Zero-shot Evaluation in Commonsense Question Answering" (AAAI 2021). See full paper [here](https://arxiv.org/abs/2011.03863) 3 | 4 | Note that our evaluation code is adpated from [self-talk repo](https://github.com/vered1986/self_talk) 5 | 6 | ## Enviroments 7 | This code has been tested on Python 3.7.6, Pytorch 1.5.1 and Transformers 3.0.2, you can install the required packages by 8 | ``` 9 | pip install -r requirements.txt 10 | ``` 11 | 12 | ## Data generation 13 | Our synthetic QA sets can be downloaded from [here](https://drive.google.com/file/d/1qp2Exh88m1LT8iyDvt8TOAXhGdHQhP2B/view?usp=sharing), uncompress it and place it in the HyKAS-CSKG root directory. 14 | 15 | If you would like to generate data from scratch, first `cd` to the `src/Data_generation` directory. 16 | 17 | For the **ATOMIC** synthetic sets, download the ATOMIC from [official website](https://homes.cs.washington.edu/~msap/atomic/) and uncompress. 18 | Then run 19 | ``` 20 | python generate_from_ATOMIC.py --train_KG atomic/v4_atomic_trn.csv --dev_KG atomic/v4_atomic_dev.csv --strategy random --out_dir ../../data/ATOMIC 21 | ``` 22 | 23 | For **CWWV**, download the `cskg_connected.tsv` from [here](https://drive.google.com/file/d/11TiW3pAHnt6l8yuIWpowzOMuM8fq7ff6/view?usp=sharing) and `cache.pkl` from [here](https://drive.google.com/file/d/19tcSaKi-Efz8IH-HX0oBkYtalnqOseZj/view?usp=sharing), then run: 24 | ``` 25 | python generate_from_CWWV.py --cskg_file cskg_connected.tsv --lex_cache cache.pkl --output_dir ../../data/CWWV --strategy random 26 | python filter_CWWV.py --input_file ../../data/CWWV/random.jsonl 27 | ``` 28 | 29 | ## Pretraining on Synthetic QA sets 30 | We provide following pretrained models 31 | LM | KG | Download 32 | ---|---|--- 33 | RoBERTa-Large | ATOMIC | [Download](https://drive.google.com/file/d/1oTYV5YZRlXtMSZW9_pTjyMn6o8yrPU2N/view?usp=sharing) 34 | RoBERTa-Large | CWWV | [Download](https://drive.google.com/file/d/1Ot-x3WJoFWYUTyyDSMeG2CrKDmCTggxM/view?usp=sharing) 35 | RoBERTa-Large | CSKG | [Download](https://drive.google.com/file/d/1nfWtIfrQk4REp7oGUyyn1ShT7aEvMI9E/view?usp=sharing) 36 | GPT2-Large | ATOMIC | [Download](https://drive.google.com/file/d/1lENyTTBogmRIK_M7cu_uxeD8AiWBo7Ko/view?usp=sharing) 37 | GPT2-Large | CWWV | [Download](https://drive.google.com/file/d/1dnqdW-5d6tULZfDaejViVrjuNx-nY8sP/view?usp=sharing) 38 | GPT2-Large | CSKG | [Download](https://drive.google.com/file/d/1VUBAxtyKElmbNTxSkIdPjR88PkEjbc-2/view?usp=sharing) 39 | 40 | If you would like to train models from scratch, you can use the following commands under src/Training 41 | 42 | For RoBERTa 43 | ``` 44 | CUDA_VISIBLE_DEVICES=0 python run_pretrain.py --model_type roberta-mlm --model_name_or_path roberta-large --task_name cskg --output_dir ../../out_dir --max_sequence_per_time 200 \ 45 | --train_file ../../data/ATOMIC/train_random.jsonl --second_train_file ../../data/CWWV/train_random.jsonl --dev_file ../../data/ATOMIC/dev_random.jsonl --second_dev_file \ 46 | ../../data/CWWV/dev_random.jsonl --max_seq_length 128 --max_words_to_mask 6 --do_train --do_eval --per_gpu_train_batch_size 2 --gradient_accumulation_steps 16 \ 47 | --learning_rate 1e-5 --num_train_epochs 1 --warmup_proportion 0.05 --evaluate_during_training --per_gpu_eval_batch_size 8 --save_steps 6500 --margin 1.0 48 | ``` 49 | For GPT2 50 | ``` 51 | CUDA_VISIBLE_DEVICES=0 python run_pretrain_gpt2.py --model_type gpt2 --model_name_or_path gpt2-large --task_name cskg --output_dir ../../out_dir --train_file ../../data/ATOMIC/ \ 52 | train_random.jsonl --second_train_file ../../data/CWWV/train_random.jsonl --dev_file ../../data/ATOMIC/dev_random.jsonl --second_dev_file ../../data/CWWV/dev_random.jsonl \ 53 | --max_seq_length 128 --do_train --do_eval --per_gpu_train_batch_size 2 --gradient_accumulation_steps 16 --learning_rate 1e-5 --num_train_epochs 1 --warmup_proportion 0.05 \ 54 | --evaluate_during_training --per_gpu_eval_batch_size 8 --save_steps 6500 --margin 1.0 55 | ``` 56 | 57 | ## Evaluation 58 | For LM baselines, cd to src/Evaluation directory 59 | ``` 60 | python evaluate_RoBERTa.py --lm roberta-large --dataset_file DATA_FILE --out_dir ../../results --device 1 --reader TASK_NAME 61 | python evaluate_GPT2.py --lm gpt2-large --dataset_file DATA_FILE --out_dir ../../results --device 1 --reader TASK_NAME 62 | ``` 63 | For pretrained models, simply point the --lm flag to your model directory, for example 64 | ``` 65 | python evaluate_RoBERTa.py --lm ../../models/roberta_cskg --dataset_file ../../tasks/commonsenseqa_dev.jsonl --out_dir ../../results --device 1 --reader commonsenseqa 66 | python evaluate_GPT2.py --lm ../../models/gpt2_cskg --dataset_file ../../tasks/commonsenseqa_dev.jsonl --out_dir ../../results --device 1 --reader commonsenseqa 67 | ``` 68 | 69 | ## MLM abalation 70 | To run MLM pretraining experiments (comparison of training regimes), cd to src/Training/MLM 71 | ``` 72 | CUDA_VISIBLE_DEVICES=0 python run_mlm_roberta.py --model_type roberta-mlm --model_name_or_path roberta-large --task_name atomicmlm --output_dir ../../out_dir --train_file \ 73 | ../../data/ATOMIC/train_random.jsonl --dev_file ../../data/ATOMIC/dev_random.jsonl --mlm_probability 0.5 --max_seq_length 128 --max_words_to_mask 6 --max_sequence_per_time 200 \ 74 | --do_train --do_eval --per_gpu_train_batch_size 8 --gradient_accumulation_steps 4 --learning_rate 1e-5 --num_train_epochs 3 --warmup_proportion 0.05 --evaluate_during_training \ 75 | --per_gpu_eval_batch_size 8 --save_steps 5000 76 | ``` 77 | Then follow the same evaluation commands as above to evaluate the models 78 | 79 | ## AFLite 80 | To generate adversarial filtered datasets using AFLite algorithm, first run the data generation code with --do_split flag 81 | ``` 82 | python generate_from_ATOMIC.py --train_KG atomic/v4_atomic_trn.csv --dev_KG atomic/v4_atomic_dev.csv --strategy random --out_dir ../../data/ATOMIC --do_split 83 | ``` 84 | This will split training set into 3 subsets, then we can train a feature function, cd to src/Training/AFLite directory 85 | ``` 86 | CUDA_VISIBLE_DEVICES=0 python run_roberta_classification.py --model_type roberta-mc --model_name_or_path roberta-large --task_name cwwv --output_dir ../../out_dir --train_file \ 87 | ../../data/ATOMIC/train_4%_random.jsonl --dev_file ../../data/ATOMIC/train_1%_random.jsonl --max_seq_length 128 --per_gpu_eval_batch_size 16 --do_train --do_eval \ 88 | --evaluate_during_training --per_gpu_train_batch_size 4 --gradient_accumulation_steps 8 --learning_rate 1e-5 --num_train_epochs 3 --warmup_proportion 0.05 --save_steps 150 89 | ``` 90 | Then we compute the embeddings for the remaining 95% of train and dev sets 91 | ``` 92 | CUDA_VISIBLE_DEVICES=0 python run_roberta_classification.py --model_type roberta-mc --model_name_or_path roberta-large --task_name cwwv --output_dir ../../out_dir --train_file \ 93 | ../../data/ATOMIC/train_4%_random.jsonl --dev_file ../../data/ATOMIC/train_95%_random.jsonl --max_seq_length 128 --per_gpu_eval_batch_size 16 --do_eval 94 | CUDA_VISIBLE_DEVICES=0 python run_roberta_classification.py --model_type roberta-mc --model_name_or_path roberta-large --task_name cwwv --output_dir ../../out_dir --train_file \ 95 | ../../data/ATOMIC/train_4%_random.jsonl --dev_file ../../data/ATOMIC/dev_random.jsonl --max_seq_length 128 --per_gpu_eval_batch_size 16 --do_eval 96 | ``` 97 | To run AFLite 98 | ``` 99 | python run_AFLite.py --train_file ../../data/ATOMIC/train_95%_random.jsonl --dev_file ../../data/ATOMIC/dev_random.jsonl 100 | ``` 101 | This will produce the AFLite filtered output files at the same location as input files, which can be used for pretraining the models. 102 | 103 | ## Cite 104 | ``` 105 | @misc{ma2020knowledgedriven, 106 | title={Knowledge-driven Data Construction for Zero-shot Evaluation in Commonsense Question Answering}, 107 | author={Kaixin Ma and Filip Ilievski and Jonathan Francis and Yonatan Bisk and Eric Nyberg and Alessandro Oltramari}, 108 | year={2020}, 109 | eprint={2011.03863}, 110 | archivePrefix={arXiv}, 111 | primaryClass={cs.CL} 112 | } 113 | ``` 114 | -------------------------------------------------------------------------------- /src/Training/data_utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | from tqdm import tqdm 4 | import json 5 | import re 6 | import ftfy 7 | import random 8 | from collections import Counter 9 | import unicodedata 10 | import string 11 | import nltk 12 | from nltk.corpus import stopwords 13 | skip_words = set(stopwords.words('english')) 14 | skip_words.add('\'s') 15 | skip_words.add('.') 16 | skip_words.add(',') 17 | PERSON_NAMES = ['Alex', 'Ash', 'Aspen', 'Bali', 'Berkeley', 'Cameron', 'Chris', 'Cody', 'Dana', 'Drew', 'Emory', 'Flynn', 'Gale', 'Jamie', 'Jesse', 18 | 'Kai', 'Kendall', 'Kyle', 'Lee', 'Logan', 'Max', 'Morgan', 'Nico', 'Paris', 'Pat', 'Quinn', 'Ray', 'Robin', 'Rowan', 'Rudy', 'Sam', 'Skylar', 'Sydney', 19 | 'Taylor', 'Tracy', 'West', 'Wynne'] 20 | logger = logging.getLogger(__name__) 21 | 22 | def accuracy(out, labels): 23 | return {'acc': (out == labels).mean()} 24 | 25 | def handle_words(span, tokenizer, keywords=None, is_start=False): 26 | inputs = [] 27 | labels = [] 28 | words = nltk.word_tokenize(span) 29 | for w_i, w in enumerate(words): 30 | if (w_i == 0 and is_start) or w == '.' or w == ',' or w.startswith('\''): 31 | w_bpes = tokenizer.tokenize(w) 32 | else: 33 | w_bpes = tokenizer.tokenize(w, add_prefix_space=True) 34 | inputs.extend(w_bpes) 35 | if keywords != None: 36 | if w in keywords: 37 | labels.extend(w_bpes) 38 | else: 39 | labels.extend([-100]*len(w_bpes)) 40 | else: 41 | if w not in PERSON_NAMES and w not in skip_words and w.lower() not in skip_words: 42 | labels.extend(w_bpes) 43 | else: 44 | labels.extend([-100]*len(w_bpes)) 45 | return inputs, labels 46 | 47 | def handle_underscores(suffix, tokenizer, keywords=None, prefix=False): 48 | inputs = [] 49 | labels = [] 50 | if '_' in suffix: 51 | suffix_parts = [i.strip() for i in suffix.split('___')] 52 | for i, part in enumerate(suffix_parts): 53 | if part: 54 | tmp_inputs, tmp_labels = handle_words(part, tokenizer, keywords=keywords, is_start=(i==0 and prefix)) 55 | inputs += tmp_inputs 56 | labels += tmp_labels 57 | 58 | if i != len(suffix_parts) - 1 and suffix_parts[i+1]: 59 | inputs.append(tokenizer.mask_token) 60 | labels.append(-100) 61 | else: 62 | inputs.append(tokenizer.mask_token) 63 | labels.append(-100) 64 | else: 65 | inputs, labels = handle_words(suffix, tokenizer, keywords=keywords, is_start=prefix) 66 | return inputs, labels 67 | 68 | def convert_examples_to_features(examples, tokenizer, max_length=512): 69 | data = [] 70 | for example in examples: 71 | inputs, labels = handle_underscores(example['context'], tokenizer, keywords=example['keywords'], prefix=True) 72 | choices = [handle_underscores(cand, tokenizer) for cand in example['candidates']] 73 | input_ids = [inputs+cand[0] for cand in choices] 74 | input_ids = [tokenizer.convert_tokens_to_ids(cand) for cand in input_ids] 75 | label_ids = [labels+cand[1] for cand in choices] 76 | label_ids = [[t if t == -100 else input_ids[i][t_i] for t_i, t in enumerate(cand)] for i, cand in enumerate(label_ids)] 77 | label_ids = [[-100]+cand+[-100] for cand in label_ids] 78 | input_ids = [tokenizer.prepare_for_model(cand, max_length=max_length, truncation=True)['input_ids'] for cand in input_ids] 79 | data.append([input_ids, label_ids, example['correct']]) 80 | return data 81 | 82 | class ATOMICMLMProcessor(object): 83 | def __init__(self, args): 84 | self.D = [] 85 | self.filelist = [args.train_file, args.dev_file] 86 | 87 | def get_train_examples(self): 88 | self.load_data(self.filelist[0]) 89 | return self.D 90 | 91 | def get_dev_examples(self): 92 | data = [] 93 | with open(self.filelist[1], 'r') as f: 94 | for row in tqdm(f): 95 | sample = json.loads(row) 96 | data.append(sample) 97 | print (len(data)) 98 | return data 99 | 100 | def load_data(self, filename): 101 | with open(filename, "r") as f: 102 | for row in tqdm(f): 103 | sample = json.loads(row) 104 | self.D.append({'id':sample['id'], 'context':sample['context'], 'ending':sample['candidates'][sample['correct']], 'keywords': sample['keywords']}) 105 | print (len(self.D)) 106 | 107 | class ATOMICProcessor(object): 108 | def __init__(self, args): 109 | print ('loading from %s %s' % (args.train_file, args.dev_file)) 110 | self.filelist = [args.train_file, args.dev_file] 111 | self.D = [[], []] 112 | 113 | def get_train_examples(self): 114 | self.load_data(self.filelist[0], 0) 115 | return self.D[0] 116 | 117 | def get_dev_examples(self): 118 | self.load_data(self.filelist[1], 1) 119 | return self.D[1] 120 | 121 | def load_data(self, filename, sid): 122 | with open(filename, "r") as f: 123 | for row in tqdm(f): 124 | sample = json.loads(row) 125 | self.D[sid].append(sample) 126 | print (len(self.D[sid])) 127 | 128 | class CWWVProcessor(object): 129 | def __init__(self, args): 130 | self.answerKey_mapping = {'A':0, 'B':1, 'C':2} 131 | self.D = [[], []] 132 | if args.task_name == 'cskg': 133 | print ('loading from %s %s' % (args.second_train_file, args.second_dev_file)) 134 | self.filelist = [args.second_train_file, args.second_dev_file] 135 | else: 136 | print ('loading from %s %s' % (args.train_file, args.dev_file)) 137 | self.filelist = [args.train_file, args.dev_file] 138 | 139 | def get_train_examples(self): 140 | self.load_data(self.filelist[0], 0) 141 | return self.D[0] 142 | 143 | def get_dev_examples(self): 144 | self.load_data(self.filelist[1], 1) 145 | return self.D[1] 146 | 147 | def load_data(self, filename, sid): 148 | skipped = 0 149 | with open(filename, "r") as f: 150 | for row in tqdm(f): 151 | sample = json.loads(row) 152 | context = sample['question']['stem'] 153 | if context.endswith('.'): 154 | context = context[:-1] 155 | if not context.endswith('[MASK]'): 156 | skipped += 1 157 | context_parts = context.split('[MASK]') 158 | context = context_parts[0].strip() 159 | candidates = [c['text']+context_parts[1]+'.' for c in sample['question']['choices']] 160 | else: 161 | context = context[:-7] 162 | candidates = [c['text']+'.' for c in sample['question']['choices']] 163 | label = self.answerKey_mapping[sample['answerKey']] 164 | keywords = nltk.word_tokenize(sample['question']['head']) 165 | keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words] 166 | self.D[sid].append({'id':sample['id'], 'context':context, 'correct':label, 'candidates':candidates, 'keywords':keywords}) 167 | print (len(self.D[sid]), skipped) 168 | 169 | class CWWVMLMProcessor(object): 170 | def __init__(self, args): 171 | self.answerKey_mapping = {'A':0, 'B':1, 'C':2} 172 | self.D = [] 173 | self.filelist = [args.train_file, args.dev_file] 174 | self.args = args 175 | 176 | def get_train_examples(self): 177 | self.load_data(self.filelist[0]) 178 | return self.D 179 | 180 | def get_dev_examples(self): 181 | processor = CSKGProcessor(self.args) 182 | return processor.get_dev_examples() 183 | 184 | def load_data(self, filename): 185 | skipped = 0 186 | with open(filename, "r") as f: 187 | for row in tqdm(f): 188 | sample = json.loads(row) 189 | context = sample['question']['stem'] 190 | if context.endswith('.'): 191 | context = context[:-1] 192 | assert context.endswith('[MASK]') 193 | context = context[:-7] 194 | candidates = [c['text']+'.' for c in sample['question']['choices']] 195 | label = self.answerKey_mapping[sample['answerKey']] 196 | keywords = nltk.word_tokenize(sample['question']['head']) 197 | keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words] 198 | self.D.append({'id':sample['id'], 'context':context, 'ending':candidates[label], 'keywords':keywords}) 199 | print (len(self.D)) 200 | 201 | class CSKGProcessor(object): 202 | def __init__(self, args): 203 | # CWWV set always uses second train/dev file params 204 | self.atomicprocessor = ATOMICProcessor(args) 205 | self.cwwvprocessor = CWWVProcessor(args) 206 | 207 | def get_train_examples(self): 208 | cwwv_questions = self.cwwvprocessor.get_train_examples() 209 | atomic_questions = self.atomicprocessor.get_train_examples() 210 | return cwwv_questions+atomic_questions 211 | 212 | def get_dev_examples(self): 213 | cwwv_questions = self.cwwvprocessor.get_dev_examples() 214 | atomic_questions = self.atomicprocessor.get_dev_examples() 215 | return cwwv_questions+atomic_questions 216 | 217 | myprocessors = { 218 | "atomic": ATOMICProcessor, 219 | "cwwv": CWWVProcessor, 220 | "atomicmlm": ATOMICMLMProcessor, 221 | "cwwvmlm": CWWVMLMProcessor, 222 | "cskg": CSKGProcessor 223 | } 224 | -------------------------------------------------------------------------------- /src/Data_generation/generate_from_CWWV.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | import argparse 3 | import sys 4 | import random 5 | import json 6 | from tqdm import tqdm 7 | import pickle as pkl 8 | from string import Template 9 | import numpy as np 10 | from sentence_transformers import SentenceTransformer, util 11 | from os import path 12 | random.seed(1) 13 | num_distractors=2 14 | 15 | good_relations=['/r/Causes', '/r/UsedFor', '/r/CapableOf', '/r/CausesDesire', '/r/IsA', '/r/SymbolOf', '/r/MadeOf', '/r/LocatedNear', '/r/Desires', '/r/AtLocation', '/r/HasProperty', '/r/PartOf', '/r/HasFirstSubevent', '/r/HasLastSubevent'] 16 | 17 | q_sources=set(['CN', 'WD', 'WN']) 18 | dist_only_sources=set(['VG']) 19 | 20 | def format_question(q, a, distractors, q_id, head_label, template, source, rel): 21 | q_entry={} 22 | q_entry['id']=q_id 23 | q_entry['question']={'stem': q} 24 | 25 | answer_key=random.choice(["A", "B", "C"]) 26 | q_entry["answerKey"]=answer_key 27 | if answer_key=="A": 28 | correct_option={"text": a, "label": "A"} 29 | dist1={"text": distractors[0], "label": "B"} 30 | dist2={"text": distractors[1], "label": "C"} 31 | options=[correct_option, dist1, dist2] 32 | elif answer_key=="B": 33 | correct_option={"text": a, "label": "B"} 34 | dist1={"text": distractors[0], "label": "A"} 35 | dist2={"text": distractors[1], "label": "C"} 36 | options=[dist1, correct_option, dist2] 37 | elif answer_key=="C": 38 | correct_option={"text": a, "label": "C"} 39 | dist1={"text": distractors[0], "label": "A"} 40 | dist2={"text": distractors[1], "label": "B"} 41 | options=[dist1, dist2, correct_option] 42 | q_entry["question"]["choices"]=options 43 | q_entry["question"]["head"]=head_label 44 | q_entry["question"]["source"]=source 45 | q_entry["question"]["template"]=template 46 | q_entry["question"]["relation"]=rel 47 | return q_entry 48 | 49 | def select_distractors_noaf(data, head_label, heads, correct_answer, rel): 50 | """Distractors without AFiltering""" 51 | 52 | negatives = [] 53 | 54 | answer_heads=set(head_label.split()) 55 | 56 | candidates=random.choices(list(data), k=num_distractors*100) 57 | 58 | for neg in candidates: 59 | distractor_heads=heads[(neg, rel)] 60 | if neg not in negatives and neg!=correct_answer and neg not in correct_answer and correct_answer not in neg and not (distractor_heads & answer_heads): 61 | negatives.append(neg) 62 | if len(negatives)>=num_distractors: 63 | return negatives, -1 64 | print('Not enough') 65 | return None, -1 66 | 67 | def select_distractors_af(data, head_label, heads, correct_answer, rel, question, embeddings, sentence2id, sentences, q_or_a='a'): 68 | """Distractors with AF""" 69 | high_prob = 0.6 70 | low_prob = 0.5 71 | step=0.05 72 | limit_dists=10 73 | if q_or_a=='q': 74 | downsample_size=num_distractors*400 75 | else: 76 | downsample_size=num_distractors*100 77 | 78 | negatives = [] 79 | 80 | answer_heads=set(head_label.split()) 81 | 82 | candidates=random.choices(list(data), k=downsample_size) 83 | 84 | distractors_indices = [sentence2id[sent] for sent in candidates] # todo! 85 | if q_or_a=='a': 86 | compare_index=sentence2id[correct_answer] # todo! 87 | else: # q 88 | compare_index=sentence2id[question_to_sentence(question)] 89 | dist_mapping = {i:val for i, val in enumerate(distractors_indices)} 90 | distractor_emb = embeddings[distractors_indices] 91 | correct_emb = embeddings[compare_index] 92 | cos_scores = util.pytorch_cos_sim(correct_emb, distractor_emb)[0] 93 | midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1) 94 | while len(midpoint) < limit_dists: 95 | low_prob -= step 96 | midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1) 97 | 98 | x=0 99 | while len(negatives) < num_distractors: 100 | if x>=10: 101 | negatives=None 102 | print('Not enough') 103 | break 104 | sample_idx = random.choice(midpoint) 105 | neg = sentences[dist_mapping[sample_idx.item()]] 106 | distractor_heads=heads[(neg, rel)] 107 | if neg not in negatives and neg!=correct_answer and neg not in correct_answer and correct_answer not in neg and not (distractor_heads & answer_heads): 108 | negatives.append(neg) 109 | x+=1 110 | return negatives, low_prob 111 | 112 | def construct_from_template(h, r): 113 | t={ 114 | "/r/Causes": "$node1 can cause [MASK]", 115 | "/r/UsedFor": "$node1 can be used for [MASK]", 116 | "/r/CapableOf": "$node1 is capable of [MASK]", 117 | "/r/CausesDesire": "$node1 causes desire for [MASK]", 118 | "/r/IsA": "$node1 is a [MASK]", 119 | "/r/SymbolOf": "$node1 is a symbol of [MASK]", 120 | "/r/MadeOf": "$node1 can be made of [MASK]", 121 | "/r/LocatedNear": "$node1 is often located near [MASK]", 122 | "/r/Desires": "$node1 desires [MASK]", 123 | "/r/AtLocation": "$node1 can be found at [MASK]", 124 | "/r/HasProperty": "$node1 has property [MASK]", 125 | "/r/PartOf": "$node1 is part of [MASK]", 126 | "/r/HasFirstSubevent": "$node1 starts by [MASK]", 127 | "/r/HasLastSubevent": "$node1 ends by [MASK]" 128 | } 129 | if r in t.keys(): 130 | temp=Template(t[r]) 131 | question=temp.substitute(node1=h) 132 | template=temp.substitute(node1='{}').replace('[MASK]', '{}') 133 | return question, template 134 | else: 135 | print('ERROR') 136 | 137 | def generate_questions(qa_pairs, rel_tails, answer_heads, output_file, embeddings, sentence2id, sentences, strategy, limit=1000): 138 | q_id=0 139 | all_rels=[] 140 | all_min_probs=[] 141 | with open(output_file, 'w') as w: 142 | for pair, qa_data in tqdm(qa_pairs.items(), total=len(qa_pairs)): 143 | node1, rel=pair 144 | n1_labels=qa_data[0][-1] 145 | for qa in qa_data: 146 | q,a, n1_labels, template, head_label, sent_source,distractor_only =qa 147 | if distractor_only or a in head_label: 148 | continue 149 | q_or_a = None 150 | if args.strategy == 'adv-answer': 151 | q_or_a = 'a' 152 | elif args.strategy == 'adv-question': 153 | q_or_a = 'q' 154 | if q_or_a != None: 155 | distractors, min_prob=select_distractors_af(rel_tails[rel], 156 | head_label, 157 | answer_heads, 158 | a, 159 | rel, 160 | q, 161 | embeddings, 162 | sentence2id, 163 | sentences, 164 | q_or_a) 165 | else: 166 | distractors, min_prob=select_distractors_noaf(rel_tails[rel], 167 | head_label, 168 | answer_heads, 169 | a, 170 | rel) 171 | if distractors: 172 | all_min_probs.append(min_prob) 173 | q_entry=format_question(q, a, distractors, q_id, head_label, template, sent_source, rel) 174 | q_id+=1 175 | w.write(json.dumps(q_entry) + '\n') 176 | all_rels.append(rel) 177 | print(Counter(all_rels)) 178 | print(Counter(all_min_probs)) 179 | 180 | def get_labels(data): 181 | if '|' in data: 182 | return data.split('|') 183 | else: 184 | return [data] 185 | 186 | def question_to_sentence(q): 187 | return q.replace('[MASK]', '').strip() 188 | 189 | def make_masked_question(s): 190 | node1_start=s.find('[[') 191 | node1_end=s.find(']]') 192 | node1_label=s[node1_start+2:node1_end] 193 | 194 | node2_start=s.rfind('[[') 195 | node2_end=s.rfind(']]') 196 | new_s=s[:node2_start].replace('[[', '').replace(']]', '') + '[MASK]' + s[node2_end+2:] 197 | 198 | template=s[:node1_start] + '{}' + s[node1_end+2:node2_start] + '{}' + s[node2_end+2:] 199 | 200 | return new_s, node1_label, template 201 | 202 | def make_masked_question_from_lex(sentence, head, tail): 203 | question=sentence.replace(tail, '[MASK]') 204 | template=sentence.replace(head, '{}').replace(tail, '{}') 205 | return question, template 206 | 207 | def token_overlap(x, y): 208 | return bool(set(x.split()) & set(y.split())) 209 | 210 | def build_embeddings(sentences, out_dir, model_name='roberta-large-nli-stsb-mean-tokens'): 211 | emb_file = os.path.join(args.out_dir, 'cwwv_emb.pkl') 212 | if path.exists(emb_file): 213 | print ('embeddings already exists, skip computation') 214 | with open(emb_file, 'rb') as f: 215 | data=pkl.load(f) 216 | embeddings=data['embeddings'] 217 | sentences=data['sentences'] 218 | return embeddings 219 | model = SentenceTransformer(model_name) 220 | embeddings = model.encode(sentences, show_progress_bar=True, device=0, num_workers=4) 221 | with open(emb_file, "wb") as fout: 222 | pkl.dump({'sentences': sentences, 'embeddings': embeddings}, fout, protocol=pkl.HIGHEST_PROTOCOL) 223 | return embeddings 224 | 225 | def create_indices(cskg_file, lex_cache): 226 | qa_pairs=defaultdict(list) 227 | 228 | rel_tails=defaultdict(set) 229 | answer_heads=defaultdict(set) 230 | 231 | all_tails=set() 232 | 233 | q_sents=set() 234 | 235 | with open(cskg_file, 'r') as f: 236 | header=next(f) 237 | for line in f: 238 | fields=line.split('\t') 239 | 240 | # extract existing info 241 | node1=fields[1] 242 | rel=fields[2] 243 | node2=fields[3] 244 | pair=(node1, rel) 245 | node1_labels=get_labels(fields[4]) 246 | #head_tokens=set() 247 | #for n1_label in node1_labels: 248 | # head_tokens |= set(n1_label.split()) 249 | node2_labels=get_labels(fields[5]) 250 | edge_id=fields[0] 251 | source=fields[8] 252 | sentence=fields[9].strip() 253 | 254 | if '|' in source: 255 | source=set(source.split('|')) 256 | else: 257 | source=set([source]) 258 | 259 | if rel not in good_relations or (len(source & (q_sources|dist_only_sources))==0): continue 260 | 261 | for answer in node2_labels: 262 | rel_tails[rel].add(answer) 263 | answer_heads[(answer, rel)] |= set(node1_labels) 264 | all_tails.add(answer) 265 | 266 | distractor_only=True 267 | for s in source: 268 | if s in q_sources: 269 | distractor_only=False 270 | if sentence: 271 | question, head_label, template = make_masked_question(sentence) 272 | for answer in node2_labels: 273 | if not token_overlap(head_label, answer): 274 | qa_pairs[pair].append((question, answer, node1_labels, template, head_label, 'omcs', distractor_only)) 275 | q_sents.add(question_to_sentence(question)) 276 | elif lex_cache: 277 | for n1_label in node1_labels: 278 | for answer in node2_labels: 279 | triple=(n1_label, rel, answer) 280 | if triple in lex_cache.keys() and not token_overlap(n1_label, answer): 281 | sentence=lex_cache[triple] 282 | question, template = make_masked_question_from_lex(sentence, n1_label, answer) 283 | if '[MASK]' not in question or template.split().count('{}')!=2 or question.split().count('[MASK]')==1: 284 | question, template = construct_from_template(n1_label, rel) 285 | qa_pairs[pair].append((question, answer, node1_labels, template, n1_label, 'lex', distractor_only)) 286 | q_sents.add(question_to_sentence(question)) 287 | elif not token_overlap(n1_label, answer): 288 | question, template = construct_from_template(n1_label, rel) 289 | qa_pairs[pair].append((question, answer, node1_labels, template, n1_label, 'lex', distractor_only)) 290 | q_sents.add(question_to_sentence(question)) 291 | return qa_pairs, all_tails, rel_tails, answer_heads, list(q_sents) 292 | 293 | if __name__ == '__main__': 294 | parser = argparse.ArgumentParser() 295 | parser.add_argument("--cskg_file", type=str, default=None, required=True, 296 | help="CSKG graph TSV file") 297 | parser.add_argument("--out_dir", type=str, default=None, required=True, 298 | help="Output directory") 299 | parser.add_argument("--limit", type=int, default=1000000000, 300 | help="Limit of CSKG rows to process") 301 | parser.add_argument('--lex_cache', type=str, default='../cache.pkl', 302 | help="Pickle file that contains the cache of the lexicalization.") 303 | parser.add_argument("--strategy", default='random', type=str, required=False, choices=['random', 'adv-answer', 'adv-question'], help="which data generation strategy to use") 304 | args = parser.parse_args() 305 | 306 | lex_cache=None 307 | lex_cache=pkl.load(open(args.lex_cache, 'rb')) 308 | qa_pairs, all_tails, rel_tails, answer_heads, q_sentences=create_indices(args.cskg_file, lex_cache) 309 | print('Collecting sentences') 310 | sentences=list(all_tails) + q_sentences 311 | print(len(sentences), 'sentences', len(all_tails), 'answers', len(qa_pairs.keys()), 'qa pairs') 312 | if args.strategy == 'adv-answer' or args.strategy == 'adv-question': 313 | print ('Using %s strategy' % args.strategy) 314 | print('Computing embeddings') 315 | embeddings=build_embeddings(sentences, args.out_dir) 316 | print(len(embeddings), 'embeddings') 317 | else: 318 | embeddings = None 319 | sentence2id={word:i for i, word in enumerate(sentences)} 320 | output_file = path.join(args.out_dir, args.strategy+'.jsonl') 321 | generate_questions(qa_pairs, rel_tails, answer_heads, output_file, embeddings, sentence2id, sentences, args.strategy, args.limit) 322 | 323 | 324 | -------------------------------------------------------------------------------- /src/Evaluation/evaluate_GPT2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import tqdm 5 | import torch 6 | import logging 7 | import argparse 8 | import numpy as np 9 | 10 | from overrides import overrides 11 | from torch.nn import CrossEntropyLoss 12 | from transformers import AutoTokenizer, AutoModelWithLMHead 13 | 14 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 15 | datefmt='%m/%d/%Y %H:%M:%S', 16 | level=logging.INFO) 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class InstanceReader(object): 21 | def to_uniform_fields(self, fields): 22 | pass 23 | 24 | def fields_to_instance(self, fields): 25 | pass 26 | 27 | class PiqaInstanceReader(InstanceReader): 28 | """ 29 | Reads the PIQA dataset into a unified format with context, question, label, and choices. 30 | """ 31 | @overrides 32 | def to_uniform_fields(self, fields): 33 | context = "" 34 | question = fields["goal"] 35 | label = fields.get('label', None) 36 | choices = [fields["sol1"], fields["sol2"]] 37 | return context, question, label, choices 38 | 39 | @overrides 40 | def fields_to_instance(self, fields): 41 | context, question, label, choices = self.to_uniform_fields(fields) 42 | context_with_choices = [f"{question} {choice[0].lower() + choice[1:]}" for choice in choices] 43 | return context, question, label, choices, context_with_choices 44 | 45 | 46 | class SocialIQAInstanceReader(InstanceReader): 47 | """ 48 | Reads the SocialIQa dataset into a unified format with context, question, label, and choices. 49 | """ 50 | def __init__(self): 51 | super(SocialIQAInstanceReader).__init__() 52 | self.QUESTION_TO_ANSWER_PREFIX = { 53 | "What will (.*) want to do next?": r"As a result, [SUBJ] wanted to", 54 | "What will (.*) want to do after?": r"As a result, [SUBJ] wanted to", 55 | "How would (.*) feel afterwards?": r"As a result, [SUBJ] felt", 56 | "How would (.*) feel as a result?": r"As a result, [SUBJ] felt", 57 | "What will (.*) do next?": r"[SUBJ] then", 58 | "How would (.*) feel after?": r"[SUBJ] then", 59 | "How would you describe (.*)?": r"[SUBJ] is seen as", 60 | "What kind of person is (.*)?": r"[SUBJ] is seen as", 61 | "How would you describe (.*) as a person?": r"[SUBJ] is seen as", 62 | "Why did (.*) do that?": r"Before, [SUBJ] wanted", 63 | "Why did (.*) do this?": r"Before, [SUBJ] wanted", 64 | "Why did (.*) want to do this?": r"Before, [SUBJ] wanted", 65 | "What does (.*) need to do beforehand?": r"Before, [SUBJ] needed to", 66 | "What does (.*) need to do before?": r"Before, [SUBJ] needed to", 67 | "What does (.*) need to do before this?": r"Before, [SUBJ] needed to", 68 | "What did (.*) need to do before this?": r"Before, [SUBJ] needed to", 69 | "What will happen to (.*)?": r"[SUBJ] then", 70 | "What will happen to (.*) next?": r"[SUBJ] then" 71 | } 72 | 73 | @overrides 74 | def to_uniform_fields(self, fields): 75 | context = fields['context'] 76 | if not context.endswith("."): 77 | context += "." 78 | 79 | question = fields['question'] 80 | label = fields['correct'] 81 | choices = [fields['answerA'], fields['answerB'], fields['answerC']] 82 | choices = [c + "." if not c.endswith(".") else c for c in choices] 83 | label = ord(label) - 65 84 | return context, question, label, choices 85 | 86 | @overrides 87 | def fields_to_instance(self, fields): 88 | context, question, label, choices = self.to_uniform_fields(fields) 89 | 90 | answer_prefix = "" 91 | for template, ans_prefix in self.QUESTION_TO_ANSWER_PREFIX.items(): 92 | m = re.match(template, question) 93 | if m is not None: 94 | subj = m.group(1) 95 | if subj.endswith('?'): 96 | subj = subj[:-1] 97 | answer_prefix = ans_prefix.replace("[SUBJ]", subj) 98 | break 99 | 100 | if answer_prefix == "": 101 | answer_prefix = question.replace("?", "is") 102 | 103 | choices = [ 104 | " ".join((answer_prefix, choice[0].lower() + choice[1:])).replace( 105 | "?", "").replace("wanted to wanted to", "wanted to").replace( 106 | "needed to needed to", "needed to").replace("to to", "to") for choice in choices] 107 | 108 | context_with_choices = [f"{context} {choice}" for choice in choices] 109 | return context, question, label, choices, context_with_choices 110 | 111 | class ATOMICInstanceReader(InstanceReader): 112 | """ 113 | Reads the ATOMIC dataset into a unified format with context, question, label, and choices. 114 | """ 115 | @overrides 116 | def to_uniform_fields(self, fields): 117 | question = fields['context'] 118 | label = fields['correct'] 119 | choices = [fields['candidates'][0], fields['candidates'][1], fields['candidates'][2]] 120 | return '', question, label, choices 121 | 122 | @overrides 123 | def fields_to_instance(self, fields): 124 | context, question, label, choices = self.to_uniform_fields(fields) 125 | context_with_choices = [f"{question} {choice}" for choice in choices] 126 | return context, question, label, choices, context_with_choices 127 | 128 | class CWWVInstanceReader(InstanceReader): 129 | """ 130 | Reads the CWWV dataset into a unified format with context, question, label, and choices. 131 | """ 132 | @overrides 133 | def to_uniform_fields(self, fields): 134 | question = fields['question']['stem'] 135 | if question.endswith('.'): 136 | question = question[:-1] 137 | if not question.endswith('[MASK]'): 138 | print ('should not happen') 139 | exit(0) 140 | question = question[:-7] 141 | label = ['A','B','C'].index(fields['answerKey']) 142 | choices = [fields['question']['choices'][0]['text']+'.', fields['question']['choices'][1]['text']+'.', fields['question']['choices'][2]['text']+'.'] 143 | return '', question, label, choices 144 | 145 | @overrides 146 | def fields_to_instance(self, fields): 147 | context, question, label, choices = self.to_uniform_fields(fields) 148 | context_with_choices = [f"{question} {choice}" for choice in choices] 149 | return context, question, label, choices, context_with_choices 150 | 151 | class WinograndeInstanceReader(InstanceReader): 152 | """ 153 | Reads the WinoGrande dataset into a unified format with context, question, label, and choices. 154 | """ 155 | @overrides 156 | def to_uniform_fields(self, fields): 157 | context = fields['sentence'] 158 | if not context.endswith("."): 159 | context += "." 160 | 161 | label = fields['answer'] 162 | choices = [fields['option1'], fields['option2']] 163 | label = int(label) - 1 164 | question = '' 165 | return context, question, label, choices 166 | 167 | @overrides 168 | def fields_to_instance(self, fields): 169 | context, question, label, choices = self.to_uniform_fields(fields) 170 | context_with_choices = [context.replace("_", choice) for choice in choices] 171 | return context, question, label, choices, context_with_choices 172 | 173 | 174 | class CommonsenseqaInstanceReader(InstanceReader): 175 | """ 176 | Reads the CommonsenseQA dataset into a unified format with context, question, label, and choices. 177 | """ 178 | @overrides 179 | def to_uniform_fields(self, fields): 180 | context = '' 181 | 182 | question = 'Q: ' + fields['question']['stem'] 183 | label = ['A','B','C','D','E'].index(fields['answerKey']) if "answerKey" in fields else None 184 | choices = ['A: '+ c['text'] for c in fields['question']['choices']] 185 | return context, question, label, choices 186 | 187 | @overrides 188 | def fields_to_instance(self, fields): 189 | context, question, label, choices = self.to_uniform_fields(fields) 190 | context_with_choices = [f"{question} {choice[0].lower() + choice[1:]}" for choice in choices] 191 | return context, question, label, choices, context_with_choices 192 | 193 | class ANLIInstanceReader(InstanceReader): 194 | """ 195 | Reads the aNLI dataset into a unified format with context, question, label, and choices. 196 | """ 197 | @overrides 198 | def to_uniform_fields(self, fields): 199 | label = ['A','B'].index(fields['answerKey']) if "answerKey" in fields else None 200 | choices = [c['statement'] for c in fields['statements']] 201 | return label, choices 202 | 203 | @overrides 204 | def fields_to_instance(self, fields): 205 | label, choices = self.to_uniform_fields(fields) 206 | return None, None, label, None, choices 207 | 208 | INSTANCE_READERS = {"socialiqa": SocialIQAInstanceReader, 209 | "winogrande": WinograndeInstanceReader, 210 | "piqa": PiqaInstanceReader, 211 | "commonsenseqa":CommonsenseqaInstanceReader, 212 | "anli": ANLIInstanceReader, 213 | "atomic": ATOMICInstanceReader, 214 | 'cwwv': CWWVInstanceReader} 215 | 216 | 217 | def main(): 218 | parser = argparse.ArgumentParser() 219 | parser.add_argument("--lm", default="gpt2-large", type=str, required=False, help="language model to use") 220 | parser.add_argument("--dataset_file", default=None, type=str, required=True, help="Jsonl file") 221 | parser.add_argument("--out_dir", default=None, type=str, required=True, help="Out directory for the predictions") 222 | parser.add_argument("--device", default=-1, type=int, required=False, help="GPU device") 223 | parser.add_argument("--cache_dir", default=None, type=str, required=False, help="where the model is cached") 224 | parser.add_argument("--reader", default=None, type=str, required=True, help="which reader to use") 225 | args = parser.parse_args() 226 | logger.info(args) 227 | 228 | task = args.reader 229 | if args.lm != 'gpt2-large': 230 | model_path = ['gpt2']+args.lm.split('/')[-1:]+[task] 231 | model_path = '_'.join([m for m in model_path if m != '']) 232 | out_dir = os.path.join(args.out_dir, model_path) 233 | else: 234 | out_dir = os.path.join(args.out_dir, 'gpt2_'+task) 235 | if os.path.exists(out_dir) and os.listdir(out_dir): 236 | raise ValueError("Output directory ({}) already exists and is not empty.".format(out_dir)) 237 | if not os.path.exists(out_dir): 238 | os.makedirs(out_dir) 239 | # Load the language model 240 | device = torch.device(f'cuda:{args.device}') if args.device >= 0 else torch.device("cpu") 241 | model, tokenizer = init_model(args.lm, device, args.cache_dir) 242 | 243 | # Load the dataset 244 | instance_reader = INSTANCE_READERS[args.reader]() 245 | 246 | out_file = os.path.join(out_dir, "predictions.jsonl") 247 | log_file = os.path.join(out_dir, 'results.txt') 248 | gold = [] 249 | predictions = [] 250 | results = [] 251 | pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0 252 | sample_id = 0 253 | # Predict instances 254 | with open(out_file, "w") as f_out: 255 | with open(args.dataset_file) as f_in: 256 | for line in tqdm.tqdm(f_in): 257 | fields = json.loads(line.strip()) 258 | context, question, label, choices, context_with_choices = \ 259 | instance_reader.fields_to_instance(fields) 260 | if sample_id == 0: 261 | results.append(json.dumps(context_with_choices)) 262 | gold.append(label) 263 | # Tokenize and pad 264 | tokenized = [tokenizer.encode(text) for text in context_with_choices] 265 | max_length = max([len(text) for text in tokenized]) 266 | att_mask = torch.zeros((len(tokenized), max_length)).to(device) 267 | for i in range(len(tokenized)): 268 | att_mask[i][:len(tokenized[i])] = 1 269 | tokenized = [text + [pad_token_id] * (max_length - len(text)) for text in tokenized] 270 | tokenized = torch.tensor(tokenized).long().to(device) 271 | prediction = int(np.argmin(get_lm_score(model, tokenized, pad_token_id, att_mask))) 272 | fields["prediction"] = prediction 273 | predictions.append(prediction) 274 | f_out.write(json.dumps(fields) + "\n") 275 | sample_id += 1 276 | 277 | # Don't report accuracy if we don't have the labels 278 | if None not in gold: 279 | accuracy = (np.array(gold)==np.array(predictions)).mean() 280 | print(f"Accuracy: {accuracy:.3f}") 281 | results.append(f"Accuracy : {accuracy:.3f}") 282 | with open(log_file, 'w') as fout: 283 | for line in results: 284 | fout.write(line + '\n') 285 | 286 | 287 | def get_lm_score(model, batch, pad_token_id, att_mask): 288 | """ 289 | Get the cross entropy loss of the texts in batch using the langage model 290 | """ 291 | # Batch: [num_choices, max_length] 292 | with torch.no_grad(): 293 | num_choices, max_length = batch.shape 294 | shift_labels = batch[..., 1:].contiguous().view(-1) 295 | lm_logits = model(batch, attention_mask=att_mask)[0] 296 | shift_logits = lm_logits[..., :-1, :].contiguous() 297 | shift_logits = shift_logits.view(-1, shift_logits.size(-1)) 298 | loss_fct = CrossEntropyLoss(reduction="none", ignore_index=pad_token_id) 299 | loss = loss_fct(shift_logits, shift_labels) 300 | loss = loss.view(num_choices, -1).sum(1).cpu().numpy() 301 | valid_tokens = (batch!=pad_token_id).long().sum(1).cpu().numpy() 302 | loss /= valid_tokens 303 | return loss 304 | 305 | 306 | def init_model(model_name: str, 307 | device: torch.device, cache_dir): 308 | """ 309 | Initialize a pre-trained LM 310 | :param model_name: from MODEL_CLASSES 311 | :param device: CUDA / CPU device 312 | :return: the model and tokenizer 313 | """ 314 | logger.info(f'Initializing {model_name}') 315 | tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) 316 | model = AutoModelWithLMHead.from_pretrained(model_name, cache_dir=cache_dir) 317 | model.to(device) 318 | model.eval() 319 | return model, tokenizer 320 | 321 | 322 | if __name__ == '__main__': 323 | main() 324 | -------------------------------------------------------------------------------- /src/Evaluation/evaluate_RoBERTa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import tqdm 5 | import torch 6 | import logging 7 | import argparse 8 | import numpy as np 9 | from overrides import overrides 10 | from torch.nn import CrossEntropyLoss 11 | from transformers import RobertaTokenizer, RobertaForMaskedLM 12 | 13 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 14 | datefmt='%m/%d/%Y %H:%M:%S', 15 | level=logging.INFO) 16 | logger = logging.getLogger(__name__) 17 | MAX_SEQUENCE_PER_TIME = 80 18 | 19 | class InstanceReader(object): 20 | def to_uniform_fields(self, fields): 21 | pass 22 | 23 | def fields_to_instance(self, fields): 24 | pass 25 | 26 | class PiqaInstanceReader(InstanceReader): 27 | """ 28 | Reads the PIQA dataset into a unified format with context, question, label, and choices. 29 | """ 30 | @overrides 31 | def to_uniform_fields(self, fields): 32 | context = "" 33 | question = fields["goal"] 34 | label = fields.get('label', None) 35 | choices = [fields["sol1"][0].lower()+fields["sol1"][1:], fields["sol2"][0].lower()+fields["sol2"][1:]] 36 | return context, question, label, choices 37 | 38 | @overrides 39 | def fields_to_instance(self, fields): 40 | context, question, label, choices = self.to_uniform_fields(fields) 41 | return context, question, label, choices 42 | 43 | 44 | class SocialIQAInstanceReader(InstanceReader): 45 | """ 46 | Reads the SocialIQa dataset into a unified format with context, question, label, and choices. 47 | """ 48 | def __init__(self): 49 | super(SocialIQAInstanceReader).__init__() 50 | self.QUESTION_TO_ANSWER_PREFIX = { 51 | "What will (.*) want to do next?": r"As a result, [SUBJ] wanted to", 52 | "What will (.*) want to do after?": r"As a result, [SUBJ] wanted to", 53 | "How would (.*) feel afterwards?": r"As a result, [SUBJ] felt", 54 | "How would (.*) feel as a result?": r"As a result, [SUBJ] felt", 55 | "What will (.*) do next?": r"[SUBJ] then", 56 | "How would (.*) feel after?": r"[SUBJ] then", 57 | "How would you describe (.*)?": r"[SUBJ] is seen as", 58 | "What kind of person is (.*)?": r"[SUBJ] is seen as", 59 | "How would you describe (.*) as a person?": r"[SUBJ] is seen as", 60 | "Why did (.*) do that?": r"Before, [SUBJ] wanted", 61 | "Why did (.*) do this?": r"Before, [SUBJ] wanted", 62 | "Why did (.*) want to do this?": r"Before, [SUBJ] wanted", 63 | "What does (.*) need to do beforehand?": r"Before, [SUBJ] needed to", 64 | "What does (.*) need to do before?": r"Before, [SUBJ] needed to", 65 | "What does (.*) need to do before this?": r"Before, [SUBJ] needed to", 66 | "What did (.*) need to do before this?": r"Before, [SUBJ] needed to", 67 | "What will happen to (.*)?": r"[SUBJ] then", 68 | "What will happen to (.*) next?": r"[SUBJ] then" 69 | } 70 | 71 | @overrides 72 | def to_uniform_fields(self, fields): 73 | context = fields['context'] 74 | if not context.endswith("."): 75 | context += "." 76 | 77 | question = fields['question'] 78 | label = fields['correct'] 79 | choices = [fields['answerA'], fields['answerB'], fields['answerC']] 80 | choices = [c + "." if not c.endswith(".") else c for c in choices] 81 | label = ord(label) - 65 82 | return context, question, label, choices 83 | 84 | def convert_choice(self, choice, answer_prefix): 85 | if answer_prefix.endswith('wanted to') and choice.startswith('wanted to'): 86 | choice = choice[9:].strip() 87 | if answer_prefix.endswith('needed to') and choice.startswith('needed to'): 88 | choice = choice[9:].strip() 89 | if answer_prefix.endswith('to') and choice.startswith('to'): 90 | choice = choice[2:].strip() 91 | choice = choice[0].lower() + choice[1:] 92 | return choice 93 | 94 | @overrides 95 | def fields_to_instance(self, fields): 96 | context, question, label, choices = self.to_uniform_fields(fields) 97 | 98 | answer_prefix = "" 99 | for template, ans_prefix in self.QUESTION_TO_ANSWER_PREFIX.items(): 100 | m = re.match(template, question) 101 | if m is not None: 102 | subj = m.group(1) 103 | if subj.endswith('?'): 104 | subj = subj[:-1] 105 | answer_prefix = ans_prefix.replace("[SUBJ]", subj) 106 | break 107 | 108 | if answer_prefix == "": 109 | answer_prefix = question.replace("?", "is") 110 | 111 | question = context + ' ' + answer_prefix 112 | choices = [self.convert_choice(choice, answer_prefix) for choice in choices] 113 | 114 | return context, question, label, choices 115 | 116 | class ATOMICInstanceReader(InstanceReader): 117 | """ 118 | Reads the ATOMIC dataset into a unified format with context, question, label, and choices. 119 | """ 120 | @overrides 121 | def to_uniform_fields(self, fields): 122 | question = fields['context'] 123 | label = fields['correct'] 124 | choices = [fields['candidates'][0], fields['candidates'][1], fields['candidates'][2]] 125 | return '', question, label, choices 126 | 127 | @overrides 128 | def fields_to_instance(self, fields): 129 | context, question, label, choices = self.to_uniform_fields(fields) 130 | #print (question, choices) 131 | return context, question, label, choices 132 | 133 | class CWWVInstanceReader(InstanceReader): 134 | """ 135 | Reads the CWWV dataset into a unified format with context, question, label, and choices. 136 | """ 137 | @overrides 138 | def to_uniform_fields(self, fields): 139 | question = fields['question']['stem'] 140 | if question.endswith('.'): 141 | question = question[:-1] 142 | if not question.endswith('[MASK]'): 143 | print ('should not happen') 144 | exit(0) 145 | question = question[:-7] 146 | label = ['A','B','C'].index(fields['answerKey']) 147 | choices = [fields['question']['choices'][0]['text']+'.', fields['question']['choices'][1]['text']+'.', fields['question']['choices'][2]['text']+'.'] 148 | return '', question, label, choices 149 | 150 | @overrides 151 | def fields_to_instance(self, fields): 152 | context, question, label, choices = self.to_uniform_fields(fields) 153 | return context, question, label, choices 154 | 155 | class WinograndeInstanceReader(InstanceReader): 156 | """ 157 | Reads the WinoGrande dataset into a unified format with context, question, label, and choices. 158 | """ 159 | @overrides 160 | def to_uniform_fields(self, fields): 161 | context = fields['sentence'] 162 | if not context.endswith("."): 163 | context += "." 164 | context = context.split('_') 165 | label = fields['answer'] 166 | choices = [fields['option1']+context[1], fields['option2']+context[1]] 167 | label = int(label) - 1 168 | question = context[0].strip() 169 | return context, question, label, choices 170 | 171 | @overrides 172 | def fields_to_instance(self, fields): 173 | context, question, label, choices = self.to_uniform_fields(fields) 174 | return context, question, label, choices 175 | 176 | 177 | class CommonsenseqaInstanceReader(InstanceReader): 178 | """ 179 | Reads the CommonsenseQA dataset into a unified format with context, question, label, and choices. 180 | """ 181 | @overrides 182 | def to_uniform_fields(self, fields): 183 | context = '' 184 | question = 'Q: '+ fields['question']['stem'] 185 | label = ['A','B','C','D','E'].index(fields['answerKey']) if "answerKey" in fields else None 186 | choices = ['A: '+c['text'][0].lower()+c['text'][1:] for c in fields['question']['choices']] 187 | return context, question, label, choices 188 | 189 | @overrides 190 | def fields_to_instance(self, fields): 191 | context, question, label, choices = self.to_uniform_fields(fields) 192 | return context, question, label, choices 193 | 194 | class ANLIInstanceReader(InstanceReader): 195 | """ 196 | Reads the aNLI dataset into a unified format with context, question, label, and choices. 197 | """ 198 | @overrides 199 | def to_uniform_fields(self, fields): 200 | context = '' 201 | question = fields['context'] 202 | label = ['A','B'].index(fields['answerKey']) if "answerKey" in fields else None 203 | choices = [c['text']+' '+fields['question']['stem'] for c in fields['question']['choices']] 204 | return context, question, label, choices 205 | 206 | @overrides 207 | def fields_to_instance(self, fields): 208 | context, question, label, choices = self.to_uniform_fields(fields) 209 | return context, question, label, choices 210 | 211 | INSTANCE_READERS = {"socialiqa": SocialIQAInstanceReader, 212 | "winogrande": WinograndeInstanceReader, 213 | "piqa": PiqaInstanceReader, 214 | "commonsenseqa":CommonsenseqaInstanceReader, 215 | "anli": ANLIInstanceReader, 216 | 'atomic': ATOMICInstanceReader, 217 | 'cwwv': CWWVInstanceReader} 218 | 219 | def token_wise_scoring(sequences, label_ids, attention_mask, tokenizer, device, model): 220 | choice_loss = [0 for i in range(len(sequences))] 221 | for i in range(len(sequences)): 222 | tmp_seq_list = [] 223 | tmp_label_list = [] 224 | tmp_attention_mask = [] 225 | curr_label_ids = label_ids[i] 226 | for j, t in enumerate(curr_label_ids): 227 | if t == -100: 228 | continue 229 | tmp_seq = torch.tensor(sequences[i][:j]+[tokenizer.mask_token_id]+sequences[i][j+1:]).long().to(device) 230 | tmp_label = torch.tensor([-100]*j+sequences[i][j:j+1]+[-100]*(len(sequences[i])-j-1)).long().to(device) 231 | tmp_seq_list.append(tmp_seq) 232 | tmp_label_list.append(tmp_label) 233 | tmp_attention_mask.append(torch.tensor(attention_mask[i]).long().to(device)) 234 | tmp_seq_list = torch.stack(tmp_seq_list) 235 | tmp_label_list = torch.stack(tmp_label_list) 236 | tmp_attention_mask = torch.stack(tmp_attention_mask) 237 | if len(tmp_seq_list) < MAX_SEQUENCE_PER_TIME: 238 | loss = get_lm_score(model, tmp_seq_list, tmp_label_list, tmp_attention_mask) 239 | else: 240 | loss = [] 241 | for chunk in range(0, len(tmp_seq_list), MAX_SEQUENCE_PER_TIME): 242 | loss.append(get_lm_score(model, tmp_seq_list[chunk:chunk+MAX_SEQUENCE_PER_TIME], tmp_label_list[chunk:chunk+MAX_SEQUENCE_PER_TIME], tmp_attention_mask[chunk:chunk+MAX_SEQUENCE_PER_TIME])) 243 | loss = np.concatenate(loss) 244 | choice_loss[i] = sum(loss)/len(loss) 245 | prediction = choice_loss.index(min(choice_loss)) 246 | return prediction 247 | 248 | def prepare_input(sequences, label_ids, pad_token_id): 249 | max_length = max([len(text) for text in sequences]) 250 | attention_mask = np.zeros((len(sequences), max_length)) 251 | for i in range(len(sequences)): 252 | attention_mask[i][:len(sequences[i])] = 1 253 | sequences = [text + [pad_token_id] * (max_length - len(text)) for text in sequences] 254 | label_ids = [text + [-100] * (max_length - len(text)) for text in label_ids] 255 | return sequences, label_ids, attention_mask 256 | 257 | def score_task(question, choices, tokenizer, device, model): 258 | pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0 259 | question_ids = tokenizer.encode(question) 260 | choice_ids = [tokenizer.encode(choice, add_prefix_space=True)[1:-1] for choice in choices] 261 | sequences = [question_ids[:-1] + choice_ids[i] +[tokenizer.sep_token_id] for i in range(len(choice_ids))] 262 | label_ids = [[-100]+text[1:-1]+[-100] for text in sequences] 263 | sequences, label_ids, attention_mask = prepare_input(sequences, label_ids, pad_token_id) 264 | prediction = token_wise_scoring(sequences, label_ids, attention_mask, tokenizer, device, model) 265 | return prediction 266 | 267 | def main(): 268 | parser = argparse.ArgumentParser() 269 | parser.add_argument("--lm", default="roberta-large", type=str, required=False, help="language model to use") 270 | parser.add_argument("--dataset_file", default=None, type=str, required=True, help="Jsonl file") 271 | parser.add_argument("--out_dir", default=None, type=str, required=True, help="Out directory for the predictions") 272 | parser.add_argument("--device", default=-1, type=int, required=False, help="GPU device") 273 | parser.add_argument("--cache_dir", default=None, type=str, required=False, help="where the model is cached") 274 | parser.add_argument("--reader", default=None, type=str, required=True, help="which reader to use") 275 | args = parser.parse_args() 276 | logger.info(args) 277 | task = args.reader 278 | if args.lm != 'roberta-large': 279 | model_path = ['roberta']+args.lm.split('/')[-1:]+[task] 280 | model_path = '_'.join([m for m in model_path if m != '']) 281 | out_dir = os.path.join(args.out_dir, model_path) 282 | else: 283 | out_dir = os.path.join(args.out_dir, 'roberta_'+task) 284 | if os.path.exists(out_dir) and os.listdir(out_dir): 285 | raise ValueError("Output directory ({}) already exists and is not empty.".format(out_dir)) 286 | if not os.path.exists(out_dir): 287 | os.makedirs(out_dir) 288 | out_file = os.path.join(out_dir, 'predictions.jsonl') 289 | log_file = os.path.join(out_dir, 'results.txt') 290 | 291 | # Load the language model 292 | device = torch.device(f'cuda:{args.device}') if args.device >= 0 else torch.device("cpu") 293 | model, tokenizer = init_model(args.lm, device, args.cache_dir) 294 | 295 | # Load the dataset 296 | instance_reader = INSTANCE_READERS[args.reader]() 297 | 298 | gold = [] 299 | predictions = [] 300 | results = [] 301 | pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0 302 | print ('currently evaluating the task', task) 303 | # Predict instances 304 | sample_id = 0 305 | with open(out_file, "w") as f_out: 306 | with open(args.dataset_file) as f_in: 307 | for line in tqdm.tqdm(f_in): 308 | fields = json.loads(line.strip()) 309 | context, question, label, choices = \ 310 | instance_reader.fields_to_instance(fields) 311 | gold.append(label) 312 | if sample_id == 0: 313 | results.append(json.dumps(context)) 314 | results.append(json.dumps(question)) 315 | results.append(json.dumps(choices)) 316 | prediction = score_task(question, choices, tokenizer, device, model) 317 | fields["prediction"] = prediction 318 | predictions.append(prediction) 319 | f_out.write(json.dumps(fields) + "\n") 320 | sample_id += 1 321 | # Don't report accuracy if we don't have the labels 322 | if None not in gold: 323 | accuracy = (np.array(gold)==np.array(predictions)).mean() 324 | print(f"Accuracy: {accuracy:.3f}") 325 | results.append(f"Accuracy: {accuracy:.3f}") 326 | with open(log_file, 'w') as fout: 327 | for line in results: 328 | fout.write(line + '\n') 329 | 330 | def get_lm_score(model, batch, label_ids, attention_mask): 331 | """ 332 | Get the cross entropy loss of the texts in batch using the langage model 333 | """ 334 | # Batch: [num_choices, max_length] 335 | with torch.no_grad(): 336 | num_choices, max_length = batch.shape 337 | label_ids = label_ids.view(-1) 338 | lm_logits = model(batch, attention_mask=attention_mask)[0] 339 | lm_logits = lm_logits.view(-1, lm_logits.size(-1)) 340 | loss_fct = CrossEntropyLoss(reduction="none") 341 | loss = loss_fct(lm_logits, label_ids) 342 | loss = loss.view(num_choices, -1).sum(1).cpu().numpy() 343 | return loss 344 | 345 | 346 | def init_model(model_name: str, 347 | device: torch.device, cache_dir): 348 | """ 349 | Initialize a pre-trained LM 350 | :param model_name: from MODEL_CLASSES 351 | :param device: CUDA / CPU device 352 | :return: the model and tokenizer 353 | """ 354 | logger.info(f'Initializing {model_name}') 355 | tokenizer = RobertaTokenizer.from_pretrained(model_name, cache_dir=cache_dir) 356 | model = RobertaForMaskedLM.from_pretrained(model_name, cache_dir=cache_dir) 357 | model.to(device) 358 | model.eval() 359 | return model, tokenizer 360 | 361 | if __name__ == '__main__': 362 | main() 363 | -------------------------------------------------------------------------------- /src/Training/MLM/run_lm_gpt2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import glob 21 | import logging 22 | import os 23 | import random 24 | 25 | import numpy as np 26 | import torch 27 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 28 | TensorDataset) 29 | from torch.utils.data.distributed import DistributedSampler 30 | from torch.utils.tensorboard import SummaryWriter 31 | from tqdm import tqdm, trange 32 | import sys 33 | sys.path.append('../') 34 | sys.path.append('.') 35 | from transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer) 36 | from transformers import AdamW, get_linear_schedule_with_warmup 37 | from data_utils import myprocessors, handle_underscores 38 | import json 39 | from collections import Counter 40 | logger = logging.getLogger(__name__) 41 | from transformers import MODEL_WITH_LM_HEAD_MAPPING 42 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 43 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 44 | 45 | MODEL_CLASSES = { 46 | 'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer) 47 | } 48 | 49 | class MyDataset(torch.utils.data.Dataset): 50 | def __init__(self, data, mask_token): 51 | self.data = data 52 | self.mask_token = mask_token 53 | 54 | def __len__(self): 55 | return len(self.data) 56 | 57 | def __getitem__(self, idx): 58 | sample = self.data[idx] 59 | return sample, self.mask_token 60 | 61 | def mCollateFn(batch): 62 | batch_input_ids = [] 63 | batch_input_mask = [] 64 | batch_label_ids = [] 65 | mask_token = batch[0][1] 66 | max_len = max([len(f[0]) for f in batch]) 67 | for f in batch: 68 | input_ids = np.full(max_len, mask_token) 69 | input_ids[:len(f[0])] = f[0] 70 | labels = np.array([-100 if f[0][i] == mask_token else f[0][i] for i in range(len(f[0]))]+[-100]*(max_len-len(f[0]))) 71 | mask = np.zeros(max_len) 72 | mask[:len(f[0])] = 1 73 | batch_input_ids.append(input_ids) 74 | batch_input_mask.append(mask) 75 | batch_label_ids.append(labels) 76 | 77 | batch_input_ids = torch.tensor(batch_input_ids, dtype=torch.long) 78 | batch_input_mask = torch.tensor(batch_input_mask, dtype=torch.long) 79 | batch_label_ids = torch.tensor(batch_label_ids, dtype=torch.long) 80 | return batch_input_ids, batch_input_mask, batch_label_ids 81 | 82 | def convert_examples_to_features(examples, tokenizer, max_length=512): 83 | data = [] 84 | for example in examples: 85 | inputs, _ = handle_underscores(example['context'], tokenizer, keywords=example['keywords'], prefix=True) 86 | t_inputs, _ = handle_underscores(example['ending'], tokenizer) 87 | input_ids = tokenizer.convert_tokens_to_ids(inputs+t_inputs) 88 | data.append(input_ids) 89 | return data 90 | 91 | def set_seed(args): 92 | random.seed(args.seed) 93 | np.random.seed(args.seed) 94 | torch.manual_seed(args.seed) 95 | if args.n_gpu > 0: 96 | torch.cuda.manual_seed_all(args.seed) 97 | 98 | def count_parameters(model): 99 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 100 | 101 | def train(args, train_dataset, model, tokenizer): 102 | """ Train the model """ 103 | if args.local_rank in [-1, 0]: 104 | tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs')) 105 | 106 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 107 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) 108 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=mCollateFn) 109 | 110 | if args.max_steps > 0: 111 | t_total = args.max_steps 112 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 113 | else: 114 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs 115 | 116 | # Prepare optimizer and schedule (linear warmup and decay) 117 | no_decay = ['bias', 'LayerNorm.weight'] 118 | optimizer_grouped_parameters = [ 119 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, 120 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 121 | ] 122 | 123 | warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total) 124 | logger.info("warm up steps = %d", warmup_steps) 125 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98)) 126 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) 127 | 128 | if args.fp16: 129 | try: 130 | from apex import amp 131 | except ImportError: 132 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 133 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 134 | 135 | # multi-gpu training (should be after apex fp16 initialization) 136 | if args.n_gpu > 1: 137 | model = torch.nn.DataParallel(model) 138 | 139 | # Distributed training (should be after apex fp16 initialization) 140 | if args.local_rank != -1: 141 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 142 | output_device=args.local_rank, 143 | find_unused_parameters=True) 144 | # Train! 145 | logger.info("***** Running training *****") 146 | logger.info(" Num examples = %d", len(train_dataset)) 147 | logger.info(" Num Epochs = %d", args.num_train_epochs) 148 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 149 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 150 | args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 151 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 152 | logger.info(" Total optimization steps = %d", t_total) 153 | 154 | global_step = 0 155 | tr_loss, logging_loss = 0.0, 0.0 156 | model.zero_grad() 157 | train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) 158 | set_seed(args) # Added here for reproductibility (even between python 2 and 3) 159 | for _ in train_iterator: 160 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) 161 | for step, batch in enumerate(epoch_iterator): 162 | model.train() 163 | inputs = {'input_ids': batch[0].cuda(), 164 | 'attention_mask': batch[1].cuda(), 165 | 'labels': batch[2].cuda()} 166 | outputs = model(**inputs) 167 | loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) 168 | 169 | if args.n_gpu > 1: 170 | loss = loss.mean() # mean() to average on multi-gpu parallel training 171 | if args.gradient_accumulation_steps > 1: 172 | loss = loss / args.gradient_accumulation_steps 173 | 174 | if args.fp16: 175 | with amp.scale_loss(loss, optimizer) as scaled_loss: 176 | scaled_loss.backward() 177 | else: 178 | loss.backward() 179 | 180 | tr_loss += loss.item() 181 | if (step + 1) % args.gradient_accumulation_steps == 0: 182 | optimizer.step() 183 | scheduler.step() # Update learning rate schedule 184 | model.zero_grad() 185 | global_step += 1 186 | 187 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 188 | # Log metrics 189 | tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step) 190 | tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) 191 | tb_writer.add_scalar('Batch_loss', loss.item()*args.gradient_accumulation_steps, global_step) 192 | logger.info(" global_step = %s, average loss = %s", global_step, (tr_loss - logging_loss)/args.logging_steps) 193 | logging_loss = tr_loss 194 | 195 | 196 | if args.max_steps > 0 and global_step > args.max_steps: 197 | epoch_iterator.close() 198 | break 199 | if args.max_steps > 0 and global_step > args.max_steps: 200 | train_iterator.close() 201 | break 202 | if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well 203 | # Save model checkpoint 204 | output_dir = os.path.join(args.output_dir, 'epoch%s' % _) 205 | if not os.path.exists(output_dir): 206 | os.makedirs(output_dir) 207 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 208 | model_to_save.save_pretrained(output_dir) 209 | tokenizer.save_pretrained(output_dir) 210 | torch.save(args, os.path.join(output_dir, 'training_args.bin')) 211 | logger.info("Saving model checkpoint to %s", output_dir) 212 | if args.local_rank in [-1, 0]: 213 | tb_writer.close() 214 | return global_step, tr_loss / global_step 215 | 216 | 217 | def load_and_cache_examples(args, task, tokenizer, evaluate=False): 218 | if args.local_rank not in [-1, 0] and not evaluate: 219 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 220 | 221 | processor = myprocessors[task](args) 222 | examples = processor.get_train_examples() 223 | features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length) 224 | if args.local_rank == 0 and not evaluate: 225 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 226 | return MyDataset(features, tokenizer.mask_token_id) 227 | 228 | def main(): 229 | parser = argparse.ArgumentParser() 230 | 231 | ## Required parameters 232 | parser.add_argument("--train_file", default=None, type=str, required=True, 233 | help="The train file name") 234 | parser.add_argument("--dev_file", default=None, type=str, required=True, 235 | help="The dev file name") 236 | parser.add_argument("--model_type", default=None, type=str, required=True, 237 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) 238 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True, 239 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_TYPES)) 240 | parser.add_argument("--config_name", default="", type=str, 241 | help="Pretrained config name or path if not the same as model_name") 242 | parser.add_argument("--tokenizer_name", default="", type=str, 243 | help="Pretrained tokenizer name or path if not the same as model_name") 244 | parser.add_argument("--cache_dir", default="", type=str, 245 | help="Where do you want to store the pre-trained models downloaded from s3") 246 | parser.add_argument("--task_name", default=None, type=str, required=True, 247 | help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys())) 248 | parser.add_argument("--output_dir", default=None, type=str, required=True, 249 | help="The output directory where the model predictions and checkpoints will be written.") 250 | 251 | ## Other parameters 252 | parser.add_argument("--max_seq_length", default=128, type=int, 253 | help="The maximum total input sequence length after tokenization. Sequences longer " 254 | "than this will be truncated, sequences shorter will be padded.") 255 | parser.add_argument("--do_train", action='store_true', 256 | help="Whether to run training.") 257 | parser.add_argument("--do_eval", action='store_true', 258 | help="Whether to run eval on the dev set.") 259 | parser.add_argument("--do_lower_case", action='store_true', 260 | help="Set this flag if you are using an uncased model.") 261 | parser.add_argument("--per_gpu_train_batch_size", default=1, type=int, 262 | help="Batch size per GPU/CPU for training.") 263 | parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int, 264 | help="Batch size per GPU/CPU for evaluation.") 265 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 266 | help="Number of updates steps to accumulate before performing a backward/update pass.") 267 | parser.add_argument("--learning_rate", default=1e-5, type=float, 268 | help="The initial learning rate for Adam.") 269 | parser.add_argument("--weight_decay", default=0.01, type=float, 270 | help="Weight deay if we apply some.") 271 | parser.add_argument("--adam_epsilon", default=1e-6, type=float, 272 | help="Epsilon for Adam optimizer.") 273 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 274 | help="Max gradient norm.") 275 | parser.add_argument("--num_train_epochs", default=1.0, type=float, 276 | help="Total number of training epochs to perform.") 277 | parser.add_argument("--max_steps", default=-1, type=int, 278 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 279 | parser.add_argument("--warmup_steps", default=0, type=int, 280 | help="Linear warmup over warmup_steps.") 281 | parser.add_argument("--warmup_proportion", default=0.05, type=float, 282 | help="Linear warmup over warmup proportion.") 283 | parser.add_argument('--logging_steps', type=int, default=50, 284 | help="Log every X updates steps.") 285 | parser.add_argument('--save_steps', type=int, default=50, 286 | help="Save checkpoint every X updates steps.") 287 | parser.add_argument("--no_cuda", action='store_true', 288 | help="Avoid using CUDA when available") 289 | parser.add_argument('--overwrite_output_dir', action='store_true', 290 | help="Overwrite the content of the output directory") 291 | parser.add_argument('--seed', type=int, default=2555, 292 | help="random seed for initialization") 293 | parser.add_argument('--fp16', action='store_true', 294 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 295 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 296 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 297 | "See details at https://nvidia.github.io/apex/amp.html") 298 | parser.add_argument("--local_rank", type=int, default=-1, 299 | help="For distributed training: local_rank") 300 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") 301 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") 302 | args = parser.parse_args() 303 | 304 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir: 305 | raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) 306 | if not os.path.exists(args.output_dir): 307 | os.makedirs(args.output_dir) 308 | 309 | # Setup CUDA, GPU & distributed training 310 | if args.local_rank == -1 or args.no_cuda: 311 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 312 | args.n_gpu = torch.cuda.device_count() 313 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 314 | torch.cuda.set_device(args.local_rank) 315 | device = torch.device("cuda", args.local_rank) 316 | torch.distributed.init_process_group(backend='nccl') 317 | args.n_gpu = 1 318 | args.device = device 319 | 320 | if args.do_train: 321 | for handler in logging.root.handlers[:]: 322 | logging.root.removeHandler(handler) 323 | # Setup logging 324 | if args.do_train: 325 | log_file = os.path.join(args.output_dir, 'train.log') 326 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 327 | datefmt = '%m/%d/%Y %H:%M:%S', 328 | level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN, 329 | filename=log_file) 330 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 331 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 332 | os.system("cp run_lm_gpt2.py %s" % os.path.join(args.output_dir, 'run_lm_gpt2.py')) 333 | os.system("cp ../data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py')) 334 | 335 | # Set seed 336 | set_seed(args) 337 | args.task_name = args.task_name.lower() 338 | if args.task_name not in myprocessors: 339 | raise ValueError("Task not found: %s" % (args.task_name)) 340 | 341 | args.model_type = args.model_type.lower() 342 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 343 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir) 344 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir) 345 | model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir) 346 | 347 | count = count_parameters(model) 348 | print (count) 349 | special_tokens_dict = {'mask_token': ''} 350 | num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) 351 | model.resize_token_embeddings(len(tokenizer)) 352 | 353 | if args.local_rank == 0: 354 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 355 | 356 | model.to(args.device) 357 | 358 | logger.info("Training/evaluation parameters %s", args) 359 | if args.do_train: 360 | train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) 361 | global_step, tr_loss = train(args, train_dataset, model, tokenizer) 362 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) 363 | # Evaluation 364 | results = {} 365 | 366 | return results 367 | 368 | if __name__ == "__main__": 369 | main() -------------------------------------------------------------------------------- /src/Data_generation/generate_from_ATOMIC.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | from tqdm import tqdm 4 | import json 5 | import re 6 | import ftfy 7 | import random 8 | from collections import Counter, defaultdict 9 | import nltk 10 | from nltk.corpus import stopwords 11 | skip_words = set(stopwords.words('english')) 12 | skip_words.add('\'s') 13 | skip_words.add('.') 14 | skip_words.add(',') 15 | import sys 16 | sys.path.append('../') 17 | sys.path.append('.') 18 | import os 19 | import argparse 20 | from Training.data_utils import PERSON_NAMES 21 | from sentence_transformers import SentenceTransformer, util 22 | import pickle 23 | import numpy as np 24 | import torch 25 | 26 | def text_standardize(text): 27 | """ 28 | Borrowed from COMET repo 29 | """ 30 | text = text.replace('—', '-') 31 | text = text.replace('–', '-') 32 | text = text.replace('―', '-') 33 | text = text.replace('…', '...') 34 | text = text.replace('´', "'") 35 | text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text) 36 | text = re.sub(r'\s*\n\s*', ' \n ', text) 37 | text = re.sub(r'[^\S\n]+', ' ', text) 38 | return text.strip() 39 | 40 | def overlap_exist(tail, keywords): 41 | tail = nltk.word_tokenize(tail.lower()) 42 | if len(set(tail).intersection(keywords)) > 0: 43 | return True 44 | else: 45 | return False 46 | 47 | def write_data(filename, data): 48 | with open(filename, 'w') as fout: 49 | for sample in data: 50 | fout.write(json.dumps(sample)) 51 | fout.write('\n') 52 | 53 | def read_data(filename): 54 | data = [] 55 | with open(filename, 'r') as f: 56 | for line in f: 57 | data.append(json.loads(line)) 58 | return data 59 | 60 | class ATOMICProcessor(object): 61 | def __init__(self, args): 62 | self.mapping = { 63 | 'xAttr' : '. PersonX is seen as', 64 | 'xIntent' : '. Before, PersonX wanted', 65 | 'xNeed' : '. Before, PersonX needed to', 66 | 'xReact': '. As a result, PersonX felt', 67 | 'xWant': '. As a result, PersonX wanted to', 68 | 'xEffect': '. PersonX then', 69 | 'oReact': '. As a result, others felt', 70 | 'oWant': '. As a result, others wanted to', 71 | 'oEffect': '. Others then' 72 | } 73 | self.xset = ['PersonX', 'Personx', 'personX', 'personx', 'Person X', 'Person x', 'person X', 'person x'] 74 | self.yset = ['PersonY', 'Persony', 'personY', 'persony', 'Person Y', 'Person y', 'person Y', 'person y'] 75 | self.zset = ['PersonZ', 'Personz', 'personZ', 'personz', 'Person Z', 'Person z', 'person Z', 'person z'] 76 | self.xset1 = [' X ', ' x ', ' X\'', ' x\'', ' X.', ' x.'] 77 | self.yset1 = [' Y ', ' y ', ' Y\'', ' y\'', ' Y.', ' y.'] 78 | self.zset1 = [' Z ', ' z ', ' Z\'', ' z\'', ' Z.', ' z.'] 79 | self.answerKey_mapping = {} 80 | self.D = [[], []] 81 | self.labels=[] 82 | self.filelist = [args.train_KG, args.dev_KG] 83 | self.tail_keywords = defaultdict(set) 84 | self.adv = False 85 | 86 | def get_person_set(self, context): 87 | person_set = [] 88 | if any([x in context for x in self.xset+self.xset1]): 89 | person_set += self.xset+self.xset1 90 | if any([y in context for y in self.yset+self.yset1]): 91 | person_set += self.yset+self.yset1 92 | if any([z in context for z in self.zset+self.zset1]): 93 | person_set += self.zset+self.zset1 94 | return person_set 95 | 96 | def find_underscore_length(self, seq): 97 | start = "_" 98 | while start in seq: 99 | start += "_" 100 | return start[:-1] 101 | 102 | def fill_names(self, sent, names): 103 | for x in self.xset: 104 | sent = sent.replace(x, names[0]) 105 | for x in self.xset1: 106 | sent = sent.replace(x, x[0]+names[0]+x[-1]) 107 | for y in self.yset: 108 | sent = sent.replace(y, names[1]) 109 | for y in self.yset1: 110 | sent = sent.replace(y, y[0]+names[0]+y[-1]) 111 | for z in self.zset: 112 | sent = sent.replace(z, names[2]) 113 | for z in self.zset1: 114 | sent = sent.replace(z, z[0]+names[0]+z[-1]) 115 | return sent 116 | 117 | def fix_templates(self, context, tail): 118 | if context.endswith('wanted to') and tail.startswith('wanted to'): 119 | tail = tail[9:].strip() 120 | if context.endswith('needed to') and tail.startswith('needed to'): 121 | tail = tail[9:].strip() 122 | if context.endswith('to') and tail.startswith('to'): 123 | tail = tail[2:].strip() 124 | if len(tail) != 0: 125 | tail = tail[0].lower()+tail[1:] 126 | if not tail.endswith('.'): 127 | tail += '.' 128 | return tail 129 | 130 | def negative_sample(self, prefix, dim, correct_ones, data, person_set, question, correct_answer): 131 | negatives = [] 132 | while len(negatives) < 2: 133 | sample = random.choice(data) 134 | if len(sample[1][dim]) == 0: 135 | continue 136 | neg = random.choice(sample[1][dim]) 137 | if len(set(prefix).intersection(self.tail_keywords[(neg, dim)])) != 0: 138 | continue 139 | if neg in correct_ones: 140 | continue 141 | if neg in negatives: 142 | continue 143 | if neg[:-1] in correct_answer[:-1].split() or correct_answer[:-1] in neg[:-1].split(): 144 | continue 145 | if len(person_set) < len(self.xset+self.xset1)*2 and any([y in neg for y in self.yset+self.yset1]): 146 | continue 147 | if len(person_set) < len(self.xset+self.xset1)*3 and any([z in neg for z in self.zset+self.zset1]): 148 | continue 149 | negatives.append(neg) 150 | return negatives 151 | 152 | def create_dataset(self, data): 153 | generated_data = [] 154 | count = 0 155 | for sample in tqdm(data): 156 | for k, v in sample[1].items(): 157 | if len(v) != 0: 158 | context = text_standardize(ftfy.fix_text(sample[0])) 159 | person_set = self.get_person_set(context) 160 | question = self.mapping[k] 161 | for vv in v: 162 | correct_answer = vv 163 | if overlap_exist(correct_answer, sample[-1]): 164 | continue 165 | negative_answers = self.negative_sample(sample[-1], k, v, data, person_set, context+question, correct_answer) 166 | if negative_answers == None: 167 | continue 168 | names = random.sample(PERSON_NAMES, 3) 169 | new_context = self.fill_names(context+question, names) 170 | correct_answer = self.fill_names(correct_answer, names) 171 | negative_answers = [self.fill_names(neg, names) for neg in negative_answers] 172 | candidates = negative_answers+[correct_answer] 173 | random.shuffle(candidates) 174 | label = candidates.index(correct_answer) 175 | count += 1 176 | generated_data.append({'id':str(count), 'dim':k, 'context':new_context, 'correct':label, 'candidates':candidates, 'keywords': sample[-1]}) 177 | return generated_data 178 | 179 | def get_train_examples(self): 180 | self.load_data(self.filelist[0], 0) 181 | return self.create_dataset(self.D[0]) 182 | 183 | def get_dev_examples(self): 184 | self.load_data(self.filelist[1], 1) 185 | return self.create_dataset(self.D[1]) 186 | 187 | def load_data(self, filename, sid): 188 | skipped = 0 189 | previous = 'random stuff' 190 | prefix = 'random stuff' 191 | cache = None 192 | with open(filename, "r") as f: 193 | csvreader = csv.reader(f) 194 | fields = next(csvreader) 195 | for row in tqdm(csvreader): 196 | if row[0] != previous: 197 | if cache != None: 198 | self.D[sid].append([previous, cache, prefix]) 199 | previous = row[0] 200 | cache = {k:[] for k, v in self.mapping.items()} 201 | row[1:-1] = [json.loads(e) for e in row[1:-1]] 202 | prefix = row[-2] 203 | for i, attr in enumerate(row[1:-2]): 204 | for ending in attr: 205 | ending = ending.lower() 206 | ending = self.fix_templates(self.mapping[fields[i+1]], text_standardize(ftfy.fix_text(ending))) 207 | if '_' in ending: 208 | tok = self.find_underscore_length(ending) 209 | ending = ending.replace(tok, "___") 210 | if ending != 'none.' and len(ending) > 0 and ending not in cache[fields[i+1]]: 211 | self.tail_keywords[(ending, fields[i+1])] |= set(prefix) 212 | cache[fields[i+1]].append(ending) 213 | if cache != None: 214 | self.D[sid].append([previous, cache, prefix]) 215 | print (len(self.D[sid])) 216 | 217 | class ATOMICAdvAnswerProcessor(ATOMICProcessor): 218 | def __init__(self, args): 219 | super(ATOMICAdvAnswerProcessor, self).__init__(args) 220 | with open(os.path.join(args.out_dir, 'atomic_tails.pkl'), "rb") as fin: 221 | d = pickle.load(fin) 222 | self.tail_index = d['sentences'] 223 | self.reverse_tail_index = {v:k for k, v in self.tail_index.items()} 224 | self.embeddings = d['embeddings'] 225 | self.lower_bounds = Counter() 226 | self.high_prob = 0.4 227 | self.low_prob = 0.3 228 | self.patience = 10 229 | self.step_size = 0.05 230 | self.downsample_size = 50 231 | self.adv = True 232 | 233 | def negative_sample(self, prefix, dim, correct_ones, data, person_set, question, correct_answer): 234 | negatives = [] 235 | curr_data = random.choices(data, k=self.downsample_size) 236 | distractors = list(set([neg for sample in curr_data for neg in sample[1][dim]])) 237 | distractors = [neg for neg in distractors if len(set(prefix).intersection(self.tail_keywords[(neg, dim)])) == 0] 238 | distractors_mapping = {i:self.tail_index[neg] for i, neg in enumerate(distractors)} 239 | distractors_indices = list(distractors_mapping.values()) 240 | distractor_emb = self.embeddings[distractors_indices] 241 | correct_emb = self.embeddings[self.tail_index[correct_answer]] 242 | cos_scores = util.pytorch_cos_sim(correct_emb, distractor_emb)[0] 243 | high_prob = self.high_prob 244 | low_prob = self.low_prob 245 | midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1) 246 | midinf = 0 247 | while len(midpoint) < self.patience and midinf < self.patience: 248 | midinf += 1 249 | low_prob -= self.step_size 250 | midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1) 251 | if len(midpoint) == 0: 252 | print ('empty') 253 | return None 254 | infinite = 0 255 | while len(negatives) < 2 and infinite < self.patience: 256 | infinite += 1 257 | sample_idx = random.choice(midpoint) 258 | neg = self.reverse_tail_index[distractors_mapping[sample_idx.item()]] 259 | if neg in correct_ones: 260 | continue 261 | if neg in negatives: 262 | continue 263 | if neg[:-1] in correct_answer[:-1].split() or correct_answer[:-1] in neg[:-1].split(): 264 | continue 265 | if len(person_set) < len(self.xset+self.xset1)*2 and any([y in neg for y in self.yset+self.yset1]): 266 | continue 267 | if len(person_set) < len(self.xset+self.xset1)*3 and any([z in neg for z in self.zset+self.zset1]): 268 | continue 269 | negatives.append(neg) 270 | self.lower_bounds[low_prob] += 1 271 | if len(negatives) < 2: 272 | return None 273 | return negatives 274 | 275 | class ATOMICAdvQuestionProcessor(ATOMICProcessor): 276 | def __init__(self, args): 277 | super(ATOMICAdvQuestionProcessor, self).__init__(args) 278 | with open(os.path.join(args.out_dir, 'atomic_tails.pkl'), "rb") as fin: 279 | d = pickle.load(fin) 280 | self.tail_index = d['sentences'] 281 | self.reverse_tail_index = {v:k for k, v in self.tail_index.items()} 282 | self.tail_embeddings = d['embeddings'] 283 | with open(os.path.join(args.out_dir, 'atomic_heads.pkl'), "rb") as fin: 284 | d = pickle.load(fin) 285 | self.head_index = d['sentences'] 286 | self.revers_head_index = {v:k for k, v in self.head_index.items()} 287 | self.head_embeddings = d['embeddings'] 288 | self.lower_bounds = Counter() 289 | self.high_prob = 0.4 290 | self.low_prob = 0.3 291 | self.patience = 10 292 | self.step_size = 0.05 293 | self.downsample_size = 200 294 | self.adv = True 295 | 296 | def negative_sample(self, prefix, dim, correct_ones, data, person_set, question, correct_answer): 297 | negatives = [] 298 | curr_data = random.choices(data, k=self.downsample_size) 299 | distractors = list(set([neg for sample in curr_data for neg in sample[1][dim]])) 300 | distractors = [neg for neg in distractors if len(set(prefix).intersection(self.tail_keywords[(neg, dim)])) == 0] 301 | distractors_mapping = {i:self.tail_index[neg] for i, neg in enumerate(distractors)} 302 | distractors_indices = list(distractors_mapping.values()) 303 | distractor_emb = self.tail_embeddings[distractors_indices] 304 | question_emb = self.head_embeddings[self.head_index[question]] 305 | cos_scores = util.pytorch_cos_sim(question_emb, distractor_emb)[0] 306 | high_prob = self.high_prob 307 | low_prob = self.low_prob 308 | midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1) 309 | midinf = 0 310 | while len(midpoint) < self.patience and midinf < self.patience: 311 | midinf += 1 312 | low_prob -= self.step_size 313 | midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1) 314 | if len(midpoint) == 0: 315 | print ('empty') 316 | return None 317 | infinite = 0 318 | while len(negatives) < 2 and infinite < self.patience: 319 | infinite += 1 320 | sample_idx = random.choice(midpoint) 321 | neg = self.reverse_tail_index[distractors_mapping[sample_idx.item()]] 322 | if neg in correct_ones: 323 | continue 324 | if neg in negatives: 325 | continue 326 | if neg[:-1] in correct_answer[:-1].split() or correct_answer[:-1] in neg[:-1].split(): 327 | continue 328 | if len(person_set) < len(self.xset+self.xset1)*2 and any([y in neg for y in self.yset+self.yset1]): 329 | continue 330 | if len(person_set) < len(self.xset+self.xset1)*3 and any([z in neg for z in self.zset+self.zset1]): 331 | continue 332 | negatives.append(neg) 333 | self.lower_bounds[low_prob] += 1 334 | if len(negatives) < 2: 335 | return None 336 | return negatives 337 | 338 | def build_embeddings_answers(args): 339 | if os.path.exists(os.path.join(args.out_dir, 'atomic_tails.pkl')): 340 | print ('tail embeddings already exist, skip computation') 341 | return 342 | processor = ATOMICProcessor(args) 343 | model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens') 344 | all_tails = {} 345 | files = [args.train_KG, args.dev_KG] 346 | for file in files: 347 | with open(file, 'r') as f: 348 | csvreader = csv.reader(f) 349 | fields = next(csvreader) 350 | for row in tqdm(csvreader): 351 | row[1:-1] = [json.loads(e) for e in row[1:-1]] 352 | for i, attr in enumerate(row[1:-2]): 353 | for ending in attr: 354 | ending = ending.lower() 355 | if ending != 'none': 356 | tail = text_standardize(ftfy.fix_text(ending)) 357 | tail = processor.fix_templates(processor.mapping[fields[i+1]], tail) 358 | if '_' in tail: 359 | tok = processor.find_underscore_length(tail) 360 | tail = tail.replace(tok, "___") 361 | if tail not in all_tails: 362 | all_tails[tail] = len(all_tails) 363 | print (len(all_tails)) 364 | corpus = [k for k, v in all_tails.items()] 365 | embeddings = model.encode(corpus, show_progress_bar=True, device=0, num_workers=4) 366 | print (len(embeddings), embeddings.shape) 367 | with open(os.path.join(args.out_dir, 'atomic_tails.pkl'), "wb") as fOut: 368 | pickle.dump({'sentences': all_tails, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL) 369 | 370 | def build_embeddings_question(args): 371 | if os.path.exists(os.path.join(args.out_dir, 'atomic_heads.pkl')): 372 | print ('head embeddings already exist, skip computation') 373 | return 374 | processor = ATOMICProcessor(args) 375 | model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens') 376 | all_heads = {} 377 | files = [args.train_KG, args.dev_KG] 378 | previous = 'random stuff' 379 | for file in files: 380 | with open(file, 'r') as f: 381 | csvreader = csv.reader(f) 382 | fields = next(csvreader) 383 | for row in tqdm(csvreader): 384 | row[1:-1] = [json.loads(e) for e in row[1:-1]] 385 | if row[0] != previous: 386 | previous = row[0] 387 | head = text_standardize(ftfy.fix_text(row[0])) 388 | for i, attr in enumerate(row[1:-2]): 389 | rel = processor.mapping[fields[i+1]] 390 | question = head + rel 391 | if question not in all_heads: 392 | all_heads[question] = len(all_heads) 393 | 394 | print (len(all_heads)) 395 | corpus = list(all_heads.keys()) 396 | embeddings1 = model.encode(corpus[:100000], show_progress_bar=True, device=0, num_workers=4) 397 | embeddings2 = model.encode(corpus[100000:], show_progress_bar=True, device=0, num_workers=4) 398 | embeddings = np.concatenate([embeddings1, embeddings2], axis=0) 399 | print (len(embeddings), embeddings.shape) 400 | with open(os.path.join(args.out_dir, 'atomic_heads.pkl'), "wb") as fOut: 401 | pickle.dump({'sentences': all_heads, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL) 402 | 403 | if __name__ == '__main__': 404 | parser = argparse.ArgumentParser() 405 | parser.add_argument("--train_KG", default=None, type=str, required=True, help="ATOMIC train file") 406 | parser.add_argument("--dev_KG", default=None, type=str, required=True, help="ATOMIC dev file") 407 | parser.add_argument("--strategy", default='random', type=str, required=False, choices=['random', 'adv-answer', 'adv-question'], help="which data generation strategy to use") 408 | parser.add_argument("--out_dir", default=None, type=str, required=True, help="Output dir") 409 | parser.add_argument('--do_split', action="store_true", help="Further split training set into subsets for AFLite") 410 | args = parser.parse_args() 411 | random.seed(1) 412 | np.random.seed(1) 413 | if args.strategy == 'random': 414 | processor = ATOMICProcessor(args) 415 | elif args.strategy == 'adv-answer': 416 | print ('Using adv-answer strategy') 417 | build_embeddings_answers(args) 418 | processor = ATOMICAdvAnswerProcessor(args) 419 | elif args.strategy == 'adv-question': 420 | print ('Using adv-question strategy') 421 | build_embeddings_answers(args) 422 | build_embeddings_question(args) 423 | processor = ATOMICAdvQuestionProcessor(args) 424 | else: 425 | print ('strategy not recognized') 426 | exit(0) 427 | dev_examples = processor.get_dev_examples() 428 | write_data(os.path.join(args.out_dir, 'dev_'+args.strategy+'.jsonl'), dev_examples) 429 | train_examples = processor.get_train_examples() 430 | write_data(os.path.join(args.out_dir, 'train_'+args.strategy+'.jsonl'), train_examples) 431 | if args.do_split: 432 | assert args.strategy == 'random' 433 | random.shuffle(train_examples) 434 | print ('splitting train into subsets, which can be used for AFLite (only valid for random strategy)') 435 | train_examples_1 = train_examples[:int(len(train_examples)*0.01)] 436 | train_examples_4 = train_examples[int(len(train_examples)*0.01):int(len(train_examples)*0.05)] 437 | train_examples_95 = train_examples[int(len(train_examples)*0.05):] 438 | write_data(os.path.join(args.out_dir, 'train_1%_'+args.strategy+'.jsonl'), train_examples_1) 439 | write_data(os.path.join(args.out_dir, 'train_4%_'+args.strategy+'.jsonl'), train_examples_4) 440 | write_data(os.path.join(args.out_dir, 'train_95%_'+args.strategy+'.jsonl'), train_examples_95) 441 | 442 | 443 | 444 | 445 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /src/Training/MLM/run_mlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import glob 21 | import logging 22 | import os 23 | import random 24 | 25 | import numpy as np 26 | import torch 27 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 28 | TensorDataset) 29 | from torch.utils.data.distributed import DistributedSampler 30 | from torch.utils.tensorboard import SummaryWriter 31 | from tqdm import tqdm, trange 32 | import sys 33 | sys.path.append('../') 34 | sys.path.append('.') 35 | from transformers import (WEIGHTS_NAME, RobertaConfig, RobertaForMaskedLM, RobertaTokenizer) 36 | from transformers import AdamW, get_linear_schedule_with_warmup 37 | from data_utils import myprocessors, handle_underscores 38 | from run_pretrain import convert_examples_to_features, MyDataset 39 | from run_pretrain import evaluate as evaluate_func 40 | import json 41 | from collections import Counter 42 | logger = logging.getLogger(__name__) 43 | from transformers import MODEL_WITH_LM_HEAD_MAPPING 44 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 45 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 46 | 47 | MODEL_CLASSES = { 48 | 'roberta-mlm': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer) 49 | } 50 | 51 | class MLMDataset(torch.utils.data.Dataset): 52 | def __init__(self, data): 53 | self.data = data 54 | 55 | def __len__(self): 56 | return len(self.data) 57 | 58 | def __getitem__(self, idx): 59 | sample = self.data[idx] 60 | return sample 61 | 62 | def mask_tokens(batch_inputs, batch_labels, tokenizer, mlm_probability): 63 | """ 64 | Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. 65 | """ 66 | if tokenizer.mask_token is None: 67 | raise ValueError( 68 | "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." 69 | ) 70 | probability_matrix = torch.full(batch_labels.shape, mlm_probability) 71 | invalid_tokens_mask = [[t==-100 for t in val] for val in batch_labels.tolist()] 72 | probability_matrix.masked_fill_(torch.tensor(invalid_tokens_mask, dtype=torch.bool), value=0.0) 73 | masked_indices = torch.bernoulli(probability_matrix).bool() 74 | batch_labels[~masked_indices] = -100 # We only compute loss on masked tokens 75 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) 76 | indices_replaced = torch.bernoulli(torch.full(batch_labels.shape, 0.8)).bool() & masked_indices 77 | batch_inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) 78 | 79 | # 10% of the time, we replace masked input tokens with random word 80 | indices_random = torch.bernoulli(torch.full(batch_labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced 81 | random_words = torch.randint(len(tokenizer), batch_labels.shape, dtype=torch.long) 82 | batch_inputs[indices_random] = random_words[indices_random] 83 | 84 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged 85 | return batch_inputs, batch_labels 86 | 87 | def mCollateFn(batch): 88 | batch_input_ids = [] 89 | batch_input_mask = [] 90 | batch_label_ids = [] 91 | max_len = max([len(f[0]) for f in batch]) 92 | for f in batch: 93 | input_ids = np.ones(max_len) 94 | input_ids[:len(f[0])] = f[0] 95 | labels = np.full(max_len, -100) 96 | labels[:len(f[1])] = f[1] 97 | mask = np.zeros(max_len) 98 | mask[:len(f[0])] = 1 99 | batch_input_ids.append(input_ids) 100 | batch_input_mask.append(mask) 101 | batch_label_ids.append(labels) 102 | 103 | batch_input_ids = torch.tensor(batch_input_ids, dtype=torch.long) 104 | batch_input_mask = torch.tensor(batch_input_mask, dtype=torch.long) 105 | batch_label_ids = torch.tensor(batch_label_ids, dtype=torch.long) 106 | return batch_input_ids, batch_input_mask, batch_label_ids 107 | 108 | def convert_examples_to_features_mlm(examples, tokenizer, max_length=512): 109 | data = [] 110 | valid_tokens = 0 111 | total_tokens = 0 112 | for example in examples: 113 | inputs, labels = handle_underscores(example['context'], tokenizer, keywords=example['keywords'], prefix=True) 114 | t_inputs, t_labels = handle_underscores(example['ending'], tokenizer) 115 | input_ids = tokenizer.convert_tokens_to_ids(inputs+t_inputs) 116 | label_ids = [t if t == -100 else input_ids[t_i] for t_i, t in enumerate(labels+t_labels)] 117 | valid_tokens += len([t for t in label_ids if t != -100]) 118 | total_tokens += len(label_ids) 119 | input_ids = tokenizer.prepare_for_model(input_ids, max_length=max_length, truncation=True)['input_ids'] 120 | label_ids = [-100] + label_ids + [-100] 121 | data.append([input_ids, label_ids]) 122 | #print (valid_tokens, total_tokens) 123 | return data 124 | 125 | def set_seed(args): 126 | random.seed(args.seed) 127 | np.random.seed(args.seed) 128 | torch.manual_seed(args.seed) 129 | if args.n_gpu > 0: 130 | torch.cuda.manual_seed_all(args.seed) 131 | 132 | def count_parameters(model): 133 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 134 | 135 | def train(args, train_dataset, model, tokenizer, eval_dataset): 136 | """ Train the model """ 137 | if args.local_rank in [-1, 0]: 138 | tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs')) 139 | 140 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 141 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) 142 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=mCollateFn) 143 | 144 | if args.max_steps > 0: 145 | t_total = args.max_steps 146 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 147 | else: 148 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs 149 | 150 | # Prepare optimizer and schedule (linear warmup and decay) 151 | no_decay = ['bias', 'LayerNorm.weight'] 152 | optimizer_grouped_parameters = [ 153 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, 154 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 155 | ] 156 | 157 | warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total) 158 | logger.info("warm up steps = %d", warmup_steps) 159 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98)) 160 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) 161 | 162 | if args.fp16: 163 | try: 164 | from apex import amp 165 | except ImportError: 166 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 167 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 168 | 169 | # multi-gpu training (should be after apex fp16 initialization) 170 | if args.n_gpu > 1: 171 | model = torch.nn.DataParallel(model) 172 | 173 | # Distributed training (should be after apex fp16 initialization) 174 | if args.local_rank != -1: 175 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 176 | output_device=args.local_rank, 177 | find_unused_parameters=True) 178 | # Train! 179 | logger.info("***** Running training *****") 180 | logger.info(" Num examples = %d", len(train_dataset)) 181 | logger.info(" Num Epochs = %d", args.num_train_epochs) 182 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 183 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 184 | args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 185 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 186 | logger.info(" Total optimization steps = %d", t_total) 187 | 188 | global_step = 0 189 | tr_loss, logging_loss = 0.0, 0.0 190 | model.zero_grad() 191 | train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) 192 | set_seed(args) # Added here for reproductibility (even between python 2 and 3) 193 | curr_best = 0.0 194 | for _ in train_iterator: 195 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) 196 | for step, batch in enumerate(epoch_iterator): 197 | model.train() 198 | input_ids, mlm_labels = mask_tokens(batch[0], batch[2], tokenizer, args.mlm_probability) 199 | inputs = {'input_ids': input_ids.cuda(), 200 | 'attention_mask': batch[1].cuda(), 201 | 'masked_lm_labels': mlm_labels.cuda()} 202 | outputs = model(**inputs) 203 | loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) 204 | 205 | if args.n_gpu > 1: 206 | loss = loss.mean() # mean() to average on multi-gpu parallel training 207 | if args.gradient_accumulation_steps > 1: 208 | loss = loss / args.gradient_accumulation_steps 209 | 210 | if args.fp16: 211 | with amp.scale_loss(loss, optimizer) as scaled_loss: 212 | scaled_loss.backward() 213 | else: 214 | loss.backward() 215 | 216 | tr_loss += loss.item() 217 | if (step + 1) % args.gradient_accumulation_steps == 0: 218 | optimizer.step() 219 | scheduler.step() # Update learning rate schedule 220 | model.zero_grad() 221 | global_step += 1 222 | 223 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 224 | # Log metrics 225 | tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step) 226 | tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) 227 | tb_writer.add_scalar('Batch_loss', loss.item()*args.gradient_accumulation_steps, global_step) 228 | logger.info(" global_step = %s, average loss = %s", global_step, (tr_loss - logging_loss)/args.logging_steps) 229 | logging_loss = tr_loss 230 | 231 | if args.local_rank == -1 and args.evaluate_during_training and global_step % args.save_steps == 0: 232 | results = evaluate_func(args, model, tokenizer, eval_dataset) 233 | for key, value in results.items(): 234 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step) 235 | if results['acc'] > curr_best: 236 | curr_best = results['acc'] 237 | # Save model checkpoint 238 | output_dir = args.output_dir 239 | if not os.path.exists(output_dir): 240 | os.makedirs(output_dir) 241 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 242 | model_to_save.save_pretrained(output_dir) 243 | tokenizer.save_pretrained(output_dir) 244 | torch.save(args, os.path.join(output_dir, 'training_args.bin')) 245 | logger.info("Saving model checkpoint to %s", output_dir) 246 | 247 | 248 | if args.max_steps > 0 and global_step > args.max_steps: 249 | epoch_iterator.close() 250 | break 251 | if args.max_steps > 0 and global_step > args.max_steps: 252 | train_iterator.close() 253 | break 254 | results = evaluate_func(args, model, tokenizer, eval_dataset) 255 | for key, value in results.items(): 256 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step) 257 | if results['acc'] > curr_best: 258 | curr_best = results['acc'] 259 | # Save model checkpoint 260 | output_dir = args.output_dir 261 | if not os.path.exists(output_dir): 262 | os.makedirs(output_dir) 263 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 264 | model_to_save.save_pretrained(output_dir) 265 | tokenizer.save_pretrained(output_dir) 266 | torch.save(args, os.path.join(output_dir, 'training_args.bin')) 267 | logger.info("Saving model checkpoint to %s", output_dir) 268 | if args.local_rank in [-1, 0]: 269 | tb_writer.close() 270 | return global_step, tr_loss / global_step 271 | 272 | 273 | def load_and_cache_examples(args, task, tokenizer, evaluate=False): 274 | if args.local_rank not in [-1, 0] and not evaluate: 275 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 276 | 277 | processor = myprocessors[task](args) 278 | examples = processor.get_dev_examples() if evaluate else processor.get_train_examples() 279 | feature_func = convert_examples_to_features if evaluate else convert_examples_to_features_mlm 280 | features = feature_func(examples, tokenizer, max_length=args.max_seq_length) 281 | if args.local_rank == 0 and not evaluate: 282 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 283 | if evaluate: 284 | return MyDataset(features, tokenizer.pad_token_id, tokenizer.mask_token_id, args.max_words_to_mask) 285 | else: 286 | return MLMDataset(features) 287 | 288 | def main(): 289 | parser = argparse.ArgumentParser() 290 | 291 | ## Required parameters 292 | parser.add_argument("--train_file", default=None, type=str, required=True, 293 | help="The train file name") 294 | parser.add_argument("--dev_file", default=None, type=str, required=True, 295 | help="The dev file name") 296 | parser.add_argument("--model_type", default=None, type=str, required=True, 297 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) 298 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True, 299 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_TYPES)) 300 | parser.add_argument("--config_name", default="", type=str, 301 | help="Pretrained config name or path if not the same as model_name") 302 | parser.add_argument("--tokenizer_name", default="", type=str, 303 | help="Pretrained tokenizer name or path if not the same as model_name") 304 | parser.add_argument("--cache_dir", default="", type=str, 305 | help="Where do you want to store the pre-trained models downloaded from s3") 306 | parser.add_argument("--task_name", default=None, type=str, required=True, 307 | help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys())) 308 | parser.add_argument("--output_dir", default=None, type=str, required=True, 309 | help="The output directory where the model predictions and checkpoints will be written.") 310 | 311 | ## Other parameters 312 | parser.add_argument("--mlm_probability", default=0.5, type=float, 313 | help="token masking probability, should be 0.5 for ATOMIC and 0.3 for CSKG") 314 | parser.add_argument("--max_seq_length", default=128, type=int, 315 | help="The maximum total input sequence length after tokenization. Sequences longer " 316 | "than this will be truncated, sequences shorter will be padded.") 317 | parser.add_argument("--max_words_to_mask", default=6, type=int, 318 | help="The maximum number of tokens to mask when computing scores") 319 | parser.add_argument("--max_sequence_per_time", default=80, type=int, 320 | help="The maximum number of sequences to feed into the model") 321 | parser.add_argument("--do_train", action='store_true', 322 | help="Whether to run training.") 323 | parser.add_argument("--do_eval", action='store_true', 324 | help="Whether to run eval on the dev set.") 325 | parser.add_argument("--evaluate_during_training", action='store_true', 326 | help="Run evaluation during training at each logging step.") 327 | parser.add_argument("--do_lower_case", action='store_true', 328 | help="Set this flag if you are using an uncased model.") 329 | 330 | parser.add_argument("--per_gpu_train_batch_size", default=1, type=int, 331 | help="Batch size per GPU/CPU for training.") 332 | parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int, 333 | help="Batch size per GPU/CPU for evaluation.") 334 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 335 | help="Number of updates steps to accumulate before performing a backward/update pass.") 336 | parser.add_argument("--learning_rate", default=1e-5, type=float, 337 | help="The initial learning rate for Adam.") 338 | parser.add_argument("--weight_decay", default=0.01, type=float, 339 | help="Weight deay if we apply some.") 340 | parser.add_argument("--adam_epsilon", default=1e-6, type=float, 341 | help="Epsilon for Adam optimizer.") 342 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 343 | help="Max gradient norm.") 344 | parser.add_argument("--num_train_epochs", default=1.0, type=float, 345 | help="Total number of training epochs to perform.") 346 | parser.add_argument("--max_steps", default=-1, type=int, 347 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 348 | parser.add_argument("--warmup_steps", default=0, type=int, 349 | help="Linear warmup over warmup_steps.") 350 | parser.add_argument("--warmup_proportion", default=0.05, type=float, 351 | help="Linear warmup over warmup proportion.") 352 | parser.add_argument('--logging_steps', type=int, default=50, 353 | help="Log every X updates steps.") 354 | parser.add_argument('--save_steps', type=int, default=50, 355 | help="Save checkpoint every X updates steps.") 356 | parser.add_argument("--logits_file", default='logits_test.txt', type=str, 357 | help="The file where prediction logits will be written") 358 | parser.add_argument("--results_file", default='eval_results.txt', type=str, 359 | help="The file where eval results will be written") 360 | parser.add_argument("--no_cuda", action='store_true', 361 | help="Avoid using CUDA when available") 362 | parser.add_argument('--overwrite_output_dir', action='store_true', 363 | help="Overwrite the content of the output directory") 364 | parser.add_argument('--seed', type=int, default=2555, 365 | help="random seed for initialization") 366 | parser.add_argument('--fp16', action='store_true', 367 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 368 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 369 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 370 | "See details at https://nvidia.github.io/apex/amp.html") 371 | parser.add_argument("--local_rank", type=int, default=-1, 372 | help="For distributed training: local_rank") 373 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") 374 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") 375 | args = parser.parse_args() 376 | 377 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir: 378 | raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) 379 | if not os.path.exists(args.output_dir): 380 | os.makedirs(args.output_dir) 381 | 382 | # Setup CUDA, GPU & distributed training 383 | if args.local_rank == -1 or args.no_cuda: 384 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 385 | args.n_gpu = torch.cuda.device_count() 386 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 387 | torch.cuda.set_device(args.local_rank) 388 | device = torch.device("cuda", args.local_rank) 389 | torch.distributed.init_process_group(backend='nccl') 390 | args.n_gpu = 1 391 | args.device = device 392 | 393 | if args.do_train: 394 | for handler in logging.root.handlers[:]: 395 | logging.root.removeHandler(handler) 396 | # Setup logging 397 | if args.do_train: 398 | log_file = os.path.join(args.output_dir, 'train.log') 399 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 400 | datefmt = '%m/%d/%Y %H:%M:%S', 401 | level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN, 402 | filename=log_file) 403 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 404 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 405 | os.system("cp run_mlm_roberta.py %s" % os.path.join(args.output_dir, 'run_mlm_roberta.py')) 406 | os.system("cp ../data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py')) 407 | 408 | # Set seed 409 | set_seed(args) 410 | args.task_name = args.task_name.lower() 411 | if args.task_name not in myprocessors: 412 | raise ValueError("Task not found: %s" % (args.task_name)) 413 | 414 | args.model_type = args.model_type.lower() 415 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 416 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir) 417 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir) 418 | model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir) 419 | 420 | count = count_parameters(model) 421 | print (count) 422 | 423 | if args.local_rank == 0: 424 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 425 | 426 | model.to(args.device) 427 | 428 | logger.info("Training/evaluation parameters %s", args) 429 | 430 | eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) 431 | if args.do_train: 432 | train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) 433 | global_step, tr_loss = train(args, train_dataset, model, tokenizer, eval_dataset) 434 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) 435 | # Evaluation 436 | results = {} 437 | return results 438 | 439 | if __name__ == "__main__": 440 | main() -------------------------------------------------------------------------------- /src/Training/AFLite/run_roberta_classification.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import glob 21 | import logging 22 | import os 23 | import random 24 | 25 | import numpy as np 26 | import torch 27 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 28 | TensorDataset) 29 | from torch.utils.data.distributed import DistributedSampler 30 | from torch.utils.tensorboard import SummaryWriter 31 | from tqdm import tqdm, trange 32 | import sys 33 | sys.path.append('../') 34 | sys.path.append('.') 35 | from transformers import (WEIGHTS_NAME, RobertaConfig, RobertaTokenizer) 36 | from transformers import AdamW, get_linear_schedule_with_warmup 37 | from data_utils import accuracy, myprocessors, convert_examples_to_features 38 | import json 39 | from custimized_models import RobertaForMultipleChoice 40 | logger = logging.getLogger(__name__) 41 | 42 | from transformers import MODEL_WITH_LM_HEAD_MAPPING 43 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 44 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 45 | 46 | MODEL_CLASSES = { 47 | 'roberta-mc': (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer) 48 | } 49 | 50 | class MyDataset(torch.utils.data.Dataset): 51 | 52 | def __init__(self, data, pad_token, mask_token): 53 | self.data = data 54 | self.pad_token = pad_token 55 | self.mask_token = mask_token 56 | 57 | def __len__(self): 58 | return len(self.data) 59 | 60 | def __getitem__(self, idx): 61 | sample = self.data[idx] 62 | return sample, self.pad_token, self.mask_token 63 | 64 | def mCollateFn(batch): 65 | batch_input_ids = [] 66 | batch_input_mask = [] 67 | batch_label_ids = [] 68 | features = [b[0] for b in batch] 69 | pad_token = batch[0][1] 70 | mask_token = batch[0][2] 71 | max_len = max([len(cand) for f in features for cand in f[0]]) 72 | for f in features: 73 | batch_input_ids.append([]) 74 | batch_input_mask.append([]) 75 | batch_label_ids.append(f[2]) 76 | for i in range(len(f[0])): 77 | sequence = f[0][i] + [pad_token]*(max_len-len(f[0][i])) 78 | att_mask = [1]*len(f[0][i]) + [0]*(max_len-len(f[0][i])) 79 | batch_input_ids[-1].append(sequence) 80 | batch_input_mask[-1].append(att_mask) 81 | batch_input_ids = torch.tensor(batch_input_ids, dtype=torch.long) 82 | batch_input_mask = torch.tensor(batch_input_mask, dtype=torch.long) 83 | return batch_input_ids, batch_input_mask, torch.tensor(batch_label_ids, dtype=torch.long) 84 | 85 | def set_seed(args): 86 | random.seed(args.seed) 87 | np.random.seed(args.seed) 88 | torch.manual_seed(args.seed) 89 | if args.n_gpu > 0: 90 | torch.cuda.manual_seed_all(args.seed) 91 | 92 | def count_parameters(model): 93 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 94 | 95 | def train(args, train_dataset, model, tokenizer, eval_dataset): 96 | """ Train the model """ 97 | if args.local_rank in [-1, 0]: 98 | tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs')) 99 | 100 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 101 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) 102 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=mCollateFn) 103 | 104 | if args.max_steps > 0: 105 | t_total = args.max_steps 106 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 107 | else: 108 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs 109 | 110 | # Prepare optimizer and schedule (linear warmup and decay) 111 | no_decay = ['bias', 'LayerNorm.weight'] 112 | optimizer_grouped_parameters = [ 113 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, 114 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 115 | ] 116 | 117 | warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total) 118 | logger.info("warm up steps = %d", warmup_steps) 119 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98)) 120 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) 121 | 122 | if args.fp16: 123 | try: 124 | from apex import amp 125 | except ImportError: 126 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 127 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 128 | 129 | # multi-gpu training (should be after apex fp16 initialization) 130 | if args.n_gpu > 1: 131 | model = torch.nn.DataParallel(model) 132 | 133 | # Distributed training (should be after apex fp16 initialization) 134 | if args.local_rank != -1: 135 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 136 | output_device=args.local_rank, 137 | find_unused_parameters=True) 138 | # Train! 139 | logger.info("***** Running training *****") 140 | logger.info(" Num examples = %d", len(train_dataset)) 141 | logger.info(" Num Epochs = %d", args.num_train_epochs) 142 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 143 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 144 | args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 145 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 146 | logger.info(" Total optimization steps = %d", t_total) 147 | 148 | global_step = 0 149 | tr_loss, logging_loss = 0.0, 0.0 150 | model.zero_grad() 151 | train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) 152 | set_seed(args) # Added here for reproductibility (even between python 2 and 3) 153 | curr_best = 0.0 154 | for _ in train_iterator: 155 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) 156 | for step, batch in enumerate(epoch_iterator): 157 | model.train() 158 | inputs = {'input_ids': batch[0].cuda(), 159 | 'attention_mask': batch[1].cuda(), 160 | 'labels': batch[2].cuda()} 161 | outputs = model(**inputs) 162 | loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) 163 | 164 | if args.n_gpu > 1: 165 | loss = loss.mean() # mean() to average on multi-gpu parallel training 166 | if args.gradient_accumulation_steps > 1: 167 | loss = loss / args.gradient_accumulation_steps 168 | 169 | if args.fp16: 170 | with amp.scale_loss(loss, optimizer) as scaled_loss: 171 | scaled_loss.backward() 172 | else: 173 | loss.backward() 174 | 175 | tr_loss += loss.item() 176 | if (step + 1) % args.gradient_accumulation_steps == 0: 177 | optimizer.step() 178 | scheduler.step() # Update learning rate schedule 179 | model.zero_grad() 180 | global_step += 1 181 | 182 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 183 | # Log metrics 184 | tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step) 185 | tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) 186 | tb_writer.add_scalar('Batch_loss', loss.item()*args.gradient_accumulation_steps, global_step) 187 | logger.info(" global_step = %s, average loss = %s", global_step, (tr_loss - logging_loss)/args.logging_steps) 188 | logging_loss = tr_loss 189 | 190 | if args.local_rank == -1 and args.evaluate_during_training and global_step % args.save_steps == 0: 191 | results = evaluate(args, model, tokenizer, eval_dataset) 192 | for key, value in results.items(): 193 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step) 194 | if results['acc'] > curr_best: 195 | curr_best = results['acc'] 196 | # Save model checkpoint 197 | output_dir = args.output_dir 198 | if not os.path.exists(output_dir): 199 | os.makedirs(output_dir) 200 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 201 | model_to_save.save_pretrained(output_dir) 202 | tokenizer.save_pretrained(output_dir) 203 | torch.save(args, os.path.join(output_dir, 'training_args.bin')) 204 | logger.info("Saving model checkpoint to %s", output_dir) 205 | 206 | 207 | if args.max_steps > 0 and global_step > args.max_steps: 208 | epoch_iterator.close() 209 | break 210 | if args.max_steps > 0 and global_step > args.max_steps: 211 | train_iterator.close() 212 | break 213 | results = evaluate(args, model, tokenizer, eval_dataset) 214 | for key, value in results.items(): 215 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step) 216 | if results['acc'] > curr_best: 217 | curr_best = results['acc'] 218 | # Save model checkpoint 219 | output_dir = args.output_dir 220 | if not os.path.exists(output_dir): 221 | os.makedirs(output_dir) 222 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 223 | model_to_save.save_pretrained(output_dir) 224 | tokenizer.save_pretrained(output_dir) 225 | torch.save(args, os.path.join(output_dir, 'training_args.bin')) 226 | logger.info("Saving model checkpoint to %s", output_dir) 227 | if args.local_rank in [-1, 0]: 228 | tb_writer.close() 229 | return global_step, tr_loss / global_step 230 | 231 | def save_logits(logits_all, filename): 232 | with open(filename, "w") as f: 233 | for i in range(len(logits_all)): 234 | for j in range(len(logits_all[i])): 235 | f.write(str(logits_all[i][j])) 236 | if j == len(logits_all[i])-1: 237 | f.write("\n") 238 | else: 239 | f.write(" ") 240 | 241 | def evaluate(args, model, tokenizer, eval_dataset): 242 | results = {} 243 | if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: 244 | os.makedirs(args.output_dir) 245 | 246 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 247 | # Note that DistributedSampler samples randomly 248 | eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) 249 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=mCollateFn) 250 | 251 | # Eval! 252 | logger.info("***** Running evaluation *****") 253 | logger.info(" Num examples = %d", len(eval_dataset)) 254 | logger.info(" Batch size = %d", args.eval_batch_size) 255 | preds = None 256 | out_label_ids = None 257 | features = [] 258 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 259 | model.eval() 260 | with torch.no_grad(): 261 | inputs = {'input_ids': batch[0].cuda(), 262 | 'attention_mask': batch[1].cuda(), 263 | 'labels': batch[2].cuda()} 264 | outputs = model(**inputs) 265 | loss, logits = outputs[:2] 266 | batch_features = outputs[2].view(batch[0].shape[0], batch[0].shape[1], -1).detach().cpu() 267 | features.append(batch_features) 268 | if preds is None: 269 | preds = logits.detach().cpu().numpy() 270 | out_label_ids = inputs['labels'].detach().cpu().numpy() 271 | else: 272 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) 273 | out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) 274 | features = torch.cat(features, dim=0) 275 | print (features.shape, out_label_ids.shape) 276 | torch.save(features, args.dev_file.replace('.jsonl', '_features')) 277 | torch.save(out_label_ids, args.dev_file.replace('.jsonl', '_labels')) 278 | save_logits(preds, os.path.join(args.output_dir, args.logits_file)) 279 | preds = np.argmax(preds, axis=1) 280 | result = accuracy(preds, out_label_ids) 281 | results.update(result) 282 | output_eval_file = os.path.join(args.output_dir, args.results_file) 283 | with open(output_eval_file, "w") as writer: 284 | logger.info("***** Eval results *****") 285 | for key in sorted(result.keys()): 286 | logger.info(" %s = %s", key, str(result[key])) 287 | writer.write("%s = %s\n" % (key, str(result[key]))) 288 | return results 289 | 290 | def write_data(filename, data): 291 | with open(filename, 'w') as fout: 292 | for sample in data: 293 | fout.write(json.dumps(sample)) 294 | fout.write('\n') 295 | 296 | def load_and_cache_examples(args, task, tokenizer, evaluate=False): 297 | if args.local_rank not in [-1, 0] and not evaluate: 298 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 299 | processor = myprocessors[task](args) 300 | examples = processor.get_dev_examples() if evaluate else processor.get_train_examples() 301 | features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length) 302 | if args.local_rank == 0 and not evaluate: 303 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 304 | return MyDataset(features, tokenizer.pad_token_id, tokenizer.mask_token_id) 305 | 306 | def main(): 307 | parser = argparse.ArgumentParser() 308 | 309 | ## Required parameters 310 | parser.add_argument("--train_file", default=None, type=str, required=True, 311 | help="The train file name") 312 | parser.add_argument("--dev_file", default=None, type=str, required=True, 313 | help="The dev file name") 314 | parser.add_argument("--model_type", default=None, type=str, required=True, 315 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) 316 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True, 317 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_TYPES)) 318 | parser.add_argument("--config_name", default="", type=str, 319 | help="Pretrained config name or path if not the same as model_name") 320 | parser.add_argument("--tokenizer_name", default="", type=str, 321 | help="Pretrained tokenizer name or path if not the same as model_name") 322 | parser.add_argument("--cache_dir", default="", type=str, 323 | help="Where do you want to store the pre-trained models downloaded from s3") 324 | parser.add_argument("--task_name", default=None, type=str, required=True, 325 | help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys())) 326 | parser.add_argument("--output_dir", default=None, type=str, required=True, 327 | help="The output directory where the model predictions and checkpoints will be written.") 328 | 329 | ## Other parameters 330 | parser.add_argument("--max_seq_length", default=128, type=int, 331 | help="The maximum total input sequence length after tokenization. Sequences longer " 332 | "than this will be truncated, sequences shorter will be padded.") 333 | parser.add_argument("--do_train", action='store_true', 334 | help="Whether to run training.") 335 | parser.add_argument("--do_eval", action='store_true', 336 | help="Whether to run eval on the dev set.") 337 | parser.add_argument("--evaluate_during_training", action='store_true', 338 | help="Run evaluation during training at each logging step.") 339 | parser.add_argument("--do_lower_case", action='store_true', 340 | help="Set this flag if you are using an uncased model.") 341 | parser.add_argument("--per_gpu_train_batch_size", default=1, type=int, 342 | help="Batch size per GPU/CPU for training.") 343 | parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int, 344 | help="Batch size per GPU/CPU for evaluation.") 345 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 346 | help="Number of updates steps to accumulate before performing a backward/update pass.") 347 | parser.add_argument("--learning_rate", default=1e-5, type=float, 348 | help="The initial learning rate for Adam.") 349 | parser.add_argument("--weight_decay", default=0.01, type=float, 350 | help="Weight deay if we apply some.") 351 | parser.add_argument("--adam_epsilon", default=1e-6, type=float, 352 | help="Epsilon for Adam optimizer.") 353 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 354 | help="Max gradient norm.") 355 | parser.add_argument("--num_train_epochs", default=1.0, type=float, 356 | help="Total number of training epochs to perform.") 357 | parser.add_argument("--max_steps", default=-1, type=int, 358 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 359 | parser.add_argument("--warmup_steps", default=0, type=int, 360 | help="Linear warmup over warmup_steps.") 361 | parser.add_argument("--warmup_proportion", default=0.05, type=float, 362 | help="Linear warmup over warmup proportion.") 363 | parser.add_argument('--logging_steps', type=int, default=50, 364 | help="Log every X updates steps.") 365 | parser.add_argument('--save_steps', type=int, default=50, 366 | help="Save checkpoint every X updates steps.") 367 | parser.add_argument("--logits_file", default='logits_test.txt', type=str, 368 | help="The file where prediction logits will be written") 369 | parser.add_argument("--results_file", default='eval_results.txt', type=str, 370 | help="The file where eval results will be written") 371 | parser.add_argument("--no_cuda", action='store_true', 372 | help="Avoid using CUDA when available") 373 | parser.add_argument('--overwrite_output_dir', action='store_true', 374 | help="Overwrite the content of the output directory") 375 | parser.add_argument('--seed', type=int, default=2555, 376 | help="random seed for initialization") 377 | parser.add_argument('--fp16', action='store_true', 378 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 379 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 380 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 381 | "See details at https://nvidia.github.io/apex/amp.html") 382 | parser.add_argument("--local_rank", type=int, default=-1, 383 | help="For distributed training: local_rank") 384 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") 385 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") 386 | args = parser.parse_args() 387 | 388 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir and args.do_train: 389 | raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) 390 | if not os.path.exists(args.output_dir): 391 | os.makedirs(args.output_dir) 392 | 393 | # Setup CUDA, GPU & distributed training 394 | if args.local_rank == -1 or args.no_cuda: 395 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 396 | args.n_gpu = torch.cuda.device_count() 397 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 398 | torch.cuda.set_device(args.local_rank) 399 | device = torch.device("cuda", args.local_rank) 400 | torch.distributed.init_process_group(backend='nccl') 401 | args.n_gpu = 1 402 | args.device = device 403 | 404 | if args.do_train: 405 | for handler in logging.root.handlers[:]: 406 | logging.root.removeHandler(handler) 407 | # Setup logging 408 | if args.do_train: 409 | log_file = os.path.join(args.output_dir, 'train.log') 410 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 411 | datefmt = '%m/%d/%Y %H:%M:%S', 412 | level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN, 413 | filename=log_file) 414 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 415 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 416 | os.system("cp run_roberta_classification.py %s" % os.path.join(args.output_dir, 'run_roberta_classification.py')) 417 | os.system("cp ../data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py')) 418 | 419 | # Set seed 420 | set_seed(args) 421 | args.task_name = args.task_name.lower() 422 | if args.task_name not in myprocessors: 423 | raise ValueError("Task not found: %s" % (args.task_name)) 424 | 425 | args.model_type = args.model_type.lower() 426 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 427 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir) 428 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir) 429 | model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir) 430 | 431 | count = count_parameters(model) 432 | print (count) 433 | 434 | if args.local_rank == 0: 435 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 436 | 437 | model.to(args.device) 438 | 439 | logger.info("Training/evaluation parameters %s", args) 440 | 441 | eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) 442 | 443 | if args.do_train: 444 | train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) 445 | global_step, tr_loss = train(args, train_dataset, model, tokenizer, eval_dataset) 446 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) 447 | # Evaluation 448 | results = {} 449 | if args.do_eval: 450 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) 451 | model = model_class.from_pretrained(args.output_dir) 452 | model.eval() 453 | model.to(args.device) 454 | result = evaluate(args, model, tokenizer, eval_dataset) 455 | return results 456 | 457 | if __name__ == "__main__": 458 | main() -------------------------------------------------------------------------------- /src/Training/run_pretrain_gpt2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import glob 21 | import logging 22 | import os 23 | import random 24 | 25 | import numpy as np 26 | import torch 27 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 28 | TensorDataset) 29 | from torch.utils.data.distributed import DistributedSampler 30 | from torch.utils.tensorboard import SummaryWriter 31 | 32 | from tqdm import tqdm, trange 33 | from transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer) 34 | from transformers import AdamW, get_linear_schedule_with_warmup 35 | from data_utils import accuracy, myprocessors, handle_underscores 36 | import json 37 | from collections import Counter 38 | logger = logging.getLogger(__name__) 39 | 40 | from transformers import MODEL_WITH_LM_HEAD_MAPPING 41 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 42 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 43 | 44 | MODEL_CLASSES = { 45 | 'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer) 46 | } 47 | 48 | class MyDataset(torch.utils.data.Dataset): 49 | 50 | def __init__(self, data, mask_token): 51 | self.data = data 52 | self.mask_token = mask_token 53 | 54 | def __len__(self): 55 | return len(self.data) 56 | 57 | def __getitem__(self, idx): 58 | sample = self.data[idx] 59 | return sample, self.mask_token 60 | 61 | 62 | def convert_examples_to_features(examples, tokenizer, max_length=512): 63 | data = [] 64 | for example in examples: 65 | inputs, _ = handle_underscores(example['context'], tokenizer, keywords=example['keywords'], prefix=True) 66 | choices = [handle_underscores(cand, tokenizer) for cand in example['candidates']] 67 | input_ids = [inputs+cand[0] for cand in choices] 68 | input_ids = [tokenizer.convert_tokens_to_ids(cand) for cand in input_ids] 69 | data.append([input_ids, input_ids, example['correct']]) 70 | return data 71 | 72 | def mCollateFn(batch): 73 | batch_input_ids = [] 74 | batch_input_mask = [] 75 | batch_input_labels =[] 76 | batch_label_ids = [] 77 | features = [b[0] for b in batch] 78 | mask_token = batch[0][1] 79 | max_len = max([len(cand) for f in features for cand in f[0]]) 80 | for f in features: 81 | batch_input_ids.append([]) 82 | batch_input_mask.append([]) 83 | batch_input_labels.append([]) 84 | batch_label_ids.append(f[2]) 85 | for i in range(len(f[0])): 86 | sequence = f[0][i] + [mask_token]*(max_len-len(f[0][i])) 87 | att_mask = [1]*len(f[0][i]) + [0]*(max_len-len(f[0][i])) 88 | label_sequence = f[1][i]+[mask_token]*(max_len-len(f[1][i])) 89 | batch_input_ids[-1].append(sequence) 90 | batch_input_mask[-1].append(att_mask) 91 | batch_input_labels[-1].append(label_sequence) 92 | 93 | batch_input_ids = torch.tensor(batch_input_ids, dtype=torch.long) 94 | batch_input_mask = torch.tensor(batch_input_mask, dtype=torch.long) 95 | batch_input_labels = torch.tensor(batch_input_labels, dtype=torch.long) 96 | batch_label_ids = torch.tensor(batch_label_ids, dtype=torch.long) 97 | return batch_input_ids, batch_input_mask, batch_input_labels, batch_label_ids 98 | 99 | def set_seed(args): 100 | random.seed(args.seed) 101 | np.random.seed(args.seed) 102 | torch.manual_seed(args.seed) 103 | if args.n_gpu > 0: 104 | torch.cuda.manual_seed_all(args.seed) 105 | 106 | def count_parameters(model): 107 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 108 | 109 | def train(args, train_dataset, model, tokenizer, eval_dataset): 110 | """ Train the model """ 111 | if args.local_rank in [-1, 0]: 112 | tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs')) 113 | 114 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 115 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) 116 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=mCollateFn) 117 | 118 | if args.max_steps > 0: 119 | t_total = args.max_steps 120 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 121 | else: 122 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs 123 | 124 | # Prepare optimizer and schedule (linear warmup and decay) 125 | no_decay = ['bias', 'LayerNorm.weight'] 126 | optimizer_grouped_parameters = [ 127 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, 128 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 129 | ] 130 | 131 | warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total) 132 | logger.info("warm up steps = %d", warmup_steps) 133 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98)) 134 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) 135 | 136 | if args.fp16: 137 | try: 138 | from apex import amp 139 | except ImportError: 140 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 141 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 142 | 143 | # multi-gpu training (should be after apex fp16 initialization) 144 | if args.n_gpu > 1: 145 | model = torch.nn.DataParallel(model) 146 | 147 | # Distributed training (should be after apex fp16 initialization) 148 | if args.local_rank != -1: 149 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 150 | output_device=args.local_rank, 151 | find_unused_parameters=True) 152 | # Train! 153 | logger.info("***** Running training *****") 154 | logger.info(" Num examples = %d", len(train_dataset)) 155 | logger.info(" Num Epochs = %d", args.num_train_epochs) 156 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 157 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 158 | args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 159 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 160 | logger.info(" Total optimization steps = %d", t_total) 161 | 162 | global_step = 0 163 | tr_loss, logging_loss = 0.0, 0.0 164 | model.zero_grad() 165 | train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) 166 | set_seed(args) # Added here for reproductibility (even between python 2 and 3) 167 | curr_best = 0.0 168 | CE = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=tokenizer.mask_token_id) 169 | loss_fct = torch.nn.MultiMarginLoss(margin=args.margin) 170 | for _ in train_iterator: 171 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) 172 | for step, batch in enumerate(epoch_iterator): 173 | model.train() 174 | b_size, num_cand, seq_len = batch[0].shape 175 | input_ids = batch[0].view(-1, seq_len).cuda() 176 | attention_mask = batch[1].view(-1, seq_len).cuda() 177 | input_labels = batch[2].view(-1, seq_len).cuda() 178 | shift_labels = input_labels[..., 1:].contiguous().view(-1) 179 | inputs = {'input_ids': input_ids, 180 | 'attention_mask': attention_mask} 181 | outputs = model(**inputs) 182 | shift_logits = outputs[0][..., :-1, :].contiguous().view(-1, outputs[0].size(-1)) 183 | ce_loss = CE(shift_logits, shift_labels) 184 | ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1) 185 | valid_tokens = (input_ids != tokenizer.mask_token_id).long().sum(1) 186 | ce_loss /= valid_tokens 187 | ce_loss = -ce_loss.view(b_size, num_cand) 188 | loss = loss_fct(ce_loss, batch[3].cuda()) 189 | 190 | if args.n_gpu > 1: 191 | loss = loss.mean() # mean() to average on multi-gpu parallel training 192 | if args.gradient_accumulation_steps > 1: 193 | loss = loss / args.gradient_accumulation_steps 194 | 195 | if args.fp16: 196 | with amp.scale_loss(loss, optimizer) as scaled_loss: 197 | scaled_loss.backward() 198 | else: 199 | loss.backward() 200 | 201 | tr_loss += loss.item() 202 | if (step + 1) % args.gradient_accumulation_steps == 0: 203 | optimizer.step() 204 | scheduler.step() # Update learning rate schedule 205 | model.zero_grad() 206 | global_step += 1 207 | 208 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 209 | # Log metrics 210 | tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step) 211 | tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) 212 | tb_writer.add_scalar('Batch_loss', loss.item()*args.gradient_accumulation_steps, global_step) 213 | logger.info(" global_step = %s, average loss = %s", global_step, (tr_loss - logging_loss)/args.logging_steps) 214 | logging_loss = tr_loss 215 | 216 | if args.local_rank == -1 and args.evaluate_during_training and global_step % args.save_steps == 0: 217 | results = evaluate(args, model, tokenizer, eval_dataset) 218 | for key, value in results.items(): 219 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step) 220 | if results['acc'] > curr_best: 221 | curr_best = results['acc'] 222 | # Save model checkpoint 223 | output_dir = args.output_dir 224 | if not os.path.exists(output_dir): 225 | os.makedirs(output_dir) 226 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 227 | model_to_save.save_pretrained(output_dir) 228 | tokenizer.save_pretrained(output_dir) 229 | torch.save(args, os.path.join(output_dir, 'training_args.bin')) 230 | logger.info("Saving model checkpoint to %s", output_dir) 231 | 232 | 233 | if args.max_steps > 0 and global_step > args.max_steps: 234 | epoch_iterator.close() 235 | break 236 | if args.max_steps > 0 and global_step > args.max_steps: 237 | train_iterator.close() 238 | break 239 | 240 | results = evaluate(args, model, tokenizer, eval_dataset) 241 | for key, value in results.items(): 242 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step) 243 | if results['acc'] > curr_best: 244 | curr_best = results['acc'] 245 | # Save model checkpoint 246 | output_dir = args.output_dir 247 | if not os.path.exists(output_dir): 248 | os.makedirs(output_dir) 249 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 250 | model_to_save.save_pretrained(output_dir) 251 | tokenizer.save_pretrained(output_dir) 252 | torch.save(args, os.path.join(output_dir, 'training_args.bin')) 253 | logger.info("Saving model checkpoint to %s", output_dir) 254 | if args.local_rank in [-1, 0]: 255 | tb_writer.close() 256 | return global_step, tr_loss / global_step 257 | 258 | def save_logits(logits_all, filename): 259 | with open(filename, "w") as f: 260 | for i in range(len(logits_all)): 261 | for j in range(len(logits_all[i])): 262 | f.write(str(logits_all[i][j])) 263 | if j == len(logits_all[i])-1: 264 | f.write("\n") 265 | else: 266 | f.write(" ") 267 | 268 | def evaluate(args, model, tokenizer, eval_dataset): 269 | results = {} 270 | if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: 271 | os.makedirs(args.output_dir) 272 | 273 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 274 | # Note that DistributedSampler samples randomly 275 | eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) 276 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=mCollateFn) 277 | 278 | # Eval! 279 | logger.info("***** Running evaluation *****") 280 | logger.info(" Num examples = %d", len(eval_dataset)) 281 | logger.info(" Batch size = %d", args.eval_batch_size) 282 | CE = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=tokenizer.mask_token_id) 283 | preds = [] 284 | out_label_ids = [] 285 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 286 | model.eval() 287 | with torch.no_grad(): 288 | b_size, num_cand, seq_len = batch[0].shape 289 | input_ids = batch[0].view(-1, seq_len).cuda() 290 | attention_mask = batch[1].view(-1, seq_len).cuda() 291 | input_labels = batch[2].view(-1, seq_len).cuda() 292 | shift_labels = input_labels[..., 1:].contiguous().view(-1) 293 | inputs = {'input_ids': input_ids, 294 | 'attention_mask': attention_mask} 295 | outputs = model(**inputs) 296 | shift_logits = outputs[0][..., :-1, :].contiguous().view(-1, outputs[0].size(-1)) 297 | ce_loss = CE(shift_logits, shift_labels) 298 | ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1) 299 | valid_tokens = (input_ids != tokenizer.mask_token_id).long().sum(1) 300 | ce_loss /= valid_tokens 301 | ce_loss = -ce_loss.view(b_size, num_cand) 302 | 303 | preds.append(ce_loss) 304 | out_label_ids.append(batch[3].numpy()) 305 | preds = torch.cat(preds, dim=0).cpu().numpy() 306 | save_logits(preds.tolist(), os.path.join(args.output_dir, args.logits_file)) 307 | preds = np.argmax(preds, axis=1) 308 | result = accuracy(preds, np.concatenate(out_label_ids)) 309 | results.update(result) 310 | output_eval_file = os.path.join(args.output_dir, args.results_file) 311 | with open(output_eval_file, "w") as writer: 312 | logger.info("***** Eval results *****") 313 | for key in sorted(result.keys()): 314 | logger.info(" %s = %s", key, str(result[key])) 315 | writer.write("%s = %s\n" % (key, str(result[key]))) 316 | return results 317 | 318 | def write_data(filename, data): 319 | with open(filename, 'w') as fout: 320 | for sample in data: 321 | fout.write(json.dumps(sample)) 322 | fout.write('\n') 323 | 324 | def load_and_cache_examples(args, task, tokenizer, evaluate=False): 325 | if args.local_rank not in [-1, 0] and not evaluate: 326 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 327 | 328 | processor = myprocessors[task](args) 329 | cached_features_file = os.path.join(args.output_dir, 'cached_{}_{}_{}_{}'.format( 330 | 'dev', 331 | str(args.model_type), 332 | str(args.max_seq_length), 333 | str(task))) 334 | if evaluate and os.path.exists(cached_features_file): 335 | features = torch.load(cached_features_file) 336 | else: 337 | examples = processor.get_dev_examples() if evaluate else processor.get_train_examples() 338 | features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length) 339 | if evaluate: 340 | torch.save(features, cached_features_file) 341 | if args.local_rank == 0 and not evaluate: 342 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 343 | return MyDataset(features, tokenizer.mask_token_id) 344 | 345 | def main(): 346 | parser = argparse.ArgumentParser() 347 | 348 | ## Required parameters 349 | parser.add_argument("--train_file", default=None, type=str, required=True, 350 | help="The train file name") 351 | parser.add_argument("--dev_file", default=None, type=str, required=True, 352 | help="The dev file name") 353 | parser.add_argument("--model_type", default=None, type=str, required=True, 354 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) 355 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True, 356 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_TYPES)) 357 | parser.add_argument("--config_name", default="", type=str, 358 | help="Pretrained config name or path if not the same as model_name") 359 | parser.add_argument("--tokenizer_name", default="", type=str, 360 | help="Pretrained tokenizer name or path if not the same as model_name") 361 | parser.add_argument("--cache_dir", default="", type=str, 362 | help="Where do you want to store the pre-trained models downloaded from s3") 363 | parser.add_argument("--task_name", default=None, type=str, required=True, 364 | help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys())) 365 | parser.add_argument("--output_dir", default=None, type=str, required=True, 366 | help="The output directory where the model predictions and checkpoints will be written.") 367 | 368 | ## Other parameters 369 | parser.add_argument("--second_train_file", default=None, type=str, 370 | help="Used when combining ATOMIC and CWWV") 371 | parser.add_argument("--second_dev_file", default=None, type=str, 372 | help="Used when combining ATOMIC and CWWV") 373 | parser.add_argument("--max_seq_length", default=128, type=int, 374 | help="The maximum total input sequence length after tokenization. Sequences longer " 375 | "than this will be truncated, sequences shorter will be padded.") 376 | parser.add_argument("--do_train", action='store_true', 377 | help="Whether to run training.") 378 | parser.add_argument("--do_eval", action='store_true', 379 | help="Whether to run eval on the dev set.") 380 | parser.add_argument("--evaluate_during_training", action='store_true', 381 | help="Run evaluation during training at each logging step.") 382 | parser.add_argument("--do_lower_case", action='store_true', 383 | help="Set this flag if you are using an uncased model.") 384 | 385 | parser.add_argument("--per_gpu_train_batch_size", default=1, type=int, 386 | help="Batch size per GPU/CPU for training.") 387 | parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int, 388 | help="Batch size per GPU/CPU for evaluation.") 389 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 390 | help="Number of updates steps to accumulate before performing a backward/update pass.") 391 | parser.add_argument("--margin", default=1.0, type=float, 392 | help="The margin for ranking loss") 393 | parser.add_argument("--learning_rate", default=1e-5, type=float, 394 | help="The initial learning rate for Adam.") 395 | parser.add_argument("--weight_decay", default=0.01, type=float, 396 | help="Weight deay if we apply some.") 397 | parser.add_argument("--adam_epsilon", default=1e-6, type=float, 398 | help="Epsilon for Adam optimizer.") 399 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 400 | help="Max gradient norm.") 401 | parser.add_argument("--num_train_epochs", default=1.0, type=float, 402 | help="Total number of training epochs to perform.") 403 | parser.add_argument("--max_steps", default=-1, type=int, 404 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 405 | parser.add_argument("--warmup_steps", default=0, type=int, 406 | help="Linear warmup over warmup_steps.") 407 | parser.add_argument("--warmup_proportion", default=0.05, type=float, 408 | help="Linear warmup over warmup proportion.") 409 | parser.add_argument('--logging_steps', type=int, default=50, 410 | help="Log every X updates steps.") 411 | parser.add_argument('--save_steps', type=int, default=50, 412 | help="Save checkpoint every X updates steps.") 413 | parser.add_argument("--logits_file", default='logits_test.txt', type=str, 414 | help="The file where prediction logits will be written") 415 | parser.add_argument("--results_file", default='eval_results.txt', type=str, 416 | help="The file where eval results will be written") 417 | parser.add_argument("--no_cuda", action='store_true', 418 | help="Avoid using CUDA when available") 419 | parser.add_argument('--overwrite_output_dir', action='store_true', 420 | help="Overwrite the content of the output directory") 421 | parser.add_argument('--seed', type=int, default=2555, 422 | help="random seed for initialization") 423 | parser.add_argument('--fp16', action='store_true', 424 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 425 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 426 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 427 | "See details at https://nvidia.github.io/apex/amp.html") 428 | parser.add_argument("--local_rank", type=int, default=-1, 429 | help="For distributed training: local_rank") 430 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") 431 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") 432 | args = parser.parse_args() 433 | 434 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir and args.do_train: 435 | raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) 436 | if not os.path.exists(args.output_dir): 437 | os.makedirs(args.output_dir) 438 | 439 | # Setup CUDA, GPU & distributed training 440 | if args.local_rank == -1 or args.no_cuda: 441 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 442 | args.n_gpu = torch.cuda.device_count() 443 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 444 | torch.cuda.set_device(args.local_rank) 445 | device = torch.device("cuda", args.local_rank) 446 | torch.distributed.init_process_group(backend='nccl') 447 | args.n_gpu = 1 448 | args.device = device 449 | 450 | if args.do_train: 451 | for handler in logging.root.handlers[:]: 452 | logging.root.removeHandler(handler) 453 | # Setup logging 454 | if args.do_train: 455 | log_file = os.path.join(args.output_dir, 'train.log') 456 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 457 | datefmt = '%m/%d/%Y %H:%M:%S', 458 | level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN, 459 | filename=log_file) 460 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 461 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 462 | os.system("cp run_pretrain_gpt2.py %s" % os.path.join(args.output_dir, 'run_pretrain_gpt2.py')) 463 | os.system("cp data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py')) 464 | 465 | # Set seed 466 | set_seed(args) 467 | args.task_name = args.task_name.lower() 468 | if args.task_name not in myprocessors: 469 | raise ValueError("Task not found: %s" % (args.task_name)) 470 | 471 | args.model_type = args.model_type.lower() 472 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 473 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir) 474 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir) 475 | model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir) 476 | 477 | count = count_parameters(model) 478 | print (count) 479 | special_tokens_dict = {'mask_token': ''} 480 | num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) 481 | model.resize_token_embeddings(len(tokenizer)) 482 | 483 | if args.local_rank == 0: 484 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 485 | 486 | model.to(args.device) 487 | 488 | logger.info("Training/evaluation parameters %s", args) 489 | 490 | eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) 491 | 492 | init_result = evaluate(args, model, tokenizer, eval_dataset) 493 | print (init_result) 494 | if args.do_train: 495 | train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) 496 | global_step, tr_loss = train(args, train_dataset, model, tokenizer, eval_dataset) 497 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) 498 | # Evaluation 499 | results = {} 500 | if args.do_eval: 501 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) 502 | model = model_class.from_pretrained(args.output_dir) 503 | model.eval() 504 | model.to(args.device) 505 | result = evaluate(args, model, tokenizer, eval_dataset) 506 | return results 507 | 508 | 509 | if __name__ == "__main__": 510 | main() -------------------------------------------------------------------------------- /src/Training/run_pretrain.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import glob 21 | import logging 22 | import os 23 | import random 24 | import numpy as np 25 | import torch 26 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 27 | TensorDataset) 28 | from torch.utils.data.distributed import DistributedSampler 29 | from torch.utils.tensorboard import SummaryWriter 30 | 31 | from tqdm import tqdm, trange 32 | from transformers import (WEIGHTS_NAME, RobertaConfig, RobertaForMaskedLM, RobertaTokenizer) 33 | from transformers import AdamW, get_linear_schedule_with_warmup 34 | from data_utils import accuracy, myprocessors, convert_examples_to_features 35 | import json 36 | 37 | logger = logging.getLogger(__name__) 38 | 39 | from transformers import MODEL_WITH_LM_HEAD_MAPPING 40 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 41 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 42 | MODEL_CLASSES = { 43 | 'roberta-mlm': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer) 44 | } 45 | 46 | class MyDataset(torch.utils.data.Dataset): 47 | 48 | def __init__(self, data, pad_token, mask_token, max_words_to_mask): 49 | self.data = data 50 | self.pad_token = pad_token 51 | self.mask_token = mask_token 52 | self.max_words_to_mask = max_words_to_mask 53 | 54 | def __len__(self): 55 | return len(self.data) 56 | 57 | def __getitem__(self, idx): 58 | sample = self.data[idx] 59 | return sample, self.pad_token, self.mask_token, self.max_words_to_mask 60 | 61 | def mCollateFn(batch): 62 | batch_input_ids = [] 63 | batch_input_mask = [] 64 | batch_input_labels = [] 65 | batch_label_ids = [] 66 | features = [b[0] for b in batch] 67 | pad_token = batch[0][1] 68 | mask_token = batch[0][2] 69 | MAX_WORDS_TO_MASK = batch[0][3] 70 | max_len = max([len(cand) for f in features for cand in f[0]]) 71 | for f in features: 72 | batch_input_ids.append([]) 73 | batch_input_mask.append([]) 74 | batch_input_labels.append([]) 75 | batch_label_ids.append(f[2]) 76 | for i in range(len(f[0])): 77 | masked_sequences = [] 78 | masked_labels = [] 79 | this_att_mask = [] 80 | sequence = f[0][i] + [pad_token]*(max_len-len(f[0][i])) 81 | label_sequence = f[1][i]+[-100]*(max_len-len(f[1][i])) 82 | valid_indices = [l_i for l_i, l in enumerate(label_sequence) if l != -100] 83 | if len(valid_indices) > MAX_WORDS_TO_MASK: 84 | rm_indices = random.sample(valid_indices, (len(valid_indices)-MAX_WORDS_TO_MASK)) 85 | label_sequence = [-100 if l_i in rm_indices else l for l_i, l in enumerate(label_sequence)] 86 | for j, t in enumerate(label_sequence): 87 | if t == -100: 88 | continue 89 | masked_sequences.append(sequence) 90 | masked_labels.append([-100]*max_len) 91 | else: 92 | masked_sequences.append(sequence[:j]+[mask_token]+sequence[j+1:]) 93 | masked_labels.append([-100]*j+[sequence[j]]+[-100]*(max_len-j-1)) 94 | this_att_mask.append([1]*len(f[0][i])+[0]*(max_len-len(f[0][i]))) 95 | batch_input_ids[-1].append(torch.tensor(masked_sequences, dtype=torch.long)) 96 | batch_input_mask[-1].append(torch.tensor(this_att_mask, dtype=torch.long)) 97 | batch_input_labels[-1].append(torch.tensor(masked_labels, dtype=torch.long)) 98 | return batch_input_ids, batch_input_mask, batch_input_labels, torch.tensor(batch_label_ids, dtype=torch.long) 99 | 100 | def set_seed(args): 101 | random.seed(args.seed) 102 | np.random.seed(args.seed) 103 | torch.manual_seed(args.seed) 104 | if args.n_gpu > 0: 105 | torch.cuda.manual_seed_all(args.seed) 106 | 107 | def count_parameters(model): 108 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 109 | 110 | def train(args, train_dataset, model, tokenizer, eval_dataset): 111 | """ Train the model """ 112 | if args.local_rank in [-1, 0]: 113 | tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs')) 114 | 115 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 116 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) 117 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=mCollateFn) 118 | 119 | if args.max_steps > 0: 120 | t_total = args.max_steps 121 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 122 | else: 123 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs 124 | 125 | # Prepare optimizer and schedule (linear warmup and decay) 126 | no_decay = ['bias', 'LayerNorm.weight'] 127 | optimizer_grouped_parameters = [ 128 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, 129 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 130 | ] 131 | 132 | warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total) 133 | logger.info("warm up steps = %d", warmup_steps) 134 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98)) 135 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) 136 | 137 | if args.fp16: 138 | try: 139 | from apex import amp 140 | except ImportError: 141 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 142 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 143 | 144 | # multi-gpu training (should be after apex fp16 initialization) 145 | if args.n_gpu > 1: 146 | model = torch.nn.DataParallel(model) 147 | 148 | # Distributed training (should be after apex fp16 initialization) 149 | if args.local_rank != -1: 150 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 151 | output_device=args.local_rank, 152 | find_unused_parameters=True) 153 | # Train! 154 | logger.info("***** Running training *****") 155 | logger.info(" Num examples = %d", len(train_dataset)) 156 | logger.info(" Num Epochs = %d", args.num_train_epochs) 157 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 158 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 159 | args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 160 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 161 | logger.info(" Total optimization steps = %d", t_total) 162 | 163 | global_step = 0 164 | tr_loss, logging_loss = 0.0, 0.0 165 | model.zero_grad() 166 | train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) 167 | set_seed(args) # Added here for reproductibility (even between python 2 and 3) 168 | curr_best = 0.0 169 | CE = torch.nn.CrossEntropyLoss(reduction='none') 170 | loss_fct = torch.nn.MultiMarginLoss(margin=args.margin) 171 | for _ in train_iterator: 172 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) 173 | for step, batch in enumerate(epoch_iterator): 174 | model.train() 175 | num_cand = len(batch[0][0]) 176 | choice_loss = [] 177 | choice_seq_lens = np.array([0]+[len(c) for sample in batch[0] for c in sample]) 178 | choice_seq_lens = np.cumsum(choice_seq_lens) 179 | input_ids = torch.cat([c for sample in batch[0] for c in sample], dim=0).to(args.device) 180 | att_mask = torch.cat([c for sample in batch[1] for c in sample], dim=0).to(args.device) 181 | input_labels = torch.cat([c for sample in batch[2] for c in sample], dim=0).to(args.device) 182 | 183 | if len(input_ids) < args.max_sequence_per_time: 184 | inputs = {'input_ids': input_ids, 185 | 'attention_mask': att_mask} 186 | outputs = model(**inputs) 187 | ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels.view(-1)) 188 | ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1) 189 | else: 190 | ce_loss = [] 191 | for chunk in range(0, len(input_ids), args.max_sequence_per_time): 192 | inputs = {'input_ids': input_ids[chunk:chunk+args.max_sequence_per_time], 193 | 'attention_mask': att_mask[chunk:chunk+args.max_sequence_per_time]} 194 | outputs = model(**inputs) 195 | tmp_ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels[chunk:chunk+args.max_sequence_per_time].view(-1)) 196 | tmp_ce_loss = tmp_ce_loss.view(outputs[0].size(0), -1).sum(1) 197 | ce_loss.append(tmp_ce_loss) 198 | ce_loss = torch.cat(ce_loss, dim=0) 199 | # all tokens are valid 200 | for c_i in range(len(choice_seq_lens)-1): 201 | start = choice_seq_lens[c_i] 202 | end = choice_seq_lens[c_i+1] 203 | choice_loss.append(-ce_loss[start:end].sum()/(end-start)) 204 | 205 | choice_loss = torch.stack(choice_loss) 206 | choice_loss = choice_loss.view(-1, num_cand) 207 | loss = loss_fct(choice_loss, batch[3].to(args.device)) 208 | 209 | if args.n_gpu > 1: 210 | loss = loss.mean() # mean() to average on multi-gpu parallel training 211 | if args.gradient_accumulation_steps > 1: 212 | loss = loss / args.gradient_accumulation_steps 213 | 214 | if args.fp16: 215 | with amp.scale_loss(loss, optimizer) as scaled_loss: 216 | scaled_loss.backward() 217 | else: 218 | loss.backward() 219 | 220 | tr_loss += loss.item() 221 | if (step + 1) % args.gradient_accumulation_steps == 0: 222 | optimizer.step() 223 | scheduler.step() # Update learning rate schedule 224 | model.zero_grad() 225 | global_step += 1 226 | 227 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 228 | # Log metrics 229 | tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step) 230 | tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) 231 | tb_writer.add_scalar('Batch_loss', loss.item()*args.gradient_accumulation_steps, global_step) 232 | logger.info(" global_step = %s, average loss = %s", global_step, (tr_loss - logging_loss)/args.logging_steps) 233 | logging_loss = tr_loss 234 | 235 | if args.local_rank == -1 and args.evaluate_during_training and global_step % args.save_steps == 0: 236 | results = evaluate(args, model, tokenizer, eval_dataset) 237 | for key, value in results.items(): 238 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step) 239 | if results['acc'] > curr_best: 240 | curr_best = results['acc'] 241 | # Save model checkpoint 242 | output_dir = args.output_dir 243 | if not os.path.exists(output_dir): 244 | os.makedirs(output_dir) 245 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 246 | model_to_save.save_pretrained(output_dir) 247 | tokenizer.save_pretrained(output_dir) 248 | torch.save(args, os.path.join(output_dir, 'training_args.bin')) 249 | logger.info("Saving model checkpoint to %s", output_dir) 250 | 251 | 252 | if args.max_steps > 0 and global_step > args.max_steps: 253 | epoch_iterator.close() 254 | break 255 | if args.max_steps > 0 and global_step > args.max_steps: 256 | train_iterator.close() 257 | break 258 | results = evaluate(args, model, tokenizer, eval_dataset) 259 | for key, value in results.items(): 260 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step) 261 | if results['acc'] > curr_best: 262 | curr_best = results['acc'] 263 | # Save model checkpoint 264 | output_dir = args.output_dir 265 | if not os.path.exists(output_dir): 266 | os.makedirs(output_dir) 267 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 268 | model_to_save.save_pretrained(output_dir) 269 | tokenizer.save_pretrained(output_dir) 270 | torch.save(args, os.path.join(output_dir, 'training_args.bin')) 271 | logger.info("Saving model checkpoint to %s", output_dir) 272 | if args.local_rank in [-1, 0]: 273 | tb_writer.close() 274 | return global_step, tr_loss / global_step 275 | 276 | def save_logits(logits_all, filename): 277 | with open(filename, "w") as f: 278 | for i in range(len(logits_all)): 279 | for j in range(len(logits_all[i])): 280 | f.write(str(logits_all[i][j])) 281 | if j == len(logits_all[i])-1: 282 | f.write("\n") 283 | else: 284 | f.write(" ") 285 | 286 | def evaluate(args, model, tokenizer, eval_dataset): 287 | results = {} 288 | if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: 289 | os.makedirs(args.output_dir) 290 | 291 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 292 | # Note that DistributedSampler samples randomly 293 | eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) 294 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=mCollateFn) 295 | 296 | # Eval! 297 | logger.info("***** Running evaluation *****") 298 | logger.info(" Num examples = %d", len(eval_dataset)) 299 | logger.info(" Batch size = %d", args.eval_batch_size) 300 | CE = torch.nn.CrossEntropyLoss(reduction='none') 301 | preds = [] 302 | out_label_ids = [] 303 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 304 | model.eval() 305 | with torch.no_grad(): 306 | num_cand = len(batch[0][0]) 307 | choice_loss = [] 308 | choice_seq_lens = np.array([0]+[len(c) for sample in batch[0] for c in sample]) 309 | choice_seq_lens = np.cumsum(choice_seq_lens) 310 | input_ids = torch.cat([c for sample in batch[0] for c in sample], dim=0).to(args.device) 311 | att_mask = torch.cat([c for sample in batch[1] for c in sample], dim=0).to(args.device) 312 | input_labels = torch.cat([c for sample in batch[2] for c in sample], dim=0).to(args.device) 313 | if len(input_ids) < args.max_sequence_per_time: 314 | inputs = {'input_ids': input_ids, 315 | 'attention_mask': att_mask} 316 | outputs = model(**inputs) 317 | ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels.view(-1)) 318 | ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1) 319 | else: 320 | ce_loss = [] 321 | for chunk in range(0, len(input_ids), args.max_sequence_per_time): 322 | inputs = {'input_ids': input_ids[chunk:chunk+args.max_sequence_per_time], 323 | 'attention_mask': att_mask[chunk:chunk+args.max_sequence_per_time]} 324 | outputs = model(**inputs) 325 | tmp_ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels[chunk:chunk+args.max_sequence_per_time].view(-1)) 326 | tmp_ce_loss = tmp_ce_loss.view(outputs[0].size(0), -1).sum(1) 327 | ce_loss.append(tmp_ce_loss) 328 | ce_loss = torch.cat(ce_loss, dim=0) 329 | for c_i in range(len(choice_seq_lens)-1): 330 | start = choice_seq_lens[c_i] 331 | end = choice_seq_lens[c_i+1] 332 | choice_loss.append(-ce_loss[start:end].sum()/(end-start)) 333 | choice_loss = torch.stack(choice_loss) 334 | choice_loss = choice_loss.view(-1, num_cand) 335 | preds.append(choice_loss) 336 | out_label_ids.append(batch[3].numpy()) 337 | preds = torch.cat(preds, dim=0).cpu().numpy() 338 | save_logits(preds.tolist(), os.path.join(args.output_dir, args.logits_file)) 339 | preds = np.argmax(preds, axis=1) 340 | result = accuracy(preds, np.concatenate(out_label_ids, axis=0)) 341 | results.update(result) 342 | output_eval_file = os.path.join(args.output_dir, args.results_file) 343 | with open(output_eval_file, "w") as writer: 344 | logger.info("***** Eval results *****") 345 | for key in sorted(result.keys()): 346 | logger.info(" %s = %s", key, str(result[key])) 347 | writer.write("%s = %s\n" % (key, str(result[key]))) 348 | return results 349 | 350 | def write_data(filename, data): 351 | with open(filename, 'w') as fout: 352 | for sample in data: 353 | fout.write(json.dumps(sample)) 354 | fout.write('\n') 355 | 356 | def load_and_cache_examples(args, task, tokenizer, evaluate=False): 357 | if args.local_rank not in [-1, 0] and not evaluate: 358 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 359 | processor = myprocessors[task](args) 360 | cached_features_file = os.path.join(args.output_dir, 'cached_{}_{}_{}_{}'.format( 361 | 'dev', 362 | str(args.model_type), 363 | str(args.max_seq_length), 364 | str(task))) 365 | if evaluate and os.path.exists(cached_features_file): 366 | features = torch.load(cached_features_file) 367 | else: 368 | examples = processor.get_dev_examples() if evaluate else processor.get_train_examples() 369 | features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length) 370 | if evaluate: 371 | torch.save(features, cached_features_file) 372 | if args.local_rank == 0 and not evaluate: 373 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 374 | print ('max_words_to_mask is %s for pretraining tasks %s' % (args.max_words_to_mask, task)) 375 | return MyDataset(features, tokenizer.pad_token_id, tokenizer.mask_token_id, args.max_words_to_mask) 376 | 377 | def main(): 378 | parser = argparse.ArgumentParser() 379 | 380 | ## Required parameters 381 | parser.add_argument("--train_file", default=None, type=str, required=True, 382 | help="The train file name") 383 | parser.add_argument("--dev_file", default=None, type=str, required=True, 384 | help="The dev file name") 385 | parser.add_argument("--model_type", default=None, type=str, required=True, 386 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) 387 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True, 388 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_TYPES)) 389 | parser.add_argument("--config_name", default="", type=str, 390 | help="Pretrained config name or path if not the same as model_name") 391 | parser.add_argument("--tokenizer_name", default="", type=str, 392 | help="Pretrained tokenizer name or path if not the same as model_name") 393 | parser.add_argument("--cache_dir", default="", type=str, 394 | help="Where do you want to store the pre-trained models downloaded from s3") 395 | parser.add_argument("--task_name", default=None, type=str, required=True, 396 | help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys())) 397 | parser.add_argument("--output_dir", default=None, type=str, required=True, 398 | help="The output directory where the model predictions and checkpoints will be written.") 399 | 400 | ## Other parameters 401 | parser.add_argument("--second_train_file", default=None, type=str, 402 | help="Used when combining ATOMIC and CWWV") 403 | parser.add_argument("--second_dev_file", default=None, type=str, 404 | help="Used when combining ATOMIC and CWWV") 405 | parser.add_argument("--max_seq_length", default=128, type=int, 406 | help="The maximum total input sequence length after tokenization. Sequences longer " 407 | "than this will be truncated, sequences shorter will be padded.") 408 | parser.add_argument("--max_words_to_mask", default=6, type=int, 409 | help="The maximum number of tokens to mask when computing scores") 410 | parser.add_argument("--max_sequence_per_time", default=80, type=int, 411 | help="The maximum number of sequences to feed into the model") 412 | parser.add_argument("--do_train", action='store_true', 413 | help="Whether to run training.") 414 | parser.add_argument("--do_eval", action='store_true', 415 | help="Whether to run eval on the dev set.") 416 | parser.add_argument("--evaluate_during_training", action='store_true', 417 | help="Run evaluation during training at each logging step.") 418 | parser.add_argument("--do_lower_case", action='store_true', 419 | help="Set this flag if you are using an uncased model.") 420 | parser.add_argument("--per_gpu_train_batch_size", default=1, type=int, 421 | help="Batch size per GPU/CPU for training.") 422 | parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int, 423 | help="Batch size per GPU/CPU for evaluation.") 424 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 425 | help="Number of updates steps to accumulate before performing a backward/update pass.") 426 | parser.add_argument("--margin", default=1.0, type=float, 427 | help="The margin for ranking loss") 428 | parser.add_argument("--learning_rate", default=1e-5, type=float, 429 | help="The initial learning rate for Adam.") 430 | parser.add_argument("--weight_decay", default=0.01, type=float, 431 | help="Weight deay if we apply some.") 432 | parser.add_argument("--adam_epsilon", default=1e-6, type=float, 433 | help="Epsilon for Adam optimizer.") 434 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 435 | help="Max gradient norm.") 436 | parser.add_argument("--num_train_epochs", default=1.0, type=float, 437 | help="Total number of training epochs to perform.") 438 | parser.add_argument("--max_steps", default=-1, type=int, 439 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 440 | parser.add_argument("--warmup_steps", default=0, type=int, 441 | help="Linear warmup over warmup_steps.") 442 | parser.add_argument("--warmup_proportion", default=0.05, type=float, 443 | help="Linear warmup over warmup proportion.") 444 | parser.add_argument('--logging_steps', type=int, default=50, 445 | help="Log every X updates steps.") 446 | parser.add_argument('--save_steps', type=int, default=50, 447 | help="Save checkpoint every X updates steps.") 448 | parser.add_argument("--logits_file", default='logits_test.txt', type=str, 449 | help="The file where prediction logits will be written") 450 | parser.add_argument("--results_file", default='eval_results.txt', type=str, 451 | help="The file where eval results will be written") 452 | parser.add_argument("--no_cuda", action='store_true', 453 | help="Avoid using CUDA when available") 454 | parser.add_argument('--overwrite_output_dir', action='store_true', 455 | help="Overwrite the content of the output directory") 456 | parser.add_argument('--seed', type=int, default=2555, 457 | help="random seed for initialization") 458 | parser.add_argument('--fp16', action='store_true', 459 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 460 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 461 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 462 | "See details at https://nvidia.github.io/apex/amp.html") 463 | parser.add_argument("--local_rank", type=int, default=-1, 464 | help="For distributed training: local_rank") 465 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") 466 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") 467 | args = parser.parse_args() 468 | 469 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir and args.do_train: 470 | raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) 471 | if not os.path.exists(args.output_dir): 472 | os.makedirs(args.output_dir) 473 | 474 | # Setup CUDA, GPU & distributed training 475 | if args.local_rank == -1 or args.no_cuda: 476 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 477 | args.n_gpu = torch.cuda.device_count() 478 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 479 | torch.cuda.set_device(args.local_rank) 480 | device = torch.device("cuda", args.local_rank) 481 | torch.distributed.init_process_group(backend='nccl') 482 | args.n_gpu = 1 483 | args.device = device 484 | 485 | if args.do_train: 486 | for handler in logging.root.handlers[:]: 487 | logging.root.removeHandler(handler) 488 | # Setup logging 489 | if args.do_train: 490 | log_file = os.path.join(args.output_dir, 'train.log') 491 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 492 | datefmt = '%m/%d/%Y %H:%M:%S', 493 | level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN, 494 | filename=log_file) 495 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 496 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 497 | os.system("cp run_pretrain.py %s" % os.path.join(args.output_dir, 'run_pretrain.py')) 498 | os.system("cp data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py')) 499 | 500 | # Set seed 501 | set_seed(args) 502 | args.task_name = args.task_name.lower() 503 | if args.task_name not in myprocessors: 504 | raise ValueError("Task not found: %s" % (args.task_name)) 505 | 506 | args.model_type = args.model_type.lower() 507 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 508 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir) 509 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir) 510 | model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir) 511 | 512 | count = count_parameters(model) 513 | print (count) 514 | 515 | if args.local_rank == 0: 516 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 517 | 518 | model.to(args.device) 519 | 520 | logger.info("Training/evaluation parameters %s", args) 521 | 522 | eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) 523 | if args.do_train: 524 | init_result = evaluate(args, model, tokenizer, eval_dataset) 525 | print (init_result) 526 | if args.do_train: 527 | train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) 528 | global_step, tr_loss = train(args, train_dataset, model, tokenizer, eval_dataset) 529 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) 530 | # Evaluation 531 | results = {} 532 | if args.do_eval: 533 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) 534 | model = model_class.from_pretrained(args.output_dir) 535 | model.eval() 536 | model.to(args.device) 537 | result = evaluate(args, model, tokenizer, eval_dataset) 538 | return results 539 | 540 | if __name__ == "__main__": 541 | main() --------------------------------------------------------------------------------