├── requirements.txt
├── src
    ├── Data_generation
    │   ├── filter_CWWV.py
    │   ├── generate_from_CWWV.py
    │   └── generate_from_ATOMIC.py
    ├── Training
    │   ├── AFLite
    │   │   ├── custimized_models.py
    │   │   ├── run_AFLite.py
    │   │   └── run_roberta_classification.py
    │   ├── data_utils.py
    │   ├── MLM
    │   │   ├── run_lm_gpt2.py
    │   │   └── run_mlm_roberta.py
    │   ├── run_pretrain_gpt2.py
    │   └── run_pretrain.py
    └── Evaluation
    │   ├── evaluate_GPT2.py
    │   └── evaluate_RoBERTa.py
├── .gitignore
├── README.md
└── LICENSE


/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.5.1
2 | transformers==3.0.2
3 | overrides==3.0.0
4 | ftfy==5.6
5 | nltk==3.4.5
6 | sentence-transformers==0.3.4
7 | tensorboard==2.0
8 | wordfreq==2.3.2
9 | 


--------------------------------------------------------------------------------
/src/Data_generation/filter_CWWV.py:
--------------------------------------------------------------------------------
 1 | from wordfreq import word_frequency
 2 | import json
 3 | from tqdm import tqdm
 4 | import argparse
 5 | import random 
 6 | import os
 7 | random.seed(1)
 8 | threshold=1e-06
 9 | 
10 | def write_data(data, dest):
11 | 	with open(dest, 'w') as w:
12 | 		for x in data:
13 | 			w.write(json.dumps(x) + '\n')
14 | 
15 | def get_answer(data):
16 | 	answers={}
17 | 	for choice in data['question']['choices']:
18 | 		answers[choice['label']]=choice['text']
19 | 	return answers[data['answerKey']]
20 | 
21 | 
22 | if __name__=="__main__":
23 | 
24 | 	parser = argparse.ArgumentParser()
25 | 	parser.add_argument("--input_file", type=str, default=None, required=True,
26 | 						help="Input file with artificial QA data")
27 | 	parser.add_argument('--do_split', action="store_true", help="Further split training set into subsets for AFLite")
28 | 	args = parser.parse_args()
29 | 
30 | 	common_concepts_omcs=[]
31 | 	with open(args.input_file, 'r') as f:
32 | 		for line in tqdm(f, total=500000):
33 | 			qdata=json.loads(line)
34 | 			head_label=qdata['question']['head']
35 | 			source=qdata['question']['source']
36 | 			answer=get_answer(qdata)
37 | 			is_concept=source=='omcs' or (head_label.islower() and answer.islower())
38 | 			is_common=(word_frequency(head_label, 'en')>=threshold and word_frequency(answer, 'en')>=threshold)
39 | 			if is_concept and is_common and source == 'omcs':
40 | 				common_concepts_omcs.append(qdata)
41 | 
42 | 	print('common concepts omcs', len(common_concepts_omcs))
43 | 	random.shuffle(common_concepts_omcs)
44 | 	train_set = common_concepts_omcs[:int(len(common_concepts_omcs)*0.95)]
45 | 	dev_set = common_concepts_omcs[int(len(common_concepts_omcs)*0.95):]
46 | 	basename = os.path.basename(args.input_file)
47 | 	write_data(train_set, args.input_file.replace(basename, 'train_'+basename))
48 | 	write_data(dev_set, args.input_file.replace(basename, 'dev_'+basename))
49 | 	if args.do_split:
50 | 		assert 'random' in args.input_file
51 | 		print ('splitting train into subsets, which can be used for AFLite (only valid for random strategy)')
52 | 		train_set_1 = train_set[:int(len(train_set)*0.01)]
53 | 		train_set_4 = train_set[int(len(train_set)*0.01):int(len(train_set)*0.05)]
54 | 		train_set_95 = train_set[int(len(train_set)*0.05):]
55 | 		write_data(train_set_1, 'train_1%_'+args.input_file)
56 | 		write_data(train_set_4, 'train_4%_'+args.input_file)
57 | 		write_data(train_set_95, 'train_95%_'+args.input_file)
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/src/Training/AFLite/custimized_models.py:
--------------------------------------------------------------------------------
 1 | from __future__ import (absolute_import, division, print_function,
 2 |                         unicode_literals)
 3 | import logging
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | from torch.nn import CrossEntropyLoss
 8 | 
 9 | from transformers import BertPreTrainedModel, RobertaConfig, RobertaModel
10 | 
11 | class RobertaForMultipleChoice(BertPreTrainedModel):
12 |     config_class = RobertaConfig
13 |     base_model_prefix = "roberta"
14 | 
15 |     def __init__(self, config):
16 |         super().__init__(config)
17 | 
18 |         self.roberta = RobertaModel(config)
19 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
20 |         self.classifier = nn.Linear(config.hidden_size, 1)
21 | 
22 |         self.init_weights()
23 | 
24 |     def forward(
25 |         self,
26 |         input_ids=None,
27 |         token_type_ids=None,
28 |         attention_mask=None,
29 |         labels=None,
30 |         position_ids=None,
31 |         head_mask=None,
32 |         inputs_embeds=None,
33 |         output_attentions=None,
34 |         output_hidden_states=None,
35 |     ):
36 |         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
37 | 
38 |         flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
39 |         flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
40 |         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
41 |         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
42 |         flat_inputs_embeds = (
43 |             inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
44 |             if inputs_embeds is not None
45 |             else None
46 |         )
47 | 
48 |         outputs = self.roberta(
49 |             flat_input_ids,
50 |             position_ids=flat_position_ids,
51 |             token_type_ids=flat_token_type_ids,
52 |             attention_mask=flat_attention_mask,
53 |             head_mask=head_mask,
54 |             inputs_embeds=flat_inputs_embeds,
55 |             output_attentions=output_attentions,
56 |             output_hidden_states=output_hidden_states,
57 |         )
58 |         pooled_output = outputs[1]
59 | 
60 |         pooled_output = self.dropout(pooled_output)
61 |         logits = self.classifier(pooled_output)
62 |         reshaped_logits = logits.view(-1, num_choices)
63 | 
64 |         outputs = (reshaped_logits,pooled_output,) + outputs[2:]  # add hidden states and attention if they are here
65 | 
66 |         if labels is not None:
67 |             loss_fct = CrossEntropyLoss()
68 |             loss = loss_fct(reshaped_logits, labels)
69 |             outputs = (loss,) + outputs
70 | 
71 |         return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
72 | 


--------------------------------------------------------------------------------
/src/Training/AFLite/run_AFLite.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | import numpy as np
  4 | import torch
  5 | from collections import Counter
  6 | from tqdm import tqdm
  7 | import argparse
  8 | correct_count = Counter()
  9 | chosen_count = Counter()
 10 | dev_correct_count = Counter()
 11 | dev_chosen_count = Counter()
 12 | 
 13 | def set_seed(seed):
 14 | 	random.seed(seed)
 15 | 	np.random.seed(seed)
 16 | 	torch.manual_seed(seed)
 17 | 
 18 | def read_data(filename):
 19 | 	data = []
 20 | 	with open(filename, 'r') as f:
 21 | 		for line in f:
 22 | 			data.append(json.loads(line))
 23 | 	return data
 24 | 
 25 | def write_data(filename, data):
 26 | 	with open(filename, 'w') as fout:
 27 | 		for sample in data:
 28 | 			fout.write(json.dumps(sample))
 29 | 			fout.write('\n')
 30 | 
 31 | def predict(model, features, labels):
 32 | 	n_samples, num_cand, feat_dim = features.shape
 33 | 	if len(features) > 500000:
 34 | 		logits = []
 35 | 		batch_size = int(len(features)/10) 
 36 | 		for b in range(0, len(features), batch_size):
 37 | 			batch_logits =  model(features[b:b+batch_size].cuda())
 38 | 			logits.append(batch_logits.squeeze(2).detach().cpu())
 39 | 		logits = torch.cat(logits, dim=0).numpy()
 40 | 	else:
 41 | 		features = features.cuda()
 42 | 		logits = model(features)	
 43 | 		logits = logits.squeeze(2).detach().cpu().numpy()
 44 | 	preds = np.argmax(logits, axis=1)
 45 | 	acc = (preds == labels).mean()
 46 | 	return preds == labels
 47 | 
 48 | def train_classifier(features, labels):
 49 | 	model = torch.nn.Linear(1024, 1)
 50 | 	model.to('cuda')
 51 | 	optimizer = torch.optim.Adam(model.parameters())
 52 | 	loss_fct = torch.nn.CrossEntropyLoss()
 53 | 	features = features.cuda()
 54 | 	labels = torch.tensor(labels, dtype=torch.long).cuda()
 55 | 	batch_size = int(len(features)/10) 
 56 | 	for i in range(3):
 57 | 
 58 | 		for b in range(0, len(features), batch_size):
 59 | 			logits = model(features[b:b+batch_size])
 60 | 			loss = loss_fct(logits.squeeze(2), labels[b:b+batch_size])
 61 | 			loss.backward()
 62 | 			optimizer.step()
 63 | 			model.zero_grad()
 64 | 	return model
 65 | 
 66 | def run_iteration(features, labels, sample_ids, test_features, test_labels, test_sample_ids, target_size):
 67 | 	global correct_count, chosen_count, dev_correct_count, dev_chosen_count
 68 | 	idx = [_ for _ in range(len(features))]
 69 | 	random.shuffle(idx)
 70 | 	features = features[idx]
 71 | 	labels = labels[idx]
 72 | 	sample_ids = [sample_ids[i] for i in idx]
 73 | 	train_size = target_size
 74 | 	train_feat = features[:train_size]
 75 | 	dev_feat = features[train_size:]
 76 | 	train_labels = labels[:train_size]
 77 | 	dev_labels = labels[train_size:]
 78 | 	train_sample_ids = sample_ids[:train_size]
 79 | 	dev_sample_ids = sample_ids[train_size:]
 80 | 	model = train_classifier(train_feat, train_labels)
 81 | 	preds = predict(model, dev_feat, dev_labels)
 82 | 	chosen_count.update(dev_sample_ids)
 83 | 	correct_ids = [dev_sample_ids[sid] for sid in range(len(dev_sample_ids)) if preds[sid]]
 84 | 	correct_count.update(correct_ids)
 85 | 	test_preds = predict(model, test_features, test_labels)
 86 | 	dev_chosen_count.update(test_sample_ids)
 87 | 	test_correct_ids = [test_sample_ids[sid] for sid in range(len(test_sample_ids)) if test_preds[sid]]
 88 | 	dev_correct_count.update(test_correct_ids)
 89 | 
 90 | def main():
 91 | 	parser = argparse.ArgumentParser()
 92 | 	parser.add_argument("--train_file", default=None, type=str, required=True, help="train file")
 93 | 	parser.add_argument("--dev_file", default=None, type=str, required=True, help="dev file")
 94 | 	args = parser.parse_args()
 95 | 	set_seed(1)
 96 | 	data = read_data(args.train_file)
 97 | 	dev_data = read_data(args.dev_file)
 98 | 	print (len(data), len(dev_data))
 99 | 	features = torch.load(args.train_file.replace('.jsonl', '_features'))
100 | 	torch_labels = torch.load(args.train_file.replace('.jsonl', '_labels'))
101 | 	dev_features = torch.load(args.dev_file.replace('.jsonl', '_features'))
102 | 	dev_torch_labels = torch.load(args.dev_file.replace('.jsonl', '_labels'))
103 | 	print (features.shape, dev_features.shape)
104 | 	if 'correct' in data[0]:
105 | 		labels = [sample['correct'] for sample in data]
106 | 		dev_labels = [sample['correct'] for sample in dev_data]
107 | 	else:
108 | 		mapping = {'A':0, 'B':1, 'C':2}
109 | 		labels = [mapping[sample['answerKey']] for sample in data]
110 | 		dev_labels = [mapping[sample['answerKey']] for sample in dev_data]
111 | 	print (torch_labels.shape, dev_torch_labels.shape)
112 | 	print (np.array(labels).shape, np.array(dev_labels).shape)
113 | 	assert all(np.array(labels) == torch_labels)
114 | 	assert all(np.array(dev_labels) == dev_torch_labels)
115 | 	sample_ids = [sample['id'] for sample in data]
116 | 	dev_sample_ids = [sample['id'] for sample in dev_data]
117 | 	labels = np.array(labels)
118 | 	dev_labels = np.array(dev_labels)
119 | 	target_size = int(len(features)*0.2)
120 | 	cutoff_size = int(len(features)*0.02)
121 | 	dev_cutoff_size = int(len(dev_features)*0.02)
122 | 	print ('target size', target_size)
123 | 	global correct_count, chosen_count, dev_correct_count, dev_chosen_count
124 | 	while len(features) > target_size:
125 | 		correct_count = Counter()
126 | 		chosen_count = Counter()
127 | 		dev_correct_count = Counter()
128 | 		dev_chosen_count = Counter()
129 | 		for i in tqdm(range(64)):
130 | 			run_iteration(features, labels, sample_ids, dev_features, dev_labels, dev_sample_ids, target_size)
131 | 		for k, v in correct_count.items():
132 | 			correct_count[k] = float(v)/chosen_count[k]
133 | 		for k, v in dev_correct_count.items():
134 | 			dev_correct_count[k] = float(v)/dev_chosen_count[k]
135 | 		sorted_correct_count = sorted(correct_count.items(), key=lambda x: x[1], reverse=True)
136 | 		sorted_dev_correct_count = sorted(dev_correct_count.items(), key=lambda x: x[1], reverse=True)
137 | 		easy_train = [s[0] for s in sorted_correct_count[:cutoff_size] if s[1] > 0.75]
138 | 		easy_dev = [s[0] for s in sorted_dev_correct_count[:dev_cutoff_size] if s[1] > 0.75]
139 | 
140 | 		kept_idx = [sid for sid in range(len(sample_ids)) if sample_ids[sid] not in easy_train]
141 | 		newly_removed = len(features) - len(kept_idx)
142 | 		features = features[kept_idx]
143 | 		labels = labels[kept_idx]
144 | 		sample_ids = [sample_ids[ki] for ki in kept_idx]
145 | 		dev_kept_ids = [sid for sid in range(len(dev_sample_ids)) if dev_sample_ids[sid] not in easy_dev]
146 | 		dev_features = dev_features[dev_kept_ids]
147 | 		dev_labels = dev_labels[dev_kept_ids]
148 | 		dev_sample_ids = [dev_sample_ids[ki] for ki in dev_kept_ids]
149 | 		print ('now keeping train', len(kept_idx), 'dev', len(dev_kept_ids))
150 | 		if newly_removed < cutoff_size:
151 | 			break
152 | 	print ('finally keeping train', len(sample_ids), 'dev', len(dev_sample_ids))
153 | 	kept = Counter(sample_ids)
154 | 	kept_data = [sample for sample in data if sample['id'] in kept]
155 | 	dev_kept = Counter(dev_sample_ids)
156 | 	dev_kept_data = [sample for sample in dev_data if sample['id'] in dev_kept]
157 | 	write_data(args.train_file.replace('.jsonl', '_adv-filter.jsonl'), kept_data)
158 | 	write_data(args.dev_file.replace('.jsonl', '_adv-filter.jsonl'), dev_kept_data)
159 | 
160 | if __name__ == "__main__":
161 | 	main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Knowledge-driven Data Construction for Zero-shot Evaluation in Commonsense Question Answering
  2 | This repository contains the code for the paper "Knowledge-driven Data Construction for Zero-shot Evaluation in Commonsense Question Answering" (AAAI 2021). See full paper [here](https://arxiv.org/abs/2011.03863)
  3 | 
  4 | Note that our evaluation code is adpated from [self-talk repo](https://github.com/vered1986/self_talk)
  5 | 
  6 | ## Enviroments
  7 | This code has been tested on Python 3.7.6, Pytorch 1.5.1 and Transformers 3.0.2, you can install the required packages by 
  8 | ```
  9 | pip install -r requirements.txt
 10 | ```
 11 | 
 12 | ## Data generation
 13 | Our synthetic QA sets can be downloaded from [here](https://drive.google.com/file/d/1qp2Exh88m1LT8iyDvt8TOAXhGdHQhP2B/view?usp=sharing), uncompress it and place it in the HyKAS-CSKG root directory.
 14 | 
 15 | If you would like to generate data from scratch, first `cd` to the `src/Data_generation` directory.
 16 | 
 17 | For the **ATOMIC** synthetic sets, download the ATOMIC from [official website](https://homes.cs.washington.edu/~msap/atomic/) and uncompress.
 18 | Then run
 19 | ```
 20 | python generate_from_ATOMIC.py --train_KG atomic/v4_atomic_trn.csv --dev_KG atomic/v4_atomic_dev.csv --strategy random --out_dir ../../data/ATOMIC  
 21 | ```
 22 | 
 23 | For **CWWV**, download the `cskg_connected.tsv` from [here](https://drive.google.com/file/d/11TiW3pAHnt6l8yuIWpowzOMuM8fq7ff6/view?usp=sharing) and `cache.pkl` from [here](https://drive.google.com/file/d/19tcSaKi-Efz8IH-HX0oBkYtalnqOseZj/view?usp=sharing), then run:
 24 | ```
 25 | python generate_from_CWWV.py --cskg_file cskg_connected.tsv --lex_cache cache.pkl --output_dir ../../data/CWWV --strategy random
 26 | python filter_CWWV.py --input_file ../../data/CWWV/random.jsonl 
 27 | ```
 28 | 
 29 | ## Pretraining on Synthetic QA sets
 30 | We provide following pretrained models 
 31 | LM | KG | Download
 32 | ---|---|---
 33 | RoBERTa-Large | ATOMIC | [Download](https://drive.google.com/file/d/1oTYV5YZRlXtMSZW9_pTjyMn6o8yrPU2N/view?usp=sharing)
 34 | RoBERTa-Large | CWWV | [Download](https://drive.google.com/file/d/1Ot-x3WJoFWYUTyyDSMeG2CrKDmCTggxM/view?usp=sharing)
 35 | RoBERTa-Large | CSKG | [Download](https://drive.google.com/file/d/1nfWtIfrQk4REp7oGUyyn1ShT7aEvMI9E/view?usp=sharing)
 36 | GPT2-Large | ATOMIC | [Download](https://drive.google.com/file/d/1lENyTTBogmRIK_M7cu_uxeD8AiWBo7Ko/view?usp=sharing)
 37 | GPT2-Large | CWWV | [Download](https://drive.google.com/file/d/1dnqdW-5d6tULZfDaejViVrjuNx-nY8sP/view?usp=sharing)
 38 | GPT2-Large | CSKG | [Download](https://drive.google.com/file/d/1VUBAxtyKElmbNTxSkIdPjR88PkEjbc-2/view?usp=sharing)
 39 | 
 40 | If you would like to train models from scratch, you can use the following commands under src/Training
 41 | 
 42 | For RoBERTa
 43 | ```
 44 | CUDA_VISIBLE_DEVICES=0 python run_pretrain.py --model_type roberta-mlm --model_name_or_path roberta-large --task_name cskg --output_dir ../../out_dir --max_sequence_per_time 200 \
 45 | --train_file ../../data/ATOMIC/train_random.jsonl --second_train_file ../../data/CWWV/train_random.jsonl --dev_file ../../data/ATOMIC/dev_random.jsonl --second_dev_file \
 46 | ../../data/CWWV/dev_random.jsonl --max_seq_length 128 --max_words_to_mask 6 --do_train --do_eval --per_gpu_train_batch_size 2 --gradient_accumulation_steps 16 \
 47 | --learning_rate 1e-5 --num_train_epochs 1 --warmup_proportion 0.05 --evaluate_during_training --per_gpu_eval_batch_size 8  --save_steps 6500 --margin 1.0
 48 | ```
 49 | For GPT2 
 50 | ```
 51 | CUDA_VISIBLE_DEVICES=0 python run_pretrain_gpt2.py --model_type gpt2 --model_name_or_path gpt2-large --task_name cskg --output_dir ../../out_dir --train_file ../../data/ATOMIC/ \
 52 | train_random.jsonl --second_train_file ../../data/CWWV/train_random.jsonl --dev_file ../../data/ATOMIC/dev_random.jsonl --second_dev_file ../../data/CWWV/dev_random.jsonl \
 53 | --max_seq_length 128 --do_train --do_eval --per_gpu_train_batch_size 2 --gradient_accumulation_steps 16 --learning_rate 1e-5 --num_train_epochs 1 --warmup_proportion 0.05 \
 54 | --evaluate_during_training --per_gpu_eval_batch_size 8  --save_steps 6500 --margin 1.0
 55 | ```
 56 | 
 57 | ## Evaluation
 58 | For LM baselines, cd to src/Evaluation directory
 59 | ```
 60 | python evaluate_RoBERTa.py --lm roberta-large --dataset_file DATA_FILE --out_dir ../../results --device 1 --reader TASK_NAME
 61 | python evaluate_GPT2.py --lm gpt2-large --dataset_file DATA_FILE --out_dir ../../results --device 1  --reader TASK_NAME
 62 | ```
 63 | For pretrained models, simply point the --lm flag to your model directory, for example 
 64 | ```
 65 | python evaluate_RoBERTa.py --lm ../../models/roberta_cskg --dataset_file ../../tasks/commonsenseqa_dev.jsonl --out_dir ../../results --device 1 --reader commonsenseqa
 66 | python evaluate_GPT2.py --lm ../../models/gpt2_cskg --dataset_file ../../tasks/commonsenseqa_dev.jsonl --out_dir ../../results --device 1  --reader commonsenseqa
 67 | ```
 68 | 
 69 | ## MLM abalation
 70 | To run MLM pretraining experiments (comparison of training regimes), cd to src/Training/MLM 
 71 | ```
 72 | CUDA_VISIBLE_DEVICES=0 python run_mlm_roberta.py --model_type roberta-mlm --model_name_or_path roberta-large --task_name atomicmlm --output_dir ../../out_dir --train_file \
 73 | ../../data/ATOMIC/train_random.jsonl --dev_file ../../data/ATOMIC/dev_random.jsonl --mlm_probability 0.5 --max_seq_length 128 --max_words_to_mask 6 --max_sequence_per_time 200 \
 74 | --do_train --do_eval --per_gpu_train_batch_size 8 --gradient_accumulation_steps 4 --learning_rate 1e-5 --num_train_epochs 3 --warmup_proportion 0.05 --evaluate_during_training \
 75 | --per_gpu_eval_batch_size 8 --save_steps 5000
 76 | ```
 77 | Then follow the same evaluation commands as above to evaluate the models
 78 | 
 79 | ## AFLite
 80 | To generate adversarial filtered datasets using AFLite algorithm, first run the data generation code with --do_split flag 
 81 | ```
 82 | python generate_from_ATOMIC.py --train_KG atomic/v4_atomic_trn.csv --dev_KG atomic/v4_atomic_dev.csv --strategy random --out_dir ../../data/ATOMIC --do_split 
 83 | ```
 84 | This will split training set into 3 subsets, then we can train a feature function, cd to src/Training/AFLite directory
 85 | ```
 86 | CUDA_VISIBLE_DEVICES=0 python run_roberta_classification.py --model_type roberta-mc --model_name_or_path roberta-large --task_name cwwv --output_dir ../../out_dir --train_file \
 87 | ../../data/ATOMIC/train_4%_random.jsonl  --dev_file ../../data/ATOMIC/train_1%_random.jsonl --max_seq_length 128 --per_gpu_eval_batch_size 16 --do_train --do_eval \
 88 | --evaluate_during_training --per_gpu_train_batch_size 4 --gradient_accumulation_steps 8 --learning_rate 1e-5 --num_train_epochs 3 --warmup_proportion 0.05 --save_steps 150 
 89 | ```
 90 | Then we compute the embeddings for the remaining 95% of train and dev sets 
 91 | ```
 92 | CUDA_VISIBLE_DEVICES=0 python run_roberta_classification.py --model_type roberta-mc --model_name_or_path roberta-large --task_name cwwv --output_dir ../../out_dir --train_file \
 93 | ../../data/ATOMIC/train_4%_random.jsonl  --dev_file ../../data/ATOMIC/train_95%_random.jsonl --max_seq_length 128 --per_gpu_eval_batch_size 16 --do_eval
 94 | CUDA_VISIBLE_DEVICES=0 python run_roberta_classification.py --model_type roberta-mc --model_name_or_path roberta-large --task_name cwwv --output_dir ../../out_dir --train_file \
 95 | ../../data/ATOMIC/train_4%_random.jsonl  --dev_file ../../data/ATOMIC/dev_random.jsonl --max_seq_length 128 --per_gpu_eval_batch_size 16 --do_eval
 96 | ```
 97 | To run AFLite 
 98 | ```
 99 | python run_AFLite.py --train_file ../../data/ATOMIC/train_95%_random.jsonl  --dev_file ../../data/ATOMIC/dev_random.jsonl 
100 | ```
101 | This will produce the AFLite filtered output files at the same location as input files, which can be used for pretraining the models. 
102 | 
103 | ## Cite 
104 | ```
105 | @misc{ma2020knowledgedriven,
106 |     title={Knowledge-driven Data Construction for Zero-shot Evaluation in Commonsense Question Answering},
107 |     author={Kaixin Ma and Filip Ilievski and Jonathan Francis and Yonatan Bisk and Eric Nyberg and Alessandro Oltramari},
108 |     year={2020},
109 |     eprint={2011.03863},
110 |     archivePrefix={arXiv},
111 |     primaryClass={cs.CL}
112 | }
113 | ```
114 | 


--------------------------------------------------------------------------------
/src/Training/data_utils.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import logging
  3 | from tqdm import tqdm
  4 | import json
  5 | import re
  6 | import ftfy
  7 | import random
  8 | from collections import Counter
  9 | import unicodedata
 10 | import string
 11 | import nltk
 12 | from nltk.corpus import stopwords
 13 | skip_words = set(stopwords.words('english'))
 14 | skip_words.add('\'s')
 15 | skip_words.add('.')
 16 | skip_words.add(',')
 17 | PERSON_NAMES = ['Alex', 'Ash', 'Aspen', 'Bali', 'Berkeley', 'Cameron', 'Chris', 'Cody', 'Dana', 'Drew', 'Emory', 'Flynn', 'Gale', 'Jamie', 'Jesse', 
 18 | 'Kai', 'Kendall', 'Kyle', 'Lee', 'Logan', 'Max', 'Morgan', 'Nico', 'Paris', 'Pat', 'Quinn', 'Ray', 'Robin', 'Rowan', 'Rudy', 'Sam', 'Skylar', 'Sydney', 
 19 | 'Taylor', 'Tracy', 'West', 'Wynne']
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | def accuracy(out, labels):
 23 | 	return {'acc': (out == labels).mean()}
 24 | 
 25 | def handle_words(span, tokenizer, keywords=None, is_start=False):
 26 | 	inputs = []
 27 | 	labels = []
 28 | 	words = nltk.word_tokenize(span)
 29 | 	for w_i, w in enumerate(words):
 30 | 		if (w_i == 0 and is_start) or w == '.' or w == ',' or w.startswith('\''):
 31 | 			w_bpes = tokenizer.tokenize(w)
 32 | 		else:
 33 | 			w_bpes = tokenizer.tokenize(w, add_prefix_space=True)
 34 | 		inputs.extend(w_bpes)
 35 | 		if keywords != None:
 36 | 			if w in keywords:
 37 | 				labels.extend(w_bpes)
 38 | 			else:
 39 | 				labels.extend([-100]*len(w_bpes))
 40 | 		else:
 41 | 			if w not in PERSON_NAMES and w not in skip_words and w.lower() not in skip_words:
 42 | 				labels.extend(w_bpes)
 43 | 			else:
 44 | 				labels.extend([-100]*len(w_bpes))
 45 | 	return inputs, labels
 46 | 
 47 | def handle_underscores(suffix, tokenizer, keywords=None, prefix=False):
 48 | 	inputs = []
 49 | 	labels = []
 50 | 	if '_' in suffix:
 51 | 		suffix_parts = [i.strip() for i in suffix.split('___')]
 52 | 		for i, part in enumerate(suffix_parts):
 53 | 			if part:
 54 | 				tmp_inputs, tmp_labels = handle_words(part, tokenizer, keywords=keywords, is_start=(i==0 and prefix))
 55 | 				inputs += tmp_inputs
 56 | 				labels += tmp_labels
 57 | 
 58 | 				if i != len(suffix_parts) - 1 and suffix_parts[i+1]:
 59 | 					inputs.append(tokenizer.mask_token)
 60 | 					labels.append(-100)
 61 | 			else:
 62 | 				inputs.append(tokenizer.mask_token)
 63 | 				labels.append(-100)
 64 | 	else:
 65 | 		inputs, labels = handle_words(suffix, tokenizer, keywords=keywords, is_start=prefix)
 66 | 	return inputs, labels
 67 | 
 68 | def convert_examples_to_features(examples, tokenizer, max_length=512):
 69 | 	data = []
 70 | 	for example in examples:
 71 | 		inputs, labels = handle_underscores(example['context'], tokenizer, keywords=example['keywords'], prefix=True)
 72 | 		choices = [handle_underscores(cand, tokenizer) for cand in example['candidates']]
 73 | 		input_ids = [inputs+cand[0] for cand in choices]
 74 | 		input_ids = [tokenizer.convert_tokens_to_ids(cand) for cand in input_ids]
 75 | 		label_ids = [labels+cand[1] for cand in choices]
 76 | 		label_ids = [[t if t == -100 else input_ids[i][t_i] for t_i, t in enumerate(cand)] for i, cand in enumerate(label_ids)]
 77 | 		label_ids = [[-100]+cand+[-100] for cand in label_ids]
 78 | 		input_ids = [tokenizer.prepare_for_model(cand, max_length=max_length, truncation=True)['input_ids'] for cand in input_ids]
 79 | 		data.append([input_ids, label_ids, example['correct']])		
 80 | 	return data
 81 | 
 82 | class ATOMICMLMProcessor(object):
 83 | 	def __init__(self, args):
 84 | 		self.D = []
 85 | 		self.filelist = [args.train_file, args.dev_file]
 86 | 
 87 | 	def get_train_examples(self):
 88 | 		self.load_data(self.filelist[0])
 89 | 		return self.D
 90 | 
 91 | 	def get_dev_examples(self):
 92 | 		data = []
 93 | 		with open(self.filelist[1], 'r') as f:
 94 | 			for row in tqdm(f):
 95 | 				sample = json.loads(row)
 96 | 				data.append(sample)
 97 | 		print (len(data))
 98 | 		return data
 99 | 
100 | 	def load_data(self, filename):
101 | 		with open(filename, "r") as f:
102 | 			for row in tqdm(f):
103 | 				sample = json.loads(row)
104 | 				self.D.append({'id':sample['id'], 'context':sample['context'], 'ending':sample['candidates'][sample['correct']], 'keywords': sample['keywords']})
105 | 			print (len(self.D))
106 | 
107 | class ATOMICProcessor(object):
108 | 	def __init__(self, args):
109 | 		print ('loading from %s %s' % (args.train_file, args.dev_file))
110 | 		self.filelist = [args.train_file, args.dev_file]
111 | 		self.D = [[], []]
112 | 
113 | 	def get_train_examples(self):
114 | 		self.load_data(self.filelist[0], 0)
115 | 		return self.D[0]
116 | 
117 | 	def get_dev_examples(self):
118 | 		self.load_data(self.filelist[1], 1)
119 | 		return self.D[1]
120 | 
121 | 	def load_data(self, filename, sid):
122 | 		with open(filename, "r") as f:
123 | 			for row in tqdm(f):
124 | 				sample = json.loads(row)
125 | 				self.D[sid].append(sample)
126 | 			print (len(self.D[sid]))
127 | 
128 | class CWWVProcessor(object):
129 | 	def __init__(self, args):
130 | 		self.answerKey_mapping = {'A':0, 'B':1, 'C':2}
131 | 		self.D = [[], []]
132 | 		if args.task_name == 'cskg':
133 | 			print ('loading from %s %s' % (args.second_train_file, args.second_dev_file))
134 | 			self.filelist = [args.second_train_file, args.second_dev_file]
135 | 		else:
136 | 			print ('loading from %s %s' % (args.train_file, args.dev_file))
137 | 			self.filelist = [args.train_file, args.dev_file]
138 | 
139 | 	def get_train_examples(self):
140 | 		self.load_data(self.filelist[0], 0)
141 | 		return self.D[0]
142 | 
143 | 	def get_dev_examples(self):
144 | 		self.load_data(self.filelist[1], 1)
145 | 		return self.D[1]
146 | 
147 | 	def load_data(self, filename, sid):
148 | 		skipped = 0
149 | 		with open(filename, "r") as f:
150 | 			for row in tqdm(f):
151 | 				sample = json.loads(row)
152 | 				context = sample['question']['stem']
153 | 				if context.endswith('.'):
154 | 					context = context[:-1]
155 | 				if not context.endswith('[MASK]'):
156 | 					skipped += 1
157 | 					context_parts = context.split('[MASK]')
158 | 					context = context_parts[0].strip()
159 | 					candidates = [c['text']+context_parts[1]+'.' for c in sample['question']['choices']]
160 | 				else:
161 | 					context = context[:-7]
162 | 					candidates = [c['text']+'.' for c in sample['question']['choices']]
163 | 				label = self.answerKey_mapping[sample['answerKey']]
164 | 				keywords = nltk.word_tokenize(sample['question']['head'])
165 | 				keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words]
166 | 				self.D[sid].append({'id':sample['id'], 'context':context, 'correct':label, 'candidates':candidates, 'keywords':keywords})
167 | 			print (len(self.D[sid]), skipped)
168 | 
169 | class CWWVMLMProcessor(object):
170 | 	def __init__(self, args):
171 | 		self.answerKey_mapping = {'A':0, 'B':1, 'C':2}
172 | 		self.D = []
173 | 		self.filelist = [args.train_file, args.dev_file]
174 | 		self.args = args
175 | 
176 | 	def get_train_examples(self):
177 | 		self.load_data(self.filelist[0])
178 | 		return self.D
179 | 
180 | 	def get_dev_examples(self):
181 | 		processor = CSKGProcessor(self.args)
182 | 		return processor.get_dev_examples()
183 | 
184 | 	def load_data(self, filename):
185 | 		skipped = 0
186 | 		with open(filename, "r") as f:
187 | 			for row in tqdm(f):
188 | 				sample = json.loads(row)
189 | 				context = sample['question']['stem']
190 | 				if context.endswith('.'):
191 | 					context = context[:-1]
192 | 				assert context.endswith('[MASK]')
193 | 				context = context[:-7]
194 | 				candidates = [c['text']+'.' for c in sample['question']['choices']]
195 | 				label = self.answerKey_mapping[sample['answerKey']]
196 | 				keywords = nltk.word_tokenize(sample['question']['head'])
197 | 				keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words]
198 | 				self.D.append({'id':sample['id'], 'context':context, 'ending':candidates[label], 'keywords':keywords})
199 | 			print (len(self.D))
200 | 
201 | class CSKGProcessor(object):
202 | 	def __init__(self, args):
203 | 		# CWWV set always uses second train/dev file params 
204 | 		self.atomicprocessor = ATOMICProcessor(args)
205 | 		self.cwwvprocessor = CWWVProcessor(args)
206 | 
207 | 	def get_train_examples(self):
208 | 		cwwv_questions = self.cwwvprocessor.get_train_examples()
209 | 		atomic_questions = self.atomicprocessor.get_train_examples()
210 | 		return cwwv_questions+atomic_questions
211 | 
212 | 	def get_dev_examples(self):
213 | 		cwwv_questions = self.cwwvprocessor.get_dev_examples()
214 | 		atomic_questions = self.atomicprocessor.get_dev_examples()
215 | 		return cwwv_questions+atomic_questions
216 | 
217 | myprocessors = {
218 | 	"atomic": ATOMICProcessor,
219 | 	"cwwv": CWWVProcessor,
220 | 	"atomicmlm": ATOMICMLMProcessor,
221 | 	"cwwvmlm": CWWVMLMProcessor,
222 | 	"cskg": CSKGProcessor
223 | }
224 | 


--------------------------------------------------------------------------------
/src/Data_generation/generate_from_CWWV.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter, defaultdict
  2 | import argparse
  3 | import sys
  4 | import random
  5 | import json
  6 | from tqdm import tqdm
  7 | import pickle as pkl
  8 | from string import Template
  9 | import numpy as np
 10 | from sentence_transformers import SentenceTransformer, util
 11 | from os import path
 12 | random.seed(1)
 13 | num_distractors=2
 14 | 
 15 | good_relations=['/r/Causes', '/r/UsedFor', '/r/CapableOf', '/r/CausesDesire', '/r/IsA', '/r/SymbolOf', '/r/MadeOf', '/r/LocatedNear', '/r/Desires', '/r/AtLocation', '/r/HasProperty', '/r/PartOf', '/r/HasFirstSubevent', '/r/HasLastSubevent'] 
 16 | 
 17 | q_sources=set(['CN', 'WD', 'WN'])
 18 | dist_only_sources=set(['VG'])
 19 | 
 20 | def format_question(q, a, distractors, q_id, head_label, template, source, rel):
 21 | 	q_entry={}
 22 | 	q_entry['id']=q_id
 23 | 	q_entry['question']={'stem': q}
 24 | 
 25 | 	answer_key=random.choice(["A", "B", "C"])
 26 | 	q_entry["answerKey"]=answer_key
 27 | 	if answer_key=="A":
 28 | 		correct_option={"text": a, "label": "A"}
 29 | 		dist1={"text": distractors[0], "label": "B"}
 30 | 		dist2={"text": distractors[1], "label": "C"}
 31 | 		options=[correct_option, dist1, dist2]
 32 | 	elif answer_key=="B":
 33 | 		correct_option={"text": a, "label": "B"}
 34 | 		dist1={"text": distractors[0], "label": "A"}
 35 | 		dist2={"text": distractors[1], "label": "C"}
 36 | 		options=[dist1, correct_option, dist2]
 37 | 	elif answer_key=="C":
 38 | 		correct_option={"text": a, "label": "C"}
 39 | 		dist1={"text": distractors[0], "label": "A"}
 40 | 		dist2={"text": distractors[1], "label": "B"}
 41 | 		options=[dist1, dist2, correct_option]
 42 | 	q_entry["question"]["choices"]=options
 43 | 	q_entry["question"]["head"]=head_label
 44 | 	q_entry["question"]["source"]=source
 45 | 	q_entry["question"]["template"]=template
 46 | 	q_entry["question"]["relation"]=rel
 47 | 	return q_entry
 48 | 
 49 | def select_distractors_noaf(data, head_label, heads, correct_answer, rel):
 50 | 	"""Distractors without AFiltering"""
 51 | 
 52 | 	negatives = []
 53 | 
 54 | 	answer_heads=set(head_label.split())
 55 | 
 56 | 	candidates=random.choices(list(data), k=num_distractors*100)
 57 | 
 58 | 	for neg in candidates:
 59 | 		distractor_heads=heads[(neg, rel)]
 60 | 		if neg not in negatives and neg!=correct_answer and neg not in correct_answer and correct_answer not in neg and not (distractor_heads & answer_heads):
 61 | 			negatives.append(neg)
 62 | 			if len(negatives)>=num_distractors:
 63 | 				return negatives, -1
 64 | 	print('Not enough')
 65 | 	return None, -1
 66 | 
 67 | def select_distractors_af(data, head_label, heads, correct_answer, rel, question, embeddings, sentence2id, sentences, q_or_a='a'):
 68 | 	"""Distractors with AF"""
 69 | 	high_prob = 0.6
 70 | 	low_prob = 0.5
 71 | 	step=0.05
 72 | 	limit_dists=10
 73 | 	if q_or_a=='q':
 74 | 		downsample_size=num_distractors*400
 75 | 	else:
 76 | 		downsample_size=num_distractors*100
 77 | 
 78 | 	negatives = []
 79 | 
 80 | 	answer_heads=set(head_label.split())
 81 | 
 82 | 	candidates=random.choices(list(data), k=downsample_size)
 83 | 
 84 | 	distractors_indices = [sentence2id[sent] for sent in candidates] # todo!
 85 | 	if q_or_a=='a':
 86 | 		compare_index=sentence2id[correct_answer] # todo!
 87 | 	else: # q
 88 | 		compare_index=sentence2id[question_to_sentence(question)]
 89 | 	dist_mapping = {i:val for i, val in enumerate(distractors_indices)}
 90 | 	distractor_emb = embeddings[distractors_indices]
 91 | 	correct_emb = embeddings[compare_index]
 92 | 	cos_scores = util.pytorch_cos_sim(correct_emb, distractor_emb)[0]
 93 | 	midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1)
 94 | 	while len(midpoint) < limit_dists:
 95 | 		low_prob -= step
 96 | 		midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1)
 97 | 
 98 | 	x=0
 99 | 	while len(negatives) < num_distractors:
100 | 		if x>=10: 
101 | 			negatives=None
102 | 			print('Not enough')
103 | 			break	
104 | 		sample_idx = random.choice(midpoint)
105 | 		neg = sentences[dist_mapping[sample_idx.item()]]
106 | 		distractor_heads=heads[(neg, rel)]
107 | 		if neg not in negatives and neg!=correct_answer and neg not in correct_answer and correct_answer not in neg and not (distractor_heads & answer_heads):
108 | 			negatives.append(neg)
109 | 		x+=1
110 | 	return negatives, low_prob
111 | 
112 | def construct_from_template(h, r):
113 | 	t={
114 | 		"/r/Causes": "$node1 can cause [MASK]",
115 | 		"/r/UsedFor": "$node1 can be used for [MASK]",
116 | 		"/r/CapableOf": "$node1 is capable of [MASK]", 
117 | 		"/r/CausesDesire": "$node1 causes desire for [MASK]", 
118 | 		"/r/IsA": "$node1 is a [MASK]",
119 | 		"/r/SymbolOf": "$node1 is a symbol of [MASK]",
120 | 		"/r/MadeOf": "$node1 can be made of [MASK]", 
121 | 		"/r/LocatedNear": "$node1 is often located near [MASK]",
122 | 		"/r/Desires": "$node1 desires [MASK]",
123 | 		"/r/AtLocation": "$node1 can be found at [MASK]",
124 | 		"/r/HasProperty": "$node1 has property [MASK]",
125 | 		"/r/PartOf": "$node1 is part of [MASK]",
126 | 		"/r/HasFirstSubevent": "$node1 starts by [MASK]",
127 | 		"/r/HasLastSubevent": "$node1 ends by [MASK]"
128 | 	}
129 | 	if r in t.keys():
130 | 		temp=Template(t[r])
131 | 		question=temp.substitute(node1=h)
132 | 		template=temp.substitute(node1='{}').replace('[MASK]', '{}')
133 | 		return question, template
134 | 	else:
135 | 		print('ERROR')
136 | 
137 | def generate_questions(qa_pairs, rel_tails, answer_heads, output_file, embeddings, sentence2id, sentences, strategy, limit=1000):
138 | 	q_id=0
139 | 	all_rels=[]
140 | 	all_min_probs=[]
141 | 	with open(output_file, 'w') as w:
142 | 		for pair, qa_data in tqdm(qa_pairs.items(), total=len(qa_pairs)):
143 | 			node1, rel=pair
144 | 			n1_labels=qa_data[0][-1]
145 | 			for qa in qa_data:
146 | 				q,a, n1_labels, template, head_label, sent_source,distractor_only =qa
147 | 				if distractor_only or a in head_label:
148 | 					continue
149 | 				q_or_a = None
150 | 				if args.strategy == 'adv-answer':
151 | 					q_or_a = 'a'
152 | 				elif args.strategy == 'adv-question':
153 | 					q_or_a = 'q'
154 | 				if q_or_a != None:
155 | 					distractors, min_prob=select_distractors_af(rel_tails[rel], 
156 | 																head_label, 
157 | 																answer_heads,
158 | 																a,
159 | 																rel,
160 | 																q,
161 | 																embeddings, 
162 | 																sentence2id, 
163 | 																sentences,
164 | 																q_or_a)
165 | 				else:
166 | 					distractors, min_prob=select_distractors_noaf(rel_tails[rel],
167 | 																head_label,
168 | 																answer_heads,
169 | 																a,
170 | 																rel)
171 | 				if distractors:
172 | 					all_min_probs.append(min_prob)
173 | 					q_entry=format_question(q, a, distractors, q_id, head_label, template, sent_source, rel)
174 | 					q_id+=1
175 | 					w.write(json.dumps(q_entry) + '\n')
176 | 					all_rels.append(rel)
177 | 	print(Counter(all_rels))
178 | 	print(Counter(all_min_probs))
179 | 
180 | def get_labels(data):
181 | 	if '|' in data:
182 | 		return data.split('|')
183 | 	else:
184 | 		return [data]
185 | 
186 | def question_to_sentence(q):
187 | 	return q.replace('[MASK]', '').strip()
188 | 
189 | def make_masked_question(s):
190 | 	node1_start=s.find('[[')
191 | 	node1_end=s.find(']]')
192 | 	node1_label=s[node1_start+2:node1_end]
193 | 
194 | 	node2_start=s.rfind('[[')
195 | 	node2_end=s.rfind(']]')
196 | 	new_s=s[:node2_start].replace('[[', '').replace(']]', '') + '[MASK]' + s[node2_end+2:]
197 | 
198 | 	template=s[:node1_start] + '{}' + s[node1_end+2:node2_start] + '{}' + s[node2_end+2:]
199 | 
200 | 	return new_s, node1_label, template
201 | 
202 | def make_masked_question_from_lex(sentence, head, tail):
203 | 	question=sentence.replace(tail, '[MASK]')
204 | 	template=sentence.replace(head, '{}').replace(tail, '{}')
205 | 	return question, template
206 | 
207 | def token_overlap(x, y):
208 | 	return bool(set(x.split()) & set(y.split()))
209 | 
210 | def build_embeddings(sentences, out_dir, model_name='roberta-large-nli-stsb-mean-tokens'):
211 | 	emb_file = os.path.join(args.out_dir, 'cwwv_emb.pkl')
212 | 	if path.exists(emb_file):
213 | 		print ('embeddings already exists, skip computation')
214 | 		with open(emb_file, 'rb') as f:
215 | 			data=pkl.load(f)
216 | 			embeddings=data['embeddings']
217 | 			sentences=data['sentences']
218 | 			return embeddings
219 | 	model = SentenceTransformer(model_name)
220 | 	embeddings = model.encode(sentences, show_progress_bar=True, device=0, num_workers=4)
221 | 	with open(emb_file, "wb") as fout:
222 | 		pkl.dump({'sentences': sentences, 'embeddings': embeddings}, fout, protocol=pkl.HIGHEST_PROTOCOL)
223 | 	return embeddings
224 | 
225 | def create_indices(cskg_file, lex_cache):
226 | 	qa_pairs=defaultdict(list)
227 | 
228 | 	rel_tails=defaultdict(set)
229 | 	answer_heads=defaultdict(set)
230 | 
231 | 	all_tails=set()
232 | 
233 | 	q_sents=set()
234 | 
235 | 	with open(cskg_file, 'r') as f:
236 | 		header=next(f)
237 | 		for line in f:
238 | 			fields=line.split('\t')
239 | 
240 | 			# extract existing info
241 | 			node1=fields[1]
242 | 			rel=fields[2]
243 | 			node2=fields[3]
244 | 			pair=(node1, rel)
245 | 			node1_labels=get_labels(fields[4])
246 | 			#head_tokens=set()
247 | 			#for n1_label in node1_labels:
248 | 			#	head_tokens |= set(n1_label.split())
249 | 			node2_labels=get_labels(fields[5])
250 | 			edge_id=fields[0]
251 | 			source=fields[8]
252 | 			sentence=fields[9].strip()
253 | 
254 | 			if '|' in source:
255 | 				source=set(source.split('|'))
256 | 			else:
257 | 				source=set([source])
258 | 
259 | 			if rel not in good_relations or (len(source & (q_sources|dist_only_sources))==0): continue
260 | 
261 | 			for answer in node2_labels:
262 | 				rel_tails[rel].add(answer)
263 | 				answer_heads[(answer, rel)] |= set(node1_labels)
264 | 				all_tails.add(answer)
265 | 
266 | 			distractor_only=True
267 | 			for s in source:
268 | 				if s in q_sources:
269 | 					distractor_only=False
270 | 			if sentence:
271 | 				question, head_label, template = make_masked_question(sentence)
272 | 				for answer in node2_labels:
273 | 					if not token_overlap(head_label, answer):
274 | 						qa_pairs[pair].append((question, answer, node1_labels, template, head_label, 'omcs', distractor_only))
275 | 						q_sents.add(question_to_sentence(question))
276 | 			elif lex_cache:
277 | 				for n1_label in node1_labels:
278 | 					for answer in node2_labels:
279 | 						triple=(n1_label, rel, answer)
280 | 						if triple in lex_cache.keys() and not token_overlap(n1_label, answer):
281 | 							sentence=lex_cache[triple]
282 | 							question, template = make_masked_question_from_lex(sentence, n1_label, answer)
283 | 							if '[MASK]' not in question or template.split().count('{}')!=2 or question.split().count('[MASK]')==1:
284 | 								question, template = construct_from_template(n1_label, rel)
285 | 							qa_pairs[pair].append((question, answer, node1_labels, template, n1_label, 'lex', distractor_only))
286 | 							q_sents.add(question_to_sentence(question))
287 | 						elif not token_overlap(n1_label, answer):
288 | 							question, template = construct_from_template(n1_label, rel)
289 | 							qa_pairs[pair].append((question, answer, node1_labels, template, n1_label, 'lex', distractor_only))
290 | 							q_sents.add(question_to_sentence(question))
291 | 	return qa_pairs, all_tails, rel_tails, answer_heads, list(q_sents)
292 | 
293 | if __name__ == '__main__':
294 | 	parser = argparse.ArgumentParser()
295 | 	parser.add_argument("--cskg_file", type=str, default=None, required=True,
296 | 						help="CSKG graph TSV file")
297 | 	parser.add_argument("--out_dir", type=str, default=None, required=True,
298 | 						help="Output directory")
299 | 	parser.add_argument("--limit", type=int, default=1000000000,
300 | 						help="Limit of CSKG rows to process")
301 | 	parser.add_argument('--lex_cache', type=str, default='../cache.pkl',
302 | 						help="Pickle file that contains the cache of the lexicalization.")
303 | 	parser.add_argument("--strategy", default='random', type=str, required=False, choices=['random', 'adv-answer', 'adv-question'], help="which data generation strategy to use")
304 | 	args = parser.parse_args()
305 | 	
306 | 	lex_cache=None
307 | 	lex_cache=pkl.load(open(args.lex_cache, 'rb'))
308 | 	qa_pairs, all_tails, rel_tails, answer_heads, q_sentences=create_indices(args.cskg_file, lex_cache)
309 | 	print('Collecting sentences')
310 | 	sentences=list(all_tails) + q_sentences
311 | 	print(len(sentences), 'sentences', len(all_tails), 'answers', len(qa_pairs.keys()), 'qa pairs')
312 | 	if args.strategy == 'adv-answer' or args.strategy == 'adv-question':
313 | 		print ('Using %s strategy' % args.strategy)
314 | 		print('Computing embeddings')
315 | 		embeddings=build_embeddings(sentences, args.out_dir)
316 | 		print(len(embeddings), 'embeddings')
317 | 	else:
318 | 		embeddings = None
319 | 	sentence2id={word:i for i, word in enumerate(sentences)}
320 | 	output_file = path.join(args.out_dir, args.strategy+'.jsonl')
321 | 	generate_questions(qa_pairs, rel_tails, answer_heads, output_file, embeddings, sentence2id, sentences, args.strategy, args.limit)
322 | 	
323 | 		
324 | 


--------------------------------------------------------------------------------
/src/Evaluation/evaluate_GPT2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import tqdm
  5 | import torch
  6 | import logging
  7 | import argparse
  8 | import numpy as np
  9 | 
 10 | from overrides import overrides
 11 | from torch.nn import CrossEntropyLoss
 12 | from transformers import AutoTokenizer, AutoModelWithLMHead
 13 | 
 14 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 15 |                     datefmt='%m/%d/%Y %H:%M:%S',
 16 |                     level=logging.INFO)
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | class InstanceReader(object):
 21 |     def to_uniform_fields(self, fields):
 22 |         pass
 23 | 
 24 |     def fields_to_instance(self, fields):
 25 |         pass
 26 | 
 27 | class PiqaInstanceReader(InstanceReader):
 28 |     """
 29 |     Reads the PIQA dataset into a unified format with context, question, label, and choices.
 30 |     """
 31 |     @overrides
 32 |     def to_uniform_fields(self, fields):
 33 |         context = ""
 34 |         question = fields["goal"]
 35 |         label = fields.get('label', None)
 36 |         choices = [fields["sol1"], fields["sol2"]]
 37 |         return context, question, label, choices
 38 | 
 39 |     @overrides
 40 |     def fields_to_instance(self, fields):
 41 |         context, question, label, choices = self.to_uniform_fields(fields)
 42 |         context_with_choices = [f"{question} {choice[0].lower() + choice[1:]}" for choice in choices]
 43 |         return context, question, label, choices, context_with_choices
 44 | 
 45 | 
 46 | class SocialIQAInstanceReader(InstanceReader):
 47 |     """
 48 |     Reads the SocialIQa dataset into a unified format with context, question, label, and choices.
 49 |     """
 50 |     def __init__(self):
 51 |         super(SocialIQAInstanceReader).__init__()
 52 |         self.QUESTION_TO_ANSWER_PREFIX = {
 53 |               "What will (.*) want to do next?": r"As a result, [SUBJ] wanted to",
 54 |               "What will (.*) want to do after?": r"As a result, [SUBJ] wanted to",
 55 |               "How would (.*) feel afterwards?": r"As a result, [SUBJ] felt",
 56 |               "How would (.*) feel as a result?": r"As a result, [SUBJ] felt",
 57 |               "What will (.*) do next?": r"[SUBJ] then",
 58 |               "How would (.*) feel after?": r"[SUBJ] then",
 59 |               "How would you describe (.*)?": r"[SUBJ] is seen as",
 60 |               "What kind of person is (.*)?": r"[SUBJ] is seen as",
 61 |               "How would you describe (.*) as a person?": r"[SUBJ] is seen as",
 62 |               "Why did (.*) do that?": r"Before, [SUBJ] wanted",
 63 |               "Why did (.*) do this?": r"Before, [SUBJ] wanted",
 64 |               "Why did (.*) want to do this?": r"Before, [SUBJ] wanted",
 65 |               "What does (.*) need to do beforehand?": r"Before, [SUBJ] needed to",
 66 |               "What does (.*) need to do before?": r"Before, [SUBJ] needed to",
 67 |               "What does (.*) need to do before this?": r"Before, [SUBJ] needed to",
 68 |               "What did (.*) need to do before this?": r"Before, [SUBJ] needed to",
 69 |               "What will happen to (.*)?": r"[SUBJ] then",
 70 |               "What will happen to (.*) next?": r"[SUBJ] then"
 71 |         }
 72 | 
 73 |     @overrides
 74 |     def to_uniform_fields(self, fields):
 75 |         context = fields['context']
 76 |         if not context.endswith("."):
 77 |             context += "."
 78 | 
 79 |         question = fields['question']
 80 |         label = fields['correct']
 81 |         choices = [fields['answerA'], fields['answerB'], fields['answerC']]
 82 |         choices = [c + "." if not c.endswith(".") else c for c in choices]
 83 |         label = ord(label) - 65
 84 |         return context, question, label, choices
 85 | 
 86 |     @overrides
 87 |     def fields_to_instance(self, fields):
 88 |         context, question, label, choices = self.to_uniform_fields(fields)
 89 | 
 90 |         answer_prefix = ""
 91 |         for template, ans_prefix in self.QUESTION_TO_ANSWER_PREFIX.items():
 92 |             m = re.match(template, question)
 93 |             if m is not None:
 94 |                 subj = m.group(1)
 95 |                 if subj.endswith('?'):
 96 |                     subj = subj[:-1]
 97 |                 answer_prefix = ans_prefix.replace("[SUBJ]", subj)
 98 |                 break
 99 | 
100 |         if answer_prefix == "":
101 |             answer_prefix = question.replace("?", "is")
102 | 
103 |         choices = [
104 |             " ".join((answer_prefix, choice[0].lower() + choice[1:])).replace(
105 |                 "?", "").replace("wanted to wanted to", "wanted to").replace(
106 |                 "needed to needed to", "needed to").replace("to to", "to") for choice in choices]
107 | 
108 |         context_with_choices = [f"{context} {choice}" for choice in choices]
109 |         return context, question, label, choices, context_with_choices
110 | 
111 | class ATOMICInstanceReader(InstanceReader):
112 |     """
113 |     Reads the ATOMIC dataset into a unified format with context, question, label, and choices.
114 |     """
115 |     @overrides
116 |     def to_uniform_fields(self, fields):
117 |         question = fields['context']
118 |         label = fields['correct']
119 |         choices = [fields['candidates'][0], fields['candidates'][1], fields['candidates'][2]]
120 |         return '', question, label, choices
121 | 
122 |     @overrides
123 |     def fields_to_instance(self, fields):
124 |         context, question, label, choices = self.to_uniform_fields(fields)
125 |         context_with_choices = [f"{question} {choice}" for choice in choices]
126 |         return context, question, label, choices, context_with_choices
127 | 
128 | class CWWVInstanceReader(InstanceReader):
129 |     """
130 |     Reads the CWWV dataset into a unified format with context, question, label, and choices.
131 |     """
132 |     @overrides
133 |     def to_uniform_fields(self, fields):
134 |         question = fields['question']['stem']
135 |         if question.endswith('.'):
136 |             question = question[:-1]
137 |         if not question.endswith('[MASK]'):
138 |             print ('should not happen')
139 |             exit(0)
140 |         question = question[:-7]
141 |         label = ['A','B','C'].index(fields['answerKey'])
142 |         choices = [fields['question']['choices'][0]['text']+'.', fields['question']['choices'][1]['text']+'.', fields['question']['choices'][2]['text']+'.']
143 |         return '', question, label, choices
144 | 
145 |     @overrides
146 |     def fields_to_instance(self, fields):
147 |         context, question, label, choices = self.to_uniform_fields(fields)
148 |         context_with_choices = [f"{question} {choice}" for choice in choices]
149 |         return context, question, label, choices, context_with_choices
150 | 
151 | class WinograndeInstanceReader(InstanceReader):
152 |     """
153 |     Reads the WinoGrande dataset into a unified format with context, question, label, and choices.
154 |     """
155 |     @overrides
156 |     def to_uniform_fields(self, fields):
157 |         context = fields['sentence']
158 |         if not context.endswith("."):
159 |             context += "."
160 | 
161 |         label = fields['answer']
162 |         choices = [fields['option1'], fields['option2']]
163 |         label = int(label) - 1
164 |         question = ''
165 |         return context, question, label, choices
166 | 
167 |     @overrides
168 |     def fields_to_instance(self, fields):
169 |         context, question, label, choices = self.to_uniform_fields(fields)
170 |         context_with_choices = [context.replace("_", choice) for choice in choices]
171 |         return context, question, label, choices, context_with_choices
172 | 
173 | 
174 | class CommonsenseqaInstanceReader(InstanceReader):
175 |     """
176 |     Reads the CommonsenseQA dataset into a unified format with context, question, label, and choices.
177 |     """
178 |     @overrides
179 |     def to_uniform_fields(self, fields):
180 |         context = ''
181 | 
182 |         question = 'Q: ' + fields['question']['stem']
183 |         label = ['A','B','C','D','E'].index(fields['answerKey']) if "answerKey" in fields else None
184 |         choices = ['A: '+ c['text'] for c in fields['question']['choices']]
185 |         return context, question, label, choices
186 | 
187 |     @overrides
188 |     def fields_to_instance(self, fields):
189 |         context, question, label, choices = self.to_uniform_fields(fields)
190 |         context_with_choices = [f"{question} {choice[0].lower() + choice[1:]}" for choice in choices]
191 |         return context, question, label, choices, context_with_choices
192 | 
193 | class ANLIInstanceReader(InstanceReader):
194 |     """
195 |     Reads the aNLI dataset into a unified format with context, question, label, and choices.
196 |     """
197 |     @overrides
198 |     def to_uniform_fields(self, fields):
199 |         label = ['A','B'].index(fields['answerKey']) if "answerKey" in fields else None
200 |         choices = [c['statement'] for c in fields['statements']]
201 |         return label, choices
202 | 
203 |     @overrides
204 |     def fields_to_instance(self, fields):
205 |         label, choices = self.to_uniform_fields(fields)
206 |         return None, None, label, None, choices
207 | 
208 | INSTANCE_READERS = {"socialiqa": SocialIQAInstanceReader,
209 |                     "winogrande": WinograndeInstanceReader,
210 |                     "piqa": PiqaInstanceReader,
211 |                     "commonsenseqa":CommonsenseqaInstanceReader,
212 |                     "anli": ANLIInstanceReader,
213 |                     "atomic": ATOMICInstanceReader,
214 |                     'cwwv': CWWVInstanceReader}
215 | 
216 | 
217 | def main():
218 |     parser = argparse.ArgumentParser()
219 |     parser.add_argument("--lm", default="gpt2-large", type=str, required=False, help="language model to use")
220 |     parser.add_argument("--dataset_file", default=None, type=str, required=True, help="Jsonl file")
221 |     parser.add_argument("--out_dir", default=None, type=str, required=True, help="Out directory for the predictions")
222 |     parser.add_argument("--device", default=-1, type=int, required=False, help="GPU device")
223 |     parser.add_argument("--cache_dir", default=None, type=str, required=False, help="where the model is cached")
224 |     parser.add_argument("--reader", default=None, type=str, required=True, help="which reader to use")
225 |     args = parser.parse_args()
226 |     logger.info(args)
227 | 
228 |     task = args.reader 
229 |     if args.lm != 'gpt2-large':
230 |         model_path = ['gpt2']+args.lm.split('/')[-1:]+[task]
231 |         model_path = '_'.join([m for m in model_path if m != ''])
232 |         out_dir = os.path.join(args.out_dir, model_path)
233 |     else:
234 |         out_dir = os.path.join(args.out_dir, 'gpt2_'+task)
235 |     if os.path.exists(out_dir) and os.listdir(out_dir):
236 |         raise ValueError("Output directory ({}) already exists and is not empty.".format(out_dir))
237 |     if not os.path.exists(out_dir):
238 |         os.makedirs(out_dir)
239 |     # Load the language model
240 |     device = torch.device(f'cuda:{args.device}') if args.device >= 0 else torch.device("cpu")
241 |     model, tokenizer = init_model(args.lm, device, args.cache_dir)
242 | 
243 |     # Load the dataset
244 |     instance_reader = INSTANCE_READERS[args.reader]()
245 |     
246 |     out_file = os.path.join(out_dir, "predictions.jsonl")
247 |     log_file = os.path.join(out_dir, 'results.txt')
248 |     gold = []
249 |     predictions = []
250 |     results = []
251 |     pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
252 |     sample_id = 0
253 |     # Predict instances
254 |     with open(out_file, "w") as f_out:
255 |         with open(args.dataset_file) as f_in:
256 |             for line in tqdm.tqdm(f_in):
257 |                 fields = json.loads(line.strip())
258 |                 context, question, label, choices, context_with_choices = \
259 |                     instance_reader.fields_to_instance(fields)
260 |                 if sample_id == 0:
261 |                     results.append(json.dumps(context_with_choices))
262 |                 gold.append(label)
263 |                 # Tokenize and pad
264 |                 tokenized = [tokenizer.encode(text) for text in context_with_choices]
265 |                 max_length = max([len(text) for text in tokenized])
266 |                 att_mask = torch.zeros((len(tokenized), max_length)).to(device)
267 |                 for i in range(len(tokenized)):
268 |                     att_mask[i][:len(tokenized[i])] = 1
269 |                 tokenized = [text + [pad_token_id] * (max_length - len(text)) for text in tokenized]
270 |                 tokenized = torch.tensor(tokenized).long().to(device)
271 |                 prediction = int(np.argmin(get_lm_score(model, tokenized, pad_token_id, att_mask)))
272 |                 fields["prediction"] = prediction
273 |                 predictions.append(prediction)
274 |                 f_out.write(json.dumps(fields) + "\n")
275 |                 sample_id += 1
276 | 
277 |     # Don't report accuracy if we don't have the labels
278 |     if None not in gold:
279 |         accuracy = (np.array(gold)==np.array(predictions)).mean()
280 |         print(f"Accuracy: {accuracy:.3f}")
281 |         results.append(f"Accuracy : {accuracy:.3f}")
282 |     with open(log_file, 'w') as fout:
283 |         for line in results:
284 |             fout.write(line + '\n')
285 | 
286 | 
287 | def get_lm_score(model, batch, pad_token_id, att_mask):
288 |     """
289 |     Get the cross entropy loss of the texts in batch using the langage model
290 |     """
291 |     # Batch: [num_choices, max_length]
292 |     with torch.no_grad():
293 |         num_choices, max_length = batch.shape
294 |         shift_labels = batch[..., 1:].contiguous().view(-1)
295 |         lm_logits = model(batch, attention_mask=att_mask)[0]
296 |         shift_logits = lm_logits[..., :-1, :].contiguous()
297 |         shift_logits = shift_logits.view(-1, shift_logits.size(-1))
298 |         loss_fct = CrossEntropyLoss(reduction="none", ignore_index=pad_token_id)
299 |         loss = loss_fct(shift_logits, shift_labels)
300 |         loss = loss.view(num_choices, -1).sum(1).cpu().numpy()
301 |         valid_tokens = (batch!=pad_token_id).long().sum(1).cpu().numpy()
302 |         loss /= valid_tokens 
303 |     return loss
304 | 
305 | 
306 | def init_model(model_name: str,
307 |                device: torch.device, cache_dir):
308 |     """
309 |     Initialize a pre-trained LM
310 |     :param model_name: from MODEL_CLASSES
311 |     :param device: CUDA / CPU device
312 |     :return: the model and tokenizer
313 |     """
314 |     logger.info(f'Initializing {model_name}')
315 |     tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
316 |     model = AutoModelWithLMHead.from_pretrained(model_name, cache_dir=cache_dir)
317 |     model.to(device)
318 |     model.eval()
319 |     return model, tokenizer
320 | 
321 | 
322 | if __name__ == '__main__':
323 |     main()
324 | 


--------------------------------------------------------------------------------
/src/Evaluation/evaluate_RoBERTa.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import tqdm
  5 | import torch
  6 | import logging
  7 | import argparse
  8 | import numpy as np
  9 | from overrides import overrides
 10 | from torch.nn import CrossEntropyLoss
 11 | from transformers import RobertaTokenizer, RobertaForMaskedLM
 12 | 
 13 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 14 |                     datefmt='%m/%d/%Y %H:%M:%S',
 15 |                     level=logging.INFO)
 16 | logger = logging.getLogger(__name__)
 17 | MAX_SEQUENCE_PER_TIME = 80
 18 | 
 19 | class InstanceReader(object):
 20 |     def to_uniform_fields(self, fields):
 21 |         pass
 22 | 
 23 |     def fields_to_instance(self, fields):
 24 |         pass
 25 | 
 26 | class PiqaInstanceReader(InstanceReader):
 27 |     """
 28 |     Reads the PIQA dataset into a unified format with context, question, label, and choices.
 29 |     """
 30 |     @overrides
 31 |     def to_uniform_fields(self, fields):
 32 |         context = ""
 33 |         question = fields["goal"]
 34 |         label = fields.get('label', None)
 35 |         choices = [fields["sol1"][0].lower()+fields["sol1"][1:], fields["sol2"][0].lower()+fields["sol2"][1:]]
 36 |         return context, question, label, choices
 37 | 
 38 |     @overrides
 39 |     def fields_to_instance(self, fields):
 40 |         context, question, label, choices = self.to_uniform_fields(fields)
 41 |         return context, question, label, choices
 42 | 
 43 | 
 44 | class SocialIQAInstanceReader(InstanceReader):
 45 |     """
 46 |     Reads the SocialIQa dataset into a unified format with context, question, label, and choices.
 47 |     """
 48 |     def __init__(self):
 49 |         super(SocialIQAInstanceReader).__init__()
 50 |         self.QUESTION_TO_ANSWER_PREFIX = {
 51 |               "What will (.*) want to do next?": r"As a result, [SUBJ] wanted to",
 52 |               "What will (.*) want to do after?": r"As a result, [SUBJ] wanted to",
 53 |               "How would (.*) feel afterwards?": r"As a result, [SUBJ] felt",
 54 |               "How would (.*) feel as a result?": r"As a result, [SUBJ] felt",
 55 |               "What will (.*) do next?": r"[SUBJ] then",
 56 |               "How would (.*) feel after?": r"[SUBJ] then",
 57 |               "How would you describe (.*)?": r"[SUBJ] is seen as",
 58 |               "What kind of person is (.*)?": r"[SUBJ] is seen as",
 59 |               "How would you describe (.*) as a person?": r"[SUBJ] is seen as",
 60 |               "Why did (.*) do that?": r"Before, [SUBJ] wanted",
 61 |               "Why did (.*) do this?": r"Before, [SUBJ] wanted",
 62 |               "Why did (.*) want to do this?": r"Before, [SUBJ] wanted",
 63 |               "What does (.*) need to do beforehand?": r"Before, [SUBJ] needed to",
 64 |               "What does (.*) need to do before?": r"Before, [SUBJ] needed to",
 65 |               "What does (.*) need to do before this?": r"Before, [SUBJ] needed to",
 66 |               "What did (.*) need to do before this?": r"Before, [SUBJ] needed to",
 67 |               "What will happen to (.*)?": r"[SUBJ] then",
 68 |               "What will happen to (.*) next?": r"[SUBJ] then"
 69 |         }
 70 | 
 71 |     @overrides
 72 |     def to_uniform_fields(self, fields):
 73 |         context = fields['context']
 74 |         if not context.endswith("."):
 75 |             context += "."
 76 | 
 77 |         question = fields['question']
 78 |         label = fields['correct']
 79 |         choices = [fields['answerA'], fields['answerB'], fields['answerC']]
 80 |         choices = [c + "." if not c.endswith(".") else c for c in choices]
 81 |         label = ord(label) - 65
 82 |         return context, question, label, choices
 83 | 
 84 |     def convert_choice(self, choice, answer_prefix):
 85 |         if answer_prefix.endswith('wanted to') and choice.startswith('wanted to'):
 86 |             choice = choice[9:].strip()
 87 |         if answer_prefix.endswith('needed to') and choice.startswith('needed to'):
 88 |             choice = choice[9:].strip()
 89 |         if answer_prefix.endswith('to') and choice.startswith('to'):
 90 |             choice = choice[2:].strip()
 91 |         choice = choice[0].lower() + choice[1:]
 92 |         return choice
 93 | 
 94 |     @overrides
 95 |     def fields_to_instance(self, fields):
 96 |         context, question, label, choices = self.to_uniform_fields(fields)
 97 | 
 98 |         answer_prefix = ""
 99 |         for template, ans_prefix in self.QUESTION_TO_ANSWER_PREFIX.items():
100 |             m = re.match(template, question)
101 |             if m is not None:
102 |                 subj = m.group(1)
103 |                 if subj.endswith('?'):
104 |                     subj = subj[:-1]
105 |                 answer_prefix = ans_prefix.replace("[SUBJ]", subj)
106 |                 break
107 | 
108 |         if answer_prefix == "":
109 |             answer_prefix = question.replace("?", "is")
110 | 
111 |         question = context + ' ' + answer_prefix
112 |         choices = [self.convert_choice(choice, answer_prefix) for choice in choices]
113 | 
114 |         return context, question, label, choices
115 | 
116 | class ATOMICInstanceReader(InstanceReader):
117 |     """
118 |     Reads the ATOMIC dataset into a unified format with context, question, label, and choices.
119 |     """
120 |     @overrides
121 |     def to_uniform_fields(self, fields):
122 |         question = fields['context']
123 |         label = fields['correct']
124 |         choices = [fields['candidates'][0], fields['candidates'][1], fields['candidates'][2]]
125 |         return '', question, label, choices
126 | 
127 |     @overrides
128 |     def fields_to_instance(self, fields):
129 |         context, question, label, choices = self.to_uniform_fields(fields)
130 |         #print (question, choices)
131 |         return context, question, label, choices
132 | 
133 | class CWWVInstanceReader(InstanceReader):
134 |     """
135 |     Reads the CWWV dataset into a unified format with context, question, label, and choices.
136 |     """
137 |     @overrides
138 |     def to_uniform_fields(self, fields):
139 |         question = fields['question']['stem']
140 |         if question.endswith('.'):
141 |             question = question[:-1]
142 |         if not question.endswith('[MASK]'):
143 |             print ('should not happen')
144 |             exit(0)
145 |         question = question[:-7]
146 |         label = ['A','B','C'].index(fields['answerKey'])
147 |         choices = [fields['question']['choices'][0]['text']+'.', fields['question']['choices'][1]['text']+'.', fields['question']['choices'][2]['text']+'.']
148 |         return '', question, label, choices
149 | 
150 |     @overrides
151 |     def fields_to_instance(self, fields):
152 |         context, question, label, choices = self.to_uniform_fields(fields)
153 |         return context, question, label, choices
154 | 
155 | class WinograndeInstanceReader(InstanceReader):
156 |     """
157 |     Reads the WinoGrande dataset into a unified format with context, question, label, and choices.
158 |     """
159 |     @overrides
160 |     def to_uniform_fields(self, fields):
161 |         context = fields['sentence']
162 |         if not context.endswith("."):
163 |             context += "."
164 |         context = context.split('_')
165 |         label = fields['answer']
166 |         choices = [fields['option1']+context[1], fields['option2']+context[1]]
167 |         label = int(label) - 1
168 |         question = context[0].strip()
169 |         return context, question, label, choices
170 | 
171 |     @overrides
172 |     def fields_to_instance(self, fields):
173 |         context, question, label, choices = self.to_uniform_fields(fields)
174 |         return context, question, label, choices
175 | 
176 | 
177 | class CommonsenseqaInstanceReader(InstanceReader):
178 |     """
179 |     Reads the CommonsenseQA dataset into a unified format with context, question, label, and choices.
180 |     """
181 |     @overrides
182 |     def to_uniform_fields(self, fields):
183 |         context = ''
184 |         question = 'Q: '+ fields['question']['stem']
185 |         label = ['A','B','C','D','E'].index(fields['answerKey']) if "answerKey" in fields else None
186 |         choices = ['A: '+c['text'][0].lower()+c['text'][1:] for c in fields['question']['choices']]
187 |         return context, question, label, choices
188 | 
189 |     @overrides
190 |     def fields_to_instance(self, fields):
191 |         context, question, label, choices = self.to_uniform_fields(fields)
192 |         return context, question, label, choices
193 | 
194 | class ANLIInstanceReader(InstanceReader):
195 |     """
196 |     Reads the aNLI dataset into a unified format with context, question, label, and choices.
197 |     """
198 |     @overrides
199 |     def to_uniform_fields(self, fields):
200 |         context = ''
201 |         question = fields['context']
202 |         label = ['A','B'].index(fields['answerKey']) if "answerKey" in fields else None
203 |         choices = [c['text']+' '+fields['question']['stem'] for c in fields['question']['choices']]
204 |         return context, question, label, choices
205 | 
206 |     @overrides
207 |     def fields_to_instance(self, fields):
208 |         context, question, label, choices = self.to_uniform_fields(fields)
209 |         return context, question, label, choices
210 | 
211 | INSTANCE_READERS = {"socialiqa": SocialIQAInstanceReader,
212 |                     "winogrande": WinograndeInstanceReader,
213 |                     "piqa": PiqaInstanceReader,
214 |                     "commonsenseqa":CommonsenseqaInstanceReader,
215 |                     "anli": ANLIInstanceReader,
216 |                     'atomic': ATOMICInstanceReader,
217 |                     'cwwv': CWWVInstanceReader}
218 | 
219 | def token_wise_scoring(sequences, label_ids, attention_mask, tokenizer, device, model):
220 |     choice_loss = [0 for i in range(len(sequences))]
221 |     for i in range(len(sequences)):
222 |         tmp_seq_list = []
223 |         tmp_label_list = []
224 |         tmp_attention_mask = []
225 |         curr_label_ids = label_ids[i]
226 |         for j, t in enumerate(curr_label_ids):
227 |             if t == -100:
228 |                 continue
229 |             tmp_seq = torch.tensor(sequences[i][:j]+[tokenizer.mask_token_id]+sequences[i][j+1:]).long().to(device)
230 |             tmp_label = torch.tensor([-100]*j+sequences[i][j:j+1]+[-100]*(len(sequences[i])-j-1)).long().to(device)
231 |             tmp_seq_list.append(tmp_seq)
232 |             tmp_label_list.append(tmp_label)
233 |             tmp_attention_mask.append(torch.tensor(attention_mask[i]).long().to(device))
234 |         tmp_seq_list = torch.stack(tmp_seq_list)
235 |         tmp_label_list = torch.stack(tmp_label_list)
236 |         tmp_attention_mask = torch.stack(tmp_attention_mask)
237 |         if len(tmp_seq_list) < MAX_SEQUENCE_PER_TIME:
238 |             loss = get_lm_score(model, tmp_seq_list, tmp_label_list, tmp_attention_mask)
239 |         else:
240 |             loss = []
241 |             for chunk in range(0, len(tmp_seq_list), MAX_SEQUENCE_PER_TIME):
242 |                 loss.append(get_lm_score(model, tmp_seq_list[chunk:chunk+MAX_SEQUENCE_PER_TIME], tmp_label_list[chunk:chunk+MAX_SEQUENCE_PER_TIME], tmp_attention_mask[chunk:chunk+MAX_SEQUENCE_PER_TIME]))
243 |             loss = np.concatenate(loss)
244 |         choice_loss[i] = sum(loss)/len(loss) 
245 |     prediction = choice_loss.index(min(choice_loss))
246 |     return prediction
247 | 
248 | def prepare_input(sequences, label_ids, pad_token_id):
249 |     max_length = max([len(text) for text in sequences])
250 |     attention_mask = np.zeros((len(sequences), max_length))
251 |     for i in range(len(sequences)):
252 |         attention_mask[i][:len(sequences[i])] = 1
253 |     sequences = [text + [pad_token_id] * (max_length - len(text)) for text in sequences]
254 |     label_ids = [text + [-100] * (max_length - len(text)) for text in label_ids]
255 |     return sequences, label_ids, attention_mask
256 | 
257 | def score_task(question, choices, tokenizer, device, model):
258 |     pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
259 |     question_ids = tokenizer.encode(question)
260 |     choice_ids = [tokenizer.encode(choice, add_prefix_space=True)[1:-1] for choice in choices]
261 |     sequences = [question_ids[:-1] + choice_ids[i] +[tokenizer.sep_token_id] for i in range(len(choice_ids))]
262 |     label_ids = [[-100]+text[1:-1]+[-100] for text in sequences]
263 |     sequences, label_ids, attention_mask = prepare_input(sequences, label_ids, pad_token_id)
264 |     prediction = token_wise_scoring(sequences, label_ids, attention_mask, tokenizer, device, model)
265 |     return prediction
266 | 
267 | def main():
268 |     parser = argparse.ArgumentParser()
269 |     parser.add_argument("--lm", default="roberta-large", type=str, required=False, help="language model to use")
270 |     parser.add_argument("--dataset_file", default=None, type=str, required=True, help="Jsonl file")
271 |     parser.add_argument("--out_dir", default=None, type=str, required=True, help="Out directory for the predictions")
272 |     parser.add_argument("--device", default=-1, type=int, required=False, help="GPU device")
273 |     parser.add_argument("--cache_dir", default=None, type=str, required=False, help="where the model is cached")
274 |     parser.add_argument("--reader", default=None, type=str, required=True, help="which reader to use")
275 |     args = parser.parse_args()
276 |     logger.info(args)
277 |     task = args.reader
278 |     if args.lm != 'roberta-large':
279 |         model_path = ['roberta']+args.lm.split('/')[-1:]+[task]
280 |         model_path = '_'.join([m for m in model_path if m != ''])
281 |         out_dir = os.path.join(args.out_dir, model_path)
282 |     else:
283 |         out_dir = os.path.join(args.out_dir, 'roberta_'+task)
284 |     if os.path.exists(out_dir) and os.listdir(out_dir):
285 |         raise ValueError("Output directory ({}) already exists and is not empty.".format(out_dir))
286 |     if not os.path.exists(out_dir):
287 |         os.makedirs(out_dir)
288 |     out_file = os.path.join(out_dir, 'predictions.jsonl')
289 |     log_file = os.path.join(out_dir, 'results.txt')
290 | 
291 |     # Load the language model
292 |     device = torch.device(f'cuda:{args.device}') if args.device >= 0 else torch.device("cpu")
293 |     model, tokenizer = init_model(args.lm, device, args.cache_dir)
294 | 
295 |     # Load the dataset
296 |     instance_reader = INSTANCE_READERS[args.reader]()
297 |     
298 |     gold = []
299 |     predictions = []
300 |     results = []
301 |     pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
302 |     print ('currently evaluating the task', task)
303 |     # Predict instances
304 |     sample_id = 0
305 |     with open(out_file, "w") as f_out:
306 |         with open(args.dataset_file) as f_in:
307 |             for line in tqdm.tqdm(f_in):
308 |                 fields = json.loads(line.strip())
309 |                 context, question, label, choices = \
310 |                     instance_reader.fields_to_instance(fields)
311 |                 gold.append(label)
312 |                 if sample_id == 0:
313 |                     results.append(json.dumps(context))
314 |                     results.append(json.dumps(question))
315 |                     results.append(json.dumps(choices))
316 |                 prediction = score_task(question, choices, tokenizer, device, model)
317 |                 fields["prediction"] = prediction
318 |                 predictions.append(prediction)
319 |                 f_out.write(json.dumps(fields) + "\n")
320 |                 sample_id += 1
321 |     # Don't report accuracy if we don't have the labels
322 |     if None not in gold:
323 |         accuracy = (np.array(gold)==np.array(predictions)).mean()
324 |         print(f"Accuracy: {accuracy:.3f}")
325 |         results.append(f"Accuracy: {accuracy:.3f}")
326 |     with open(log_file, 'w') as fout:
327 |         for line in results:
328 |             fout.write(line + '\n')
329 | 
330 | def get_lm_score(model, batch, label_ids, attention_mask):
331 |     """
332 |     Get the cross entropy loss of the texts in batch using the langage model
333 |     """
334 |     # Batch: [num_choices, max_length]
335 |     with torch.no_grad():
336 |         num_choices, max_length = batch.shape
337 |         label_ids = label_ids.view(-1)
338 |         lm_logits = model(batch, attention_mask=attention_mask)[0]
339 |         lm_logits = lm_logits.view(-1, lm_logits.size(-1))
340 |         loss_fct = CrossEntropyLoss(reduction="none")
341 |         loss = loss_fct(lm_logits, label_ids)
342 |         loss = loss.view(num_choices, -1).sum(1).cpu().numpy()
343 |     return loss
344 | 
345 | 
346 | def init_model(model_name: str,
347 |                device: torch.device, cache_dir):
348 |     """
349 |     Initialize a pre-trained LM
350 |     :param model_name: from MODEL_CLASSES
351 |     :param device: CUDA / CPU device
352 |     :return: the model and tokenizer
353 |     """
354 |     logger.info(f'Initializing {model_name}')
355 |     tokenizer = RobertaTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
356 |     model = RobertaForMaskedLM.from_pretrained(model_name, cache_dir=cache_dir)
357 |     model.to(device)
358 |     model.eval()
359 |     return model, tokenizer
360 | 
361 | if __name__ == '__main__':
362 |     main()
363 | 


--------------------------------------------------------------------------------
/src/Training/MLM/run_lm_gpt2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | from __future__ import absolute_import, division, print_function
 18 | 
 19 | import argparse
 20 | import glob
 21 | import logging
 22 | import os
 23 | import random
 24 | 
 25 | import numpy as np
 26 | import torch
 27 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 28 | 							  TensorDataset)
 29 | from torch.utils.data.distributed import DistributedSampler
 30 | from torch.utils.tensorboard import SummaryWriter
 31 | from tqdm import tqdm, trange
 32 | import sys
 33 | sys.path.append('../')
 34 | sys.path.append('.')
 35 | from transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
 36 | from transformers import AdamW, get_linear_schedule_with_warmup
 37 | from data_utils import myprocessors, handle_underscores
 38 | import json
 39 | from collections import Counter
 40 | logger = logging.getLogger(__name__)
 41 | from transformers import MODEL_WITH_LM_HEAD_MAPPING
 42 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 43 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 44 | 
 45 | MODEL_CLASSES = {
 46 | 	'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
 47 | }
 48 | 
 49 | class MyDataset(torch.utils.data.Dataset):
 50 | 	def __init__(self, data, mask_token):
 51 | 		self.data = data
 52 | 		self.mask_token = mask_token
 53 | 
 54 | 	def __len__(self):
 55 | 		return len(self.data)
 56 | 
 57 | 	def __getitem__(self, idx):
 58 | 		sample = self.data[idx]
 59 | 		return sample, self.mask_token
 60 | 
 61 | def mCollateFn(batch):
 62 | 	batch_input_ids = []
 63 | 	batch_input_mask = []
 64 | 	batch_label_ids = []
 65 | 	mask_token = batch[0][1]
 66 | 	max_len = max([len(f[0]) for f in batch])
 67 | 	for f in batch:
 68 | 		input_ids = np.full(max_len, mask_token)
 69 | 		input_ids[:len(f[0])] = f[0]
 70 | 		labels = np.array([-100 if f[0][i] == mask_token else f[0][i] for i in range(len(f[0]))]+[-100]*(max_len-len(f[0])))
 71 | 		mask = np.zeros(max_len)
 72 | 		mask[:len(f[0])] = 1
 73 | 		batch_input_ids.append(input_ids)
 74 | 		batch_input_mask.append(mask)
 75 | 		batch_label_ids.append(labels)
 76 | 		
 77 | 	batch_input_ids = torch.tensor(batch_input_ids, dtype=torch.long)
 78 | 	batch_input_mask = torch.tensor(batch_input_mask, dtype=torch.long)
 79 | 	batch_label_ids = torch.tensor(batch_label_ids, dtype=torch.long)
 80 | 	return batch_input_ids, batch_input_mask, batch_label_ids
 81 | 
 82 | def convert_examples_to_features(examples, tokenizer, max_length=512):
 83 | 	data = []
 84 | 	for example in examples:
 85 | 		inputs, _ = handle_underscores(example['context'], tokenizer, keywords=example['keywords'], prefix=True)
 86 | 		t_inputs, _ = handle_underscores(example['ending'], tokenizer)
 87 | 		input_ids = tokenizer.convert_tokens_to_ids(inputs+t_inputs)
 88 | 		data.append(input_ids)
 89 | 	return data
 90 | 
 91 | def set_seed(args):
 92 | 	random.seed(args.seed)
 93 | 	np.random.seed(args.seed)
 94 | 	torch.manual_seed(args.seed)
 95 | 	if args.n_gpu > 0:
 96 | 		torch.cuda.manual_seed_all(args.seed)
 97 | 
 98 | def count_parameters(model):
 99 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
100 | 
101 | def train(args, train_dataset, model, tokenizer):
102 | 	""" Train the model """
103 | 	if args.local_rank in [-1, 0]:
104 | 		tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs'))
105 | 
106 | 	args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
107 | 	train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
108 | 	train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=mCollateFn)
109 | 
110 | 	if args.max_steps > 0:
111 | 		t_total = args.max_steps
112 | 		args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
113 | 	else:
114 | 		t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
115 | 
116 | 	# Prepare optimizer and schedule (linear warmup and decay)
117 | 	no_decay = ['bias', 'LayerNorm.weight']
118 | 	optimizer_grouped_parameters = [
119 | 		{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
120 | 		{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
121 | 		]
122 | 
123 | 	warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total)
124 | 	logger.info("warm up steps = %d", warmup_steps)
125 | 	optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98))
126 | 	scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
127 | 
128 | 	if args.fp16:
129 | 		try:
130 | 			from apex import amp
131 | 		except ImportError:
132 | 			raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
133 | 		model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
134 | 
135 | 	# multi-gpu training (should be after apex fp16 initialization)
136 | 	if args.n_gpu > 1:
137 | 		model = torch.nn.DataParallel(model)
138 | 
139 | 	# Distributed training (should be after apex fp16 initialization)
140 | 	if args.local_rank != -1:
141 | 		model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
142 | 														  output_device=args.local_rank,
143 | 														  find_unused_parameters=True)
144 | 	# Train!
145 | 	logger.info("***** Running training *****")
146 | 	logger.info("  Num examples = %d", len(train_dataset))
147 | 	logger.info("  Num Epochs = %d", args.num_train_epochs)
148 | 	logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
149 | 	logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
150 | 				   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
151 | 	logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
152 | 	logger.info("  Total optimization steps = %d", t_total)
153 | 
154 | 	global_step = 0
155 | 	tr_loss, logging_loss = 0.0, 0.0
156 | 	model.zero_grad()
157 | 	train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
158 | 	set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
159 | 	for _ in train_iterator:
160 | 		epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
161 | 		for step, batch in enumerate(epoch_iterator):
162 | 			model.train()
163 | 			inputs = {'input_ids':      batch[0].cuda(),
164 | 					  'attention_mask': batch[1].cuda(),
165 | 					  'labels': batch[2].cuda()}
166 | 			outputs = model(**inputs)
167 | 			loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
168 | 
169 | 			if args.n_gpu > 1:
170 | 				loss = loss.mean() # mean() to average on multi-gpu parallel training
171 | 			if args.gradient_accumulation_steps > 1:
172 | 				loss = loss / args.gradient_accumulation_steps
173 | 
174 | 			if args.fp16:
175 | 				with amp.scale_loss(loss, optimizer) as scaled_loss:
176 | 					scaled_loss.backward()
177 | 			else:
178 | 				loss.backward()
179 | 
180 | 			tr_loss += loss.item()
181 | 			if (step + 1) % args.gradient_accumulation_steps == 0:
182 | 				optimizer.step()
183 | 				scheduler.step()  # Update learning rate schedule
184 | 				model.zero_grad()
185 | 				global_step += 1
186 | 
187 | 				if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
188 | 					# Log metrics
189 | 					tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step)
190 | 					tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
191 | 					tb_writer.add_scalar('Batch_loss', loss.item()*args.gradient_accumulation_steps, global_step)
192 | 					logger.info(" global_step = %s, average loss = %s", global_step, (tr_loss - logging_loss)/args.logging_steps)
193 | 					logging_loss = tr_loss
194 | 					
195 | 
196 | 			if args.max_steps > 0 and global_step > args.max_steps:
197 | 				epoch_iterator.close()
198 | 				break
199 | 		if args.max_steps > 0 and global_step > args.max_steps:
200 | 			train_iterator.close()
201 | 			break
202 | 		if args.local_rank == -1:  # Only evaluate when single GPU otherwise metrics may not average well
203 | 			# Save model checkpoint
204 | 			output_dir = os.path.join(args.output_dir, 'epoch%s' % _)
205 | 			if not os.path.exists(output_dir):
206 | 				os.makedirs(output_dir)
207 | 			model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
208 | 			model_to_save.save_pretrained(output_dir)
209 | 			tokenizer.save_pretrained(output_dir)
210 | 			torch.save(args, os.path.join(output_dir, 'training_args.bin'))
211 | 			logger.info("Saving model checkpoint to %s", output_dir)
212 | 	if args.local_rank in [-1, 0]:
213 | 		tb_writer.close()
214 | 	return global_step, tr_loss / global_step
215 | 
216 | 
217 | def load_and_cache_examples(args, task, tokenizer, evaluate=False):
218 | 	if args.local_rank not in [-1, 0] and not evaluate:
219 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
220 | 
221 | 	processor = myprocessors[task](args)
222 | 	examples = processor.get_train_examples()
223 | 	features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length)
224 | 	if args.local_rank == 0 and not evaluate:
225 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
226 | 	return MyDataset(features, tokenizer.mask_token_id)
227 | 
228 | def main():
229 | 	parser = argparse.ArgumentParser()
230 | 
231 | 	## Required parameters
232 | 	parser.add_argument("--train_file", default=None, type=str, required=True,
233 | 						help="The train file name")
234 | 	parser.add_argument("--dev_file", default=None, type=str, required=True,
235 | 						help="The dev file name")
236 | 	parser.add_argument("--model_type", default=None, type=str, required=True,
237 | 						help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
238 | 	parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
239 | 						help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_TYPES))
240 | 	parser.add_argument("--config_name", default="", type=str,
241 | 						help="Pretrained config name or path if not the same as model_name")
242 | 	parser.add_argument("--tokenizer_name", default="", type=str,
243 | 						help="Pretrained tokenizer name or path if not the same as model_name")
244 | 	parser.add_argument("--cache_dir", default="", type=str,
245 | 						help="Where do you want to store the pre-trained models downloaded from s3")
246 | 	parser.add_argument("--task_name", default=None, type=str, required=True,
247 | 						help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys()))
248 | 	parser.add_argument("--output_dir", default=None, type=str, required=True,
249 | 						help="The output directory where the model predictions and checkpoints will be written.")
250 | 
251 | 	## Other parameters
252 | 	parser.add_argument("--max_seq_length", default=128, type=int,
253 | 						help="The maximum total input sequence length after tokenization. Sequences longer "
254 | 							 "than this will be truncated, sequences shorter will be padded.")
255 | 	parser.add_argument("--do_train", action='store_true',
256 | 						help="Whether to run training.")
257 | 	parser.add_argument("--do_eval", action='store_true',
258 | 						help="Whether to run eval on the dev set.")
259 | 	parser.add_argument("--do_lower_case", action='store_true',
260 | 						help="Set this flag if you are using an uncased model.")
261 | 	parser.add_argument("--per_gpu_train_batch_size", default=1, type=int,
262 | 						help="Batch size per GPU/CPU for training.")
263 | 	parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int,
264 | 						help="Batch size per GPU/CPU for evaluation.")
265 | 	parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
266 | 						help="Number of updates steps to accumulate before performing a backward/update pass.")
267 | 	parser.add_argument("--learning_rate", default=1e-5, type=float,
268 | 						help="The initial learning rate for Adam.")
269 | 	parser.add_argument("--weight_decay", default=0.01, type=float,
270 | 						help="Weight deay if we apply some.")
271 | 	parser.add_argument("--adam_epsilon", default=1e-6, type=float,
272 | 						help="Epsilon for Adam optimizer.")
273 | 	parser.add_argument("--max_grad_norm", default=1.0, type=float,
274 | 						help="Max gradient norm.")
275 | 	parser.add_argument("--num_train_epochs", default=1.0, type=float,
276 | 						help="Total number of training epochs to perform.")
277 | 	parser.add_argument("--max_steps", default=-1, type=int,
278 | 						help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
279 | 	parser.add_argument("--warmup_steps", default=0, type=int,
280 | 						help="Linear warmup over warmup_steps.")
281 | 	parser.add_argument("--warmup_proportion", default=0.05, type=float,
282 | 						help="Linear warmup over warmup proportion.")
283 | 	parser.add_argument('--logging_steps', type=int, default=50,
284 | 						help="Log every X updates steps.")
285 | 	parser.add_argument('--save_steps', type=int, default=50,
286 | 						help="Save checkpoint every X updates steps.")
287 | 	parser.add_argument("--no_cuda", action='store_true',
288 | 						help="Avoid using CUDA when available")
289 | 	parser.add_argument('--overwrite_output_dir', action='store_true',
290 | 						help="Overwrite the content of the output directory")
291 | 	parser.add_argument('--seed', type=int, default=2555,
292 | 						help="random seed for initialization")
293 | 	parser.add_argument('--fp16', action='store_true',
294 | 						help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
295 | 	parser.add_argument('--fp16_opt_level', type=str, default='O1',
296 | 						help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
297 | 							 "See details at https://nvidia.github.io/apex/amp.html")
298 | 	parser.add_argument("--local_rank", type=int, default=-1,
299 | 						help="For distributed training: local_rank")
300 | 	parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
301 | 	parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
302 | 	args = parser.parse_args()
303 | 
304 | 	if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir:
305 | 		raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
306 | 	if not os.path.exists(args.output_dir):
307 | 		os.makedirs(args.output_dir)
308 | 
309 | 	# Setup CUDA, GPU & distributed training
310 | 	if args.local_rank == -1 or args.no_cuda:
311 | 		device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
312 | 		args.n_gpu = torch.cuda.device_count()
313 | 	else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
314 | 		torch.cuda.set_device(args.local_rank)
315 | 		device = torch.device("cuda", args.local_rank)
316 | 		torch.distributed.init_process_group(backend='nccl')
317 | 		args.n_gpu = 1
318 | 	args.device = device
319 | 
320 | 	if args.do_train:
321 | 		for handler in logging.root.handlers[:]:
322 | 			logging.root.removeHandler(handler)
323 | 	# Setup logging
324 | 	if args.do_train:
325 | 		log_file = os.path.join(args.output_dir, 'train.log')
326 | 		logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
327 | 							datefmt = '%m/%d/%Y %H:%M:%S',
328 | 							level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
329 | 							filename=log_file)
330 | 		logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
331 | 						args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
332 | 		os.system("cp run_lm_gpt2.py %s" % os.path.join(args.output_dir, 'run_lm_gpt2.py'))
333 | 		os.system("cp ../data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py'))
334 | 
335 | 	# Set seed
336 | 	set_seed(args)
337 | 	args.task_name = args.task_name.lower()
338 | 	if args.task_name not in myprocessors:
339 | 		raise ValueError("Task not found: %s" % (args.task_name))
340 | 	
341 | 	args.model_type = args.model_type.lower()
342 | 	config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
343 | 	config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir)
344 | 	tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir)
345 | 	model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir)
346 | 	
347 | 	count = count_parameters(model)
348 | 	print (count)
349 | 	special_tokens_dict = {'mask_token': '<mask>'}
350 | 	num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
351 | 	model.resize_token_embeddings(len(tokenizer))
352 | 
353 | 	if args.local_rank == 0:
354 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
355 | 
356 | 	model.to(args.device)
357 | 
358 | 	logger.info("Training/evaluation parameters %s", args)
359 | 	if args.do_train:
360 | 		train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
361 | 		global_step, tr_loss = train(args, train_dataset, model, tokenizer)
362 | 		logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
363 | 	# Evaluation
364 | 	results = {}
365 | 
366 | 	return results
367 | 
368 | if __name__ == "__main__":
369 | 	main()


--------------------------------------------------------------------------------
/src/Data_generation/generate_from_ATOMIC.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import logging
  3 | from tqdm import tqdm
  4 | import json
  5 | import re
  6 | import ftfy
  7 | import random
  8 | from collections import Counter, defaultdict
  9 | import nltk
 10 | from nltk.corpus import stopwords
 11 | skip_words = set(stopwords.words('english'))
 12 | skip_words.add('\'s')
 13 | skip_words.add('.')
 14 | skip_words.add(',')
 15 | import sys
 16 | sys.path.append('../')
 17 | sys.path.append('.')
 18 | import os
 19 | import argparse
 20 | from Training.data_utils import PERSON_NAMES
 21 | from sentence_transformers import SentenceTransformer, util
 22 | import pickle
 23 | import numpy as np
 24 | import torch
 25 | 
 26 | def text_standardize(text):
 27 | 	"""
 28 | 	Borrowed from COMET repo 
 29 | 	"""
 30 | 	text = text.replace('—', '-')
 31 | 	text = text.replace('–', '-')
 32 | 	text = text.replace('―', '-')
 33 | 	text = text.replace('…', '...')
 34 | 	text = text.replace('´', "'")
 35 | 	text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
 36 | 	text = re.sub(r'\s*\n\s*', ' \n ', text)
 37 | 	text = re.sub(r'[^\S\n]+', ' ', text)
 38 | 	return text.strip()
 39 | 
 40 | def overlap_exist(tail, keywords):
 41 | 	tail = nltk.word_tokenize(tail.lower())
 42 | 	if len(set(tail).intersection(keywords)) > 0:
 43 | 		return True
 44 | 	else:
 45 | 		return False
 46 | 
 47 | def write_data(filename, data):
 48 | 	with open(filename, 'w') as fout:
 49 | 		for sample in data:
 50 | 			fout.write(json.dumps(sample))
 51 | 			fout.write('\n')
 52 | 
 53 | def read_data(filename):
 54 | 	data = []
 55 | 	with open(filename, 'r') as f:
 56 | 		for line in f:
 57 | 			data.append(json.loads(line))
 58 | 	return data
 59 | 
 60 | class ATOMICProcessor(object):
 61 | 	def __init__(self, args):
 62 | 		self.mapping = {
 63 | 		'xAttr' : '. PersonX is seen as',
 64 | 		'xIntent' : '. Before, PersonX wanted',
 65 | 		'xNeed' : '. Before, PersonX needed to',
 66 | 		'xReact': '. As a result, PersonX felt',
 67 | 		'xWant': '. As a result, PersonX wanted to',
 68 | 		'xEffect': '. PersonX then',
 69 | 		'oReact': '. As a result, others felt',
 70 | 		'oWant': '. As a result, others wanted to',
 71 | 		'oEffect': '. Others then'
 72 | 		}
 73 | 		self.xset = ['PersonX', 'Personx', 'personX', 'personx', 'Person X', 'Person x', 'person X', 'person x']
 74 | 		self.yset = ['PersonY', 'Persony', 'personY', 'persony', 'Person Y', 'Person y', 'person Y', 'person y']
 75 | 		self.zset = ['PersonZ', 'Personz', 'personZ', 'personz', 'Person Z', 'Person z', 'person Z', 'person z']
 76 | 		self.xset1 = [' X ', ' x ', ' X\'', ' x\'', ' X.', ' x.']
 77 | 		self.yset1 = [' Y ', ' y ', ' Y\'', ' y\'', ' Y.', ' y.']
 78 | 		self.zset1 = [' Z ', ' z ', ' Z\'', ' z\'', ' Z.', ' z.']
 79 | 		self.answerKey_mapping = {}
 80 | 		self.D = [[], []]
 81 | 		self.labels=[]
 82 | 		self.filelist = [args.train_KG, args.dev_KG]
 83 | 		self.tail_keywords = defaultdict(set)
 84 | 		self.adv = False
 85 | 
 86 | 	def get_person_set(self, context):
 87 | 		person_set = []
 88 | 		if any([x in context for x in self.xset+self.xset1]):
 89 | 			person_set += self.xset+self.xset1
 90 | 		if any([y in context for y in self.yset+self.yset1]):
 91 | 			person_set += self.yset+self.yset1
 92 | 		if any([z in context for z in self.zset+self.zset1]):
 93 | 			person_set += self.zset+self.zset1
 94 | 		return person_set
 95 | 
 96 | 	def find_underscore_length(self, seq):
 97 | 		start = "_"
 98 | 		while start in seq:
 99 | 			start += "_"
100 | 		return start[:-1]
101 | 
102 | 	def fill_names(self, sent, names):
103 | 		for x in self.xset:
104 | 			sent = sent.replace(x, names[0])
105 | 		for x in self.xset1:
106 | 			sent = sent.replace(x, x[0]+names[0]+x[-1])
107 | 		for y in self.yset:
108 | 			sent = sent.replace(y, names[1])
109 | 		for y in self.yset1:
110 | 			sent = sent.replace(y, y[0]+names[0]+y[-1])
111 | 		for z in self.zset:
112 | 			sent = sent.replace(z, names[2])
113 | 		for z in self.zset1:
114 | 			sent = sent.replace(z, z[0]+names[0]+z[-1])
115 | 		return sent
116 | 
117 | 	def fix_templates(self, context, tail):
118 | 		if context.endswith('wanted to') and tail.startswith('wanted to'):
119 | 			tail = tail[9:].strip()
120 | 		if context.endswith('needed to') and tail.startswith('needed to'):
121 | 			tail = tail[9:].strip()
122 | 		if context.endswith('to') and tail.startswith('to'):
123 | 			tail = tail[2:].strip()
124 | 		if len(tail) != 0:
125 | 			tail = tail[0].lower()+tail[1:]
126 | 			if not tail.endswith('.'):
127 | 				tail += '.'
128 | 		return tail
129 | 
130 | 	def negative_sample(self, prefix, dim, correct_ones, data, person_set, question, correct_answer):
131 | 		negatives = []
132 | 		while len(negatives) < 2:
133 | 			sample = random.choice(data)
134 | 			if len(sample[1][dim]) == 0:
135 | 				continue
136 | 			neg = random.choice(sample[1][dim])
137 | 			if len(set(prefix).intersection(self.tail_keywords[(neg, dim)])) != 0:
138 | 				continue
139 | 			if neg in correct_ones:
140 | 				continue
141 | 			if neg in negatives:
142 | 				continue
143 | 			if neg[:-1] in correct_answer[:-1].split() or correct_answer[:-1] in neg[:-1].split():
144 | 				continue
145 | 			if len(person_set) < len(self.xset+self.xset1)*2 and any([y in neg for y in self.yset+self.yset1]):
146 | 				continue
147 | 			if len(person_set) < len(self.xset+self.xset1)*3 and any([z in neg for z in self.zset+self.zset1]):
148 | 				continue
149 | 			negatives.append(neg)
150 | 		return negatives
151 | 
152 | 	def create_dataset(self, data):
153 | 		generated_data = []
154 | 		count = 0
155 | 		for sample in tqdm(data):
156 | 			for k, v in sample[1].items():
157 | 				if len(v) != 0:
158 | 					context = text_standardize(ftfy.fix_text(sample[0]))
159 | 					person_set = self.get_person_set(context)
160 | 					question = self.mapping[k]
161 | 					for vv in v:
162 | 						correct_answer = vv
163 | 						if overlap_exist(correct_answer, sample[-1]):
164 | 							continue
165 | 						negative_answers = self.negative_sample(sample[-1], k, v, data, person_set, context+question, correct_answer)
166 | 						if negative_answers == None:
167 | 							continue
168 | 						names = random.sample(PERSON_NAMES, 3)
169 | 						new_context = self.fill_names(context+question, names)
170 | 						correct_answer = self.fill_names(correct_answer, names)
171 | 						negative_answers = [self.fill_names(neg, names) for neg in negative_answers]
172 | 						candidates = negative_answers+[correct_answer]
173 | 						random.shuffle(candidates)
174 | 						label = candidates.index(correct_answer)
175 | 						count += 1
176 | 						generated_data.append({'id':str(count), 'dim':k, 'context':new_context, 'correct':label, 'candidates':candidates, 'keywords': sample[-1]})
177 | 		return generated_data
178 | 
179 | 	def get_train_examples(self):
180 | 		self.load_data(self.filelist[0], 0)
181 | 		return self.create_dataset(self.D[0])
182 | 
183 | 	def get_dev_examples(self):
184 | 		self.load_data(self.filelist[1], 1)
185 | 		return self.create_dataset(self.D[1])
186 | 
187 | 	def load_data(self, filename, sid):
188 | 		skipped = 0
189 | 		previous = 'random stuff'
190 | 		prefix = 'random stuff'
191 | 		cache = None
192 | 		with open(filename, "r") as f:
193 | 			csvreader = csv.reader(f)
194 | 			fields = next(csvreader)
195 | 			for row in tqdm(csvreader):
196 | 				if row[0] != previous:
197 | 					if cache != None:
198 | 						self.D[sid].append([previous, cache, prefix])
199 | 					previous = row[0]
200 | 					cache = {k:[] for k, v in self.mapping.items()}
201 | 				row[1:-1] = [json.loads(e) for e in row[1:-1]]
202 | 				prefix = row[-2]
203 | 				for i, attr in enumerate(row[1:-2]):
204 | 					for ending in attr:
205 | 						ending = ending.lower()
206 | 						ending = self.fix_templates(self.mapping[fields[i+1]], text_standardize(ftfy.fix_text(ending)))
207 | 						if '_' in ending:
208 | 							tok = self.find_underscore_length(ending)
209 | 							ending = ending.replace(tok, "___")
210 | 						if ending != 'none.' and len(ending) > 0 and ending not in cache[fields[i+1]]:
211 | 							self.tail_keywords[(ending, fields[i+1])] |= set(prefix)
212 | 							cache[fields[i+1]].append(ending)
213 | 			if cache != None:
214 | 				self.D[sid].append([previous, cache, prefix])
215 | 			print (len(self.D[sid]))
216 | 
217 | class ATOMICAdvAnswerProcessor(ATOMICProcessor):
218 | 	def __init__(self, args):
219 | 		super(ATOMICAdvAnswerProcessor, self).__init__(args)
220 | 		with open(os.path.join(args.out_dir, 'atomic_tails.pkl'), "rb") as fin:
221 | 			d = pickle.load(fin)
222 | 		self.tail_index = d['sentences']
223 | 		self.reverse_tail_index = {v:k for k, v in self.tail_index.items()}
224 | 		self.embeddings = d['embeddings']
225 | 		self.lower_bounds = Counter()
226 | 		self.high_prob = 0.4
227 | 		self.low_prob = 0.3
228 | 		self.patience = 10
229 | 		self.step_size = 0.05
230 | 		self.downsample_size = 50
231 | 		self.adv = True
232 | 
233 | 	def negative_sample(self, prefix, dim, correct_ones, data, person_set, question, correct_answer):
234 | 		negatives = []
235 | 		curr_data = random.choices(data, k=self.downsample_size)	
236 | 		distractors = list(set([neg for sample in curr_data for neg in sample[1][dim]]))
237 | 		distractors = [neg for neg in distractors if len(set(prefix).intersection(self.tail_keywords[(neg, dim)])) == 0]
238 | 		distractors_mapping = {i:self.tail_index[neg] for i, neg in enumerate(distractors)}
239 | 		distractors_indices = list(distractors_mapping.values())
240 | 		distractor_emb = self.embeddings[distractors_indices]
241 | 		correct_emb = self.embeddings[self.tail_index[correct_answer]]
242 | 		cos_scores = util.pytorch_cos_sim(correct_emb, distractor_emb)[0]
243 | 		high_prob = self.high_prob
244 | 		low_prob = self.low_prob
245 | 		midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1)
246 | 		midinf = 0
247 | 		while len(midpoint) < self.patience and midinf < self.patience:
248 | 			midinf += 1
249 | 			low_prob -= self.step_size
250 | 			midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1)
251 | 		if len(midpoint) == 0:
252 | 			print ('empty')
253 | 			return None
254 | 		infinite = 0
255 | 		while len(negatives) < 2 and infinite < self.patience:
256 | 			infinite += 1
257 | 			sample_idx = random.choice(midpoint)
258 | 			neg = self.reverse_tail_index[distractors_mapping[sample_idx.item()]]
259 | 			if neg in correct_ones:
260 | 				continue
261 | 			if neg in negatives:
262 | 				continue
263 | 			if neg[:-1] in correct_answer[:-1].split() or correct_answer[:-1] in neg[:-1].split():
264 | 				continue
265 | 			if len(person_set) < len(self.xset+self.xset1)*2 and any([y in neg for y in self.yset+self.yset1]):
266 | 				continue
267 | 			if len(person_set) < len(self.xset+self.xset1)*3 and any([z in neg for z in self.zset+self.zset1]):
268 | 				continue
269 | 			negatives.append(neg)
270 | 		self.lower_bounds[low_prob] += 1
271 | 		if len(negatives) < 2:
272 | 			return None
273 | 		return negatives
274 | 
275 | class ATOMICAdvQuestionProcessor(ATOMICProcessor):
276 | 	def __init__(self, args):
277 | 		super(ATOMICAdvQuestionProcessor, self).__init__(args)
278 | 		with open(os.path.join(args.out_dir, 'atomic_tails.pkl'), "rb") as fin:
279 | 			d = pickle.load(fin)
280 | 		self.tail_index = d['sentences']
281 | 		self.reverse_tail_index = {v:k for k, v in self.tail_index.items()}
282 | 		self.tail_embeddings = d['embeddings']
283 | 		with open(os.path.join(args.out_dir, 'atomic_heads.pkl'), "rb") as fin:
284 | 			d = pickle.load(fin)
285 | 		self.head_index = d['sentences']
286 | 		self.revers_head_index = {v:k for k, v in self.head_index.items()}
287 | 		self.head_embeddings = d['embeddings']
288 | 		self.lower_bounds = Counter()
289 | 		self.high_prob = 0.4
290 | 		self.low_prob = 0.3
291 | 		self.patience = 10
292 | 		self.step_size = 0.05
293 | 		self.downsample_size = 200
294 | 		self.adv = True
295 | 
296 | 	def negative_sample(self, prefix, dim, correct_ones, data, person_set, question, correct_answer):
297 | 		negatives = []
298 | 		curr_data = random.choices(data, k=self.downsample_size)	
299 | 		distractors = list(set([neg for sample in curr_data for neg in sample[1][dim]]))
300 | 		distractors = [neg for neg in distractors if len(set(prefix).intersection(self.tail_keywords[(neg, dim)])) == 0]
301 | 		distractors_mapping = {i:self.tail_index[neg] for i, neg in enumerate(distractors)}
302 | 		distractors_indices = list(distractors_mapping.values())
303 | 		distractor_emb = self.tail_embeddings[distractors_indices]
304 | 		question_emb = self.head_embeddings[self.head_index[question]]
305 | 		cos_scores = util.pytorch_cos_sim(question_emb, distractor_emb)[0]
306 | 		high_prob = self.high_prob
307 | 		low_prob = self.low_prob
308 | 		midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1)
309 | 		midinf = 0
310 | 		while len(midpoint) < self.patience and midinf < self.patience:
311 | 			midinf += 1
312 | 			low_prob -= self.step_size
313 | 			midpoint = np.argwhere((cos_scores.numpy()>low_prob) & (cos_scores.numpy() < high_prob)).squeeze(1)
314 | 		if len(midpoint) == 0:
315 | 			print ('empty')
316 | 			return None
317 | 		infinite = 0
318 | 		while len(negatives) < 2 and infinite < self.patience:
319 | 			infinite += 1
320 | 			sample_idx = random.choice(midpoint)
321 | 			neg = self.reverse_tail_index[distractors_mapping[sample_idx.item()]]
322 | 			if neg in correct_ones:
323 | 				continue
324 | 			if neg in negatives:
325 | 				continue
326 | 			if neg[:-1] in correct_answer[:-1].split() or correct_answer[:-1] in neg[:-1].split():
327 | 				continue
328 | 			if len(person_set) < len(self.xset+self.xset1)*2 and any([y in neg for y in self.yset+self.yset1]):
329 | 				continue
330 | 			if len(person_set) < len(self.xset+self.xset1)*3 and any([z in neg for z in self.zset+self.zset1]):
331 | 				continue
332 | 			negatives.append(neg)
333 | 		self.lower_bounds[low_prob] += 1
334 | 		if len(negatives) < 2:
335 | 			return None
336 | 		return negatives
337 | 
338 | def build_embeddings_answers(args):
339 | 	if os.path.exists(os.path.join(args.out_dir, 'atomic_tails.pkl')):
340 | 		print ('tail embeddings already exist, skip computation')
341 | 		return 
342 | 	processor = ATOMICProcessor(args)
343 | 	model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
344 | 	all_tails = {}
345 | 	files = [args.train_KG, args.dev_KG]
346 | 	for file in files:
347 | 		with open(file, 'r') as f:
348 | 			csvreader = csv.reader(f)
349 | 			fields = next(csvreader)
350 | 			for row in tqdm(csvreader):
351 | 				row[1:-1] = [json.loads(e) for e in row[1:-1]]
352 | 				for i, attr in enumerate(row[1:-2]):
353 | 					for ending in attr:
354 | 						ending = ending.lower()
355 | 						if ending != 'none':
356 | 							tail = text_standardize(ftfy.fix_text(ending))
357 | 							tail = processor.fix_templates(processor.mapping[fields[i+1]], tail)
358 | 							if '_' in tail:
359 | 								tok = processor.find_underscore_length(tail)
360 | 								tail = tail.replace(tok, "___")
361 | 							if tail not in all_tails:
362 | 								all_tails[tail] = len(all_tails)
363 | 	print (len(all_tails))
364 | 	corpus = [k for k, v in all_tails.items()]
365 | 	embeddings = model.encode(corpus, show_progress_bar=True, device=0, num_workers=4)
366 | 	print (len(embeddings), embeddings.shape)
367 | 	with open(os.path.join(args.out_dir, 'atomic_tails.pkl'), "wb") as fOut:
368 | 		pickle.dump({'sentences': all_tails, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
369 | 
370 | def build_embeddings_question(args):
371 | 	if os.path.exists(os.path.join(args.out_dir, 'atomic_heads.pkl')):
372 | 		print ('head embeddings already exist, skip computation')
373 | 		return 
374 | 	processor = ATOMICProcessor(args)
375 | 	model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
376 | 	all_heads = {}
377 | 	files = [args.train_KG, args.dev_KG]
378 | 	previous = 'random stuff'
379 | 	for file in files:
380 | 		with open(file, 'r') as f:
381 | 			csvreader = csv.reader(f)
382 | 			fields = next(csvreader)
383 | 			for row in tqdm(csvreader):
384 | 				row[1:-1] = [json.loads(e) for e in row[1:-1]]
385 | 				if row[0] != previous:
386 | 					previous = row[0]
387 | 					head = text_standardize(ftfy.fix_text(row[0]))
388 | 					for i, attr in enumerate(row[1:-2]):
389 | 						rel = processor.mapping[fields[i+1]]
390 | 						question = head + rel
391 | 						if question not in all_heads:
392 | 							all_heads[question] = len(all_heads)
393 | 				
394 | 	print (len(all_heads))
395 | 	corpus = list(all_heads.keys())
396 | 	embeddings1 = model.encode(corpus[:100000], show_progress_bar=True, device=0, num_workers=4)
397 | 	embeddings2 = model.encode(corpus[100000:], show_progress_bar=True, device=0, num_workers=4)
398 | 	embeddings = np.concatenate([embeddings1, embeddings2], axis=0)
399 | 	print (len(embeddings), embeddings.shape)
400 | 	with open(os.path.join(args.out_dir, 'atomic_heads.pkl'), "wb") as fOut:
401 | 		pickle.dump({'sentences': all_heads, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
402 | 
403 | if __name__ == '__main__':
404 | 	parser = argparse.ArgumentParser()
405 | 	parser.add_argument("--train_KG", default=None, type=str, required=True, help="ATOMIC train file")
406 | 	parser.add_argument("--dev_KG", default=None, type=str, required=True, help="ATOMIC dev file")
407 | 	parser.add_argument("--strategy", default='random', type=str, required=False, choices=['random', 'adv-answer', 'adv-question'], help="which data generation strategy to use")
408 | 	parser.add_argument("--out_dir", default=None, type=str, required=True, help="Output dir")
409 | 	parser.add_argument('--do_split', action="store_true", help="Further split training set into subsets for AFLite")
410 | 	args = parser.parse_args()
411 | 	random.seed(1)
412 | 	np.random.seed(1)
413 | 	if args.strategy == 'random':
414 | 		processor = ATOMICProcessor(args)
415 | 	elif args.strategy == 'adv-answer':
416 | 		print ('Using adv-answer strategy')
417 | 		build_embeddings_answers(args)
418 | 		processor = ATOMICAdvAnswerProcessor(args)
419 | 	elif args.strategy == 'adv-question':
420 | 		print ('Using adv-question strategy')
421 | 		build_embeddings_answers(args)
422 | 		build_embeddings_question(args)
423 | 		processor = ATOMICAdvQuestionProcessor(args)
424 | 	else:
425 | 		print ('strategy not recognized')
426 | 		exit(0)
427 | 	dev_examples = processor.get_dev_examples()
428 | 	write_data(os.path.join(args.out_dir, 'dev_'+args.strategy+'.jsonl'), dev_examples)
429 | 	train_examples = processor.get_train_examples()
430 | 	write_data(os.path.join(args.out_dir, 'train_'+args.strategy+'.jsonl'), train_examples)
431 | 	if args.do_split:
432 | 		assert args.strategy == 'random'
433 | 		random.shuffle(train_examples)
434 | 		print ('splitting train into subsets, which can be used for AFLite (only valid for random strategy)')
435 | 		train_examples_1 = train_examples[:int(len(train_examples)*0.01)]
436 | 		train_examples_4 = train_examples[int(len(train_examples)*0.01):int(len(train_examples)*0.05)]
437 | 		train_examples_95 = train_examples[int(len(train_examples)*0.05):]
438 | 		write_data(os.path.join(args.out_dir, 'train_1%_'+args.strategy+'.jsonl'), train_examples_1)
439 | 		write_data(os.path.join(args.out_dir, 'train_4%_'+args.strategy+'.jsonl'), train_examples_4)
440 | 		write_data(os.path.join(args.out_dir, 'train_95%_'+args.strategy+'.jsonl'), train_examples_95)
441 | 
442 | 
443 | 
444 | 
445 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/src/Training/MLM/run_mlm_roberta.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | from __future__ import absolute_import, division, print_function
 18 | 
 19 | import argparse
 20 | import glob
 21 | import logging
 22 | import os
 23 | import random
 24 | 
 25 | import numpy as np
 26 | import torch
 27 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 28 | 							  TensorDataset)
 29 | from torch.utils.data.distributed import DistributedSampler
 30 | from torch.utils.tensorboard import SummaryWriter
 31 | from tqdm import tqdm, trange
 32 | import sys
 33 | sys.path.append('../')
 34 | sys.path.append('.')
 35 | from transformers import (WEIGHTS_NAME, RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
 36 | from transformers import AdamW, get_linear_schedule_with_warmup
 37 | from data_utils import myprocessors, handle_underscores
 38 | from run_pretrain import convert_examples_to_features, MyDataset
 39 | from run_pretrain import evaluate as evaluate_func
 40 | import json
 41 | from collections import Counter
 42 | logger = logging.getLogger(__name__)
 43 | from transformers import MODEL_WITH_LM_HEAD_MAPPING
 44 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 45 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 46 | 
 47 | MODEL_CLASSES = {
 48 | 	'roberta-mlm': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
 49 | }
 50 | 
 51 | class MLMDataset(torch.utils.data.Dataset):
 52 | 	def __init__(self, data):
 53 | 		self.data = data
 54 | 
 55 | 	def __len__(self):
 56 | 		return len(self.data)
 57 | 
 58 | 	def __getitem__(self, idx):
 59 | 		sample = self.data[idx]
 60 | 		return sample
 61 | 
 62 | def mask_tokens(batch_inputs, batch_labels, tokenizer, mlm_probability):
 63 |     """
 64 |     Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
 65 |     """
 66 |     if tokenizer.mask_token is None:
 67 |         raise ValueError(
 68 |             "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
 69 |         )
 70 |     probability_matrix = torch.full(batch_labels.shape, mlm_probability)
 71 |     invalid_tokens_mask = [[t==-100 for t in val] for val in batch_labels.tolist()]
 72 |     probability_matrix.masked_fill_(torch.tensor(invalid_tokens_mask, dtype=torch.bool), value=0.0)
 73 |     masked_indices = torch.bernoulli(probability_matrix).bool()
 74 |     batch_labels[~masked_indices] = -100  # We only compute loss on masked tokens
 75 |     # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
 76 |     indices_replaced = torch.bernoulli(torch.full(batch_labels.shape, 0.8)).bool() & masked_indices
 77 |     batch_inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
 78 | 
 79 |     # 10% of the time, we replace masked input tokens with random word
 80 |     indices_random = torch.bernoulli(torch.full(batch_labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
 81 |     random_words = torch.randint(len(tokenizer), batch_labels.shape, dtype=torch.long)
 82 |     batch_inputs[indices_random] = random_words[indices_random]
 83 | 
 84 |     # The rest of the time (10% of the time) we keep the masked input tokens unchanged
 85 |     return batch_inputs, batch_labels
 86 | 
 87 | def mCollateFn(batch):
 88 | 	batch_input_ids = []
 89 | 	batch_input_mask = []
 90 | 	batch_label_ids = []
 91 | 	max_len = max([len(f[0]) for f in batch])
 92 | 	for f in batch:
 93 | 		input_ids = np.ones(max_len)
 94 | 		input_ids[:len(f[0])] = f[0]
 95 | 		labels = np.full(max_len, -100)
 96 | 		labels[:len(f[1])] = f[1]
 97 | 		mask = np.zeros(max_len)
 98 | 		mask[:len(f[0])] = 1
 99 | 		batch_input_ids.append(input_ids)
100 | 		batch_input_mask.append(mask)
101 | 		batch_label_ids.append(labels)
102 | 		
103 | 	batch_input_ids = torch.tensor(batch_input_ids, dtype=torch.long)
104 | 	batch_input_mask = torch.tensor(batch_input_mask, dtype=torch.long)
105 | 	batch_label_ids = torch.tensor(batch_label_ids, dtype=torch.long)
106 | 	return batch_input_ids, batch_input_mask, batch_label_ids
107 | 
108 | def convert_examples_to_features_mlm(examples, tokenizer, max_length=512):
109 | 	data = []
110 | 	valid_tokens = 0
111 | 	total_tokens = 0
112 | 	for example in examples:
113 | 		inputs, labels = handle_underscores(example['context'], tokenizer, keywords=example['keywords'], prefix=True)
114 | 		t_inputs, t_labels = handle_underscores(example['ending'], tokenizer)
115 | 		input_ids = tokenizer.convert_tokens_to_ids(inputs+t_inputs)
116 | 		label_ids = [t if t == -100 else input_ids[t_i] for t_i, t in enumerate(labels+t_labels)]
117 | 		valid_tokens += len([t for t in label_ids if t != -100])
118 | 		total_tokens += len(label_ids)
119 | 		input_ids = tokenizer.prepare_for_model(input_ids, max_length=max_length, truncation=True)['input_ids']
120 | 		label_ids = [-100] + label_ids + [-100]
121 | 		data.append([input_ids, label_ids])
122 | 	#print (valid_tokens, total_tokens)
123 | 	return data
124 | 
125 | def set_seed(args):
126 | 	random.seed(args.seed)
127 | 	np.random.seed(args.seed)
128 | 	torch.manual_seed(args.seed)
129 | 	if args.n_gpu > 0:
130 | 		torch.cuda.manual_seed_all(args.seed)
131 | 
132 | def count_parameters(model):
133 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
134 | 
135 | def train(args, train_dataset, model, tokenizer, eval_dataset):
136 | 	""" Train the model """
137 | 	if args.local_rank in [-1, 0]:
138 | 		tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs'))
139 | 
140 | 	args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
141 | 	train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
142 | 	train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=mCollateFn)
143 | 
144 | 	if args.max_steps > 0:
145 | 		t_total = args.max_steps
146 | 		args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
147 | 	else:
148 | 		t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
149 | 
150 | 	# Prepare optimizer and schedule (linear warmup and decay)
151 | 	no_decay = ['bias', 'LayerNorm.weight']
152 | 	optimizer_grouped_parameters = [
153 | 		{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
154 | 		{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
155 | 		]
156 | 
157 | 	warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total)
158 | 	logger.info("warm up steps = %d", warmup_steps)
159 | 	optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98))
160 | 	scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
161 | 
162 | 	if args.fp16:
163 | 		try:
164 | 			from apex import amp
165 | 		except ImportError:
166 | 			raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
167 | 		model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
168 | 
169 | 	# multi-gpu training (should be after apex fp16 initialization)
170 | 	if args.n_gpu > 1:
171 | 		model = torch.nn.DataParallel(model)
172 | 
173 | 	# Distributed training (should be after apex fp16 initialization)
174 | 	if args.local_rank != -1:
175 | 		model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
176 | 														  output_device=args.local_rank,
177 | 														  find_unused_parameters=True)
178 | 	# Train!
179 | 	logger.info("***** Running training *****")
180 | 	logger.info("  Num examples = %d", len(train_dataset))
181 | 	logger.info("  Num Epochs = %d", args.num_train_epochs)
182 | 	logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
183 | 	logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
184 | 				   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
185 | 	logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
186 | 	logger.info("  Total optimization steps = %d", t_total)
187 | 
188 | 	global_step = 0
189 | 	tr_loss, logging_loss = 0.0, 0.0
190 | 	model.zero_grad()
191 | 	train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
192 | 	set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
193 | 	curr_best = 0.0
194 | 	for _ in train_iterator:
195 | 		epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
196 | 		for step, batch in enumerate(epoch_iterator):
197 | 			model.train()
198 | 			input_ids, mlm_labels = mask_tokens(batch[0], batch[2], tokenizer, args.mlm_probability)
199 | 			inputs = {'input_ids':      input_ids.cuda(),
200 | 					  'attention_mask': batch[1].cuda(),
201 | 					  'masked_lm_labels':  mlm_labels.cuda()}
202 | 			outputs = model(**inputs)
203 | 			loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
204 | 
205 | 			if args.n_gpu > 1:
206 | 				loss = loss.mean() # mean() to average on multi-gpu parallel training
207 | 			if args.gradient_accumulation_steps > 1:
208 | 				loss = loss / args.gradient_accumulation_steps
209 | 
210 | 			if args.fp16:
211 | 				with amp.scale_loss(loss, optimizer) as scaled_loss:
212 | 					scaled_loss.backward()
213 | 			else:
214 | 				loss.backward()
215 | 
216 | 			tr_loss += loss.item()
217 | 			if (step + 1) % args.gradient_accumulation_steps == 0:
218 | 				optimizer.step()
219 | 				scheduler.step()  # Update learning rate schedule
220 | 				model.zero_grad()
221 | 				global_step += 1
222 | 
223 | 				if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
224 | 					# Log metrics
225 | 					tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step)
226 | 					tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
227 | 					tb_writer.add_scalar('Batch_loss', loss.item()*args.gradient_accumulation_steps, global_step)
228 | 					logger.info(" global_step = %s, average loss = %s", global_step, (tr_loss - logging_loss)/args.logging_steps)
229 | 					logging_loss = tr_loss
230 | 
231 | 				if args.local_rank == -1 and args.evaluate_during_training and global_step % args.save_steps == 0:
232 | 					results = evaluate_func(args, model, tokenizer, eval_dataset)
233 | 					for key, value in results.items():
234 | 						tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
235 | 					if results['acc'] > curr_best:
236 | 						curr_best = results['acc']
237 | 						# Save model checkpoint
238 | 						output_dir = args.output_dir
239 | 						if not os.path.exists(output_dir):
240 | 							os.makedirs(output_dir)
241 | 						model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
242 | 						model_to_save.save_pretrained(output_dir)
243 | 						tokenizer.save_pretrained(output_dir)
244 | 						torch.save(args, os.path.join(output_dir, 'training_args.bin'))
245 | 						logger.info("Saving model checkpoint to %s", output_dir)
246 | 					
247 | 
248 | 			if args.max_steps > 0 and global_step > args.max_steps:
249 | 				epoch_iterator.close()
250 | 				break
251 | 		if args.max_steps > 0 and global_step > args.max_steps:
252 | 			train_iterator.close()
253 | 			break
254 | 	results = evaluate_func(args, model, tokenizer, eval_dataset)
255 | 	for key, value in results.items():
256 | 		tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
257 | 	if results['acc'] > curr_best:
258 | 		curr_best = results['acc']
259 | 		# Save model checkpoint
260 | 		output_dir = args.output_dir
261 | 		if not os.path.exists(output_dir):
262 | 			os.makedirs(output_dir)
263 | 		model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
264 | 		model_to_save.save_pretrained(output_dir)
265 | 		tokenizer.save_pretrained(output_dir)
266 | 		torch.save(args, os.path.join(output_dir, 'training_args.bin'))
267 | 		logger.info("Saving model checkpoint to %s", output_dir)
268 | 	if args.local_rank in [-1, 0]:
269 | 		tb_writer.close()
270 | 	return global_step, tr_loss / global_step
271 | 
272 | 
273 | def load_and_cache_examples(args, task, tokenizer, evaluate=False):
274 | 	if args.local_rank not in [-1, 0] and not evaluate:
275 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
276 | 
277 | 	processor = myprocessors[task](args)
278 | 	examples = processor.get_dev_examples() if evaluate else processor.get_train_examples()
279 | 	feature_func = convert_examples_to_features if evaluate else convert_examples_to_features_mlm
280 | 	features = feature_func(examples, tokenizer, max_length=args.max_seq_length)
281 | 	if args.local_rank == 0 and not evaluate:
282 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
283 | 	if evaluate:
284 | 		return MyDataset(features, tokenizer.pad_token_id, tokenizer.mask_token_id, args.max_words_to_mask)
285 | 	else:
286 | 		return MLMDataset(features)
287 | 
288 | def main():
289 | 	parser = argparse.ArgumentParser()
290 | 
291 | 	## Required parameters
292 | 	parser.add_argument("--train_file", default=None, type=str, required=True,
293 | 						help="The train file name")
294 | 	parser.add_argument("--dev_file", default=None, type=str, required=True,
295 | 						help="The dev file name")
296 | 	parser.add_argument("--model_type", default=None, type=str, required=True,
297 | 						help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
298 | 	parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
299 | 						help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_TYPES))
300 | 	parser.add_argument("--config_name", default="", type=str,
301 | 						help="Pretrained config name or path if not the same as model_name")
302 | 	parser.add_argument("--tokenizer_name", default="", type=str,
303 | 						help="Pretrained tokenizer name or path if not the same as model_name")
304 | 	parser.add_argument("--cache_dir", default="", type=str,
305 | 						help="Where do you want to store the pre-trained models downloaded from s3")
306 | 	parser.add_argument("--task_name", default=None, type=str, required=True,
307 | 						help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys()))
308 | 	parser.add_argument("--output_dir", default=None, type=str, required=True,
309 | 						help="The output directory where the model predictions and checkpoints will be written.")
310 | 
311 | 	## Other parameters
312 | 	parser.add_argument("--mlm_probability", default=0.5, type=float,
313 | 						help="token masking probability, should be 0.5 for ATOMIC and 0.3 for CSKG")
314 | 	parser.add_argument("--max_seq_length", default=128, type=int,
315 | 						help="The maximum total input sequence length after tokenization. Sequences longer "
316 | 							 "than this will be truncated, sequences shorter will be padded.")
317 | 	parser.add_argument("--max_words_to_mask", default=6, type=int,
318 | 						help="The maximum number of tokens to mask when computing scores")
319 | 	parser.add_argument("--max_sequence_per_time", default=80, type=int,
320 | 						help="The maximum number of sequences to feed into the model")
321 | 	parser.add_argument("--do_train", action='store_true',
322 | 						help="Whether to run training.")
323 | 	parser.add_argument("--do_eval", action='store_true',
324 | 						help="Whether to run eval on the dev set.")
325 | 	parser.add_argument("--evaluate_during_training", action='store_true',
326 | 						help="Run evaluation during training at each logging step.")
327 | 	parser.add_argument("--do_lower_case", action='store_true',
328 | 						help="Set this flag if you are using an uncased model.")
329 | 
330 | 	parser.add_argument("--per_gpu_train_batch_size", default=1, type=int,
331 | 						help="Batch size per GPU/CPU for training.")
332 | 	parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int,
333 | 						help="Batch size per GPU/CPU for evaluation.")
334 | 	parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
335 | 						help="Number of updates steps to accumulate before performing a backward/update pass.")
336 | 	parser.add_argument("--learning_rate", default=1e-5, type=float,
337 | 						help="The initial learning rate for Adam.")
338 | 	parser.add_argument("--weight_decay", default=0.01, type=float,
339 | 						help="Weight deay if we apply some.")
340 | 	parser.add_argument("--adam_epsilon", default=1e-6, type=float,
341 | 						help="Epsilon for Adam optimizer.")
342 | 	parser.add_argument("--max_grad_norm", default=1.0, type=float,
343 | 						help="Max gradient norm.")
344 | 	parser.add_argument("--num_train_epochs", default=1.0, type=float,
345 | 						help="Total number of training epochs to perform.")
346 | 	parser.add_argument("--max_steps", default=-1, type=int,
347 | 						help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
348 | 	parser.add_argument("--warmup_steps", default=0, type=int,
349 | 						help="Linear warmup over warmup_steps.")
350 | 	parser.add_argument("--warmup_proportion", default=0.05, type=float,
351 | 						help="Linear warmup over warmup proportion.")
352 | 	parser.add_argument('--logging_steps', type=int, default=50,
353 | 						help="Log every X updates steps.")
354 | 	parser.add_argument('--save_steps', type=int, default=50,
355 | 						help="Save checkpoint every X updates steps.")
356 | 	parser.add_argument("--logits_file", default='logits_test.txt', type=str, 
357 | 						help="The file where prediction logits will be written")
358 | 	parser.add_argument("--results_file", default='eval_results.txt', type=str,
359 | 						help="The file where eval results will be written")
360 | 	parser.add_argument("--no_cuda", action='store_true',
361 | 						help="Avoid using CUDA when available")
362 | 	parser.add_argument('--overwrite_output_dir', action='store_true',
363 | 						help="Overwrite the content of the output directory")
364 | 	parser.add_argument('--seed', type=int, default=2555,
365 | 						help="random seed for initialization")
366 | 	parser.add_argument('--fp16', action='store_true',
367 | 						help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
368 | 	parser.add_argument('--fp16_opt_level', type=str, default='O1',
369 | 						help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
370 | 							 "See details at https://nvidia.github.io/apex/amp.html")
371 | 	parser.add_argument("--local_rank", type=int, default=-1,
372 | 						help="For distributed training: local_rank")
373 | 	parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
374 | 	parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
375 | 	args = parser.parse_args()
376 | 
377 | 	if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir:
378 | 		raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
379 | 	if not os.path.exists(args.output_dir):
380 | 		os.makedirs(args.output_dir)
381 | 
382 | 	# Setup CUDA, GPU & distributed training
383 | 	if args.local_rank == -1 or args.no_cuda:
384 | 		device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
385 | 		args.n_gpu = torch.cuda.device_count()
386 | 	else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
387 | 		torch.cuda.set_device(args.local_rank)
388 | 		device = torch.device("cuda", args.local_rank)
389 | 		torch.distributed.init_process_group(backend='nccl')
390 | 		args.n_gpu = 1
391 | 	args.device = device
392 | 
393 | 	if args.do_train:
394 | 		for handler in logging.root.handlers[:]:
395 | 			logging.root.removeHandler(handler)
396 | 	# Setup logging
397 | 	if args.do_train:
398 | 		log_file = os.path.join(args.output_dir, 'train.log')
399 | 		logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
400 | 							datefmt = '%m/%d/%Y %H:%M:%S',
401 | 							level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
402 | 							filename=log_file)
403 | 		logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
404 | 						args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
405 | 		os.system("cp run_mlm_roberta.py %s" % os.path.join(args.output_dir, 'run_mlm_roberta.py'))
406 | 		os.system("cp ../data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py'))
407 | 
408 | 	# Set seed
409 | 	set_seed(args)
410 | 	args.task_name = args.task_name.lower()
411 | 	if args.task_name not in myprocessors:
412 | 		raise ValueError("Task not found: %s" % (args.task_name))
413 | 	
414 | 	args.model_type = args.model_type.lower()
415 | 	config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
416 | 	config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir)
417 | 	tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir)
418 | 	model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir)
419 | 	
420 | 	count = count_parameters(model)
421 | 	print (count)
422 | 
423 | 	if args.local_rank == 0:
424 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
425 | 
426 | 	model.to(args.device)
427 | 
428 | 	logger.info("Training/evaluation parameters %s", args)
429 | 
430 | 	eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
431 | 	if args.do_train:
432 | 		train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
433 | 		global_step, tr_loss = train(args, train_dataset, model, tokenizer, eval_dataset)
434 | 		logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
435 | 	# Evaluation
436 | 	results = {}
437 | 	return results
438 | 
439 | if __name__ == "__main__":
440 | 	main()


--------------------------------------------------------------------------------
/src/Training/AFLite/run_roberta_classification.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | from __future__ import absolute_import, division, print_function
 18 | 
 19 | import argparse
 20 | import glob
 21 | import logging
 22 | import os
 23 | import random
 24 | 
 25 | import numpy as np
 26 | import torch
 27 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 28 | 							  TensorDataset)
 29 | from torch.utils.data.distributed import DistributedSampler
 30 | from torch.utils.tensorboard import SummaryWriter
 31 | from tqdm import tqdm, trange
 32 | import sys
 33 | sys.path.append('../')
 34 | sys.path.append('.')
 35 | from transformers import (WEIGHTS_NAME, RobertaConfig, RobertaTokenizer)
 36 | from transformers import AdamW, get_linear_schedule_with_warmup
 37 | from data_utils import accuracy, myprocessors, convert_examples_to_features
 38 | import json
 39 | from custimized_models import RobertaForMultipleChoice
 40 | logger = logging.getLogger(__name__)
 41 | 
 42 | from transformers import MODEL_WITH_LM_HEAD_MAPPING
 43 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 44 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 45 | 
 46 | MODEL_CLASSES = {
 47 | 	'roberta-mc': (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer)
 48 | }
 49 | 
 50 | class MyDataset(torch.utils.data.Dataset):
 51 | 
 52 | 	def __init__(self, data, pad_token, mask_token):
 53 | 		self.data = data
 54 | 		self.pad_token = pad_token
 55 | 		self.mask_token = mask_token
 56 | 
 57 | 	def __len__(self):
 58 | 		return len(self.data)
 59 | 
 60 | 	def __getitem__(self, idx):
 61 | 		sample = self.data[idx]
 62 | 		return sample, self.pad_token, self.mask_token
 63 | 
 64 | def mCollateFn(batch):
 65 | 	batch_input_ids = []
 66 | 	batch_input_mask = []
 67 | 	batch_label_ids = []
 68 | 	features = [b[0] for b in batch]
 69 | 	pad_token = batch[0][1]
 70 | 	mask_token = batch[0][2]
 71 | 	max_len = max([len(cand) for f in features for cand in f[0]])
 72 | 	for f in features:
 73 | 		batch_input_ids.append([])
 74 | 		batch_input_mask.append([])
 75 | 		batch_label_ids.append(f[2])
 76 | 		for i in range(len(f[0])):
 77 | 			sequence = f[0][i] + [pad_token]*(max_len-len(f[0][i]))
 78 | 			att_mask = [1]*len(f[0][i]) + [0]*(max_len-len(f[0][i]))
 79 | 			batch_input_ids[-1].append(sequence)
 80 | 			batch_input_mask[-1].append(att_mask)
 81 | 	batch_input_ids = torch.tensor(batch_input_ids, dtype=torch.long)
 82 | 	batch_input_mask = torch.tensor(batch_input_mask, dtype=torch.long)
 83 | 	return batch_input_ids, batch_input_mask,  torch.tensor(batch_label_ids, dtype=torch.long)
 84 | 
 85 | def set_seed(args):
 86 | 	random.seed(args.seed)
 87 | 	np.random.seed(args.seed)
 88 | 	torch.manual_seed(args.seed)
 89 | 	if args.n_gpu > 0:
 90 | 		torch.cuda.manual_seed_all(args.seed)
 91 | 
 92 | def count_parameters(model):
 93 | 	return sum(p.numel() for p in model.parameters() if p.requires_grad)
 94 | 
 95 | def train(args, train_dataset, model, tokenizer, eval_dataset):
 96 | 	""" Train the model """
 97 | 	if args.local_rank in [-1, 0]:
 98 | 		tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs'))
 99 | 
100 | 	args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
101 | 	train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
102 | 	train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=mCollateFn)
103 | 
104 | 	if args.max_steps > 0:
105 | 		t_total = args.max_steps
106 | 		args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
107 | 	else:
108 | 		t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
109 | 
110 | 	# Prepare optimizer and schedule (linear warmup and decay)
111 | 	no_decay = ['bias', 'LayerNorm.weight']
112 | 	optimizer_grouped_parameters = [
113 | 		{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
114 | 		{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
115 | 		]
116 | 
117 | 	warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total)
118 | 	logger.info("warm up steps = %d", warmup_steps)
119 | 	optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98))
120 | 	scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
121 | 
122 | 	if args.fp16:
123 | 		try:
124 | 			from apex import amp
125 | 		except ImportError:
126 | 			raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
127 | 		model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
128 | 
129 | 	# multi-gpu training (should be after apex fp16 initialization)
130 | 	if args.n_gpu > 1:
131 | 		model = torch.nn.DataParallel(model)
132 | 
133 | 	# Distributed training (should be after apex fp16 initialization)
134 | 	if args.local_rank != -1:
135 | 		model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
136 | 														  output_device=args.local_rank,
137 | 														  find_unused_parameters=True)
138 | 	# Train!
139 | 	logger.info("***** Running training *****")
140 | 	logger.info("  Num examples = %d", len(train_dataset))
141 | 	logger.info("  Num Epochs = %d", args.num_train_epochs)
142 | 	logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
143 | 	logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
144 | 				   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
145 | 	logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
146 | 	logger.info("  Total optimization steps = %d", t_total)
147 | 
148 | 	global_step = 0
149 | 	tr_loss, logging_loss = 0.0, 0.0
150 | 	model.zero_grad()
151 | 	train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
152 | 	set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
153 | 	curr_best = 0.0
154 | 	for _ in train_iterator:
155 | 		epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
156 | 		for step, batch in enumerate(epoch_iterator):
157 | 			model.train()
158 | 			inputs = {'input_ids': batch[0].cuda(),
159 | 					  'attention_mask': batch[1].cuda(),
160 | 					  'labels': batch[2].cuda()}
161 | 			outputs = model(**inputs)
162 | 			loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
163 | 
164 | 			if args.n_gpu > 1:
165 | 				loss = loss.mean() # mean() to average on multi-gpu parallel training
166 | 			if args.gradient_accumulation_steps > 1:
167 | 				loss = loss / args.gradient_accumulation_steps
168 | 
169 | 			if args.fp16:
170 | 				with amp.scale_loss(loss, optimizer) as scaled_loss:
171 | 					scaled_loss.backward()
172 | 			else:
173 | 				loss.backward()
174 | 
175 | 			tr_loss += loss.item()
176 | 			if (step + 1) % args.gradient_accumulation_steps == 0:
177 | 				optimizer.step()
178 | 				scheduler.step()  # Update learning rate schedule
179 | 				model.zero_grad()
180 | 				global_step += 1
181 | 
182 | 				if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
183 | 					# Log metrics
184 | 					tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step)
185 | 					tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
186 | 					tb_writer.add_scalar('Batch_loss', loss.item()*args.gradient_accumulation_steps, global_step)
187 | 					logger.info(" global_step = %s, average loss = %s", global_step, (tr_loss - logging_loss)/args.logging_steps)
188 | 					logging_loss = tr_loss
189 | 
190 | 				if args.local_rank == -1 and args.evaluate_during_training and global_step % args.save_steps == 0:
191 | 					results = evaluate(args, model, tokenizer, eval_dataset)
192 | 					for key, value in results.items():
193 | 						tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
194 | 					if results['acc'] > curr_best:
195 | 						curr_best = results['acc']
196 | 						# Save model checkpoint
197 | 						output_dir = args.output_dir
198 | 						if not os.path.exists(output_dir):
199 | 							os.makedirs(output_dir)
200 | 						model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
201 | 						model_to_save.save_pretrained(output_dir)
202 | 						tokenizer.save_pretrained(output_dir)
203 | 						torch.save(args, os.path.join(output_dir, 'training_args.bin'))
204 | 						logger.info("Saving model checkpoint to %s", output_dir)
205 | 					
206 | 
207 | 			if args.max_steps > 0 and global_step > args.max_steps:
208 | 				epoch_iterator.close()
209 | 				break
210 | 		if args.max_steps > 0 and global_step > args.max_steps:
211 | 			train_iterator.close()
212 | 			break
213 | 	results = evaluate(args, model, tokenizer, eval_dataset)
214 | 	for key, value in results.items():
215 | 		tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
216 | 	if results['acc'] > curr_best:
217 | 		curr_best = results['acc']
218 | 		# Save model checkpoint
219 | 		output_dir = args.output_dir
220 | 		if not os.path.exists(output_dir):
221 | 			os.makedirs(output_dir)
222 | 		model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
223 | 		model_to_save.save_pretrained(output_dir)
224 | 		tokenizer.save_pretrained(output_dir)
225 | 		torch.save(args, os.path.join(output_dir, 'training_args.bin'))
226 | 		logger.info("Saving model checkpoint to %s", output_dir)
227 | 	if args.local_rank in [-1, 0]:
228 | 		tb_writer.close()
229 | 	return global_step, tr_loss / global_step
230 | 
231 | def save_logits(logits_all, filename):
232 | 	with open(filename, "w") as f:
233 | 		for i in range(len(logits_all)):
234 | 			for j in range(len(logits_all[i])):
235 | 				f.write(str(logits_all[i][j]))
236 | 				if j == len(logits_all[i])-1:
237 | 					f.write("\n")
238 | 				else:
239 | 					f.write(" ")
240 | 
241 | def evaluate(args, model, tokenizer, eval_dataset):
242 | 	results = {}
243 | 	if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
244 | 		os.makedirs(args.output_dir)
245 | 	
246 | 	args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
247 | 	# Note that DistributedSampler samples randomly
248 | 	eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
249 | 	eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=mCollateFn)
250 | 
251 | 	# Eval!
252 | 	logger.info("***** Running evaluation *****")
253 | 	logger.info("  Num examples = %d", len(eval_dataset))
254 | 	logger.info("  Batch size = %d", args.eval_batch_size)
255 | 	preds = None
256 | 	out_label_ids = None
257 | 	features = []
258 | 	for batch in tqdm(eval_dataloader, desc="Evaluating"):
259 | 		model.eval()
260 | 		with torch.no_grad():
261 | 			inputs = {'input_ids': batch[0].cuda(),
262 | 					  'attention_mask': batch[1].cuda(),
263 | 					  'labels': batch[2].cuda()}
264 | 			outputs = model(**inputs)
265 | 			loss, logits = outputs[:2]
266 | 			batch_features = outputs[2].view(batch[0].shape[0], batch[0].shape[1], -1).detach().cpu()
267 | 			features.append(batch_features)
268 | 		if preds is None:
269 | 			preds = logits.detach().cpu().numpy()
270 | 			out_label_ids = inputs['labels'].detach().cpu().numpy()
271 | 		else:
272 | 			preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
273 | 			out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
274 | 	features = torch.cat(features, dim=0)
275 | 	print (features.shape, out_label_ids.shape)
276 | 	torch.save(features, args.dev_file.replace('.jsonl', '_features'))
277 | 	torch.save(out_label_ids, args.dev_file.replace('.jsonl', '_labels'))
278 | 	save_logits(preds, os.path.join(args.output_dir, args.logits_file))
279 | 	preds = np.argmax(preds, axis=1)
280 | 	result = accuracy(preds, out_label_ids)
281 | 	results.update(result)
282 | 	output_eval_file = os.path.join(args.output_dir, args.results_file)
283 | 	with open(output_eval_file, "w") as writer:
284 | 		logger.info("***** Eval results *****")
285 | 		for key in sorted(result.keys()):
286 | 			logger.info("  %s = %s", key, str(result[key]))
287 | 			writer.write("%s = %s\n" % (key, str(result[key])))
288 | 	return results
289 | 
290 | def write_data(filename, data):
291 |     with open(filename, 'w') as fout:
292 |         for sample in data:
293 |             fout.write(json.dumps(sample))
294 |             fout.write('\n')
295 | 
296 | def load_and_cache_examples(args, task, tokenizer, evaluate=False):
297 | 	if args.local_rank not in [-1, 0] and not evaluate:
298 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
299 | 	processor = myprocessors[task](args)
300 | 	examples = processor.get_dev_examples() if evaluate else processor.get_train_examples()
301 | 	features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length)
302 | 	if args.local_rank == 0 and not evaluate:
303 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
304 | 	return MyDataset(features, tokenizer.pad_token_id, tokenizer.mask_token_id)
305 | 
306 | def main():
307 | 	parser = argparse.ArgumentParser()
308 | 
309 | 	## Required parameters
310 | 	parser.add_argument("--train_file", default=None, type=str, required=True,
311 | 						help="The train file name")
312 | 	parser.add_argument("--dev_file", default=None, type=str, required=True,
313 | 						help="The dev file name")
314 | 	parser.add_argument("--model_type", default=None, type=str, required=True,
315 | 						help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
316 | 	parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
317 | 						help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_TYPES))
318 | 	parser.add_argument("--config_name", default="", type=str,
319 | 						help="Pretrained config name or path if not the same as model_name")
320 | 	parser.add_argument("--tokenizer_name", default="", type=str,
321 | 						help="Pretrained tokenizer name or path if not the same as model_name")
322 | 	parser.add_argument("--cache_dir", default="", type=str,
323 | 						help="Where do you want to store the pre-trained models downloaded from s3")
324 | 	parser.add_argument("--task_name", default=None, type=str, required=True,
325 | 						help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys()))
326 | 	parser.add_argument("--output_dir", default=None, type=str, required=True,
327 | 						help="The output directory where the model predictions and checkpoints will be written.")
328 | 
329 | 	## Other parameters
330 | 	parser.add_argument("--max_seq_length", default=128, type=int,
331 | 						help="The maximum total input sequence length after tokenization. Sequences longer "
332 | 							 "than this will be truncated, sequences shorter will be padded.")
333 | 	parser.add_argument("--do_train", action='store_true',
334 | 						help="Whether to run training.")
335 | 	parser.add_argument("--do_eval", action='store_true',
336 | 						help="Whether to run eval on the dev set.")
337 | 	parser.add_argument("--evaluate_during_training", action='store_true',
338 | 						help="Run evaluation during training at each logging step.")
339 | 	parser.add_argument("--do_lower_case", action='store_true',
340 | 						help="Set this flag if you are using an uncased model.")
341 | 	parser.add_argument("--per_gpu_train_batch_size", default=1, type=int,
342 | 						help="Batch size per GPU/CPU for training.")
343 | 	parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int,
344 | 						help="Batch size per GPU/CPU for evaluation.")
345 | 	parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
346 | 						help="Number of updates steps to accumulate before performing a backward/update pass.")
347 | 	parser.add_argument("--learning_rate", default=1e-5, type=float,
348 | 						help="The initial learning rate for Adam.")
349 | 	parser.add_argument("--weight_decay", default=0.01, type=float,
350 | 						help="Weight deay if we apply some.")
351 | 	parser.add_argument("--adam_epsilon", default=1e-6, type=float,
352 | 						help="Epsilon for Adam optimizer.")
353 | 	parser.add_argument("--max_grad_norm", default=1.0, type=float,
354 | 						help="Max gradient norm.")
355 | 	parser.add_argument("--num_train_epochs", default=1.0, type=float,
356 | 						help="Total number of training epochs to perform.")
357 | 	parser.add_argument("--max_steps", default=-1, type=int,
358 | 						help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
359 | 	parser.add_argument("--warmup_steps", default=0, type=int,
360 | 						help="Linear warmup over warmup_steps.")
361 | 	parser.add_argument("--warmup_proportion", default=0.05, type=float,
362 | 						help="Linear warmup over warmup proportion.")
363 | 	parser.add_argument('--logging_steps', type=int, default=50,
364 | 						help="Log every X updates steps.")
365 | 	parser.add_argument('--save_steps', type=int, default=50,
366 | 						help="Save checkpoint every X updates steps.")
367 | 	parser.add_argument("--logits_file", default='logits_test.txt', type=str, 
368 | 						help="The file where prediction logits will be written")
369 | 	parser.add_argument("--results_file", default='eval_results.txt', type=str,
370 | 						help="The file where eval results will be written")
371 | 	parser.add_argument("--no_cuda", action='store_true',
372 | 						help="Avoid using CUDA when available")
373 | 	parser.add_argument('--overwrite_output_dir', action='store_true',
374 | 						help="Overwrite the content of the output directory")
375 | 	parser.add_argument('--seed', type=int, default=2555,
376 | 						help="random seed for initialization")
377 | 	parser.add_argument('--fp16', action='store_true',
378 | 						help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
379 | 	parser.add_argument('--fp16_opt_level', type=str, default='O1',
380 | 						help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
381 | 							 "See details at https://nvidia.github.io/apex/amp.html")
382 | 	parser.add_argument("--local_rank", type=int, default=-1,
383 | 						help="For distributed training: local_rank")
384 | 	parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
385 | 	parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
386 | 	args = parser.parse_args()
387 | 
388 | 	if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir and args.do_train:
389 | 		raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
390 | 	if not os.path.exists(args.output_dir):
391 | 		os.makedirs(args.output_dir)
392 | 
393 | 	# Setup CUDA, GPU & distributed training
394 | 	if args.local_rank == -1 or args.no_cuda:
395 | 		device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
396 | 		args.n_gpu = torch.cuda.device_count()
397 | 	else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
398 | 		torch.cuda.set_device(args.local_rank)
399 | 		device = torch.device("cuda", args.local_rank)
400 | 		torch.distributed.init_process_group(backend='nccl')
401 | 		args.n_gpu = 1
402 | 	args.device = device
403 | 
404 | 	if args.do_train:
405 | 		for handler in logging.root.handlers[:]:
406 | 			logging.root.removeHandler(handler)
407 | 	# Setup logging
408 | 	if args.do_train:
409 | 		log_file = os.path.join(args.output_dir, 'train.log')
410 | 		logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
411 | 							datefmt = '%m/%d/%Y %H:%M:%S',
412 | 							level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
413 | 							filename=log_file)
414 | 		logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
415 | 						args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
416 | 		os.system("cp run_roberta_classification.py %s" % os.path.join(args.output_dir, 'run_roberta_classification.py'))
417 | 		os.system("cp ../data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py'))
418 | 
419 | 	# Set seed
420 | 	set_seed(args)
421 | 	args.task_name = args.task_name.lower()
422 | 	if args.task_name not in myprocessors:
423 | 		raise ValueError("Task not found: %s" % (args.task_name))
424 | 	
425 | 	args.model_type = args.model_type.lower()
426 | 	config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
427 | 	config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir)
428 | 	tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir)
429 | 	model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir)
430 | 	
431 | 	count = count_parameters(model)
432 | 	print (count)
433 | 
434 | 	if args.local_rank == 0:
435 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
436 | 
437 | 	model.to(args.device)
438 | 
439 | 	logger.info("Training/evaluation parameters %s", args)
440 | 
441 | 	eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
442 | 
443 | 	if args.do_train:
444 | 		train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
445 | 		global_step, tr_loss = train(args, train_dataset, model, tokenizer, eval_dataset)
446 | 		logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
447 | 	# Evaluation
448 | 	results = {}
449 | 	if args.do_eval:
450 | 		tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
451 | 		model = model_class.from_pretrained(args.output_dir)
452 | 		model.eval()
453 | 		model.to(args.device)
454 | 		result = evaluate(args, model, tokenizer, eval_dataset)
455 | 	return results
456 | 
457 | if __name__ == "__main__":
458 | 	main()


--------------------------------------------------------------------------------
/src/Training/run_pretrain_gpt2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | from __future__ import absolute_import, division, print_function
 18 | 
 19 | import argparse
 20 | import glob
 21 | import logging
 22 | import os
 23 | import random
 24 | 
 25 | import numpy as np
 26 | import torch
 27 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 28 | 							  TensorDataset)
 29 | from torch.utils.data.distributed import DistributedSampler
 30 | from torch.utils.tensorboard import SummaryWriter
 31 | 
 32 | from tqdm import tqdm, trange
 33 | from transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
 34 | from transformers import AdamW, get_linear_schedule_with_warmup
 35 | from data_utils import accuracy, myprocessors, handle_underscores
 36 | import json
 37 | from collections import Counter
 38 | logger = logging.getLogger(__name__)
 39 | 
 40 | from transformers import MODEL_WITH_LM_HEAD_MAPPING
 41 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 42 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 43 | 
 44 | MODEL_CLASSES = {
 45 | 	'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
 46 | }
 47 | 
 48 | class MyDataset(torch.utils.data.Dataset):
 49 | 
 50 | 	def __init__(self, data, mask_token):
 51 | 		self.data = data
 52 | 		self.mask_token = mask_token
 53 | 
 54 | 	def __len__(self):
 55 | 		return len(self.data)
 56 | 
 57 | 	def __getitem__(self, idx):
 58 | 		sample = self.data[idx]
 59 | 		return sample, self.mask_token
 60 | 
 61 | 
 62 | def convert_examples_to_features(examples, tokenizer, max_length=512):
 63 | 	data = []
 64 | 	for example in examples:
 65 | 		inputs, _ = handle_underscores(example['context'], tokenizer, keywords=example['keywords'], prefix=True)
 66 | 		choices = [handle_underscores(cand, tokenizer) for cand in example['candidates']]
 67 | 		input_ids = [inputs+cand[0] for cand in choices]
 68 | 		input_ids = [tokenizer.convert_tokens_to_ids(cand) for cand in input_ids]
 69 | 		data.append([input_ids, input_ids, example['correct']])	
 70 | 	return data
 71 | 
 72 | def mCollateFn(batch):
 73 | 	batch_input_ids = []
 74 | 	batch_input_mask = []
 75 | 	batch_input_labels =[]
 76 | 	batch_label_ids = []
 77 | 	features = [b[0] for b in batch]
 78 | 	mask_token = batch[0][1]
 79 | 	max_len = max([len(cand) for f in features for cand in f[0]])
 80 | 	for f in features:
 81 | 		batch_input_ids.append([])
 82 | 		batch_input_mask.append([])
 83 | 		batch_input_labels.append([])
 84 | 		batch_label_ids.append(f[2])
 85 | 		for i in range(len(f[0])):
 86 | 			sequence = f[0][i] + [mask_token]*(max_len-len(f[0][i]))
 87 | 			att_mask = [1]*len(f[0][i]) + [0]*(max_len-len(f[0][i]))
 88 | 			label_sequence = f[1][i]+[mask_token]*(max_len-len(f[1][i]))
 89 | 			batch_input_ids[-1].append(sequence)
 90 | 			batch_input_mask[-1].append(att_mask)
 91 | 			batch_input_labels[-1].append(label_sequence)
 92 | 		
 93 | 	batch_input_ids = torch.tensor(batch_input_ids, dtype=torch.long)
 94 | 	batch_input_mask = torch.tensor(batch_input_mask, dtype=torch.long)
 95 | 	batch_input_labels = torch.tensor(batch_input_labels, dtype=torch.long)
 96 | 	batch_label_ids = torch.tensor(batch_label_ids, dtype=torch.long)
 97 | 	return batch_input_ids, batch_input_mask, batch_input_labels, batch_label_ids
 98 | 
 99 | def set_seed(args):
100 | 	random.seed(args.seed)
101 | 	np.random.seed(args.seed)
102 | 	torch.manual_seed(args.seed)
103 | 	if args.n_gpu > 0:
104 | 		torch.cuda.manual_seed_all(args.seed)
105 | 
106 | def count_parameters(model):
107 | 	return sum(p.numel() for p in model.parameters() if p.requires_grad)
108 | 
109 | def train(args, train_dataset, model, tokenizer, eval_dataset):
110 | 	""" Train the model """
111 | 	if args.local_rank in [-1, 0]:
112 | 		tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs'))
113 | 
114 | 	args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
115 | 	train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
116 | 	train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=mCollateFn)
117 | 
118 | 	if args.max_steps > 0:
119 | 		t_total = args.max_steps
120 | 		args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
121 | 	else:
122 | 		t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
123 | 
124 | 	# Prepare optimizer and schedule (linear warmup and decay)
125 | 	no_decay = ['bias', 'LayerNorm.weight']
126 | 	optimizer_grouped_parameters = [
127 | 		{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
128 | 		{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
129 | 		]
130 | 
131 | 	warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total)
132 | 	logger.info("warm up steps = %d", warmup_steps)
133 | 	optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98))
134 | 	scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
135 | 
136 | 	if args.fp16:
137 | 		try:
138 | 			from apex import amp
139 | 		except ImportError:
140 | 			raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
141 | 		model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
142 | 
143 | 	# multi-gpu training (should be after apex fp16 initialization)
144 | 	if args.n_gpu > 1:
145 | 		model = torch.nn.DataParallel(model)
146 | 
147 | 	# Distributed training (should be after apex fp16 initialization)
148 | 	if args.local_rank != -1:
149 | 		model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
150 | 														  output_device=args.local_rank,
151 | 														  find_unused_parameters=True)
152 | 	# Train!
153 | 	logger.info("***** Running training *****")
154 | 	logger.info("  Num examples = %d", len(train_dataset))
155 | 	logger.info("  Num Epochs = %d", args.num_train_epochs)
156 | 	logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
157 | 	logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
158 | 				   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
159 | 	logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
160 | 	logger.info("  Total optimization steps = %d", t_total)
161 | 
162 | 	global_step = 0
163 | 	tr_loss, logging_loss = 0.0, 0.0
164 | 	model.zero_grad()
165 | 	train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
166 | 	set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
167 | 	curr_best = 0.0
168 | 	CE = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=tokenizer.mask_token_id)
169 | 	loss_fct = torch.nn.MultiMarginLoss(margin=args.margin)
170 | 	for _ in train_iterator:
171 | 		epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
172 | 		for step, batch in enumerate(epoch_iterator):
173 | 			model.train()
174 | 			b_size, num_cand, seq_len = batch[0].shape
175 | 			input_ids = batch[0].view(-1, seq_len).cuda()
176 | 			attention_mask = batch[1].view(-1, seq_len).cuda()
177 | 			input_labels = batch[2].view(-1, seq_len).cuda()
178 | 			shift_labels = input_labels[..., 1:].contiguous().view(-1)
179 | 			inputs = {'input_ids': input_ids,
180 | 					  'attention_mask': attention_mask}
181 | 			outputs = model(**inputs)
182 | 			shift_logits = outputs[0][..., :-1, :].contiguous().view(-1, outputs[0].size(-1))
183 | 			ce_loss = CE(shift_logits, shift_labels)
184 | 			ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1)
185 | 			valid_tokens = (input_ids != tokenizer.mask_token_id).long().sum(1)
186 | 			ce_loss /= valid_tokens
187 | 			ce_loss = -ce_loss.view(b_size, num_cand)
188 | 			loss = loss_fct(ce_loss, batch[3].cuda())
189 | 
190 | 			if args.n_gpu > 1:
191 | 				loss = loss.mean() # mean() to average on multi-gpu parallel training
192 | 			if args.gradient_accumulation_steps > 1:
193 | 				loss = loss / args.gradient_accumulation_steps
194 | 
195 | 			if args.fp16:
196 | 				with amp.scale_loss(loss, optimizer) as scaled_loss:
197 | 					scaled_loss.backward()
198 | 			else:
199 | 				loss.backward()
200 | 
201 | 			tr_loss += loss.item()
202 | 			if (step + 1) % args.gradient_accumulation_steps == 0:
203 | 				optimizer.step()
204 | 				scheduler.step()  # Update learning rate schedule
205 | 				model.zero_grad()
206 | 				global_step += 1
207 | 
208 | 				if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
209 | 					# Log metrics
210 | 					tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step)
211 | 					tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
212 | 					tb_writer.add_scalar('Batch_loss', loss.item()*args.gradient_accumulation_steps, global_step)
213 | 					logger.info(" global_step = %s, average loss = %s", global_step, (tr_loss - logging_loss)/args.logging_steps)
214 | 					logging_loss = tr_loss
215 | 
216 | 				if args.local_rank == -1 and args.evaluate_during_training and global_step % args.save_steps == 0:
217 | 					results = evaluate(args, model, tokenizer, eval_dataset)
218 | 					for key, value in results.items():
219 | 						tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
220 | 					if results['acc'] > curr_best:
221 | 						curr_best = results['acc']
222 | 						# Save model checkpoint
223 | 						output_dir = args.output_dir
224 | 						if not os.path.exists(output_dir):
225 | 							os.makedirs(output_dir)
226 | 						model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
227 | 						model_to_save.save_pretrained(output_dir)
228 | 						tokenizer.save_pretrained(output_dir)
229 | 						torch.save(args, os.path.join(output_dir, 'training_args.bin'))
230 | 						logger.info("Saving model checkpoint to %s", output_dir)
231 | 					
232 | 
233 | 			if args.max_steps > 0 and global_step > args.max_steps:
234 | 				epoch_iterator.close()
235 | 				break
236 | 		if args.max_steps > 0 and global_step > args.max_steps:
237 | 			train_iterator.close()
238 | 			break
239 | 
240 | 	results = evaluate(args, model, tokenizer, eval_dataset)
241 | 	for key, value in results.items():
242 | 		tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
243 | 	if results['acc'] > curr_best:
244 | 		curr_best = results['acc']
245 | 		# Save model checkpoint
246 | 		output_dir = args.output_dir
247 | 		if not os.path.exists(output_dir):
248 | 			os.makedirs(output_dir)
249 | 		model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
250 | 		model_to_save.save_pretrained(output_dir)
251 | 		tokenizer.save_pretrained(output_dir)
252 | 		torch.save(args, os.path.join(output_dir, 'training_args.bin'))
253 | 		logger.info("Saving model checkpoint to %s", output_dir)
254 | 	if args.local_rank in [-1, 0]:
255 | 		tb_writer.close()
256 | 	return global_step, tr_loss / global_step
257 | 
258 | def save_logits(logits_all, filename):
259 | 	with open(filename, "w") as f:
260 | 		for i in range(len(logits_all)):
261 | 			for j in range(len(logits_all[i])):
262 | 				f.write(str(logits_all[i][j]))
263 | 				if j == len(logits_all[i])-1:
264 | 					f.write("\n")
265 | 				else:
266 | 					f.write(" ")
267 | 
268 | def evaluate(args, model, tokenizer, eval_dataset):
269 | 	results = {}
270 | 	if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
271 | 		os.makedirs(args.output_dir)
272 | 	
273 | 	args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
274 | 	# Note that DistributedSampler samples randomly
275 | 	eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
276 | 	eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=mCollateFn)
277 | 
278 | 	# Eval!
279 | 	logger.info("***** Running evaluation *****")
280 | 	logger.info("  Num examples = %d", len(eval_dataset))
281 | 	logger.info("  Batch size = %d", args.eval_batch_size)
282 | 	CE = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=tokenizer.mask_token_id)
283 | 	preds = []
284 | 	out_label_ids = []
285 | 	for batch in tqdm(eval_dataloader, desc="Evaluating"):
286 | 		model.eval()
287 | 		with torch.no_grad():
288 | 			b_size, num_cand, seq_len = batch[0].shape
289 | 			input_ids = batch[0].view(-1, seq_len).cuda()
290 | 			attention_mask = batch[1].view(-1, seq_len).cuda()
291 | 			input_labels = batch[2].view(-1, seq_len).cuda()
292 | 			shift_labels = input_labels[..., 1:].contiguous().view(-1)
293 | 			inputs = {'input_ids': input_ids,
294 | 					  'attention_mask': attention_mask}
295 | 			outputs = model(**inputs)
296 | 			shift_logits = outputs[0][..., :-1, :].contiguous().view(-1, outputs[0].size(-1))
297 | 			ce_loss = CE(shift_logits, shift_labels)
298 | 			ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1)
299 | 			valid_tokens = (input_ids != tokenizer.mask_token_id).long().sum(1)
300 | 			ce_loss /= valid_tokens
301 | 			ce_loss = -ce_loss.view(b_size, num_cand)
302 | 
303 | 		preds.append(ce_loss)
304 | 		out_label_ids.append(batch[3].numpy())
305 | 	preds = torch.cat(preds, dim=0).cpu().numpy()
306 | 	save_logits(preds.tolist(), os.path.join(args.output_dir, args.logits_file))
307 | 	preds = np.argmax(preds, axis=1)
308 | 	result = accuracy(preds, np.concatenate(out_label_ids))
309 | 	results.update(result)
310 | 	output_eval_file = os.path.join(args.output_dir, args.results_file)
311 | 	with open(output_eval_file, "w") as writer:
312 | 		logger.info("***** Eval results *****")
313 | 		for key in sorted(result.keys()):
314 | 			logger.info("  %s = %s", key, str(result[key]))
315 | 			writer.write("%s = %s\n" % (key, str(result[key])))
316 | 	return results
317 | 
318 | def write_data(filename, data):
319 |     with open(filename, 'w') as fout:
320 |         for sample in data:
321 |             fout.write(json.dumps(sample))
322 |             fout.write('\n')
323 | 
324 | def load_and_cache_examples(args, task, tokenizer, evaluate=False):
325 | 	if args.local_rank not in [-1, 0] and not evaluate:
326 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
327 | 
328 | 	processor = myprocessors[task](args)
329 | 	cached_features_file = os.path.join(args.output_dir, 'cached_{}_{}_{}_{}'.format(
330 | 		'dev',
331 | 		str(args.model_type),
332 | 		str(args.max_seq_length),
333 | 		str(task)))
334 | 	if evaluate and os.path.exists(cached_features_file):
335 | 		features = torch.load(cached_features_file)
336 | 	else:
337 | 		examples = processor.get_dev_examples() if evaluate else processor.get_train_examples()
338 | 		features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length)
339 | 		if evaluate:
340 | 			torch.save(features, cached_features_file)
341 | 	if args.local_rank == 0 and not evaluate:
342 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
343 | 	return MyDataset(features, tokenizer.mask_token_id)
344 | 
345 | def main():
346 | 	parser = argparse.ArgumentParser()
347 | 
348 | 	## Required parameters
349 | 	parser.add_argument("--train_file", default=None, type=str, required=True,
350 | 						help="The train file name")
351 | 	parser.add_argument("--dev_file", default=None, type=str, required=True,
352 | 						help="The dev file name")
353 | 	parser.add_argument("--model_type", default=None, type=str, required=True,
354 | 						help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
355 | 	parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
356 | 						help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_TYPES))
357 | 	parser.add_argument("--config_name", default="", type=str,
358 | 						help="Pretrained config name or path if not the same as model_name")
359 | 	parser.add_argument("--tokenizer_name", default="", type=str,
360 | 						help="Pretrained tokenizer name or path if not the same as model_name")
361 | 	parser.add_argument("--cache_dir", default="", type=str,
362 | 						help="Where do you want to store the pre-trained models downloaded from s3")
363 | 	parser.add_argument("--task_name", default=None, type=str, required=True,
364 | 						help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys()))
365 | 	parser.add_argument("--output_dir", default=None, type=str, required=True,
366 | 						help="The output directory where the model predictions and checkpoints will be written.")
367 | 
368 | 	## Other parameters
369 | 	parser.add_argument("--second_train_file", default=None, type=str,
370 | 						help="Used when combining ATOMIC and CWWV")
371 | 	parser.add_argument("--second_dev_file", default=None, type=str,
372 | 						help="Used when combining ATOMIC and CWWV")
373 | 	parser.add_argument("--max_seq_length", default=128, type=int,
374 | 						help="The maximum total input sequence length after tokenization. Sequences longer "
375 | 							 "than this will be truncated, sequences shorter will be padded.")
376 | 	parser.add_argument("--do_train", action='store_true',
377 | 						help="Whether to run training.")
378 | 	parser.add_argument("--do_eval", action='store_true',
379 | 						help="Whether to run eval on the dev set.")
380 | 	parser.add_argument("--evaluate_during_training", action='store_true',
381 | 						help="Run evaluation during training at each logging step.")
382 | 	parser.add_argument("--do_lower_case", action='store_true',
383 | 						help="Set this flag if you are using an uncased model.")
384 | 
385 | 	parser.add_argument("--per_gpu_train_batch_size", default=1, type=int,
386 | 						help="Batch size per GPU/CPU for training.")
387 | 	parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int,
388 | 						help="Batch size per GPU/CPU for evaluation.")
389 | 	parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
390 | 						help="Number of updates steps to accumulate before performing a backward/update pass.")
391 | 	parser.add_argument("--margin", default=1.0, type=float,
392 | 						help="The margin for ranking loss")
393 | 	parser.add_argument("--learning_rate", default=1e-5, type=float,
394 | 						help="The initial learning rate for Adam.")
395 | 	parser.add_argument("--weight_decay", default=0.01, type=float,
396 | 						help="Weight deay if we apply some.")
397 | 	parser.add_argument("--adam_epsilon", default=1e-6, type=float,
398 | 						help="Epsilon for Adam optimizer.")
399 | 	parser.add_argument("--max_grad_norm", default=1.0, type=float,
400 | 						help="Max gradient norm.")
401 | 	parser.add_argument("--num_train_epochs", default=1.0, type=float,
402 | 						help="Total number of training epochs to perform.")
403 | 	parser.add_argument("--max_steps", default=-1, type=int,
404 | 						help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
405 | 	parser.add_argument("--warmup_steps", default=0, type=int,
406 | 						help="Linear warmup over warmup_steps.")
407 | 	parser.add_argument("--warmup_proportion", default=0.05, type=float,
408 | 						help="Linear warmup over warmup proportion.")
409 | 	parser.add_argument('--logging_steps', type=int, default=50,
410 | 						help="Log every X updates steps.")
411 | 	parser.add_argument('--save_steps', type=int, default=50,
412 | 						help="Save checkpoint every X updates steps.")
413 | 	parser.add_argument("--logits_file", default='logits_test.txt', type=str, 
414 | 						help="The file where prediction logits will be written")
415 | 	parser.add_argument("--results_file", default='eval_results.txt', type=str,
416 | 						help="The file where eval results will be written")
417 | 	parser.add_argument("--no_cuda", action='store_true',
418 | 						help="Avoid using CUDA when available")
419 | 	parser.add_argument('--overwrite_output_dir', action='store_true',
420 | 						help="Overwrite the content of the output directory")
421 | 	parser.add_argument('--seed', type=int, default=2555,
422 | 						help="random seed for initialization")
423 | 	parser.add_argument('--fp16', action='store_true',
424 | 						help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
425 | 	parser.add_argument('--fp16_opt_level', type=str, default='O1',
426 | 						help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
427 | 							 "See details at https://nvidia.github.io/apex/amp.html")
428 | 	parser.add_argument("--local_rank", type=int, default=-1,
429 | 						help="For distributed training: local_rank")
430 | 	parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
431 | 	parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
432 | 	args = parser.parse_args()
433 | 
434 | 	if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir and args.do_train:
435 | 		raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
436 | 	if not os.path.exists(args.output_dir):
437 | 		os.makedirs(args.output_dir)
438 | 
439 | 	# Setup CUDA, GPU & distributed training
440 | 	if args.local_rank == -1 or args.no_cuda:
441 | 		device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
442 | 		args.n_gpu = torch.cuda.device_count()
443 | 	else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
444 | 		torch.cuda.set_device(args.local_rank)
445 | 		device = torch.device("cuda", args.local_rank)
446 | 		torch.distributed.init_process_group(backend='nccl')
447 | 		args.n_gpu = 1
448 | 	args.device = device
449 | 
450 | 	if args.do_train:
451 | 		for handler in logging.root.handlers[:]:
452 | 			logging.root.removeHandler(handler)
453 | 	# Setup logging
454 | 	if args.do_train:
455 | 		log_file = os.path.join(args.output_dir, 'train.log')
456 | 		logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
457 | 							datefmt = '%m/%d/%Y %H:%M:%S',
458 | 							level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
459 | 							filename=log_file)
460 | 		logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
461 | 						args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
462 | 		os.system("cp run_pretrain_gpt2.py %s" % os.path.join(args.output_dir, 'run_pretrain_gpt2.py'))
463 | 		os.system("cp data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py'))
464 | 
465 | 	# Set seed
466 | 	set_seed(args)
467 | 	args.task_name = args.task_name.lower()
468 | 	if args.task_name not in myprocessors:
469 | 		raise ValueError("Task not found: %s" % (args.task_name))
470 | 	
471 | 	args.model_type = args.model_type.lower()
472 | 	config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
473 | 	config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir)
474 | 	tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir)
475 | 	model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir)
476 | 	
477 | 	count = count_parameters(model)
478 | 	print (count)
479 | 	special_tokens_dict = {'mask_token': '<mask>'}
480 | 	num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
481 | 	model.resize_token_embeddings(len(tokenizer))
482 | 
483 | 	if args.local_rank == 0:
484 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
485 | 
486 | 	model.to(args.device)
487 | 
488 | 	logger.info("Training/evaluation parameters %s", args)
489 | 
490 | 	eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
491 | 
492 | 	init_result = evaluate(args, model, tokenizer, eval_dataset)
493 | 	print (init_result)
494 | 	if args.do_train:
495 | 		train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
496 | 		global_step, tr_loss = train(args, train_dataset, model, tokenizer, eval_dataset)
497 | 		logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
498 | 	# Evaluation
499 | 	results = {}
500 | 	if args.do_eval:
501 | 		tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
502 | 		model = model_class.from_pretrained(args.output_dir)
503 | 		model.eval()
504 | 		model.to(args.device)
505 | 		result = evaluate(args, model, tokenizer, eval_dataset)
506 | 	return results
507 | 
508 | 
509 | if __name__ == "__main__":
510 | 	main()


--------------------------------------------------------------------------------
/src/Training/run_pretrain.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | from __future__ import absolute_import, division, print_function
 18 | 
 19 | import argparse
 20 | import glob
 21 | import logging
 22 | import os
 23 | import random
 24 | import numpy as np
 25 | import torch
 26 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 27 | 							  TensorDataset)
 28 | from torch.utils.data.distributed import DistributedSampler
 29 | from torch.utils.tensorboard import SummaryWriter
 30 | 
 31 | from tqdm import tqdm, trange
 32 | from transformers import (WEIGHTS_NAME, RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
 33 | from transformers import AdamW, get_linear_schedule_with_warmup
 34 | from data_utils import accuracy, myprocessors, convert_examples_to_features
 35 | import json
 36 | 
 37 | logger = logging.getLogger(__name__)
 38 | 
 39 | from transformers import MODEL_WITH_LM_HEAD_MAPPING
 40 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 41 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 42 | MODEL_CLASSES = {
 43 | 	'roberta-mlm': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
 44 | }
 45 | 
 46 | class MyDataset(torch.utils.data.Dataset):
 47 | 
 48 | 	def __init__(self, data, pad_token, mask_token, max_words_to_mask):
 49 | 		self.data = data
 50 | 		self.pad_token = pad_token
 51 | 		self.mask_token = mask_token
 52 | 		self.max_words_to_mask = max_words_to_mask
 53 | 
 54 | 	def __len__(self):
 55 | 		return len(self.data)
 56 | 
 57 | 	def __getitem__(self, idx):
 58 | 		sample = self.data[idx]
 59 | 		return sample, self.pad_token, self.mask_token, self.max_words_to_mask
 60 | 
 61 | def mCollateFn(batch):
 62 | 	batch_input_ids = []
 63 | 	batch_input_mask = []
 64 | 	batch_input_labels = []
 65 | 	batch_label_ids = []
 66 | 	features = [b[0] for b in batch]
 67 | 	pad_token = batch[0][1]
 68 | 	mask_token = batch[0][2]
 69 | 	MAX_WORDS_TO_MASK = batch[0][3]
 70 | 	max_len = max([len(cand) for f in features for cand in f[0]])
 71 | 	for f in features:
 72 | 		batch_input_ids.append([])
 73 | 		batch_input_mask.append([])
 74 | 		batch_input_labels.append([])
 75 | 		batch_label_ids.append(f[2])
 76 | 		for i in range(len(f[0])):
 77 | 			masked_sequences = []
 78 | 			masked_labels = []
 79 | 			this_att_mask = []
 80 | 			sequence = f[0][i] + [pad_token]*(max_len-len(f[0][i]))
 81 | 			label_sequence = f[1][i]+[-100]*(max_len-len(f[1][i]))
 82 | 			valid_indices = [l_i for l_i, l in enumerate(label_sequence) if l != -100]
 83 | 			if len(valid_indices) > MAX_WORDS_TO_MASK:
 84 | 				rm_indices = random.sample(valid_indices, (len(valid_indices)-MAX_WORDS_TO_MASK))
 85 | 				label_sequence = [-100 if l_i in rm_indices else l for l_i, l in enumerate(label_sequence)]
 86 | 			for j, t in enumerate(label_sequence):
 87 | 				if t == -100:
 88 | 					continue
 89 | 					masked_sequences.append(sequence)
 90 | 					masked_labels.append([-100]*max_len)
 91 | 				else:
 92 | 					masked_sequences.append(sequence[:j]+[mask_token]+sequence[j+1:])
 93 | 					masked_labels.append([-100]*j+[sequence[j]]+[-100]*(max_len-j-1))
 94 | 				this_att_mask.append([1]*len(f[0][i])+[0]*(max_len-len(f[0][i])))
 95 | 			batch_input_ids[-1].append(torch.tensor(masked_sequences, dtype=torch.long))
 96 | 			batch_input_mask[-1].append(torch.tensor(this_att_mask, dtype=torch.long))
 97 | 			batch_input_labels[-1].append(torch.tensor(masked_labels, dtype=torch.long))
 98 | 	return batch_input_ids, batch_input_mask, batch_input_labels, torch.tensor(batch_label_ids, dtype=torch.long)
 99 | 
100 | def set_seed(args):
101 | 	random.seed(args.seed)
102 | 	np.random.seed(args.seed)
103 | 	torch.manual_seed(args.seed)
104 | 	if args.n_gpu > 0:
105 | 		torch.cuda.manual_seed_all(args.seed)
106 | 
107 | def count_parameters(model):
108 | 	return sum(p.numel() for p in model.parameters() if p.requires_grad)
109 | 
110 | def train(args, train_dataset, model, tokenizer, eval_dataset):
111 | 	""" Train the model """
112 | 	if args.local_rank in [-1, 0]:
113 | 		tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs'))
114 | 
115 | 	args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
116 | 	train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
117 | 	train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=mCollateFn)
118 | 
119 | 	if args.max_steps > 0:
120 | 		t_total = args.max_steps
121 | 		args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
122 | 	else:
123 | 		t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
124 | 
125 | 	# Prepare optimizer and schedule (linear warmup and decay)
126 | 	no_decay = ['bias', 'LayerNorm.weight']
127 | 	optimizer_grouped_parameters = [
128 | 		{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
129 | 		{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
130 | 		]
131 | 
132 | 	warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total)
133 | 	logger.info("warm up steps = %d", warmup_steps)
134 | 	optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98))
135 | 	scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
136 | 
137 | 	if args.fp16:
138 | 		try:
139 | 			from apex import amp
140 | 		except ImportError:
141 | 			raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
142 | 		model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
143 | 
144 | 	# multi-gpu training (should be after apex fp16 initialization)
145 | 	if args.n_gpu > 1:
146 | 		model = torch.nn.DataParallel(model)
147 | 
148 | 	# Distributed training (should be after apex fp16 initialization)
149 | 	if args.local_rank != -1:
150 | 		model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
151 | 														  output_device=args.local_rank,
152 | 														  find_unused_parameters=True)
153 | 	# Train!
154 | 	logger.info("***** Running training *****")
155 | 	logger.info("  Num examples = %d", len(train_dataset))
156 | 	logger.info("  Num Epochs = %d", args.num_train_epochs)
157 | 	logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
158 | 	logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
159 | 				   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
160 | 	logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
161 | 	logger.info("  Total optimization steps = %d", t_total)
162 | 
163 | 	global_step = 0
164 | 	tr_loss, logging_loss = 0.0, 0.0
165 | 	model.zero_grad()
166 | 	train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
167 | 	set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
168 | 	curr_best = 0.0
169 | 	CE = torch.nn.CrossEntropyLoss(reduction='none')
170 | 	loss_fct = torch.nn.MultiMarginLoss(margin=args.margin)
171 | 	for _ in train_iterator:
172 | 		epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
173 | 		for step, batch in enumerate(epoch_iterator):
174 | 			model.train()
175 | 			num_cand = len(batch[0][0])
176 | 			choice_loss = []
177 | 			choice_seq_lens = np.array([0]+[len(c) for sample in batch[0] for c in sample])
178 | 			choice_seq_lens = np.cumsum(choice_seq_lens)
179 | 			input_ids = torch.cat([c for sample in batch[0] for c in sample], dim=0).to(args.device)
180 | 			att_mask = torch.cat([c for sample in batch[1] for c in sample], dim=0).to(args.device)
181 | 			input_labels = torch.cat([c for sample in batch[2] for c in sample], dim=0).to(args.device)
182 | 
183 | 			if len(input_ids) < args.max_sequence_per_time:
184 | 				inputs = {'input_ids': input_ids,
185 | 						  'attention_mask': att_mask}
186 | 				outputs = model(**inputs)
187 | 				ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels.view(-1))
188 | 				ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1)
189 | 			else:
190 | 				ce_loss = []
191 | 				for chunk in range(0, len(input_ids), args.max_sequence_per_time):
192 | 					inputs = {'input_ids': input_ids[chunk:chunk+args.max_sequence_per_time],
193 | 						  'attention_mask': att_mask[chunk:chunk+args.max_sequence_per_time]}
194 | 					outputs = model(**inputs)
195 | 					tmp_ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels[chunk:chunk+args.max_sequence_per_time].view(-1))
196 | 					tmp_ce_loss = tmp_ce_loss.view(outputs[0].size(0), -1).sum(1)
197 | 					ce_loss.append(tmp_ce_loss)
198 | 				ce_loss = torch.cat(ce_loss, dim=0)
199 | 			# all tokens are valid
200 | 			for c_i in range(len(choice_seq_lens)-1):
201 | 				start = choice_seq_lens[c_i]
202 | 				end =  choice_seq_lens[c_i+1]
203 | 				choice_loss.append(-ce_loss[start:end].sum()/(end-start))
204 | 
205 | 			choice_loss = torch.stack(choice_loss)
206 | 			choice_loss = choice_loss.view(-1, num_cand)
207 | 			loss = loss_fct(choice_loss, batch[3].to(args.device))
208 | 
209 | 			if args.n_gpu > 1:
210 | 				loss = loss.mean() # mean() to average on multi-gpu parallel training
211 | 			if args.gradient_accumulation_steps > 1:
212 | 				loss = loss / args.gradient_accumulation_steps
213 | 
214 | 			if args.fp16:
215 | 				with amp.scale_loss(loss, optimizer) as scaled_loss:
216 | 					scaled_loss.backward()
217 | 			else:
218 | 				loss.backward()
219 | 
220 | 			tr_loss += loss.item()
221 | 			if (step + 1) % args.gradient_accumulation_steps == 0:
222 | 				optimizer.step()
223 | 				scheduler.step()  # Update learning rate schedule
224 | 				model.zero_grad()
225 | 				global_step += 1
226 | 
227 | 				if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
228 | 					# Log metrics
229 | 					tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step)
230 | 					tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
231 | 					tb_writer.add_scalar('Batch_loss', loss.item()*args.gradient_accumulation_steps, global_step)
232 | 					logger.info(" global_step = %s, average loss = %s", global_step, (tr_loss - logging_loss)/args.logging_steps)
233 | 					logging_loss = tr_loss
234 | 
235 | 				if args.local_rank == -1 and args.evaluate_during_training and global_step % args.save_steps == 0:
236 | 					results = evaluate(args, model, tokenizer, eval_dataset)
237 | 					for key, value in results.items():
238 | 						tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
239 | 					if results['acc'] > curr_best:
240 | 						curr_best = results['acc']
241 | 						# Save model checkpoint
242 | 						output_dir = args.output_dir
243 | 						if not os.path.exists(output_dir):
244 | 							os.makedirs(output_dir)
245 | 						model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
246 | 						model_to_save.save_pretrained(output_dir)
247 | 						tokenizer.save_pretrained(output_dir)
248 | 						torch.save(args, os.path.join(output_dir, 'training_args.bin'))
249 | 						logger.info("Saving model checkpoint to %s", output_dir)
250 | 					
251 | 
252 | 			if args.max_steps > 0 and global_step > args.max_steps:
253 | 				epoch_iterator.close()
254 | 				break
255 | 		if args.max_steps > 0 and global_step > args.max_steps:
256 | 			train_iterator.close()
257 | 			break
258 | 	results = evaluate(args, model, tokenizer, eval_dataset)
259 | 	for key, value in results.items():
260 | 		tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
261 | 	if results['acc'] > curr_best:
262 | 		curr_best = results['acc']
263 | 		# Save model checkpoint
264 | 		output_dir = args.output_dir
265 | 		if not os.path.exists(output_dir):
266 | 			os.makedirs(output_dir)
267 | 		model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
268 | 		model_to_save.save_pretrained(output_dir)
269 | 		tokenizer.save_pretrained(output_dir)
270 | 		torch.save(args, os.path.join(output_dir, 'training_args.bin'))
271 | 		logger.info("Saving model checkpoint to %s", output_dir)
272 | 	if args.local_rank in [-1, 0]:
273 | 		tb_writer.close()
274 | 	return global_step, tr_loss / global_step
275 | 
276 | def save_logits(logits_all, filename):
277 | 	with open(filename, "w") as f:
278 | 		for i in range(len(logits_all)):
279 | 			for j in range(len(logits_all[i])):
280 | 				f.write(str(logits_all[i][j]))
281 | 				if j == len(logits_all[i])-1:
282 | 					f.write("\n")
283 | 				else:
284 | 					f.write(" ")
285 | 
286 | def evaluate(args, model, tokenizer, eval_dataset):
287 | 	results = {}
288 | 	if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
289 | 		os.makedirs(args.output_dir)
290 | 	
291 | 	args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
292 | 	# Note that DistributedSampler samples randomly
293 | 	eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
294 | 	eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=mCollateFn)
295 | 
296 | 	# Eval!
297 | 	logger.info("***** Running evaluation *****")
298 | 	logger.info("  Num examples = %d", len(eval_dataset))
299 | 	logger.info("  Batch size = %d", args.eval_batch_size)
300 | 	CE = torch.nn.CrossEntropyLoss(reduction='none')
301 | 	preds = []
302 | 	out_label_ids = []
303 | 	for batch in tqdm(eval_dataloader, desc="Evaluating"):
304 | 		model.eval()
305 | 		with torch.no_grad():
306 | 			num_cand = len(batch[0][0])
307 | 			choice_loss = []
308 | 			choice_seq_lens = np.array([0]+[len(c) for sample in batch[0] for c in sample])
309 | 			choice_seq_lens = np.cumsum(choice_seq_lens)
310 | 			input_ids = torch.cat([c for sample in batch[0] for c in sample], dim=0).to(args.device)
311 | 			att_mask = torch.cat([c for sample in batch[1] for c in sample], dim=0).to(args.device)
312 | 			input_labels = torch.cat([c for sample in batch[2] for c in sample], dim=0).to(args.device)
313 | 			if len(input_ids) < args.max_sequence_per_time:
314 | 				inputs = {'input_ids': input_ids,
315 | 						  'attention_mask': att_mask}
316 | 				outputs = model(**inputs)
317 | 				ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels.view(-1))
318 | 				ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1)
319 | 			else:
320 | 				ce_loss = []
321 | 				for chunk in range(0, len(input_ids), args.max_sequence_per_time):
322 | 					inputs = {'input_ids': input_ids[chunk:chunk+args.max_sequence_per_time],
323 | 						  'attention_mask': att_mask[chunk:chunk+args.max_sequence_per_time]}
324 | 					outputs = model(**inputs)
325 | 					tmp_ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels[chunk:chunk+args.max_sequence_per_time].view(-1))
326 | 					tmp_ce_loss = tmp_ce_loss.view(outputs[0].size(0), -1).sum(1)
327 | 					ce_loss.append(tmp_ce_loss)
328 | 				ce_loss = torch.cat(ce_loss, dim=0)
329 | 			for c_i in range(len(choice_seq_lens)-1):
330 | 				start = choice_seq_lens[c_i]
331 | 				end =  choice_seq_lens[c_i+1]
332 | 				choice_loss.append(-ce_loss[start:end].sum()/(end-start))
333 | 			choice_loss = torch.stack(choice_loss)
334 | 			choice_loss = choice_loss.view(-1, num_cand)
335 | 		preds.append(choice_loss)
336 | 		out_label_ids.append(batch[3].numpy())
337 | 	preds = torch.cat(preds, dim=0).cpu().numpy()
338 | 	save_logits(preds.tolist(), os.path.join(args.output_dir, args.logits_file))
339 | 	preds = np.argmax(preds, axis=1)
340 | 	result = accuracy(preds, np.concatenate(out_label_ids, axis=0))
341 | 	results.update(result)
342 | 	output_eval_file = os.path.join(args.output_dir, args.results_file)
343 | 	with open(output_eval_file, "w") as writer:
344 | 		logger.info("***** Eval results *****")
345 | 		for key in sorted(result.keys()):
346 | 			logger.info("  %s = %s", key, str(result[key]))
347 | 			writer.write("%s = %s\n" % (key, str(result[key])))
348 | 	return results
349 | 
350 | def write_data(filename, data):
351 |     with open(filename, 'w') as fout:
352 |         for sample in data:
353 |             fout.write(json.dumps(sample))
354 |             fout.write('\n')
355 | 
356 | def load_and_cache_examples(args, task, tokenizer, evaluate=False):
357 | 	if args.local_rank not in [-1, 0] and not evaluate:
358 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
359 | 	processor = myprocessors[task](args)
360 | 	cached_features_file = os.path.join(args.output_dir, 'cached_{}_{}_{}_{}'.format(
361 | 		'dev',
362 | 		str(args.model_type),
363 | 		str(args.max_seq_length),
364 | 		str(task)))
365 | 	if evaluate and os.path.exists(cached_features_file):
366 | 		features = torch.load(cached_features_file)
367 | 	else:
368 | 		examples = processor.get_dev_examples() if evaluate else processor.get_train_examples()
369 | 		features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length)
370 | 		if evaluate:
371 | 			torch.save(features, cached_features_file)
372 | 	if args.local_rank == 0 and not evaluate:
373 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
374 | 	print ('max_words_to_mask is %s for pretraining tasks %s' % (args.max_words_to_mask, task))
375 | 	return MyDataset(features, tokenizer.pad_token_id, tokenizer.mask_token_id, args.max_words_to_mask)
376 | 
377 | def main():
378 | 	parser = argparse.ArgumentParser()
379 | 
380 | 	## Required parameters
381 | 	parser.add_argument("--train_file", default=None, type=str, required=True,
382 | 						help="The train file name")
383 | 	parser.add_argument("--dev_file", default=None, type=str, required=True,
384 | 						help="The dev file name")
385 | 	parser.add_argument("--model_type", default=None, type=str, required=True,
386 | 						help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
387 | 	parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
388 | 						help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_TYPES))
389 | 	parser.add_argument("--config_name", default="", type=str,
390 | 						help="Pretrained config name or path if not the same as model_name")
391 | 	parser.add_argument("--tokenizer_name", default="", type=str,
392 | 						help="Pretrained tokenizer name or path if not the same as model_name")
393 | 	parser.add_argument("--cache_dir", default="", type=str,
394 | 						help="Where do you want to store the pre-trained models downloaded from s3")
395 | 	parser.add_argument("--task_name", default=None, type=str, required=True,
396 | 						help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys()))
397 | 	parser.add_argument("--output_dir", default=None, type=str, required=True,
398 | 						help="The output directory where the model predictions and checkpoints will be written.")
399 | 
400 | 	## Other parameters
401 | 	parser.add_argument("--second_train_file", default=None, type=str,
402 | 						help="Used when combining ATOMIC and CWWV")
403 | 	parser.add_argument("--second_dev_file", default=None, type=str,
404 | 						help="Used when combining ATOMIC and CWWV")
405 | 	parser.add_argument("--max_seq_length", default=128, type=int,
406 | 						help="The maximum total input sequence length after tokenization. Sequences longer "
407 | 							 "than this will be truncated, sequences shorter will be padded.")
408 | 	parser.add_argument("--max_words_to_mask", default=6, type=int,
409 | 						help="The maximum number of tokens to mask when computing scores")
410 | 	parser.add_argument("--max_sequence_per_time", default=80, type=int,
411 | 						help="The maximum number of sequences to feed into the model")
412 | 	parser.add_argument("--do_train", action='store_true',
413 | 						help="Whether to run training.")
414 | 	parser.add_argument("--do_eval", action='store_true',
415 | 						help="Whether to run eval on the dev set.")
416 | 	parser.add_argument("--evaluate_during_training", action='store_true',
417 | 						help="Run evaluation during training at each logging step.")
418 | 	parser.add_argument("--do_lower_case", action='store_true',
419 | 						help="Set this flag if you are using an uncased model.")
420 | 	parser.add_argument("--per_gpu_train_batch_size", default=1, type=int,
421 | 						help="Batch size per GPU/CPU for training.")
422 | 	parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int,
423 | 						help="Batch size per GPU/CPU for evaluation.")
424 | 	parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
425 | 						help="Number of updates steps to accumulate before performing a backward/update pass.")
426 | 	parser.add_argument("--margin", default=1.0, type=float,
427 | 						help="The margin for ranking loss")
428 | 	parser.add_argument("--learning_rate", default=1e-5, type=float,
429 | 						help="The initial learning rate for Adam.")
430 | 	parser.add_argument("--weight_decay", default=0.01, type=float,
431 | 						help="Weight deay if we apply some.")
432 | 	parser.add_argument("--adam_epsilon", default=1e-6, type=float,
433 | 						help="Epsilon for Adam optimizer.")
434 | 	parser.add_argument("--max_grad_norm", default=1.0, type=float,
435 | 						help="Max gradient norm.")
436 | 	parser.add_argument("--num_train_epochs", default=1.0, type=float,
437 | 						help="Total number of training epochs to perform.")
438 | 	parser.add_argument("--max_steps", default=-1, type=int,
439 | 						help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
440 | 	parser.add_argument("--warmup_steps", default=0, type=int,
441 | 						help="Linear warmup over warmup_steps.")
442 | 	parser.add_argument("--warmup_proportion", default=0.05, type=float,
443 | 						help="Linear warmup over warmup proportion.")
444 | 	parser.add_argument('--logging_steps', type=int, default=50,
445 | 						help="Log every X updates steps.")
446 | 	parser.add_argument('--save_steps', type=int, default=50,
447 | 						help="Save checkpoint every X updates steps.")
448 | 	parser.add_argument("--logits_file", default='logits_test.txt', type=str, 
449 | 						help="The file where prediction logits will be written")
450 | 	parser.add_argument("--results_file", default='eval_results.txt', type=str,
451 | 						help="The file where eval results will be written")
452 | 	parser.add_argument("--no_cuda", action='store_true',
453 | 						help="Avoid using CUDA when available")
454 | 	parser.add_argument('--overwrite_output_dir', action='store_true',
455 | 						help="Overwrite the content of the output directory")
456 | 	parser.add_argument('--seed', type=int, default=2555,
457 | 						help="random seed for initialization")
458 | 	parser.add_argument('--fp16', action='store_true',
459 | 						help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
460 | 	parser.add_argument('--fp16_opt_level', type=str, default='O1',
461 | 						help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
462 | 							 "See details at https://nvidia.github.io/apex/amp.html")
463 | 	parser.add_argument("--local_rank", type=int, default=-1,
464 | 						help="For distributed training: local_rank")
465 | 	parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
466 | 	parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
467 | 	args = parser.parse_args()
468 | 
469 | 	if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir and args.do_train:
470 | 		raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
471 | 	if not os.path.exists(args.output_dir):
472 | 		os.makedirs(args.output_dir)
473 | 
474 | 	# Setup CUDA, GPU & distributed training
475 | 	if args.local_rank == -1 or args.no_cuda:
476 | 		device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
477 | 		args.n_gpu = torch.cuda.device_count()
478 | 	else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
479 | 		torch.cuda.set_device(args.local_rank)
480 | 		device = torch.device("cuda", args.local_rank)
481 | 		torch.distributed.init_process_group(backend='nccl')
482 | 		args.n_gpu = 1
483 | 	args.device = device
484 | 
485 | 	if args.do_train:
486 | 		for handler in logging.root.handlers[:]:
487 | 			logging.root.removeHandler(handler)
488 | 	# Setup logging
489 | 	if args.do_train:
490 | 		log_file = os.path.join(args.output_dir, 'train.log')
491 | 		logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
492 | 							datefmt = '%m/%d/%Y %H:%M:%S',
493 | 							level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
494 | 							filename=log_file)
495 | 		logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
496 | 						args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
497 | 		os.system("cp run_pretrain.py %s" % os.path.join(args.output_dir, 'run_pretrain.py'))
498 | 		os.system("cp data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py'))
499 | 
500 | 	# Set seed
501 | 	set_seed(args)
502 | 	args.task_name = args.task_name.lower()
503 | 	if args.task_name not in myprocessors:
504 | 		raise ValueError("Task not found: %s" % (args.task_name))
505 | 	
506 | 	args.model_type = args.model_type.lower()
507 | 	config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
508 | 	config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir)
509 | 	tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir)
510 | 	model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir)
511 | 	
512 | 	count = count_parameters(model)
513 | 	print (count)
514 | 
515 | 	if args.local_rank == 0:
516 | 		torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
517 | 
518 | 	model.to(args.device)
519 | 
520 | 	logger.info("Training/evaluation parameters %s", args)
521 | 
522 | 	eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
523 | 	if args.do_train:
524 | 		init_result = evaluate(args, model, tokenizer, eval_dataset)
525 | 		print (init_result)
526 | 	if args.do_train:
527 | 		train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
528 | 		global_step, tr_loss = train(args, train_dataset, model, tokenizer, eval_dataset)
529 | 		logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
530 | 	# Evaluation
531 | 	results = {}
532 | 	if args.do_eval:
533 | 		tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
534 | 		model = model_class.from_pretrained(args.output_dir)
535 | 		model.eval()
536 | 		model.to(args.device)
537 | 		result = evaluate(args, model, tokenizer, eval_dataset)
538 | 	return results
539 | 
540 | if __name__ == "__main__":
541 | 	main()


--------------------------------------------------------------------------------