├── .gitignore ├── README.md ├── config.yaml ├── environment.yml ├── eval.py ├── model.py ├── requirements.txt └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | *truth 2 | *tsv 3 | *models 4 | *output 5 | *cache 6 | *idea 7 | *cache* 8 | outputs 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | .vscode 14 | output/ 15 | *.tsv 16 | *-models/ 17 | .DS_Store 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | MANIFEST 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # celery beat schedule file 91 | celerybeat-schedule 92 | 93 | # SageMath parsed files 94 | *.sage.py 95 | 96 | # Environments 97 | .env 98 | .venv 99 | env/ 100 | venv/ 101 | ENV/ 102 | env.bak/ 103 | venv.bak/ 104 | 105 | # Spyder project settings 106 | .spyderproject 107 | .spyproject 108 | 109 | # Rope project settings 110 | .ropeproject 111 | 112 | # mkdocs documentation 113 | /site 114 | 115 | # mypy 116 | .mypy_cache/ 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Minimal Code Base For AI2 Commonsense Leaderboard 2 | 3 | ## Dependencies 4 | 5 | install apex if you want to use half precision: https://github.com/NVIDIA/apex. Conda env file is also included for reference, the apex might not be compatiable with conda directly so you can remove that before you create an environment. 6 | 7 | ```bash 8 | pip install -r requirements.txt 9 | ``` 10 | 11 | ## Train 12 | 13 | 14 | Modify `config.yaml` as you like and run `python train.py` to train a model. It loads the config file and outputs all the logs/checkpoints in `outputs` 15 | 16 | ## Eval 17 | 18 | ### Get predictions without evaluation 19 | ```bash 20 | python eval.py \ 21 | --input_x cache/physicaliqa-train-dev/physicaliqa-train-dev/dev.jsonl \ 22 | --config config.yaml \ 23 | --checkpoint outputs/2020-02-26/20-26-22/lightning_logs/version_6341419/checkpoints/_ckpt_epoch_3_v0.ckpt \ 24 | --output pred.lst 25 | ``` 26 | 27 | ### Get predictions with evaluation(accuracy, confidence interval) 28 | 29 | ```bash 30 | python eval.py \ 31 | --input_x cache/physicaliqa-train-dev/physicaliqa-train-dev/dev.jsonl \ 32 | --config config.yaml \ 33 | --checkpoint outputs/2020-02-26/20-26-22/lightning_logs/version_6341419/checkpoints/_ckpt_epoch_3_v0.ckpt \ 34 | --input_y cache/physicaliqa-train-dev/physicaliqa-train-dev/dev-labels.lst \ 35 | --output pred.lst 36 | ``` 37 | 38 | ## Results 39 | 40 | ### PIQA 41 | | Model | Bootstrapped Accuracy Mean | Bootstrapped Accuracy CI | Accuracy | 42 | |:-------------:|:--------------------------:|:------------------------:|:--------:| 43 | | Roberta large (V100) | 77.4 | 75.7 - 79.4 | 77.3 | 44 | | Roberta large (K80) | 74.0 | 72.4 - 76.2 | 74.2 | 45 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | task_name: physicaliqa 2 | model: "roberta-large" 3 | accumulate_grad_batches: 8 4 | use_amp: true # Half precison only works best with volta architectures such as V100 5 | max_epochs: 4 6 | learning_rate: 2e-6 7 | adam_epsilon: 10e-8 8 | warmup_steps: 150 9 | batch_size: 8 10 | max_length: 128 11 | formula: "goal -> sol1|sol2" 12 | train_x: "cache/physicaliqa-train-dev/physicaliqa-train-dev/train.jsonl" 13 | train_y: "cache/physicaliqa-train-dev/physicaliqa-train-dev/train-labels.lst" 14 | val_x: "cache/physicaliqa-train-dev/physicaliqa-train-dev/dev.jsonl" 15 | val_y: "cache/physicaliqa-train-dev/physicaliqa-train-dev/dev-labels.lst" 16 | 17 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: py3.7 2 | channels: 3 | - menpo 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - ca-certificates=2020.1.1=0 8 | - certifi=2019.11.28=py37_0 9 | - ld_impl_linux-64=2.33.1=h53a641e_7 10 | - libedit=3.1.20181209=hc058e9b_0 11 | - libffi=3.2.1=hd88cf55_4 12 | - libgcc-ng=9.1.0=hdf63c60_0 13 | - libstdcxx-ng=9.1.0=hdf63c60_0 14 | - ncurses=6.2=he6710b0_0 15 | - openssl=1.1.1d=h7b6447c_4 16 | - pip=20.0.2=py37_1 17 | - python=3.7.6=h0371630_2 18 | - readline=7.0=h7b6447c_5 19 | - setuptools=46.0.0=py37_0 20 | - sqlite=3.31.1=h7b6447c_0 21 | - tk=8.6.8=hbc83047_0 22 | - wheel=0.34.2=py37_0 23 | - xz=5.2.4=h14c3975_4 24 | - zlib=1.2.11=h7b6447c_3 25 | - pip: 26 | - absl-py==0.9.0 27 | - apex==0.1 28 | - boto3==1.12.18 29 | - botocore==1.15.18 30 | - cachetools==4.0.0 31 | - chardet==3.0.4 32 | - click==7.1.1 33 | - docutils==0.15.2 34 | - filelock==3.0.12 35 | - future==0.18.2 36 | - google-auth==1.11.2 37 | - google-auth-oauthlib==0.4.1 38 | - grpcio==1.27.2 39 | - hydra-core==0.11.3 40 | - idna==2.9 41 | - jmespath==0.9.5 42 | - joblib==0.14.1 43 | - loguru==0.4.1 44 | - markdown==3.2.1 45 | - numpy==1.17.4 46 | - oauthlib==3.1.0 47 | - omegaconf==1.4.1 48 | - pandas==0.25.3 49 | - pillow==7.0.0 50 | - protobuf==3.11.3 51 | - pyasn1==0.4.8 52 | - pyasn1-modules==0.2.8 53 | - python-dateutil==2.8.1 54 | - pytorch-lightning==0.6.0 55 | - pytz==2019.3 56 | - pyyaml==5.3 57 | - regex==2020.2.20 58 | - requests==2.23.0 59 | - requests-oauthlib==1.3.0 60 | - rsa==4.0 61 | - s3transfer==0.3.3 62 | - sacremoses==0.0.38 63 | - scikit-learn==0.22.2 64 | - scipy==1.4.1 65 | - sentencepiece==0.1.85 66 | - six==1.14.0 67 | - tensorboard==2.1.1 68 | - tokenizers==0.5.0 69 | - torch==1.4.0 70 | - torchvision==0.4.2 71 | - tqdm==4.40.0 72 | - transformers==2.5.0 73 | - urllib3==1.25.8 74 | - werkzeug==1.0.0 75 | 76 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | import torch 3 | from torch.utils.data import DataLoader 4 | from model import Classifier 5 | from loguru import logger 6 | from tqdm import tqdm 7 | import yaml 8 | 9 | if __name__ == "__main__": 10 | 11 | import argparse 12 | 13 | parser = argparse.ArgumentParser("evaluate script") 14 | parser.add_argument("--input_x", type=str, required=True) 15 | parser.add_argument("--config", type=str, required=True) 16 | parser.add_argument("--checkpoint", type=str, required=True) 17 | parser.add_argument("--output", type=str, required=True) 18 | parser.add_argument("--input_y", type=str) 19 | 20 | args = parser.parse_args() 21 | 22 | device = 'cpu' if not torch.cuda.is_available() else "cuda" 23 | checkpoint = torch.load(args.checkpoint, map_location=device) 24 | with open(args.config, "r") as f: 25 | model = Classifier(yaml.safe_load(f.read())) 26 | model.load_state_dict(checkpoint['state_dict']) 27 | model.to(device) 28 | model.eval() 29 | 30 | preds: List[int] = [] 31 | for batch in tqdm(DataLoader(model.dataloader(args.input_x, args.input_y), batch_size=model.hparams["batch_size"] * 2, collate_fn=model.collate, shuffle=False)): 32 | for key in batch: 33 | if isinstance(batch[key], torch.Tensor): 34 | batch[key] = batch[key].to(device) 35 | 36 | with torch.no_grad(): 37 | logits = model.forward(batch) 38 | preds.extend(torch.argmax(logits, dim=1).cpu().detach().numpy().tolist()) 39 | preds = [p + model.label_offset for p in preds] 40 | 41 | if args.input_y: 42 | 43 | from sklearn.metrics import accuracy_score 44 | import pandas as pd 45 | import numpy as np 46 | 47 | labels = pd.read_csv(args.input_y, sep='\t', header=None).values.tolist() 48 | logger.info(f"F1 score: {accuracy_score(labels, preds):.3f}") 49 | 50 | stats = [] 51 | for _ in range(100): 52 | indices = [i for i in np.random.random_integers(0, len(preds)-1, size=len(preds))] 53 | stats.append(accuracy_score([labels[j] for j in indices], [preds[j] for j in indices])) 54 | 55 | alpha = 0.95 56 | p = ((1.0-alpha)/2.0) * 100 57 | lower = max(0.0, np.percentile(stats, p)) 58 | p = (alpha+((1.0-alpha)/2.0)) * 100 59 | upper = min(1.0, np.percentile(stats, p)) 60 | logger.info(f'{alpha*100:.1f} confidence interval {lower*100:.1f} and {upper*100:.1f}, average: {np.mean(stats)*100:.1f}') 61 | 62 | 63 | with open(args.output, "w") as f: 64 | f.write("\n".join(map(str, preds))) 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import pathlib 4 | from typing import * 5 | from itertools import cycle 6 | 7 | import torch 8 | import pytorch_lightning as pl 9 | import torch.nn as nn 10 | import pandas as pd 11 | import numpy as np 12 | from loguru import logger 13 | from torch.nn import functional as F 14 | from torch.utils.data import DataLoader, Dataset 15 | from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup 16 | 17 | class ClassificationDataset(Dataset): 18 | 19 | def __init__(self, instances): 20 | 21 | self.instances = instances 22 | 23 | def __len__(self): 24 | return len(self.instances) 25 | 26 | def __getitem__(self, idx): 27 | return self.instances[idx] 28 | 29 | 30 | class Classifier(pl.LightningModule): 31 | 32 | def __init__(self, config): 33 | super().__init__() 34 | self.hparams = config 35 | self.root_path = pathlib.Path(__file__).parent.absolute() 36 | self.embedder = AutoModel.from_pretrained(config["model"], cache_dir=self.root_path / "model_cache") 37 | self.tokenizer = AutoTokenizer.from_pretrained(config["model"], cache_dir=self.root_path / "model_cache", use_fast=False) 38 | 39 | self.embedder.train() 40 | self.label_offset = 0 41 | self.classifier = nn.Linear(self.embedder.config.hidden_size, 1, bias=True) 42 | 43 | self.loss = nn.CrossEntropyLoss(ignore_index=-1, reduction="mean") 44 | 45 | self.classifier.weight.data.normal_(mean=0.0, std=self.embedder.config.initializer_range) 46 | self.classifier.bias.data.zero_() 47 | 48 | def forward(self, batch): 49 | 50 | 51 | assert len(batch["input_ids"].shape) == 2, "LM only take two-dimensional input" 52 | assert len(batch["attention_mask"].shape) == 2, "LM only take two-dimensional input" 53 | assert len(batch["token_type_ids"].shape) == 2, "LM only take two-dimensional input" 54 | 55 | batch["token_type_ids"] = None if "roberta" in self.hparams["model"] else batch["token_type_ids"] 56 | 57 | results = self.embedder(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"]) 58 | 59 | token_embeddings, *_ = results 60 | logits = self.classifier(token_embeddings.mean(dim=1)).squeeze(dim=1) 61 | logits = logits.reshape(-1, batch["num_choice"]) 62 | 63 | return logits 64 | 65 | def training_step(self, batch, batch_idx): 66 | 67 | logits = self.forward(batch) 68 | loss = self.loss(logits, batch["labels"]) 69 | if self.trainer and self.trainer.use_dp: 70 | loss = loss.unsqueeze(0) 71 | return { 72 | "loss": loss 73 | } 74 | 75 | def validation_step(self, batch, batch_idx): 76 | logits = self.forward(batch) 77 | loss = self.loss(logits, batch["labels"]) 78 | if self.trainer and self.trainer.use_dp: 79 | loss = loss.unsqueeze(0) 80 | return { 81 | 'val_loss': loss, 82 | "val_batch_logits": logits, 83 | "val_batch_labels": batch["labels"], 84 | } 85 | 86 | def validation_end(self, outputs): 87 | 88 | val_loss_mean = torch.stack([o['val_loss'] for o in outputs]).mean() 89 | val_logits = torch.cat([o["val_batch_logits"] for o in outputs]) 90 | val_labels = torch.cat([o["val_batch_labels"] for o in outputs]) 91 | return { 92 | 'val_loss': val_loss_mean, 93 | "progress_bar": { 94 | "val_accuracy": torch.sum(val_labels == torch.argmax(val_logits, dim=1)) / (val_labels.shape[0] * 1.0) 95 | } 96 | } 97 | 98 | def configure_optimizers(self): 99 | 100 | t_total = len(self.train_dataloader()) // self.hparams["accumulate_grad_batches"] * self.hparams["max_epochs"] 101 | 102 | optimizer = AdamW(self.parameters(), lr=float(self.hparams["learning_rate"]), eps=float(self.hparams["adam_epsilon"])) 103 | 104 | return optimizer 105 | 106 | @pl.data_loader 107 | def train_dataloader(self): 108 | 109 | return DataLoader(self.dataloader(self.root_path / self.hparams["train_x"], self.root_path / self.hparams["train_y"]), batch_size=self.hparams["batch_size"], collate_fn=self.collate) 110 | 111 | @pl.data_loader 112 | def val_dataloader(self): 113 | return DataLoader(self.dataloader(self.root_path / self.hparams["val_x"], self.root_path / self.hparams["val_y"]), batch_size=self.hparams["batch_size"], collate_fn=self.collate) 114 | 115 | 116 | def dataloader(self, x_path: Union[str, pathlib.Path], y_path: Union[str, pathlib.Path] = None): 117 | 118 | df = pd.read_json(x_path, lines=True) 119 | if y_path: 120 | labels = pd.read_csv(y_path, sep='\t', header=None).values.tolist() 121 | self.label_offset = np.asarray(labels).min() 122 | df["label"] = np.asarray(labels) - self.label_offset 123 | 124 | df["text"] = df.apply(self.transform(self.hparams["formula"]), axis=1) 125 | print(df.head()) 126 | return ClassificationDataset(df[["text", "label"]].to_dict("records")) 127 | 128 | 129 | @staticmethod 130 | def transform(formula): 131 | 132 | def warpper(row): 133 | 134 | context, choices = formula.split("->") 135 | # (context + question -> answerA|answerB|answerC) 136 | # (obs1 + obs2 -> hyp1|hyp2) 137 | # (ctx_a + ctx_b -> ending_options) 138 | # (goal -> sol1|sol2) 139 | context = context.split("+") 140 | choices = choices.split("|") 141 | 142 | context = " ".join(row[x.strip()] for x in context) 143 | choices = row[choices[0]] if len(choices) == 0 else [row[x.strip()] for x in choices] 144 | return list(zip(cycle([context]), choices)) 145 | 146 | return warpper 147 | 148 | 149 | def collate(self, examples): 150 | 151 | batch_size = len(examples) 152 | num_choice = len(examples[0]["text"]) 153 | 154 | pairs = [pair for example in examples for pair in example["text"]] 155 | results = self.tokenizer.batch_encode_plus(pairs, add_special_tokens=True, max_length=self.hparams["max_length"], return_tensors='pt', return_attention_masks=True, pad_to_max_length=True) 156 | 157 | assert results["input_ids"].shape[0] == batch_size * num_choice, f"Invalid shapes {results['input_ids'].shape} {batch_size, num_choice}" 158 | 159 | return { 160 | "input_ids": results["input_ids"], 161 | "attention_mask": results["attention_mask"], 162 | "token_type_ids": results["token_type_ids"], 163 | "labels": torch.LongTensor([e["label"] for e in examples]) if "label" in examples[0] else None, 164 | "num_choice": num_choice 165 | } 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==0.25.3 2 | numpy==1.17.4 3 | pytorch_lightning==0.6.0 4 | hydra_core==0.11.3 5 | loguru==0.4.1 6 | torch==1.4.0 7 | tqdm==4.40.0 8 | transformers==2.5.0 9 | PyYAML==5.3 10 | scikit_learn==0.22.2 11 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | import hydra 3 | import torch 4 | import random 5 | import numpy as np 6 | from pytorch_lightning import Trainer 7 | from loguru import logger 8 | from model import Classifier 9 | 10 | 11 | @hydra.main(config_path="config.yaml") 12 | def train(config): 13 | 14 | logger.info(config) 15 | 16 | np.random.seed(42) 17 | random.seed(42) 18 | 19 | if torch.cuda.is_available(): 20 | torch.backends.cuda.deterministic = True 21 | torch.backends.cuda.benchmark = False 22 | 23 | model = Classifier(config) 24 | trainer = Trainer( 25 | gradient_clip_val = 0, 26 | num_nodes=1, 27 | gpus = None if not torch.cuda.is_available() else [i for i in range(torch.cuda.device_count())], 28 | log_gpu_memory=True, 29 | show_progress_bar=True, 30 | accumulate_grad_batches=config["accumulate_grad_batches"], 31 | max_epochs=config["max_epochs"], 32 | min_epochs=1, 33 | val_check_interval=0.1, 34 | log_save_interval=100, 35 | row_log_interval=10, 36 | distributed_backend = "ddp", 37 | use_amp=config["use_amp"], 38 | weights_summary= 'top', 39 | amp_level='O2', 40 | num_sanity_val_steps=5, 41 | resume_from_checkpoint=None, 42 | ) 43 | trainer.fit(model) 44 | 45 | pass 46 | 47 | if __name__ == "__main__": 48 | train() 49 | --------------------------------------------------------------------------------