├── .gitignore
├── README.md
├── config.yaml
├── environment.yml
├── eval.py
├── model.py
├── requirements.txt
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | *truth
  2 | *tsv
  3 | *models
  4 | *output
  5 | *cache
  6 | *idea
  7 | *cache*
  8 | outputs
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | .vscode
 14 | output/
 15 | *.tsv
 16 | *-models/
 17 | .DS_Store
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | build/
 24 | develop-eggs/
 25 | dist/
 26 | downloads/
 27 | eggs/
 28 | .eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | *.egg-info/
 36 | .installed.cfg
 37 | *.egg
 38 | MANIFEST
 39 | 
 40 | # PyInstaller
 41 | #  Usually these files are written by a python script from a template
 42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 43 | *.manifest
 44 | *.spec
 45 | 
 46 | # Installer logs
 47 | pip-log.txt
 48 | pip-delete-this-directory.txt
 49 | 
 50 | # Unit test / coverage reports
 51 | htmlcov/
 52 | .tox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | 
 62 | # Translations
 63 | *.mo
 64 | *.pot
 65 | 
 66 | # Django stuff:
 67 | *.log
 68 | local_settings.py
 69 | db.sqlite3
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # celery beat schedule file
 91 | celerybeat-schedule
 92 | 
 93 | # SageMath parsed files
 94 | *.sage.py
 95 | 
 96 | # Environments
 97 | .env
 98 | .venv
 99 | env/
100 | venv/
101 | ENV/
102 | env.bak/
103 | venv.bak/
104 | 
105 | # Spyder project settings
106 | .spyderproject
107 | .spyproject
108 | 
109 | # Rope project settings
110 | .ropeproject
111 | 
112 | # mkdocs documentation
113 | /site
114 | 
115 | # mypy
116 | .mypy_cache/
117 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Minimal Code Base For AI2 Commonsense Leaderboard
 2 | 
 3 | ## Dependencies
 4 | 
 5 | install apex if you want to use half precision: https://github.com/NVIDIA/apex. Conda env file is also included for reference, the apex might not be compatiable with conda directly so you can remove that before you create an environment.
 6 | 
 7 | ```bash
 8 | pip install -r requirements.txt
 9 | ```
10 | 
11 | ## Train
12 | 
13 | 
14 | Modify `config.yaml` as you like and run `python train.py` to train a model. It loads the config file and outputs all the logs/checkpoints in `outputs`
15 | 
16 | ## Eval
17 | 
18 | ### Get predictions without evaluation
19 | ```bash
20 | python eval.py \
21 |     --input_x cache/physicaliqa-train-dev/physicaliqa-train-dev/dev.jsonl \
22 |     --config config.yaml \
23 |     --checkpoint outputs/2020-02-26/20-26-22/lightning_logs/version_6341419/checkpoints/_ckpt_epoch_3_v0.ckpt \
24 |     --output pred.lst
25 | ```
26 | 
27 | ### Get predictions with evaluation(accuracy, confidence interval)
28 | 
29 | ```bash
30 | python eval.py \
31 |     --input_x cache/physicaliqa-train-dev/physicaliqa-train-dev/dev.jsonl \
32 |     --config config.yaml \
33 |     --checkpoint outputs/2020-02-26/20-26-22/lightning_logs/version_6341419/checkpoints/_ckpt_epoch_3_v0.ckpt \
34 |     --input_y cache/physicaliqa-train-dev/physicaliqa-train-dev/dev-labels.lst \
35 |     --output pred.lst
36 | ```
37 | 
38 | ## Results
39 | 
40 | ### PIQA
41 | |     Model     | Bootstrapped Accuracy Mean | Bootstrapped Accuracy CI | Accuracy |
42 | |:-------------:|:--------------------------:|:------------------------:|:--------:|
43 | | Roberta large (V100) |            77.4            |        75.7 - 79.4       |   77.3   |
44 | | Roberta large (K80)  |            74.0            |        72.4 - 76.2       |   74.2   |
45 | 


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | task_name: physicaliqa
 2 | model: "roberta-large"
 3 | accumulate_grad_batches: 8
 4 | use_amp: true # Half precison only works best with volta architectures such as V100
 5 | max_epochs: 4
 6 | learning_rate: 2e-6
 7 | adam_epsilon: 10e-8
 8 | warmup_steps: 150
 9 | batch_size: 8
10 | max_length: 128
11 | formula: "goal -> sol1|sol2"
12 | train_x: "cache/physicaliqa-train-dev/physicaliqa-train-dev/train.jsonl"
13 | train_y: "cache/physicaliqa-train-dev/physicaliqa-train-dev/train-labels.lst"
14 | val_x: "cache/physicaliqa-train-dev/physicaliqa-train-dev/dev.jsonl"
15 | val_y: "cache/physicaliqa-train-dev/physicaliqa-train-dev/dev-labels.lst"
16 | 
17 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: py3.7
 2 | channels:
 3 |   - menpo
 4 |   - defaults
 5 | dependencies:
 6 |   - _libgcc_mutex=0.1=main
 7 |   - ca-certificates=2020.1.1=0
 8 |   - certifi=2019.11.28=py37_0
 9 |   - ld_impl_linux-64=2.33.1=h53a641e_7
10 |   - libedit=3.1.20181209=hc058e9b_0
11 |   - libffi=3.2.1=hd88cf55_4
12 |   - libgcc-ng=9.1.0=hdf63c60_0
13 |   - libstdcxx-ng=9.1.0=hdf63c60_0
14 |   - ncurses=6.2=he6710b0_0
15 |   - openssl=1.1.1d=h7b6447c_4
16 |   - pip=20.0.2=py37_1
17 |   - python=3.7.6=h0371630_2
18 |   - readline=7.0=h7b6447c_5
19 |   - setuptools=46.0.0=py37_0
20 |   - sqlite=3.31.1=h7b6447c_0
21 |   - tk=8.6.8=hbc83047_0
22 |   - wheel=0.34.2=py37_0
23 |   - xz=5.2.4=h14c3975_4
24 |   - zlib=1.2.11=h7b6447c_3
25 |   - pip:
26 |     - absl-py==0.9.0
27 |     - apex==0.1
28 |     - boto3==1.12.18
29 |     - botocore==1.15.18
30 |     - cachetools==4.0.0
31 |     - chardet==3.0.4
32 |     - click==7.1.1
33 |     - docutils==0.15.2
34 |     - filelock==3.0.12
35 |     - future==0.18.2
36 |     - google-auth==1.11.2
37 |     - google-auth-oauthlib==0.4.1
38 |     - grpcio==1.27.2
39 |     - hydra-core==0.11.3
40 |     - idna==2.9
41 |     - jmespath==0.9.5
42 |     - joblib==0.14.1
43 |     - loguru==0.4.1
44 |     - markdown==3.2.1
45 |     - numpy==1.17.4
46 |     - oauthlib==3.1.0
47 |     - omegaconf==1.4.1
48 |     - pandas==0.25.3
49 |     - pillow==7.0.0
50 |     - protobuf==3.11.3
51 |     - pyasn1==0.4.8
52 |     - pyasn1-modules==0.2.8
53 |     - python-dateutil==2.8.1
54 |     - pytorch-lightning==0.6.0
55 |     - pytz==2019.3
56 |     - pyyaml==5.3
57 |     - regex==2020.2.20
58 |     - requests==2.23.0
59 |     - requests-oauthlib==1.3.0
60 |     - rsa==4.0
61 |     - s3transfer==0.3.3
62 |     - sacremoses==0.0.38
63 |     - scikit-learn==0.22.2
64 |     - scipy==1.4.1
65 |     - sentencepiece==0.1.85
66 |     - six==1.14.0
67 |     - tensorboard==2.1.1
68 |     - tokenizers==0.5.0
69 |     - torch==1.4.0
70 |     - torchvision==0.4.2
71 |     - tqdm==4.40.0
72 |     - transformers==2.5.0
73 |     - urllib3==1.25.8
74 |     - werkzeug==1.0.0
75 | 
76 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | from typing import *
 2 | import torch
 3 | from torch.utils.data import DataLoader
 4 | from model import Classifier
 5 | from loguru import logger
 6 | from tqdm import tqdm
 7 | import yaml
 8 | 
 9 | if __name__ == "__main__":
10 | 
11 |     import argparse
12 | 
13 |     parser = argparse.ArgumentParser("evaluate script")
14 |     parser.add_argument("--input_x", type=str, required=True)
15 |     parser.add_argument("--config", type=str, required=True)
16 |     parser.add_argument("--checkpoint", type=str, required=True)
17 |     parser.add_argument("--output", type=str, required=True)
18 |     parser.add_argument("--input_y", type=str)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     device = 'cpu' if not torch.cuda.is_available() else "cuda"
23 |     checkpoint = torch.load(args.checkpoint, map_location=device)
24 |     with open(args.config, "r") as f:
25 |         model = Classifier(yaml.safe_load(f.read()))
26 |     model.load_state_dict(checkpoint['state_dict'])
27 |     model.to(device)
28 |     model.eval()
29 | 
30 |     preds: List[int] = []
31 |     for batch in tqdm(DataLoader(model.dataloader(args.input_x, args.input_y), batch_size=model.hparams["batch_size"] * 2, collate_fn=model.collate, shuffle=False)):
32 |         for key in batch:
33 |             if isinstance(batch[key], torch.Tensor):
34 |                 batch[key] = batch[key].to(device)
35 |         
36 |         with torch.no_grad():
37 |             logits = model.forward(batch)
38 |         preds.extend(torch.argmax(logits, dim=1).cpu().detach().numpy().tolist())
39 |     preds = [p + model.label_offset for p in preds]
40 | 
41 |     if args.input_y:
42 | 
43 |         from sklearn.metrics import accuracy_score
44 |         import pandas as pd
45 |         import numpy as np
46 | 
47 |         labels = pd.read_csv(args.input_y, sep='\t', header=None).values.tolist()
48 |         logger.info(f"F1 score: {accuracy_score(labels, preds):.3f}")
49 | 
50 |         stats = []
51 |         for _ in range(100):
52 |             indices = [i for i in np.random.random_integers(0, len(preds)-1, size=len(preds))]
53 |             stats.append(accuracy_score([labels[j] for j in indices], [preds[j] for j in indices]))
54 | 
55 |         alpha = 0.95
56 |         p = ((1.0-alpha)/2.0) * 100
57 |         lower = max(0.0, np.percentile(stats, p))
58 |         p = (alpha+((1.0-alpha)/2.0)) * 100
59 |         upper = min(1.0, np.percentile(stats, p))
60 |         logger.info(f'{alpha*100:.1f} confidence interval {lower*100:.1f} and {upper*100:.1f}, average: {np.mean(stats)*100:.1f}')
61 | 
62 | 
63 |     with open(args.output, "w") as f:
64 |         f.write("\n".join(map(str, preds)))
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import pathlib
  4 | from typing import *
  5 | from itertools import cycle
  6 | 
  7 | import torch
  8 | import pytorch_lightning as pl
  9 | import torch.nn as nn
 10 | import pandas as pd
 11 | import numpy as np
 12 | from loguru import logger
 13 | from torch.nn import functional as F
 14 | from torch.utils.data import DataLoader, Dataset
 15 | from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
 16 | 
 17 | class ClassificationDataset(Dataset):
 18 | 
 19 |     def __init__(self, instances):
 20 | 
 21 |         self.instances = instances
 22 | 
 23 |     def __len__(self):
 24 |         return len(self.instances)
 25 | 
 26 |     def __getitem__(self, idx):
 27 |         return self.instances[idx]
 28 | 
 29 | 
 30 | class Classifier(pl.LightningModule):
 31 | 
 32 |     def __init__(self, config):
 33 |         super().__init__()
 34 |         self.hparams = config
 35 |         self.root_path = pathlib.Path(__file__).parent.absolute()
 36 |         self.embedder = AutoModel.from_pretrained(config["model"], cache_dir=self.root_path / "model_cache")
 37 |         self.tokenizer = AutoTokenizer.from_pretrained(config["model"], cache_dir=self.root_path / "model_cache", use_fast=False)
 38 | 
 39 |         self.embedder.train()
 40 |         self.label_offset = 0
 41 |         self.classifier = nn.Linear(self.embedder.config.hidden_size, 1, bias=True)
 42 | 
 43 |         self.loss = nn.CrossEntropyLoss(ignore_index=-1, reduction="mean")
 44 | 
 45 |         self.classifier.weight.data.normal_(mean=0.0, std=self.embedder.config.initializer_range)
 46 |         self.classifier.bias.data.zero_()
 47 | 
 48 |     def forward(self, batch):
 49 | 
 50 | 
 51 |         assert len(batch["input_ids"].shape) == 2, "LM only take two-dimensional input"
 52 |         assert len(batch["attention_mask"].shape) == 2, "LM only take two-dimensional input"
 53 |         assert len(batch["token_type_ids"].shape) == 2, "LM only take two-dimensional input"
 54 |         
 55 |         batch["token_type_ids"] = None if "roberta" in self.hparams["model"] else batch["token_type_ids"]
 56 | 
 57 |         results = self.embedder(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"])
 58 | 
 59 |         token_embeddings, *_ = results
 60 |         logits = self.classifier(token_embeddings.mean(dim=1)).squeeze(dim=1)
 61 |         logits = logits.reshape(-1, batch["num_choice"])
 62 | 
 63 |         return logits
 64 | 
 65 |     def training_step(self, batch, batch_idx):
 66 | 
 67 |         logits = self.forward(batch)
 68 |         loss = self.loss(logits, batch["labels"])
 69 |         if self.trainer and self.trainer.use_dp:
 70 |             loss = loss.unsqueeze(0)
 71 |         return {
 72 |             "loss": loss
 73 |         }
 74 | 
 75 |     def validation_step(self, batch, batch_idx):
 76 |         logits = self.forward(batch)
 77 |         loss = self.loss(logits, batch["labels"])
 78 |         if self.trainer and self.trainer.use_dp:
 79 |             loss = loss.unsqueeze(0)
 80 |         return {
 81 |             'val_loss': loss,
 82 |             "val_batch_logits": logits,
 83 |             "val_batch_labels": batch["labels"],
 84 |         }
 85 | 
 86 |     def validation_end(self, outputs):
 87 | 
 88 |         val_loss_mean = torch.stack([o['val_loss'] for o in outputs]).mean()
 89 |         val_logits = torch.cat([o["val_batch_logits"] for o in outputs])
 90 |         val_labels = torch.cat([o["val_batch_labels"] for o in outputs])
 91 |         return {
 92 |             'val_loss': val_loss_mean,
 93 |             "progress_bar": {
 94 |                 "val_accuracy": torch.sum(val_labels == torch.argmax(val_logits, dim=1)) / (val_labels.shape[0] * 1.0)
 95 |             }
 96 |         }
 97 | 
 98 |     def configure_optimizers(self):
 99 | 
100 |         t_total = len(self.train_dataloader()) // self.hparams["accumulate_grad_batches"] * self.hparams["max_epochs"]
101 | 
102 |         optimizer = AdamW(self.parameters(), lr=float(self.hparams["learning_rate"]), eps=float(self.hparams["adam_epsilon"]))
103 | 
104 |         return optimizer
105 | 
106 |     @pl.data_loader
107 |     def train_dataloader(self):
108 | 
109 |         return DataLoader(self.dataloader(self.root_path / self.hparams["train_x"], self.root_path / self.hparams["train_y"]), batch_size=self.hparams["batch_size"], collate_fn=self.collate)
110 | 
111 |     @pl.data_loader
112 |     def val_dataloader(self):
113 |         return DataLoader(self.dataloader(self.root_path / self.hparams["val_x"], self.root_path / self.hparams["val_y"]), batch_size=self.hparams["batch_size"], collate_fn=self.collate)
114 | 
115 | 
116 |     def dataloader(self, x_path: Union[str, pathlib.Path], y_path: Union[str, pathlib.Path] = None):
117 | 
118 |         df = pd.read_json(x_path, lines=True)
119 |         if y_path:
120 |             labels = pd.read_csv(y_path, sep='\t', header=None).values.tolist()
121 |             self.label_offset = np.asarray(labels).min()
122 |             df["label"] = np.asarray(labels) - self.label_offset
123 | 
124 |         df["text"] = df.apply(self.transform(self.hparams["formula"]), axis=1)
125 |         print(df.head())
126 |         return ClassificationDataset(df[["text", "label"]].to_dict("records"))
127 | 
128 | 
129 |     @staticmethod
130 |     def transform(formula):
131 | 
132 |         def warpper(row):
133 | 
134 |             context, choices = formula.split("->")
135 |             # (context + question -> answerA|answerB|answerC)
136 |             # (obs1 + obs2 -> hyp1|hyp2)
137 |             # (ctx_a + ctx_b -> ending_options)
138 |             # (goal -> sol1|sol2)
139 |             context = context.split("+")
140 |             choices = choices.split("|")
141 | 
142 |             context = " ".join(row[x.strip()] for x in context)
143 |             choices = row[choices[0]] if len(choices) == 0 else [row[x.strip()] for x in choices]
144 |             return list(zip(cycle([context]), choices))
145 | 
146 |         return warpper
147 | 
148 | 
149 |     def collate(self, examples):
150 | 
151 |         batch_size = len(examples)
152 |         num_choice = len(examples[0]["text"])
153 | 
154 |         pairs = [pair for example in examples for pair in example["text"]]
155 |         results = self.tokenizer.batch_encode_plus(pairs, add_special_tokens=True, max_length=self.hparams["max_length"], return_tensors='pt', return_attention_masks=True, pad_to_max_length=True)
156 | 
157 |         assert results["input_ids"].shape[0] == batch_size * num_choice, f"Invalid shapes {results['input_ids'].shape} {batch_size, num_choice}"
158 | 
159 |         return {
160 |             "input_ids": results["input_ids"],
161 |             "attention_mask": results["attention_mask"],
162 |             "token_type_ids": results["token_type_ids"],
163 |             "labels": torch.LongTensor([e["label"] for e in examples]) if "label" in examples[0] else None,
164 |             "num_choice": num_choice
165 |         }
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas==0.25.3
 2 | numpy==1.17.4
 3 | pytorch_lightning==0.6.0
 4 | hydra_core==0.11.3
 5 | loguru==0.4.1
 6 | torch==1.4.0
 7 | tqdm==4.40.0
 8 | transformers==2.5.0
 9 | PyYAML==5.3
10 | scikit_learn==0.22.2
11 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | from typing import *
 2 | import hydra
 3 | import torch
 4 | import random
 5 | import numpy as np
 6 | from pytorch_lightning import Trainer
 7 | from loguru import logger
 8 | from model import Classifier
 9 | 
10 | 
11 | @hydra.main(config_path="config.yaml")
12 | def train(config):
13 | 
14 |     logger.info(config)
15 | 
16 |     np.random.seed(42)
17 |     random.seed(42)
18 | 
19 |     if torch.cuda.is_available():
20 |         torch.backends.cuda.deterministic = True
21 |         torch.backends.cuda.benchmark = False
22 | 
23 |     model = Classifier(config)
24 |     trainer = Trainer(
25 |         gradient_clip_val = 0,
26 |         num_nodes=1,
27 |         gpus = None if not torch.cuda.is_available() else [i for i in range(torch.cuda.device_count())],
28 |         log_gpu_memory=True,
29 |         show_progress_bar=True,
30 |         accumulate_grad_batches=config["accumulate_grad_batches"],
31 |         max_epochs=config["max_epochs"],
32 |         min_epochs=1,
33 |         val_check_interval=0.1,
34 |         log_save_interval=100,
35 |         row_log_interval=10,
36 |         distributed_backend = "ddp",
37 |         use_amp=config["use_amp"],
38 |         weights_summary= 'top',
39 |         amp_level='O2',
40 |         num_sanity_val_steps=5,
41 |         resume_from_checkpoint=None,
42 |     )
43 |     trainer.fit(model)
44 | 
45 |     pass
46 | 
47 | if __name__ == "__main__":
48 |     train()
49 | 


--------------------------------------------------------------------------------