├── example_tasks ├── __init__.py ├── lstm.py └── cifar.py ├── .gitignore ├── report └── template.tex ├── train.py └── README.md /example_tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .cifar import CifarTask 2 | from .lstm import LanguageModelingTask 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | .pytest_cache/ 3 | data/ 4 | submission/data/ 5 | __pycache__/ 6 | report/*.aux 7 | report/*.fdb_latexmk 8 | report/*.fls 9 | report/*.log 10 | -------------------------------------------------------------------------------- /report/template.tex: -------------------------------------------------------------------------------- 1 | \documentclass[twocolumn,a4paper,12pt]{article} 2 | 3 | \usepackage[utf8]{inputenc} 4 | \usepackage[english]{babel} 5 | \usepackage{amsmath} 6 | \usepackage{amssymb} 7 | \usepackage{lipsum} 8 | 9 | \title{Your AutoTrain Optimizer} 10 | \author{ 11 | First author \and 12 | Second author \and 13 | Third author 14 | } 15 | 16 | 17 | \begin{document} 18 | 19 | \maketitle 20 | 21 | 22 | \section{Introduction} 23 | 24 | \lipsum[1] 25 | 26 | 27 | \section{Related work} 28 | 29 | \lipsum[2-3] 30 | 31 | 32 | \section{Method} 33 | 34 | \lipsum[4-5] 35 | 36 | 37 | \section{Results} 38 | 39 | \lipsum[6-7] 40 | 41 | \end{document} 42 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | This is an example submission that implements Adam. 5 | """ 6 | 7 | import math 8 | 9 | import torch 10 | 11 | 12 | def train(task): 13 | batch_size = task.default_batch_size 14 | target_loss = task.target_test_loss 15 | 16 | learning_rate = 0.001 17 | beta1 = 0.9 18 | beta2 = 0.999 19 | epsilon = 1e-8 20 | epsilon = 1e-08 21 | weight_decay = 1e-5 22 | n_epochs = 10 23 | 24 | # Adam Initialization 25 | first_moment = [torch.zeros_like(param) for param in task.state] 26 | second_moment = [torch.zeros_like(param) for param in task.state] 27 | t = 0 28 | 29 | for epoch in range(n_epochs): 30 | print("Epoch {}".format(epoch)) 31 | 32 | for batch in task.train_iterator(batch_size=batch_size, shuffle=True): 33 | # Get a batch gradient 34 | _, df = task.batch_loss_and_gradient(batch) 35 | 36 | # Adam Update 37 | t += 1 38 | lr = learning_rate * math.sqrt(1 - beta2 ** t) / (1 - beta1 ** t) 39 | for m1, m2, variable, grad in zip(first_moment, second_moment, task.state, df): 40 | m1 = beta1 * m1 + (1 - beta1) * grad 41 | m2 = beta2 * m2 + (1 - beta2) * grad * grad 42 | variable.mul_(1 - weight_decay) 43 | variable.add_(-lr, m1 / (torch.sqrt(m2 + epsilon))) 44 | 45 | # As soon as you test your model and the test_loss is lower than task.target_test_loss, 46 | # your optimizer will be killed and you are done. 47 | test_loss = task.test(task.state) 48 | print("Test loss at epoch {}: {:.3f}".format(epoch, test_loss)) 49 | 50 | 51 | if __name__ == "__main__": 52 | from example_tasks import CifarTask 53 | 54 | task = CifarTask() 55 | train(task) 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoTrain Challenge 2 | 3 | (This design document corresponds to a challenge that was submitted to NeurIPS but not run.) 4 | 5 | AutoTrain challenges you to submit optimizers that work reliably on any deep learning task without task-specific tuning. 6 | It separates AutoML into (1) fully automatic training of a model and (2) model selection, and tackles the first aspect. 7 | 8 | Your submissions will be benchmarked on a secret set of architecture/dataset pairs inspired by common deep learning tasks. 9 | The optimizers need to achieve a target test loss as fast as possible. The fastest on average wins the competition. 10 | 11 | The winning optimizers will be made publicly available as 12 | open source and bring significant value to practitioners and researchers, by removing 13 | the need of expensive hyperparameter tuning, and by providing fair benchmarking of 14 | all optimizers. 15 | 16 | ## Submission 17 | 18 | You are required to submit a ZIP file before the deadline to [autotrain@groupes.epfl.ch](mailto:autotrain@groupes.epfl.ch) containing: 19 | 20 | - `README.md`: team name and team members, 21 | - `train.py`: code of your optimizer, 22 | - `report.pdf`: 4 pages (two-columns) report describing your submission. 23 | 24 | You can refer to [train.py](./train.py) for a sample submission. 25 | 26 | ## Rules 27 | 28 | ### Evaluation 29 | 30 | The participants are required to submit code for an AutoTrain optimizer which will be uploaded to the challenge platform. This code will be run on previously unseen architecture / dataset pairs. The submission is executed until the target test loss is reached, or it stops otherwise, or it consumes more than the maximum allowed resources (time or memory). Submissions are ranked on average time-to-accuracy (on the specified standard cloud instance), since this corresponds best to cost in real-world usecases. More precisely, the time-to-accuracy over the test cases will be normalized by the different baseline times for each test-case. In effect, with normalization, the final score is the harmonic mean of the time-to-accuracy speed-up for the different architecture / dataset pairs (speedup is defined as the time ratio compared to the baseline). 31 | 32 | ### Unseen architecture / dataset pairs 33 | 34 | The unseen architecture / dataset pairs on which they will be judged will be modifications of the sample architecture / dataset pairs provided beforehand to the participants. Hence, it is sufficient to ensure that the submitted code does not exceed maximum resources on the provided sample architecture / dataset. Most importantly, the number of weights on the unknown network will not exceed the one in the provided example models. Further, the range of the following high level characteristics of the unseen architecture / dataset pair will be of the same order of magnitude as that of their sample counterparts: i) number of parameters, ii) time required for forward pass, iii) time required for backprop, and iv) size of the training data. However, the exact values of each of these characteristics might not match that of any of the provided samples. Further, the architecture of the model, though similar, might not exactly match any of the provided samples. 35 | 36 | ### Task interface 37 | 38 | The AutoTrain optimizer can access the (train) data via querying consecutive mini-batches of desired size. It is allowed to make as many calls as desired (within the resource limits) to the following oracles, which take a minibatch as input and output: 39 | 40 | 1. the loss value of the network on the corresponding mini-batch (i.e. inference), 41 | 2. the result of the backprop on the corresponding mini-batch. 42 | 43 | The optimizer can update the weights as many times as desired. Access to the interface will be synchronous—multiple simultaneous queries are ignored. The interface is based on PyTorch. 44 | 45 | The optimizer can query the test loss via `test_loss = task.test(task.state)`. The current test loss will be compared against the target **only** when you call this function. 46 | 47 | The optimizer also has access to the target test loss `task.target_test_loss`, and a default batch size `task.default_batch_size` which is guaranteed to not exceed memory limits for SGD and Adam. 48 | 49 | ### Additional rules 50 | 51 | - Each submission should be accompanied by an informative description (commented code, README, and writeup of the approach). 52 | - Source code of the submission must be provided. Your optimizer should be implemented in train.py and not use any external dependencies. 53 | - Use of external communication, pretraining, or manipulation of the provided oracles (such as backprop) is not allowed, only the use of the results (vectors) of the oracles is permitted. 54 | - We require the winning submission to be publicly released to ensure reproducibility and impact on the community. 55 | 56 | ### Environment 57 | 58 | We will evaluate the submissions on a system with Ubuntu 18.04, Anaconda Python 3.7 and Cuda 10. 59 | You can use the packages `torch`, `numpy`, `scipy` and any other package available in Anaconda Python. 60 | 61 | ### Optimizer 62 | 63 | The submitted `train.py` file must define the function: 64 | 65 | ```python 66 | def train(task: Task): 67 | """Train the (model, dataset) pair associated to the task. 68 | 69 | Args: 70 | task [Task]: task to optimize. Refer to `src/task.py` for available functions. 71 | """ 72 | ``` 73 | 74 | An example is provided in [train.py](./train.py). 75 | Every time you evaluate the model on the test set (`task.test(task.state)`), you are compared againast the target loss and get a chance to win. 76 | 77 | ## Organizers 78 | 79 | - Thijs Vogels, EPFL 80 | - Sai Praneeth Karimireddy, EPFL 81 | - Jean-Baptiste Cordonnier, EPFL 82 | - Michael Tschannen, ETH Zürich 83 | - Fabian Pedregosa, Google 84 | - Sebastian U. Stich, EPFL 85 | - Sharada Mohanty, EPFL 86 | - Marcel Salathé, EPFL 87 | - Martin Jaggi, EPFL 88 | 89 | Contact: autotrain@groupes.epfl.ch 90 | -------------------------------------------------------------------------------- /example_tasks/lstm.py: -------------------------------------------------------------------------------- 1 | import os 2 | from copy import deepcopy 3 | from typing import Dict, Iterable, List 4 | 5 | import numpy as np 6 | import spacy 7 | import torch 8 | import torch.nn as nn 9 | import torchtext 10 | from spacy.symbols import ORTH 11 | from torch.utils.data import DataLoader 12 | 13 | 14 | """ 15 | This is another example of a task. 16 | It is an implementation of language modeling. 17 | The CifarTask is easier to understand and better documented. 18 | """ 19 | 20 | 21 | class Batch: 22 | def __init__(self, x, y, hidden): 23 | self.x = x 24 | self.y = y 25 | self.hidden = hidden 26 | 27 | 28 | class LanguageModelingTask: 29 | def __init__(self): 30 | self.default_batch_size = 64 31 | self.target_test_loss = 4.7 32 | 33 | self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 34 | self._seed = 34534 35 | self._epoch = 0 36 | 37 | torch.random.manual_seed(self._seed) 38 | self.text, self.train_loader, self.val_loader = define_dataset( 39 | self._device, "wikitext2", "data", batch_size=self.default_batch_size 40 | ) 41 | 42 | global ITOS 43 | global STOI 44 | ITOS = self.text.vocab.itos 45 | STOI = self.text.vocab.stoi 46 | 47 | self._model = self._create_model() 48 | self._criterion = torch.nn.CrossEntropyLoss().to(self._device) 49 | 50 | self.state = [parameter.data for parameter in self._model.parameters()] 51 | self.buffers = [buffer for buffer in self._model.buffers()] 52 | self.parameter_names = [name for (name, _) in self._model.named_parameters()] 53 | self._hidden_container = {"hidden": None} 54 | 55 | def train_iterator(self, batch_size: int, shuffle: bool = False) -> Iterable[Batch]: 56 | """Shuffle is ignored .. text cannot be shuffled""" 57 | self._epoch += 1 58 | self._hidden_container["hidden"] = self._model.init_hidden(batch_size) 59 | _, train_loader, _ = define_dataset( 60 | self._device, "wikitext2", "data", batch_size=batch_size 61 | ) 62 | return BatchLoader( 63 | train_loader, self._device, model=self._model, hidden_container=self._hidden_container 64 | ) 65 | 66 | def batch_loss(self, batch: Batch) -> (float, Dict[str, float]): 67 | with torch.no_grad(): 68 | prediction, hidden = self._model(batch.x, batch.hidden) 69 | self._hidden_container["hidden"] = hidden 70 | loss = self._criterion( 71 | prediction.view(-1, self._model.ntokens), batch.y.contiguous().view(-1) 72 | ) 73 | return loss.item() 74 | 75 | def batch_loss_and_gradient( 76 | self, batch: Batch, rnn_clip=0.4 77 | ) -> (float, List[torch.Tensor], Dict[str, float]): 78 | self._zero_grad() 79 | prediction, hidden = self._model(batch.x, batch.hidden) 80 | self._hidden_container["hidden"] = hidden 81 | f = self._criterion(prediction.view(-1, self._model.ntokens), batch.y.contiguous().view(-1)) 82 | f.backward() 83 | torch.nn.utils.clip_grad_norm_(self._model.parameters(), rnn_clip) 84 | df = [parameter.grad for parameter in self._model.parameters()] 85 | return f.detach(), df 86 | 87 | def test(self, state=None) -> float: 88 | self._hidden_container["hidden"] = self._model.init_hidden(self.default_batch_size) 89 | test_loader = BatchLoader( 90 | self.val_loader, 91 | self._device, 92 | model=self._model, 93 | hidden_container=self._hidden_container, 94 | ) 95 | 96 | if state: 97 | test_model = self._create_test_model(state) 98 | else: 99 | test_model = self._model 100 | test_model.eval() 101 | 102 | losses = [] 103 | 104 | for batch in test_loader: 105 | with torch.no_grad(): 106 | prediction, hidden = self._model(batch.x, batch.hidden) 107 | self._hidden_container["hidden"] = hidden 108 | losses.append( 109 | self._criterion( 110 | prediction.view(-1, self._model.ntokens), batch.y.contiguous().view(-1) 111 | ).item() 112 | ) 113 | 114 | mean_f = np.mean(losses) 115 | if mean_f < self.target_test_loss: 116 | raise Done(mean_f) 117 | 118 | return mean_f 119 | 120 | def _create_model(self): 121 | torch.random.manual_seed(self._seed) 122 | model = define_model(self.text) 123 | model.to(self._device) 124 | model.train() 125 | return model 126 | 127 | def _create_test_model(self, state): 128 | test_model = deepcopy(self._model) 129 | test_model.eval() 130 | for param, new_value in zip(test_model.parameters(), state): 131 | param.data = new_value.data 132 | return test_model 133 | 134 | def _zero_grad(self): 135 | self._model.zero_grad() 136 | 137 | 138 | class BatchLoader: 139 | """ 140 | Utility that transforms a dataloader that is an iterable over (x, y) tuples 141 | into an iterable over Batch() tuples, where its contents are already moved 142 | to the selected device. 143 | """ 144 | 145 | def __init__(self, dataloader, device, model, hidden_container): 146 | self.dataloader = dataloader 147 | self.device = device 148 | self._model = model 149 | self._hidden_container = hidden_container 150 | 151 | def __len__(self): 152 | return len(self.dataloader) 153 | 154 | def __iter__(self): 155 | for batch in self.dataloader: 156 | x = batch.text 157 | y = batch.target 158 | hidden = self._model.repackage_hidden(self._hidden_container["hidden"]) 159 | yield Batch(x, y, hidden) 160 | 161 | 162 | def define_dataset( 163 | device, 164 | dataset_name, 165 | dataset_path, 166 | batch_size, 167 | rnn_use_pretrained_emb=False, 168 | rnn_n_hidden=650, 169 | reshuffle_per_epoch=True, 170 | rnn_bptt_len=30, 171 | ): 172 | # create dataset. 173 | TEXT, train, valid, test = _get_dataset(dataset_name, dataset_path) 174 | 175 | # Build vocb. 176 | # we can use some precomputed word embeddings, 177 | # e.g., GloVe vectors with 100, 200, and 300. 178 | if rnn_use_pretrained_emb: 179 | try: 180 | vectors = "glove.6B.{}d".format(rnn_n_hidden) 181 | vectors_cache = os.path.join(dataset_path, ".vector_cache") 182 | except: 183 | vectors, vectors_cache = None, None 184 | else: 185 | vectors, vectors_cache = None, None 186 | TEXT.build_vocab(train, vectors=vectors, vectors_cache=vectors_cache) 187 | 188 | # Partition training data. 189 | train_loader, _ = torchtext.data.BPTTIterator.splits( 190 | (train, valid), 191 | batch_size=batch_size, 192 | bptt_len=rnn_bptt_len, 193 | device=device, 194 | shuffle=reshuffle_per_epoch, 195 | ) 196 | _, val_loader = torchtext.data.BPTTIterator.splits( 197 | (train, valid), 198 | batch_size=batch_size, 199 | bptt_len=rnn_bptt_len, 200 | device=device, 201 | shuffle=reshuffle_per_epoch, 202 | ) 203 | 204 | # get some stat. 205 | return TEXT, train_loader, val_loader 206 | 207 | 208 | def define_model(TEXT, rnn_n_hidden=650, rnn_n_layers=3, rnn_tie_weights=True, drop_rate=0.4): 209 | # get embdding size and num_tokens. 210 | weight_matrix = TEXT.vocab.vectors 211 | 212 | if weight_matrix is not None: 213 | n_tokens, emb_size = weight_matrix.size(0), weight_matrix.size(1) 214 | else: 215 | n_tokens, emb_size = len(TEXT.vocab), rnn_n_hidden 216 | 217 | # create model. 218 | model = RNNModel( 219 | rnn_type="LSTM", 220 | ntoken=n_tokens, 221 | ninp=emb_size, 222 | nhid=rnn_n_hidden, 223 | nlayers=rnn_n_layers, 224 | tie_weights=rnn_tie_weights, 225 | dropout=drop_rate, 226 | ) 227 | 228 | # init the model. 229 | if weight_matrix is not None: 230 | model.encoder.weight.data.copy_(weight_matrix) 231 | 232 | return model 233 | 234 | 235 | def _get_text(): 236 | spacy_en = spacy.load("en") 237 | spacy_en.tokenizer.add_special_case("", [{ORTH: ""}]) 238 | spacy_en.tokenizer.add_special_case("", [{ORTH: ""}]) 239 | spacy_en.tokenizer.add_special_case("", [{ORTH: ""}]) 240 | 241 | def spacy_tok(text): 242 | return [tok.text for tok in spacy_en.tokenizer(text)] 243 | 244 | TEXT = torchtext.data.Field(lower=True, tokenize=spacy_tok) 245 | return TEXT 246 | 247 | 248 | def _get_dataset(name, datasets_path): 249 | TEXT = _get_text() 250 | 251 | # Load and split data. 252 | if "wikitext2" in name: 253 | train, valid, test = torchtext.datasets.WikiText2.splits(TEXT, root=datasets_path) 254 | elif "ptb" in name: 255 | train, valid, test = torchtext.datasets.PennTreebank.splits(TEXT, root=datasets_path) 256 | return TEXT, train, valid, test 257 | 258 | 259 | class RNNModel(nn.Module): 260 | """Container module with an encoder, a recurrent module, and a decoder.""" 261 | 262 | def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False): 263 | super(RNNModel, self).__init__() 264 | self.drop = nn.Dropout(dropout) 265 | self.encoder = nn.Embedding(ntoken, ninp) 266 | if rnn_type in ["LSTM", "GRU"]: 267 | self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout) 268 | else: 269 | try: 270 | nonlinearity = {"RNN_TANH": "tanh", "RNN_RELU": "relu"}[rnn_type] 271 | except KeyError: 272 | raise ValueError( 273 | """An invalid option for `--model` was supplied, 274 | options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""" 275 | ) 276 | self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) 277 | self.decoder = nn.Linear(nhid, ntoken) 278 | 279 | # Optionally tie weights as in: 280 | # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) 281 | # https://arxiv.org/abs/1608.05859 282 | # and 283 | # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) 284 | # https://arxiv.org/abs/1611.01462 285 | if tie_weights: 286 | if nhid != ninp: 287 | raise ValueError("When using the tied flag, nhid must be equal to emsize") 288 | self.decoder.weight = self.encoder.weight 289 | 290 | self.init_weights() 291 | 292 | self.rnn_type = rnn_type 293 | self.nhid = nhid 294 | self.nlayers = nlayers 295 | self.ntokens = ntoken 296 | 297 | def init_weights(self): 298 | initrange = 0.1 299 | self.encoder.weight.data.uniform_(-initrange, initrange) 300 | self.decoder.bias.data.zero_() 301 | self.decoder.weight.data.uniform_(-initrange, initrange) 302 | 303 | def forward(self, input, hidden): 304 | emb = self.drop(self.encoder(input)) 305 | output, hidden = self.rnn(emb, hidden) 306 | output = self.drop(output) 307 | decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2))) 308 | return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden 309 | 310 | def init_hidden(self, bsz): 311 | weight = next(self.parameters()) 312 | if self.rnn_type == "LSTM": 313 | return ( 314 | weight.new_zeros(self.nlayers, bsz, self.nhid), 315 | weight.new_zeros(self.nlayers, bsz, self.nhid), 316 | ) 317 | else: 318 | return weight.new_zeros(self.nlayers, bsz, self.nhid) 319 | 320 | def repackage_hidden(self, h): 321 | """Wraps hidden states in new Tensors, to detach them from their history.""" 322 | if isinstance(h, torch.Tensor): 323 | return h.detach() 324 | else: 325 | return tuple(self.repackage_hidden(v) for v in h) 326 | 327 | 328 | class Done(Exception): 329 | pass 330 | 331 | 332 | ITOS = None # integer to string 333 | STOI = None # string to integer 334 | -------------------------------------------------------------------------------- /example_tasks/cifar.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import Iterable, List 3 | 4 | import numpy as np 5 | import torch 6 | import torchvision 7 | from torch.utils.data import DataLoader, Dataset 8 | 9 | 10 | """ 11 | This file describes the public interface of optimization Tasks 12 | and implements a ResNet18/Cifar10 optimization task to test your 13 | optimizer. An example on how to implement the optimizer is in train.py. 14 | """ 15 | 16 | 17 | class Batch: 18 | def __init__(self, x, y): 19 | self._x = x 20 | self._y = y 21 | 22 | 23 | class CifarTask: 24 | """ 25 | Example implementation of an optimization task. 26 | 27 | Interface: 28 | The following methods are exposed to the challenge participants: 29 | - `train_iterator`: returns an iterator of `Batch`es from the training set, 30 | - `batch_loss`: evaluate the function value of a `Batch`, 31 | - `batch_loss_and_gradient`: evaluate the function value of a `Batch` and compute the gradients, 32 | - `test`: compute the test loss of the model on the test set. 33 | The following attributes are exposed to the challenge participants: 34 | - `default_batch_size` 35 | - `target_test_loss` 36 | - `state` contains a list of current model parameter values 37 | 38 | See documentation below for more information. 39 | 40 | Example: 41 | See /train.py for an example of a Task in use. 42 | """ 43 | 44 | def __init__(self): 45 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 46 | 47 | self.default_batch_size = 128 48 | self.target_test_loss = 0.25 49 | 50 | self._train_set, self._test_set = self._create_dataset() 51 | # self._train_set = torch.utils.data.Subset( 52 | # self._train_set, np.random.choice(len(self._train_set), 512) 53 | # ) 54 | # self._test_set = torch.utils.data.Subset( 55 | # self._test_set, np.random.choice(len(self._test_set), 500) 56 | # ) 57 | self._test_loader = DataLoader(self._test_set, batch_size=100, shuffle=False, num_workers=1) 58 | 59 | self._model = self._create_model() 60 | self._criterion = torch.nn.CrossEntropyLoss() 61 | 62 | self.state = [parameter.data for parameter in self._model.parameters()] 63 | 64 | def train_iterator(self, batch_size: int, shuffle: bool) -> Iterable[Batch]: 65 | """Create a dataloader serving `Batch`es from the training dataset. 66 | 67 | Example: 68 | >>> for batch in task.train_iterator(batch_size=32, shuffle=True): 69 | ... batch_loss, gradients = task.batch_loss_and_gradient(batch) 70 | """ 71 | train_loader = DataLoader( 72 | self._train_set, 73 | batch_size=batch_size, 74 | shuffle=shuffle, 75 | pin_memory=True, 76 | drop_last=True, 77 | num_workers=2, 78 | ) 79 | 80 | return BatchLoader(train_loader, self.device) 81 | 82 | def batch_loss(self, batch: Batch) -> float: 83 | """ 84 | Evaluate the loss on a batch. 85 | If the model has batch normalization or dropout, this will run in training mode. 86 | """ 87 | return self._criterion(self._model(batch._x), batch._y).item() 88 | 89 | def batch_loss_and_gradient(self, batch: Batch) -> (float, List[torch.Tensor]): 90 | """ 91 | Evaluate the loss and its gradients on a batch. 92 | If the model has batch normalization or dropout, this will run in training mode. 93 | 94 | Returns: 95 | - function value (float) 96 | - gradients (list of tensors in the same order as task.state()) 97 | """ 98 | self._zero_grad() 99 | f = self._criterion(self._model(batch._x), batch._y) 100 | f.backward() 101 | df = [parameter.grad.data for parameter in self._model.parameters()] 102 | return f.item(), df 103 | 104 | def test(self, state) -> float: 105 | """ 106 | Compute the average loss on the test set. 107 | The task is completed as soon as the output is below self.target_test_loss. 108 | If the model has batch normalization or dropout, this will run in eval mode. 109 | """ 110 | test_model = self._create_test_model(state) 111 | losses = [] 112 | for x, y in self._test_loader: 113 | x = x.to(self.device) 114 | y = y.to(self.device) 115 | with torch.no_grad(): 116 | f = self._criterion(test_model(x), y) 117 | losses.append(f.item()) 118 | mean_f = np.mean(losses) 119 | if mean_f < self.target_test_loss: 120 | raise Done(mean_f) 121 | return mean_f 122 | 123 | def _create_model(self): 124 | """Create a PyTorch module for the model""" 125 | torch.random.manual_seed(42) 126 | model = ResNet(ResNetBlock, [2, 2, 2, 2]) 127 | model.to(self.device) 128 | model.train() 129 | return model 130 | 131 | def _create_dataset(self, data_root="./data"): 132 | """Create train and test datasets""" 133 | dataset = torchvision.datasets.CIFAR10 134 | 135 | data_mean = (0.4914, 0.4822, 0.4465) 136 | data_stddev = (0.2023, 0.1994, 0.2010) 137 | 138 | transform_train = torchvision.transforms.Compose( 139 | [ 140 | torchvision.transforms.RandomCrop(32, padding=4), 141 | torchvision.transforms.RandomHorizontalFlip(), 142 | torchvision.transforms.ToTensor(), 143 | torchvision.transforms.Normalize(data_mean, data_stddev), 144 | ] 145 | ) 146 | 147 | transform_test = torchvision.transforms.Compose( 148 | [ 149 | torchvision.transforms.ToTensor(), 150 | torchvision.transforms.Normalize(data_mean, data_stddev), 151 | ] 152 | ) 153 | 154 | training_set = dataset(root=data_root, train=True, download=True, transform=transform_train) 155 | test_set = dataset(root=data_root, train=False, download=True, transform=transform_test) 156 | 157 | return training_set, test_set 158 | 159 | def _create_test_model(self, state): 160 | test_model = deepcopy(self._model) 161 | test_model.eval() 162 | for param, new_value in zip(test_model.parameters(), state): 163 | param.data = new_value.data 164 | return test_model 165 | 166 | def _zero_grad(self): 167 | for param in self._model.parameters(): 168 | if param.grad is not None: 169 | param.grad.zero_() 170 | 171 | 172 | class Done(Exception): 173 | pass 174 | 175 | 176 | class BatchLoader: 177 | """ 178 | Utility that transforms a dataloader that is an iterable over (x, y) tuples 179 | into an iterable over Batch() tuples, where its contents are already moved 180 | to the selected device. 181 | """ 182 | 183 | def __init__(self, dataloader, device): 184 | self.dataloader = dataloader 185 | self.device = device 186 | 187 | def __len__(self): 188 | return len(self.dataloader) 189 | 190 | def __iter__(self): 191 | for x, y in self.dataloader: 192 | x = x.to(self.device) 193 | y = y.to(self.device) 194 | yield Batch(x, y) 195 | 196 | 197 | class ResNet(torch.nn.Module): 198 | """ 199 | Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 200 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 201 | Source: github.com/kuangliu/pytorch-cifar 202 | """ 203 | 204 | def __init__(self, block, num_blocks, num_classes=10, use_batchnorm=True): 205 | super(ResNet, self).__init__() 206 | self.in_planes = 64 207 | self.use_batchnorm = use_batchnorm 208 | self.conv1 = torch.nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) 209 | self.bn1 = torch.nn.BatchNorm2d(64) if use_batchnorm else torch.nn.Sequential() 210 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 211 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 212 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 213 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 214 | self.linear = torch.nn.Linear(512 * block.expansion, num_classes) 215 | 216 | def _make_layer(self, block, planes, num_blocks, stride): 217 | strides = [stride] + [1] * (num_blocks - 1) 218 | layers = [] 219 | for stride in strides: 220 | layers.append(block(self.in_planes, planes, stride, self.use_batchnorm)) 221 | self.in_planes = planes * block.expansion 222 | return torch.nn.Sequential(*layers) 223 | 224 | def forward(self, x): 225 | out = torch.nn.functional.relu(self.bn1(self.conv1(x))) 226 | out = self.layer1(out) 227 | out = self.layer2(out) 228 | out = self.layer3(out) 229 | out = self.layer4(out) 230 | out = torch.nn.functional.avg_pool2d(out, 4) 231 | out = out.view(out.size(0), -1) 232 | out = self.linear(out) 233 | return out 234 | 235 | 236 | class ResNetBlock(torch.nn.Module): 237 | """ 238 | Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 239 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 240 | Source: github.com/kuangliu/pytorch-cifar 241 | """ 242 | 243 | expansion = 1 244 | 245 | def __init__(self, in_planes, planes, stride=1, use_batchnorm=True): 246 | super(ResNetBlock, self).__init__() 247 | self.conv1 = torch.nn.Conv2d( 248 | in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False 249 | ) 250 | self.bn1 = torch.nn.BatchNorm2d(planes) 251 | self.conv2 = torch.nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) 252 | self.bn2 = torch.nn.BatchNorm2d(planes) 253 | 254 | if not use_batchnorm: 255 | self.bn1 = self.bn2 = torch.nn.Sequential() 256 | 257 | self.shortcut = torch.nn.Sequential() 258 | if stride != 1 or in_planes != self.expansion * planes: 259 | self.shortcut = torch.nn.Sequential( 260 | torch.nn.Conv2d( 261 | in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False 262 | ), 263 | torch.nn.BatchNorm2d(self.expansion * planes) 264 | if use_batchnorm 265 | else torch.nn.Sequential(), 266 | ) 267 | 268 | def forward(self, x): 269 | out = torch.nn.functional.relu(self.bn1(self.conv1(x))) 270 | out = self.bn2(self.conv2(out)) 271 | out += self.shortcut(x) 272 | out = torch.nn.functional.relu(out) 273 | return out 274 | 275 | 276 | class ResNetBottleneck(torch.nn.Module): 277 | """ 278 | Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 279 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 280 | Source: github.com/kuangliu/pytorch-cifar 281 | """ 282 | 283 | expansion = 4 284 | 285 | def __init__(self, in_planes, planes, stride=1, use_batchnorm=True): 286 | super(ResNetBottleneck, self).__init__() 287 | self.conv1 = torch.nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 288 | self.bn1 = torch.nn.BatchNorm2d(planes) 289 | self.conv2 = torch.nn.Conv2d( 290 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False 291 | ) 292 | self.bn2 = torch.nn.BatchNorm2d(planes) 293 | self.conv3 = torch.nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False) 294 | self.bn3 = torch.nn.BatchNorm2d(self.expansion * planes) 295 | 296 | if not use_batchnorm: 297 | self.bn1 = self.bn2 = self.bn3 = torch.nn.Sequential() 298 | 299 | self.shortcut = torch.nn.Sequential() 300 | if stride != 1 or in_planes != self.expansion * planes: 301 | self.shortcut = torch.nn.Sequential( 302 | torch.nn.Conv2d( 303 | in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False 304 | ), 305 | torch.nn.BatchNorm2d(self.expansion * planes) 306 | if use_batchnorm 307 | else torch.nn.Sequential(), 308 | ) 309 | 310 | def forward(self, x): 311 | out = torch.nn.functional.relu(self.bn1(self.conv1(x))) 312 | out = torch.nn.functional.relu(self.bn2(self.conv2(out))) 313 | out = self.bn3(self.conv3(out)) 314 | out += self.shortcut(x) 315 | out = torch.nn.functional.relu(out) 316 | return out 317 | --------------------------------------------------------------------------------