├── example_tasks
    ├── __init__.py
    ├── lstm.py
    └── cifar.py
├── .gitignore
├── report
    └── template.tex
├── train.py
└── README.md


/example_tasks/__init__.py:
--------------------------------------------------------------------------------
1 | from .cifar import CifarTask
2 | from .lstm import LanguageModelingTask
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg-info/
 2 | .pytest_cache/
 3 | data/
 4 | submission/data/
 5 | __pycache__/
 6 | report/*.aux
 7 | report/*.fdb_latexmk
 8 | report/*.fls
 9 | report/*.log
10 | 


--------------------------------------------------------------------------------
/report/template.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[twocolumn,a4paper,12pt]{article}
 2 | 
 3 | \usepackage[utf8]{inputenc}
 4 | \usepackage[english]{babel}
 5 | \usepackage{amsmath}
 6 | \usepackage{amssymb}
 7 | \usepackage{lipsum}
 8 | 
 9 | \title{Your AutoTrain Optimizer}
10 | \author{
11 |     First author \and
12 |     Second author \and
13 |     Third author
14 | }
15 | 
16 | 
17 | \begin{document}
18 | 
19 | \maketitle
20 | 
21 | 
22 | \section{Introduction}
23 | 
24 | \lipsum[1]
25 | 
26 | 
27 | \section{Related work}
28 | 
29 | \lipsum[2-3]
30 | 
31 | 
32 | \section{Method}
33 | 
34 | \lipsum[4-5]
35 | 
36 | 
37 | \section{Results}
38 | 
39 | \lipsum[6-7]
40 | 
41 | \end{document}
42 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | This is an example submission that implements Adam.
 5 | """
 6 | 
 7 | import math
 8 | 
 9 | import torch
10 | 
11 | 
12 | def train(task):
13 |     batch_size = task.default_batch_size
14 |     target_loss = task.target_test_loss
15 | 
16 |     learning_rate = 0.001
17 |     beta1 = 0.9
18 |     beta2 = 0.999
19 |     epsilon = 1e-8
20 |     epsilon = 1e-08
21 |     weight_decay = 1e-5
22 |     n_epochs = 10
23 | 
24 |     # Adam Initialization
25 |     first_moment = [torch.zeros_like(param) for param in task.state]
26 |     second_moment = [torch.zeros_like(param) for param in task.state]
27 |     t = 0
28 | 
29 |     for epoch in range(n_epochs):
30 |         print("Epoch {}".format(epoch))
31 | 
32 |         for batch in task.train_iterator(batch_size=batch_size, shuffle=True):
33 |             # Get a batch gradient
34 |             _, df = task.batch_loss_and_gradient(batch)
35 | 
36 |             # Adam Update
37 |             t += 1
38 |             lr = learning_rate * math.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)
39 |             for m1, m2, variable, grad in zip(first_moment, second_moment, task.state, df):
40 |                 m1 = beta1 * m1 + (1 - beta1) * grad
41 |                 m2 = beta2 * m2 + (1 - beta2) * grad * grad
42 |                 variable.mul_(1 - weight_decay)
43 |                 variable.add_(-lr, m1 / (torch.sqrt(m2 + epsilon)))
44 | 
45 |         # As soon as you test your model and the test_loss is lower than task.target_test_loss,
46 |         # your optimizer will be killed and you are done.
47 |         test_loss = task.test(task.state)
48 |         print("Test loss at epoch {}: {:.3f}".format(epoch, test_loss))
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     from example_tasks import CifarTask
53 | 
54 |     task = CifarTask()
55 |     train(task)
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AutoTrain Challenge
 2 | 
 3 | (This design document corresponds to a challenge that was submitted to NeurIPS but not run.)
 4 | 
 5 | AutoTrain challenges you to submit optimizers that work reliably on any deep learning task without task-specific tuning.
 6 | It separates AutoML into (1) fully automatic training of a model and (2) model selection, and tackles the first aspect.
 7 | 
 8 | Your submissions will be benchmarked on a secret set of architecture/dataset pairs inspired by common deep learning tasks.
 9 | The optimizers need to achieve a target test loss as fast as possible. The fastest on average wins the competition.
10 | 
11 | The winning optimizers will be made publicly available as
12 | open source and bring significant value to practitioners and researchers, by removing
13 | the need of expensive hyperparameter tuning, and by providing fair benchmarking of
14 | all optimizers.
15 | 
16 | ## Submission
17 | 
18 | You are required to submit a ZIP file before the deadline to [autotrain@groupes.epfl.ch](mailto:autotrain@groupes.epfl.ch) containing:
19 | 
20 | -   `README.md`: team name and team members,
21 | -   `train.py`: code of your optimizer,
22 | -   `report.pdf`: 4 pages (two-columns) report describing your submission.
23 | 
24 | You can refer to [train.py](./train.py) for a sample submission.
25 | 
26 | ## Rules
27 | 
28 | ### Evaluation
29 | 
30 | The participants are required to submit code for an AutoTrain optimizer which will be uploaded to the challenge platform. This code will be run on previously unseen architecture / dataset pairs. The submission is executed until the target test loss is reached, or it stops otherwise, or it consumes more than the maximum allowed resources (time or memory). Submissions are ranked on average time-to-accuracy (on the specified standard cloud instance), since this corresponds best to cost in real-world usecases. More precisely, the time-to-accuracy over the test cases will be normalized by the different baseline times for each test-case. In effect, with normalization, the final score is the harmonic mean of the time-to-accuracy speed-up for the different architecture / dataset pairs (speedup is defined as the time ratio compared to the baseline).
31 | 
32 | ### Unseen architecture / dataset pairs
33 | 
34 | The unseen architecture / dataset pairs on which they will be judged will be modifications of the sample architecture / dataset pairs provided beforehand to the participants. Hence, it is sufficient to ensure that the submitted code does not exceed maximum resources on the provided sample architecture / dataset. Most importantly, the number of weights on the unknown network will not exceed the one in the provided example models. Further, the range of the following high level characteristics of the unseen architecture / dataset pair will be of the same order of magnitude as that of their sample counterparts: i) number of parameters, ii) time required for forward pass, iii) time required for backprop, and iv) size of the training data. However, the exact values of each of these characteristics might not match that of any of the provided samples. Further, the architecture of the model, though similar, might not exactly match any of the provided samples.
35 | 
36 | ### Task interface
37 | 
38 | The AutoTrain optimizer can access the (train) data via querying consecutive mini-batches of desired size. It is allowed to make as many calls as desired (within the resource limits) to the following oracles, which take a minibatch as input and output:
39 | 
40 | 1. the loss value of the network on the corresponding mini-batch (i.e. inference),
41 | 2. the result of the backprop on the corresponding mini-batch.
42 | 
43 | The optimizer can update the weights as many times as desired. Access to the interface will be synchronous—multiple simultaneous queries are ignored. The interface is based on PyTorch.
44 | 
45 | The optimizer can query the test loss via `test_loss = task.test(task.state)`. The current test loss will be compared against the target **only** when you call this function.
46 | 
47 | The optimizer also has access to the target test loss `task.target_test_loss`, and a default batch size `task.default_batch_size` which is guaranteed to not exceed memory limits for SGD and Adam.
48 | 
49 | ### Additional rules
50 | 
51 | -   Each submission should be accompanied by an informative description (commented code, README, and writeup of the approach).
52 | -   Source code of the submission must be provided. Your optimizer should be implemented in train.py and not use any external dependencies.
53 | -   Use of external communication, pretraining, or manipulation of the provided oracles (such as backprop) is not allowed, only the use of the results (vectors) of the oracles is permitted.
54 | -   We require the winning submission to be publicly released to ensure reproducibility and impact on the community.
55 | 
56 | ### Environment
57 | 
58 | We will evaluate the submissions on a system with Ubuntu 18.04, Anaconda Python 3.7 and Cuda 10.
59 | You can use the packages `torch`, `numpy`, `scipy` and any other package available in Anaconda Python.
60 | 
61 | ### Optimizer
62 | 
63 | The submitted `train.py` file must define the function:
64 | 
65 | ```python
66 | def train(task: Task):
67 |     """Train the (model, dataset) pair associated to the task.
68 | 
69 |     Args:
70 |         task [Task]: task to optimize. Refer to `src/task.py` for available functions.
71 |     """
72 | ```
73 | 
74 | An example is provided in [train.py](./train.py).
75 | Every time you evaluate the model on the test set (`task.test(task.state)`), you are compared againast the target loss and get a chance to win.
76 | 
77 | ## Organizers
78 | 
79 | -   Thijs Vogels, EPFL
80 | -   Sai Praneeth Karimireddy, EPFL
81 | -   Jean-Baptiste Cordonnier, EPFL
82 | -   Michael Tschannen, ETH Zürich
83 | -   Fabian Pedregosa, Google
84 | -   Sebastian U. Stich, EPFL
85 | -   Sharada Mohanty, EPFL
86 | -   Marcel Salathé, EPFL
87 | -   Martin Jaggi, EPFL
88 | 
89 | Contact: autotrain@groupes.epfl.ch
90 | 


--------------------------------------------------------------------------------
/example_tasks/lstm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from copy import deepcopy
  3 | from typing import Dict, Iterable, List
  4 | 
  5 | import numpy as np
  6 | import spacy
  7 | import torch
  8 | import torch.nn as nn
  9 | import torchtext
 10 | from spacy.symbols import ORTH
 11 | from torch.utils.data import DataLoader
 12 | 
 13 | 
 14 | """
 15 | This is another example of a task.
 16 | It is an implementation of language modeling.
 17 | The CifarTask is easier to understand and better documented.
 18 | """
 19 | 
 20 | 
 21 | class Batch:
 22 |     def __init__(self, x, y, hidden):
 23 |         self.x = x
 24 |         self.y = y
 25 |         self.hidden = hidden
 26 | 
 27 | 
 28 | class LanguageModelingTask:
 29 |     def __init__(self):
 30 |         self.default_batch_size = 64
 31 |         self.target_test_loss = 4.7
 32 | 
 33 |         self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 34 |         self._seed = 34534
 35 |         self._epoch = 0
 36 | 
 37 |         torch.random.manual_seed(self._seed)
 38 |         self.text, self.train_loader, self.val_loader = define_dataset(
 39 |             self._device, "wikitext2", "data", batch_size=self.default_batch_size
 40 |         )
 41 | 
 42 |         global ITOS
 43 |         global STOI
 44 |         ITOS = self.text.vocab.itos
 45 |         STOI = self.text.vocab.stoi
 46 | 
 47 |         self._model = self._create_model()
 48 |         self._criterion = torch.nn.CrossEntropyLoss().to(self._device)
 49 | 
 50 |         self.state = [parameter.data for parameter in self._model.parameters()]
 51 |         self.buffers = [buffer for buffer in self._model.buffers()]
 52 |         self.parameter_names = [name for (name, _) in self._model.named_parameters()]
 53 |         self._hidden_container = {"hidden": None}
 54 | 
 55 |     def train_iterator(self, batch_size: int, shuffle: bool = False) -> Iterable[Batch]:
 56 |         """Shuffle is ignored .. text cannot be shuffled"""
 57 |         self._epoch += 1
 58 |         self._hidden_container["hidden"] = self._model.init_hidden(batch_size)
 59 |         _, train_loader, _ = define_dataset(
 60 |             self._device, "wikitext2", "data", batch_size=batch_size
 61 |         )
 62 |         return BatchLoader(
 63 |             train_loader, self._device, model=self._model, hidden_container=self._hidden_container
 64 |         )
 65 | 
 66 |     def batch_loss(self, batch: Batch) -> (float, Dict[str, float]):
 67 |         with torch.no_grad():
 68 |             prediction, hidden = self._model(batch.x, batch.hidden)
 69 |             self._hidden_container["hidden"] = hidden
 70 |             loss = self._criterion(
 71 |                 prediction.view(-1, self._model.ntokens), batch.y.contiguous().view(-1)
 72 |             )
 73 |         return loss.item()
 74 | 
 75 |     def batch_loss_and_gradient(
 76 |         self, batch: Batch, rnn_clip=0.4
 77 |     ) -> (float, List[torch.Tensor], Dict[str, float]):
 78 |         self._zero_grad()
 79 |         prediction, hidden = self._model(batch.x, batch.hidden)
 80 |         self._hidden_container["hidden"] = hidden
 81 |         f = self._criterion(prediction.view(-1, self._model.ntokens), batch.y.contiguous().view(-1))
 82 |         f.backward()
 83 |         torch.nn.utils.clip_grad_norm_(self._model.parameters(), rnn_clip)
 84 |         df = [parameter.grad for parameter in self._model.parameters()]
 85 |         return f.detach(), df
 86 | 
 87 |     def test(self, state=None) -> float:
 88 |         self._hidden_container["hidden"] = self._model.init_hidden(self.default_batch_size)
 89 |         test_loader = BatchLoader(
 90 |             self.val_loader,
 91 |             self._device,
 92 |             model=self._model,
 93 |             hidden_container=self._hidden_container,
 94 |         )
 95 | 
 96 |         if state:
 97 |             test_model = self._create_test_model(state)
 98 |         else:
 99 |             test_model = self._model
100 |             test_model.eval()
101 | 
102 |         losses = []
103 | 
104 |         for batch in test_loader:
105 |             with torch.no_grad():
106 |                 prediction, hidden = self._model(batch.x, batch.hidden)
107 |                 self._hidden_container["hidden"] = hidden
108 |                 losses.append(
109 |                     self._criterion(
110 |                         prediction.view(-1, self._model.ntokens), batch.y.contiguous().view(-1)
111 |                     ).item()
112 |                 )
113 | 
114 |         mean_f = np.mean(losses)
115 |         if mean_f < self.target_test_loss:
116 |             raise Done(mean_f)
117 | 
118 |         return mean_f
119 | 
120 |     def _create_model(self):
121 |         torch.random.manual_seed(self._seed)
122 |         model = define_model(self.text)
123 |         model.to(self._device)
124 |         model.train()
125 |         return model
126 | 
127 |     def _create_test_model(self, state):
128 |         test_model = deepcopy(self._model)
129 |         test_model.eval()
130 |         for param, new_value in zip(test_model.parameters(), state):
131 |             param.data = new_value.data
132 |         return test_model
133 | 
134 |     def _zero_grad(self):
135 |         self._model.zero_grad()
136 | 
137 | 
138 | class BatchLoader:
139 |     """
140 |     Utility that transforms a dataloader that is an iterable over (x, y) tuples
141 |     into an iterable over Batch() tuples, where its contents are already moved
142 |     to the selected device.
143 |     """
144 | 
145 |     def __init__(self, dataloader, device, model, hidden_container):
146 |         self.dataloader = dataloader
147 |         self.device = device
148 |         self._model = model
149 |         self._hidden_container = hidden_container
150 | 
151 |     def __len__(self):
152 |         return len(self.dataloader)
153 | 
154 |     def __iter__(self):
155 |         for batch in self.dataloader:
156 |             x = batch.text
157 |             y = batch.target
158 |             hidden = self._model.repackage_hidden(self._hidden_container["hidden"])
159 |             yield Batch(x, y, hidden)
160 | 
161 | 
162 | def define_dataset(
163 |     device,
164 |     dataset_name,
165 |     dataset_path,
166 |     batch_size,
167 |     rnn_use_pretrained_emb=False,
168 |     rnn_n_hidden=650,
169 |     reshuffle_per_epoch=True,
170 |     rnn_bptt_len=30,
171 | ):
172 |     # create dataset.
173 |     TEXT, train, valid, test = _get_dataset(dataset_name, dataset_path)
174 | 
175 |     # Build vocb.
176 |     # we can use some precomputed word embeddings,
177 |     # e.g., GloVe vectors with 100, 200, and 300.
178 |     if rnn_use_pretrained_emb:
179 |         try:
180 |             vectors = "glove.6B.{}d".format(rnn_n_hidden)
181 |             vectors_cache = os.path.join(dataset_path, ".vector_cache")
182 |         except:
183 |             vectors, vectors_cache = None, None
184 |     else:
185 |         vectors, vectors_cache = None, None
186 |     TEXT.build_vocab(train, vectors=vectors, vectors_cache=vectors_cache)
187 | 
188 |     # Partition training data.
189 |     train_loader, _ = torchtext.data.BPTTIterator.splits(
190 |         (train, valid),
191 |         batch_size=batch_size,
192 |         bptt_len=rnn_bptt_len,
193 |         device=device,
194 |         shuffle=reshuffle_per_epoch,
195 |     )
196 |     _, val_loader = torchtext.data.BPTTIterator.splits(
197 |         (train, valid),
198 |         batch_size=batch_size,
199 |         bptt_len=rnn_bptt_len,
200 |         device=device,
201 |         shuffle=reshuffle_per_epoch,
202 |     )
203 | 
204 |     # get some stat.
205 |     return TEXT, train_loader, val_loader
206 | 
207 | 
208 | def define_model(TEXT, rnn_n_hidden=650, rnn_n_layers=3, rnn_tie_weights=True, drop_rate=0.4):
209 |     # get embdding size and num_tokens.
210 |     weight_matrix = TEXT.vocab.vectors
211 | 
212 |     if weight_matrix is not None:
213 |         n_tokens, emb_size = weight_matrix.size(0), weight_matrix.size(1)
214 |     else:
215 |         n_tokens, emb_size = len(TEXT.vocab), rnn_n_hidden
216 | 
217 |     # create model.
218 |     model = RNNModel(
219 |         rnn_type="LSTM",
220 |         ntoken=n_tokens,
221 |         ninp=emb_size,
222 |         nhid=rnn_n_hidden,
223 |         nlayers=rnn_n_layers,
224 |         tie_weights=rnn_tie_weights,
225 |         dropout=drop_rate,
226 |     )
227 | 
228 |     # init the model.
229 |     if weight_matrix is not None:
230 |         model.encoder.weight.data.copy_(weight_matrix)
231 | 
232 |     return model
233 | 
234 | 
235 | def _get_text():
236 |     spacy_en = spacy.load("en")
237 |     spacy_en.tokenizer.add_special_case("<eos>", [{ORTH: "<eos>"}])
238 |     spacy_en.tokenizer.add_special_case("<bos>", [{ORTH: "<bos>"}])
239 |     spacy_en.tokenizer.add_special_case("<unk>", [{ORTH: "<unk>"}])
240 | 
241 |     def spacy_tok(text):
242 |         return [tok.text for tok in spacy_en.tokenizer(text)]
243 | 
244 |     TEXT = torchtext.data.Field(lower=True, tokenize=spacy_tok)
245 |     return TEXT
246 | 
247 | 
248 | def _get_dataset(name, datasets_path):
249 |     TEXT = _get_text()
250 | 
251 |     # Load and split data.
252 |     if "wikitext2" in name:
253 |         train, valid, test = torchtext.datasets.WikiText2.splits(TEXT, root=datasets_path)
254 |     elif "ptb" in name:
255 |         train, valid, test = torchtext.datasets.PennTreebank.splits(TEXT, root=datasets_path)
256 |     return TEXT, train, valid, test
257 | 
258 | 
259 | class RNNModel(nn.Module):
260 |     """Container module with an encoder, a recurrent module, and a decoder."""
261 | 
262 |     def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
263 |         super(RNNModel, self).__init__()
264 |         self.drop = nn.Dropout(dropout)
265 |         self.encoder = nn.Embedding(ntoken, ninp)
266 |         if rnn_type in ["LSTM", "GRU"]:
267 |             self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
268 |         else:
269 |             try:
270 |                 nonlinearity = {"RNN_TANH": "tanh", "RNN_RELU": "relu"}[rnn_type]
271 |             except KeyError:
272 |                 raise ValueError(
273 |                     """An invalid option for `--model` was supplied,
274 |                                  options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']"""
275 |                 )
276 |             self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
277 |         self.decoder = nn.Linear(nhid, ntoken)
278 | 
279 |         # Optionally tie weights as in:
280 |         # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
281 |         # https://arxiv.org/abs/1608.05859
282 |         # and
283 |         # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
284 |         # https://arxiv.org/abs/1611.01462
285 |         if tie_weights:
286 |             if nhid != ninp:
287 |                 raise ValueError("When using the tied flag, nhid must be equal to emsize")
288 |             self.decoder.weight = self.encoder.weight
289 | 
290 |         self.init_weights()
291 | 
292 |         self.rnn_type = rnn_type
293 |         self.nhid = nhid
294 |         self.nlayers = nlayers
295 |         self.ntokens = ntoken
296 | 
297 |     def init_weights(self):
298 |         initrange = 0.1
299 |         self.encoder.weight.data.uniform_(-initrange, initrange)
300 |         self.decoder.bias.data.zero_()
301 |         self.decoder.weight.data.uniform_(-initrange, initrange)
302 | 
303 |     def forward(self, input, hidden):
304 |         emb = self.drop(self.encoder(input))
305 |         output, hidden = self.rnn(emb, hidden)
306 |         output = self.drop(output)
307 |         decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
308 |         return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
309 | 
310 |     def init_hidden(self, bsz):
311 |         weight = next(self.parameters())
312 |         if self.rnn_type == "LSTM":
313 |             return (
314 |                 weight.new_zeros(self.nlayers, bsz, self.nhid),
315 |                 weight.new_zeros(self.nlayers, bsz, self.nhid),
316 |             )
317 |         else:
318 |             return weight.new_zeros(self.nlayers, bsz, self.nhid)
319 | 
320 |     def repackage_hidden(self, h):
321 |         """Wraps hidden states in new Tensors, to detach them from their history."""
322 |         if isinstance(h, torch.Tensor):
323 |             return h.detach()
324 |         else:
325 |             return tuple(self.repackage_hidden(v) for v in h)
326 | 
327 | 
328 | class Done(Exception):
329 |     pass
330 | 
331 | 
332 | ITOS = None  # integer to string
333 | STOI = None  # string to integer
334 | 


--------------------------------------------------------------------------------
/example_tasks/cifar.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | from typing import Iterable, List
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import torchvision
  7 | from torch.utils.data import DataLoader, Dataset
  8 | 
  9 | 
 10 | """
 11 | This file describes the public interface of optimization Tasks
 12 | and implements a ResNet18/Cifar10 optimization task to test your
 13 | optimizer. An example on how to implement the optimizer is in train.py.
 14 | """
 15 | 
 16 | 
 17 | class Batch:
 18 |     def __init__(self, x, y):
 19 |         self._x = x
 20 |         self._y = y
 21 | 
 22 | 
 23 | class CifarTask:
 24 |     """
 25 |     Example implementation of an optimization task.
 26 | 
 27 |     Interface:
 28 |         The following methods are exposed to the challenge participants:
 29 |             - `train_iterator`: returns an iterator of `Batch`es from the training set,
 30 |             - `batch_loss`: evaluate the function value of a `Batch`,
 31 |             - `batch_loss_and_gradient`: evaluate the function value of a `Batch` and compute the gradients,
 32 |             - `test`: compute the test loss of the model on the test set.
 33 |         The following attributes are exposed to the challenge participants:
 34 |             - `default_batch_size`
 35 |             - `target_test_loss`
 36 |             - `state` contains a list of current model parameter values
 37 | 
 38 |         See documentation below for more information.
 39 | 
 40 |     Example:
 41 |         See /train.py for an example of a Task in use.
 42 |     """
 43 | 
 44 |     def __init__(self):
 45 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 46 | 
 47 |         self.default_batch_size = 128
 48 |         self.target_test_loss = 0.25
 49 | 
 50 |         self._train_set, self._test_set = self._create_dataset()
 51 |         # self._train_set = torch.utils.data.Subset(
 52 |         #     self._train_set, np.random.choice(len(self._train_set), 512)
 53 |         # )
 54 |         # self._test_set = torch.utils.data.Subset(
 55 |         #     self._test_set, np.random.choice(len(self._test_set), 500)
 56 |         # )
 57 |         self._test_loader = DataLoader(self._test_set, batch_size=100, shuffle=False, num_workers=1)
 58 | 
 59 |         self._model = self._create_model()
 60 |         self._criterion = torch.nn.CrossEntropyLoss()
 61 | 
 62 |         self.state = [parameter.data for parameter in self._model.parameters()]
 63 | 
 64 |     def train_iterator(self, batch_size: int, shuffle: bool) -> Iterable[Batch]:
 65 |         """Create a dataloader serving `Batch`es from the training dataset.
 66 | 
 67 |         Example:
 68 |             >>> for batch in task.train_iterator(batch_size=32, shuffle=True):
 69 |             ...     batch_loss, gradients = task.batch_loss_and_gradient(batch)
 70 |         """
 71 |         train_loader = DataLoader(
 72 |             self._train_set,
 73 |             batch_size=batch_size,
 74 |             shuffle=shuffle,
 75 |             pin_memory=True,
 76 |             drop_last=True,
 77 |             num_workers=2,
 78 |         )
 79 | 
 80 |         return BatchLoader(train_loader, self.device)
 81 | 
 82 |     def batch_loss(self, batch: Batch) -> float:
 83 |         """
 84 |         Evaluate the loss on a batch.
 85 |         If the model has batch normalization or dropout, this will run in training mode.
 86 |         """
 87 |         return self._criterion(self._model(batch._x), batch._y).item()
 88 | 
 89 |     def batch_loss_and_gradient(self, batch: Batch) -> (float, List[torch.Tensor]):
 90 |         """
 91 |         Evaluate the loss and its gradients on a batch.
 92 |         If the model has batch normalization or dropout, this will run in training mode.
 93 | 
 94 |         Returns:
 95 |             - function value (float)
 96 |             - gradients (list of tensors in the same order as task.state())
 97 |         """
 98 |         self._zero_grad()
 99 |         f = self._criterion(self._model(batch._x), batch._y)
100 |         f.backward()
101 |         df = [parameter.grad.data for parameter in self._model.parameters()]
102 |         return f.item(), df
103 | 
104 |     def test(self, state) -> float:
105 |         """
106 |         Compute the average loss on the test set.
107 |         The task is completed as soon as the output is below self.target_test_loss.
108 |         If the model has batch normalization or dropout, this will run in eval mode.
109 |         """
110 |         test_model = self._create_test_model(state)
111 |         losses = []
112 |         for x, y in self._test_loader:
113 |             x = x.to(self.device)
114 |             y = y.to(self.device)
115 |             with torch.no_grad():
116 |                 f = self._criterion(test_model(x), y)
117 |             losses.append(f.item())
118 |         mean_f = np.mean(losses)
119 |         if mean_f < self.target_test_loss:
120 |             raise Done(mean_f)
121 |         return mean_f
122 | 
123 |     def _create_model(self):
124 |         """Create a PyTorch module for the model"""
125 |         torch.random.manual_seed(42)
126 |         model = ResNet(ResNetBlock, [2, 2, 2, 2])
127 |         model.to(self.device)
128 |         model.train()
129 |         return model
130 | 
131 |     def _create_dataset(self, data_root="./data"):
132 |         """Create train and test datasets"""
133 |         dataset = torchvision.datasets.CIFAR10
134 | 
135 |         data_mean = (0.4914, 0.4822, 0.4465)
136 |         data_stddev = (0.2023, 0.1994, 0.2010)
137 | 
138 |         transform_train = torchvision.transforms.Compose(
139 |             [
140 |                 torchvision.transforms.RandomCrop(32, padding=4),
141 |                 torchvision.transforms.RandomHorizontalFlip(),
142 |                 torchvision.transforms.ToTensor(),
143 |                 torchvision.transforms.Normalize(data_mean, data_stddev),
144 |             ]
145 |         )
146 | 
147 |         transform_test = torchvision.transforms.Compose(
148 |             [
149 |                 torchvision.transforms.ToTensor(),
150 |                 torchvision.transforms.Normalize(data_mean, data_stddev),
151 |             ]
152 |         )
153 | 
154 |         training_set = dataset(root=data_root, train=True, download=True, transform=transform_train)
155 |         test_set = dataset(root=data_root, train=False, download=True, transform=transform_test)
156 | 
157 |         return training_set, test_set
158 | 
159 |     def _create_test_model(self, state):
160 |         test_model = deepcopy(self._model)
161 |         test_model.eval()
162 |         for param, new_value in zip(test_model.parameters(), state):
163 |             param.data = new_value.data
164 |         return test_model
165 | 
166 |     def _zero_grad(self):
167 |         for param in self._model.parameters():
168 |             if param.grad is not None:
169 |                 param.grad.zero_()
170 | 
171 | 
172 | class Done(Exception):
173 |     pass
174 | 
175 | 
176 | class BatchLoader:
177 |     """
178 |     Utility that transforms a dataloader that is an iterable over (x, y) tuples
179 |     into an iterable over Batch() tuples, where its contents are already moved
180 |     to the selected device.
181 |     """
182 | 
183 |     def __init__(self, dataloader, device):
184 |         self.dataloader = dataloader
185 |         self.device = device
186 | 
187 |     def __len__(self):
188 |         return len(self.dataloader)
189 | 
190 |     def __iter__(self):
191 |         for x, y in self.dataloader:
192 |             x = x.to(self.device)
193 |             y = y.to(self.device)
194 |             yield Batch(x, y)
195 | 
196 | 
197 | class ResNet(torch.nn.Module):
198 |     """
199 |     Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
200 |     Deep Residual Learning for Image Recognition. arXiv:1512.03385
201 |     Source: github.com/kuangliu/pytorch-cifar
202 |     """
203 | 
204 |     def __init__(self, block, num_blocks, num_classes=10, use_batchnorm=True):
205 |         super(ResNet, self).__init__()
206 |         self.in_planes = 64
207 |         self.use_batchnorm = use_batchnorm
208 |         self.conv1 = torch.nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
209 |         self.bn1 = torch.nn.BatchNorm2d(64) if use_batchnorm else torch.nn.Sequential()
210 |         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
211 |         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
212 |         self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
213 |         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
214 |         self.linear = torch.nn.Linear(512 * block.expansion, num_classes)
215 | 
216 |     def _make_layer(self, block, planes, num_blocks, stride):
217 |         strides = [stride] + [1] * (num_blocks - 1)
218 |         layers = []
219 |         for stride in strides:
220 |             layers.append(block(self.in_planes, planes, stride, self.use_batchnorm))
221 |             self.in_planes = planes * block.expansion
222 |         return torch.nn.Sequential(*layers)
223 | 
224 |     def forward(self, x):
225 |         out = torch.nn.functional.relu(self.bn1(self.conv1(x)))
226 |         out = self.layer1(out)
227 |         out = self.layer2(out)
228 |         out = self.layer3(out)
229 |         out = self.layer4(out)
230 |         out = torch.nn.functional.avg_pool2d(out, 4)
231 |         out = out.view(out.size(0), -1)
232 |         out = self.linear(out)
233 |         return out
234 | 
235 | 
236 | class ResNetBlock(torch.nn.Module):
237 |     """
238 |     Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
239 |     Deep Residual Learning for Image Recognition. arXiv:1512.03385
240 |     Source: github.com/kuangliu/pytorch-cifar
241 |     """
242 | 
243 |     expansion = 1
244 | 
245 |     def __init__(self, in_planes, planes, stride=1, use_batchnorm=True):
246 |         super(ResNetBlock, self).__init__()
247 |         self.conv1 = torch.nn.Conv2d(
248 |             in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
249 |         )
250 |         self.bn1 = torch.nn.BatchNorm2d(planes)
251 |         self.conv2 = torch.nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
252 |         self.bn2 = torch.nn.BatchNorm2d(planes)
253 | 
254 |         if not use_batchnorm:
255 |             self.bn1 = self.bn2 = torch.nn.Sequential()
256 | 
257 |         self.shortcut = torch.nn.Sequential()
258 |         if stride != 1 or in_planes != self.expansion * planes:
259 |             self.shortcut = torch.nn.Sequential(
260 |                 torch.nn.Conv2d(
261 |                     in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False
262 |                 ),
263 |                 torch.nn.BatchNorm2d(self.expansion * planes)
264 |                 if use_batchnorm
265 |                 else torch.nn.Sequential(),
266 |             )
267 | 
268 |     def forward(self, x):
269 |         out = torch.nn.functional.relu(self.bn1(self.conv1(x)))
270 |         out = self.bn2(self.conv2(out))
271 |         out += self.shortcut(x)
272 |         out = torch.nn.functional.relu(out)
273 |         return out
274 | 
275 | 
276 | class ResNetBottleneck(torch.nn.Module):
277 |     """
278 |     Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
279 |     Deep Residual Learning for Image Recognition. arXiv:1512.03385
280 |     Source: github.com/kuangliu/pytorch-cifar
281 |     """
282 | 
283 |     expansion = 4
284 | 
285 |     def __init__(self, in_planes, planes, stride=1, use_batchnorm=True):
286 |         super(ResNetBottleneck, self).__init__()
287 |         self.conv1 = torch.nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
288 |         self.bn1 = torch.nn.BatchNorm2d(planes)
289 |         self.conv2 = torch.nn.Conv2d(
290 |             planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
291 |         )
292 |         self.bn2 = torch.nn.BatchNorm2d(planes)
293 |         self.conv3 = torch.nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)
294 |         self.bn3 = torch.nn.BatchNorm2d(self.expansion * planes)
295 | 
296 |         if not use_batchnorm:
297 |             self.bn1 = self.bn2 = self.bn3 = torch.nn.Sequential()
298 | 
299 |         self.shortcut = torch.nn.Sequential()
300 |         if stride != 1 or in_planes != self.expansion * planes:
301 |             self.shortcut = torch.nn.Sequential(
302 |                 torch.nn.Conv2d(
303 |                     in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False
304 |                 ),
305 |                 torch.nn.BatchNorm2d(self.expansion * planes)
306 |                 if use_batchnorm
307 |                 else torch.nn.Sequential(),
308 |             )
309 | 
310 |     def forward(self, x):
311 |         out = torch.nn.functional.relu(self.bn1(self.conv1(x)))
312 |         out = torch.nn.functional.relu(self.bn2(self.conv2(out)))
313 |         out = self.bn3(self.conv3(out))
314 |         out += self.shortcut(x)
315 |         out = torch.nn.functional.relu(out)
316 |         return out
317 | 


--------------------------------------------------------------------------------