├── .old
    ├── main-orig.py
    ├── main-wds.py
    ├── makeshards.py
    ├── requirements.txt
    └── run
└── README.md


/.old/main-orig.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import random
  4 | import shutil
  5 | import time
  6 | import warnings
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.parallel
 11 | import torch.backends.cudnn as cudnn
 12 | import torch.distributed as dist
 13 | import torch.optim
 14 | import torch.multiprocessing as mp
 15 | import torch.utils.data
 16 | import torch.utils.data.distributed
 17 | import torchvision.transforms as transforms
 18 | import torchvision.datasets as datasets
 19 | import torchvision.models as models
 20 | 
 21 | model_names = sorted(name for name in models.__dict__
 22 |     if name.islower() and not name.startswith("__")
 23 |     and callable(models.__dict__[name]))
 24 | 
 25 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
 26 | parser.add_argument('data', metavar='DIR',
 27 |                     help='path to dataset')
 28 | parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
 29 |                     choices=model_names,
 30 |                     help='model architecture: ' +
 31 |                         ' | '.join(model_names) +
 32 |                         ' (default: resnet18)')
 33 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
 34 |                     help='number of data loading workers (default: 4)')
 35 | parser.add_argument('--epochs', default=90, type=int, metavar='N',
 36 |                     help='number of total epochs to run')
 37 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
 38 |                     help='manual epoch number (useful on restarts)')
 39 | parser.add_argument('-b', '--batch-size', default=256, type=int,
 40 |                     metavar='N',
 41 |                     help='mini-batch size (default: 256), this is the total '
 42 |                          'batch size of all GPUs on the current node when '
 43 |                          'using Data Parallel or Distributed Data Parallel')
 44 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
 45 |                     metavar='LR', help='initial learning rate', dest='lr')
 46 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
 47 |                     help='momentum')
 48 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
 49 |                     metavar='W', help='weight decay (default: 1e-4)',
 50 |                     dest='weight_decay')
 51 | parser.add_argument('-p', '--print-freq', default=10, type=int,
 52 |                     metavar='N', help='print frequency (default: 10)')
 53 | parser.add_argument('--resume', default='', type=str, metavar='PATH',
 54 |                     help='path to latest checkpoint (default: none)')
 55 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
 56 |                     help='evaluate model on validation set')
 57 | parser.add_argument('--pretrained', dest='pretrained', action='store_true',
 58 |                     help='use pre-trained model')
 59 | parser.add_argument('--world-size', default=-1, type=int,
 60 |                     help='number of nodes for distributed training')
 61 | parser.add_argument('--rank', default=-1, type=int,
 62 |                     help='node rank for distributed training')
 63 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
 64 |                     help='url used to set up distributed training')
 65 | parser.add_argument('--dist-backend', default='nccl', type=str,
 66 |                     help='distributed backend')
 67 | parser.add_argument('--seed', default=None, type=int,
 68 |                     help='seed for initializing training. ')
 69 | parser.add_argument('--gpu', default=None, type=int,
 70 |                     help='GPU id to use.')
 71 | parser.add_argument('--multiprocessing-distributed', action='store_true',
 72 |                     help='Use multi-processing distributed training to launch '
 73 |                          'N processes per node, which has N GPUs. This is the '
 74 |                          'fastest way to use PyTorch for either single node or '
 75 |                          'multi node data parallel training')
 76 | 
 77 | best_acc1 = 0
 78 | 
 79 | 
 80 | def main():
 81 |     args = parser.parse_args()
 82 | 
 83 |     if args.seed is not None:
 84 |         random.seed(args.seed)
 85 |         torch.manual_seed(args.seed)
 86 |         cudnn.deterministic = True
 87 |         warnings.warn('You have chosen to seed training. '
 88 |                       'This will turn on the CUDNN deterministic setting, '
 89 |                       'which can slow down your training considerably! '
 90 |                       'You may see unexpected behavior when restarting '
 91 |                       'from checkpoints.')
 92 | 
 93 |     if args.gpu is not None:
 94 |         warnings.warn('You have chosen a specific GPU. This will completely '
 95 |                       'disable data parallelism.')
 96 | 
 97 |     if args.dist_url == "env://" and args.world_size == -1:
 98 |         args.world_size = int(os.environ["WORLD_SIZE"])
 99 | 
100 |     args.distributed = args.world_size > 1 or args.multiprocessing_distributed
101 | 
102 |     ngpus_per_node = torch.cuda.device_count()
103 |     if args.multiprocessing_distributed:
104 |         # Since we have ngpus_per_node processes per node, the total world_size
105 |         # needs to be adjusted accordingly
106 |         args.world_size = ngpus_per_node * args.world_size
107 |         # Use torch.multiprocessing.spawn to launch distributed processes: the
108 |         # main_worker process function
109 |         mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
110 |     else:
111 |         # Simply call main_worker function
112 |         main_worker(args.gpu, ngpus_per_node, args)
113 | 
114 | 
115 | def main_worker(gpu, ngpus_per_node, args):
116 |     global best_acc1
117 |     args.gpu = gpu
118 | 
119 |     if args.gpu is not None:
120 |         print("Use GPU: {} for training".format(args.gpu))
121 | 
122 |     if args.distributed:
123 |         if args.dist_url == "env://" and args.rank == -1:
124 |             args.rank = int(os.environ["RANK"])
125 |         if args.multiprocessing_distributed:
126 |             # For multiprocessing distributed training, rank needs to be the
127 |             # global rank among all the processes
128 |             args.rank = args.rank * ngpus_per_node + gpu
129 |         dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
130 |                                 world_size=args.world_size, rank=args.rank)
131 |     # create model
132 |     if args.pretrained:
133 |         print("=> using pre-trained model '{}'".format(args.arch))
134 |         model = models.__dict__[args.arch](pretrained=True)
135 |     else:
136 |         print("=> creating model '{}'".format(args.arch))
137 |         model = models.__dict__[args.arch]()
138 | 
139 |     if args.distributed:
140 |         # For multiprocessing distributed, DistributedDataParallel constructor
141 |         # should always set the single device scope, otherwise,
142 |         # DistributedDataParallel will use all available devices.
143 |         if args.gpu is not None:
144 |             torch.cuda.set_device(args.gpu)
145 |             model.cuda(args.gpu)
146 |             # When using a single GPU per process and per
147 |             # DistributedDataParallel, we need to divide the batch size
148 |             # ourselves based on the total number of GPUs we have
149 |             args.batch_size = int(args.batch_size / ngpus_per_node)
150 |             args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
151 |             model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
152 |         else:
153 |             model.cuda()
154 |             # DistributedDataParallel will divide and allocate batch_size to all
155 |             # available GPUs if device_ids are not set
156 |             model = torch.nn.parallel.DistributedDataParallel(model)
157 |     elif args.gpu is not None:
158 |         torch.cuda.set_device(args.gpu)
159 |         model = model.cuda(args.gpu)
160 |     else:
161 |         # DataParallel will divide and allocate batch_size to all available GPUs
162 |         if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
163 |             model.features = torch.nn.DataParallel(model.features)
164 |             model.cuda()
165 |         else:
166 |             model = torch.nn.DataParallel(model).cuda()
167 | 
168 |     # define loss function (criterion) and optimizer
169 |     criterion = nn.CrossEntropyLoss().cuda(args.gpu)
170 | 
171 |     optimizer = torch.optim.SGD(model.parameters(), args.lr,
172 |                                 momentum=args.momentum,
173 |                                 weight_decay=args.weight_decay)
174 | 
175 |     # optionally resume from a checkpoint
176 |     if args.resume:
177 |         if os.path.isfile(args.resume):
178 |             print("=> loading checkpoint '{}'".format(args.resume))
179 |             if args.gpu is None:
180 |                 checkpoint = torch.load(args.resume)
181 |             else:
182 |                 # Map model to be loaded to specified single gpu.
183 |                 loc = 'cuda:{}'.format(args.gpu)
184 |                 checkpoint = torch.load(args.resume, map_location=loc)
185 |             args.start_epoch = checkpoint['epoch']
186 |             best_acc1 = checkpoint['best_acc1']
187 |             if args.gpu is not None:
188 |                 # best_acc1 may be from a checkpoint from a different GPU
189 |                 best_acc1 = best_acc1.to(args.gpu)
190 |             model.load_state_dict(checkpoint['state_dict'])
191 |             optimizer.load_state_dict(checkpoint['optimizer'])
192 |             print("=> loaded checkpoint '{}' (epoch {})"
193 |                   .format(args.resume, checkpoint['epoch']))
194 |         else:
195 |             print("=> no checkpoint found at '{}'".format(args.resume))
196 | 
197 |     cudnn.benchmark = True
198 | 
199 |     # Data loading code
200 |     traindir = os.path.join(args.data, 'train')
201 |     valdir = os.path.join(args.data, 'val')
202 |     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
203 |                                      std=[0.229, 0.224, 0.225])
204 | 
205 |     train_dataset = datasets.ImageFolder(
206 |         traindir,
207 |         transforms.Compose([
208 |             transforms.RandomResizedCrop(224),
209 |             transforms.RandomHorizontalFlip(),
210 |             transforms.ToTensor(),
211 |             normalize,
212 |         ]))
213 | 
214 |     if args.distributed:
215 |         train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
216 |     else:
217 |         train_sampler = None
218 | 
219 |     train_loader = torch.utils.data.DataLoader(
220 |         train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
221 |         num_workers=args.workers, pin_memory=True, sampler=train_sampler)
222 | 
223 |     val_loader = torch.utils.data.DataLoader(
224 |         datasets.ImageFolder(valdir, transforms.Compose([
225 |             transforms.Resize(256),
226 |             transforms.CenterCrop(224),
227 |             transforms.ToTensor(),
228 |             normalize,
229 |         ])),
230 |         batch_size=args.batch_size, shuffle=False,
231 |         num_workers=args.workers, pin_memory=True)
232 | 
233 |     if args.evaluate:
234 |         validate(val_loader, model, criterion, args)
235 |         return
236 | 
237 |     for epoch in range(args.start_epoch, args.epochs):
238 |         if args.distributed:
239 |             train_sampler.set_epoch(epoch)
240 |         adjust_learning_rate(optimizer, epoch, args)
241 | 
242 |         # train for one epoch
243 |         train(train_loader, model, criterion, optimizer, epoch, args)
244 | 
245 |         # evaluate on validation set
246 |         acc1 = validate(val_loader, model, criterion, args)
247 | 
248 |         # remember best acc@1 and save checkpoint
249 |         is_best = acc1 > best_acc1
250 |         best_acc1 = max(acc1, best_acc1)
251 | 
252 |         if not args.multiprocessing_distributed or (args.multiprocessing_distributed
253 |                 and args.rank % ngpus_per_node == 0):
254 |             save_checkpoint({
255 |                 'epoch': epoch + 1,
256 |                 'arch': args.arch,
257 |                 'state_dict': model.state_dict(),
258 |                 'best_acc1': best_acc1,
259 |                 'optimizer' : optimizer.state_dict(),
260 |             }, is_best)
261 | 
262 | 
263 | def train(train_loader, model, criterion, optimizer, epoch, args):
264 |     batch_time = AverageMeter('Time', ':6.3f')
265 |     data_time = AverageMeter('Data', ':6.3f')
266 |     losses = AverageMeter('Loss', ':.4e')
267 |     top1 = AverageMeter('Acc@1', ':6.2f')
268 |     top5 = AverageMeter('Acc@5', ':6.2f')
269 |     progress = ProgressMeter(
270 |         len(train_loader),
271 |         [batch_time, data_time, losses, top1, top5],
272 |         prefix="Epoch: [{}]".format(epoch))
273 | 
274 |     # switch to train mode
275 |     model.train()
276 | 
277 |     end = time.time()
278 |     for i, (images, target) in enumerate(train_loader):
279 |         # measure data loading time
280 |         data_time.update(time.time() - end)
281 | 
282 |         if args.gpu is not None:
283 |             images = images.cuda(args.gpu, non_blocking=True)
284 |         target = target.cuda(args.gpu, non_blocking=True)
285 | 
286 |         # compute output
287 |         output = model(images)
288 |         loss = criterion(output, target)
289 | 
290 |         # measure accuracy and record loss
291 |         acc1, acc5 = accuracy(output, target, topk=(1, 5))
292 |         losses.update(loss.item(), images.size(0))
293 |         top1.update(acc1[0], images.size(0))
294 |         top5.update(acc5[0], images.size(0))
295 | 
296 |         # compute gradient and do SGD step
297 |         optimizer.zero_grad()
298 |         loss.backward()
299 |         optimizer.step()
300 | 
301 |         # measure elapsed time
302 |         batch_time.update(time.time() - end)
303 |         end = time.time()
304 | 
305 |         if i % args.print_freq == 0:
306 |             progress.display(i)
307 | 
308 | 
309 | def validate(val_loader, model, criterion, args):
310 |     batch_time = AverageMeter('Time', ':6.3f')
311 |     losses = AverageMeter('Loss', ':.4e')
312 |     top1 = AverageMeter('Acc@1', ':6.2f')
313 |     top5 = AverageMeter('Acc@5', ':6.2f')
314 |     progress = ProgressMeter(
315 |         len(val_loader),
316 |         [batch_time, losses, top1, top5],
317 |         prefix='Test: ')
318 | 
319 |     # switch to evaluate mode
320 |     model.eval()
321 | 
322 |     with torch.no_grad():
323 |         end = time.time()
324 |         for i, (images, target) in enumerate(val_loader):
325 |             if args.gpu is not None:
326 |                 images = images.cuda(args.gpu, non_blocking=True)
327 |             target = target.cuda(args.gpu, non_blocking=True)
328 | 
329 |             # compute output
330 |             output = model(images)
331 |             loss = criterion(output, target)
332 | 
333 |             # measure accuracy and record loss
334 |             acc1, acc5 = accuracy(output, target, topk=(1, 5))
335 |             losses.update(loss.item(), images.size(0))
336 |             top1.update(acc1[0], images.size(0))
337 |             top5.update(acc5[0], images.size(0))
338 | 
339 |             # measure elapsed time
340 |             batch_time.update(time.time() - end)
341 |             end = time.time()
342 | 
343 |             if i % args.print_freq == 0:
344 |                 progress.display(i)
345 | 
346 |         # TODO: this should also be done with the ProgressMeter
347 |         print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
348 |               .format(top1=top1, top5=top5))
349 | 
350 |     return top1.avg
351 | 
352 | 
353 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
354 |     torch.save(state, filename)
355 |     if is_best:
356 |         shutil.copyfile(filename, 'model_best.pth.tar')
357 | 
358 | 
359 | class AverageMeter(object):
360 |     """Computes and stores the average and current value"""
361 |     def __init__(self, name, fmt=':f'):
362 |         self.name = name
363 |         self.fmt = fmt
364 |         self.reset()
365 | 
366 |     def reset(self):
367 |         self.val = 0
368 |         self.avg = 0
369 |         self.sum = 0
370 |         self.count = 0
371 | 
372 |     def update(self, val, n=1):
373 |         self.val = val
374 |         self.sum += val * n
375 |         self.count += n
376 |         self.avg = self.sum / self.count
377 | 
378 |     def __str__(self):
379 |         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
380 |         return fmtstr.format(**self.__dict__)
381 | 
382 | 
383 | class ProgressMeter(object):
384 |     def __init__(self, num_batches, meters, prefix=""):
385 |         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
386 |         self.meters = meters
387 |         self.prefix = prefix
388 | 
389 |     def display(self, batch):
390 |         entries = [self.prefix + self.batch_fmtstr.format(batch)]
391 |         entries += [str(meter) for meter in self.meters]
392 |         print('\t'.join(entries))
393 | 
394 |     def _get_batch_fmtstr(self, num_batches):
395 |         num_digits = len(str(num_batches // 1))
396 |         fmt = '{:' + str(num_digits) + 'd}'
397 |         return '[' + fmt + '/' + fmt.format(num_batches) + ']'
398 | 
399 | 
400 | def adjust_learning_rate(optimizer, epoch, args):
401 |     """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
402 |     lr = args.lr * (0.1 ** (epoch // 30))
403 |     for param_group in optimizer.param_groups:
404 |         param_group['lr'] = lr
405 | 
406 | 
407 | def accuracy(output, target, topk=(1,)):
408 |     """Computes the accuracy over the k top predictions for the specified values of k"""
409 |     with torch.no_grad():
410 |         maxk = max(topk)
411 |         batch_size = target.size(0)
412 | 
413 |         _, pred = output.topk(maxk, 1, True, True)
414 |         pred = pred.t()
415 |         correct = pred.eq(target.view(1, -1).expand_as(pred))
416 | 
417 |         res = []
418 |         for k in topk:
419 |             correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
420 |             res.append(correct_k.mul_(100.0 / batch_size))
421 |         return res
422 | 
423 | 
424 | if __name__ == '__main__':
425 |     main()
426 | 


--------------------------------------------------------------------------------
/.old/main-wds.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | import os
  4 | import random
  5 | import shutil
  6 | import time
  7 | import warnings
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.parallel
 12 | import torch.backends.cudnn as cudnn
 13 | import torch.distributed as dist
 14 | import torch.optim
 15 | import torch.multiprocessing as mp
 16 | import torch.utils.data
 17 | import torch.utils.data.distributed
 18 | import torchvision.transforms as transforms
 19 | import torchvision.datasets as datasets
 20 | import torchvision.models as models
 21 | import webdataset as wds
 22 | 
 23 | model_names = sorted(name for name in models.__dict__
 24 |     if name.islower() and not name.startswith("__")
 25 |     and callable(models.__dict__[name]))
 26 | 
 27 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
 28 | parser.add_argument('--data', metavar='DIR', default='./data',
 29 |                     help='path to dataset')
 30 | parser.add_argument('--loader', default='wds', help='loader to use: orig, wds')
 31 | parser.add_argument('--shuffle', type=int, default=1000, help='shuffle buffer size for WebDataset')
 32 | parser.add_argument('--trainshards', default='./shards/imagenet-train-{000000..001281}.tar', help='path/URL for ImageNet shards',
 33 | )
 34 | parser.add_argument('--trainsize', type=int, default=1281167, help='ImageNet training set size')
 35 | parser.add_argument('--augmentation', default='full')
 36 | parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
 37 |                     choices=model_names,
 38 |                     help='model architecture: ' +
 39 |                         ' | '.join(model_names) +
 40 |                         ' (default: resnet18)')
 41 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
 42 |                     help='number of data loading workers (default: 4)')
 43 | parser.add_argument('--epochs', default=90, type=int, metavar='N',
 44 |                     help='number of total epochs to run')
 45 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
 46 |                     help='manual epoch number (useful on restarts)')
 47 | parser.add_argument('-b', '--batch-size', default=256, type=int,
 48 |                     metavar='N',
 49 |                     help='mini-batch size (default: 256), this is the total '
 50 |                          'batch size of all GPUs on the current node when '
 51 |                          'using Data Parallel or Distributed Data Parallel')
 52 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
 53 |                     metavar='LR', help='initial learning rate', dest='lr')
 54 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
 55 |                     help='momentum')
 56 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
 57 |                     metavar='W', help='weight decay (default: 1e-4)',
 58 |                     dest='weight_decay')
 59 | parser.add_argument('-p', '--print-freq', default=10, type=int,
 60 |                     metavar='N', help='print frequency (default: 10)')
 61 | parser.add_argument('--resume', default='', type=str, metavar='PATH',
 62 |                     help='path to latest checkpoint (default: none)')
 63 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
 64 |                     help='evaluate model on validation set')
 65 | parser.add_argument('--pretrained', dest='pretrained', action='store_true',
 66 |                     help='use pre-trained model')
 67 | parser.add_argument('--world-size', default=-1, type=int,
 68 |                     help='number of nodes for distributed training')
 69 | parser.add_argument('--rank', default=-1, type=int,
 70 |                     help='node rank for distributed training')
 71 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
 72 |                     help='url used to set up distributed training')
 73 | parser.add_argument('--dist-backend', default='nccl', type=str,
 74 |                     help='distributed backend')
 75 | parser.add_argument('--seed', default=None, type=int,
 76 |                     help='seed for initializing training. ')
 77 | parser.add_argument('--gpu', default=None, type=int,
 78 |                     help='GPU id to use.')
 79 | parser.add_argument('--multiprocessing-distributed', action='store_true',
 80 |                     help='Use multi-processing distributed training to launch '
 81 |                          'N processes per node, which has N GPUs. This is the '
 82 |                          'fastest way to use PyTorch for either single node or '
 83 |                          'multi node data parallel training')
 84 | 
 85 | best_acc1 = 0
 86 | 
 87 | 
 88 | def main():
 89 |     args = parser.parse_args()
 90 | 
 91 |     if args.seed is not None:
 92 |         random.seed(args.seed)
 93 |         torch.manual_seed(args.seed)
 94 |         cudnn.deterministic = True
 95 |         warnings.warn('You have chosen to seed training. '
 96 |                       'This will turn on the CUDNN deterministic setting, '
 97 |                       'which can slow down your training considerably! '
 98 |                       'You may see unexpected behavior when restarting '
 99 |                       'from checkpoints.')
100 | 
101 |     if args.gpu is not None:
102 |         warnings.warn('You have chosen a specific GPU. This will completely '
103 |                       'disable data parallelism.')
104 | 
105 |     if args.dist_url == "env://" and args.world_size == -1:
106 |         args.world_size = int(os.environ["WORLD_SIZE"])
107 | 
108 |     args.distributed = args.world_size > 1 or args.multiprocessing_distributed
109 | 
110 |     ngpus_per_node = torch.cuda.device_count()
111 |     if args.multiprocessing_distributed:
112 |         # Since we have ngpus_per_node processes per node, the total world_size
113 |         # needs to be adjusted accordingly
114 |         args.world_size = ngpus_per_node * args.world_size
115 |         # Use torch.multiprocessing.spawn to launch distributed processes: the
116 |         # main_worker process function
117 |         mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
118 |     else:
119 |         # Simply call main_worker function
120 |         main_worker(args.gpu, ngpus_per_node, args)
121 | 
122 | 
123 | def main_worker(gpu, ngpus_per_node, args):
124 |     global best_acc1
125 |     args.gpu = gpu
126 | 
127 |     if args.gpu is not None:
128 |         print("Use GPU: {} for training".format(args.gpu))
129 | 
130 |     if args.distributed:
131 |         if args.dist_url == "env://" and args.rank == -1:
132 |             args.rank = int(os.environ["RANK"])
133 |         if args.multiprocessing_distributed:
134 |             # For multiprocessing distributed training, rank needs to be the
135 |             # global rank among all the processes
136 |             args.rank = args.rank * ngpus_per_node + gpu
137 |         dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
138 |                                 world_size=args.world_size, rank=args.rank)
139 |     # create model
140 |     if args.pretrained:
141 |         print("=> using pre-trained model '{}'".format(args.arch))
142 |         model = models.__dict__[args.arch](pretrained=True)
143 |     else:
144 |         print("=> creating model '{}'".format(args.arch))
145 |         model = models.__dict__[args.arch]()
146 | 
147 |     if args.distributed:
148 |         # For multiprocessing distributed, DistributedDataParallel constructor
149 |         # should always set the single device scope, otherwise,
150 |         # DistributedDataParallel will use all available devices.
151 |         if args.gpu is not None:
152 |             torch.cuda.set_device(args.gpu)
153 |             model.cuda(args.gpu)
154 |             # When using a single GPU per process and per
155 |             # DistributedDataParallel, we need to divide the batch size
156 |             # ourselves based on the total number of GPUs we have
157 |             args.batch_size = int(args.batch_size / ngpus_per_node)
158 |             args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
159 |             model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
160 |         else:
161 |             model.cuda()
162 |             # DistributedDataParallel will divide and allocate batch_size to all
163 |             # available GPUs if device_ids are not set
164 |             model = torch.nn.parallel.DistributedDataParallel(model)
165 |     elif args.gpu is not None:
166 |         torch.cuda.set_device(args.gpu)
167 |         model = model.cuda(args.gpu)
168 |     else:
169 |         # DataParallel will divide and allocate batch_size to all available GPUs
170 |         if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
171 |             model.features = torch.nn.DataParallel(model.features)
172 |             model.cuda()
173 |         else:
174 |             model = torch.nn.DataParallel(model).cuda()
175 | 
176 |     # define loss function (criterion) and optimizer
177 |     criterion = nn.CrossEntropyLoss().cuda(args.gpu)
178 | 
179 |     optimizer = torch.optim.SGD(model.parameters(), args.lr,
180 |                                 momentum=args.momentum,
181 |                                 weight_decay=args.weight_decay)
182 | 
183 |     # optionally resume from a checkpoint
184 |     if args.resume:
185 |         if os.path.isfile(args.resume):
186 |             print("=> loading checkpoint '{}'".format(args.resume))
187 |             if args.gpu is None:
188 |                 checkpoint = torch.load(args.resume)
189 |             else:
190 |                 # Map model to be loaded to specified single gpu.
191 |                 loc = 'cuda:{}'.format(args.gpu)
192 |                 checkpoint = torch.load(args.resume, map_location=loc)
193 |             args.start_epoch = checkpoint['epoch']
194 |             best_acc1 = checkpoint['best_acc1']
195 |             if args.gpu is not None:
196 |                 # best_acc1 may be from a checkpoint from a different GPU
197 |                 best_acc1 = best_acc1.to(args.gpu)
198 |             model.load_state_dict(checkpoint['state_dict'])
199 |             optimizer.load_state_dict(checkpoint['optimizer'])
200 |             print("=> loaded checkpoint '{}' (epoch {})"
201 |                   .format(args.resume, checkpoint['epoch']))
202 |         else:
203 |             print("=> no checkpoint found at '{}'".format(args.resume))
204 | 
205 |     cudnn.benchmark = True
206 | 
207 |     train_loader = eval(f"make_train_loader_{args.loader}")(args)
208 | 
209 |     val_loader = make_val_loader(args)
210 | 
211 |     if args.evaluate:
212 |         validate(val_loader, model, criterion, args)
213 |         return
214 | 
215 |     for epoch in range(args.start_epoch, args.epochs):
216 |         if args.distributed:
217 |             sampler = getattr(train_loader, "sampler", None)
218 |             if sampler is not None:
219 |                 sampler.set_epoch(epoch)
220 |         adjust_learning_rate(optimizer, epoch, args)
221 | 
222 |         # train for one epoch
223 |         train(train_loader, model, criterion, optimizer, epoch, args)
224 | 
225 |         # evaluate on validation set
226 |         acc1 = validate(val_loader, model, criterion, args)
227 | 
228 |         # remember best acc@1 and save checkpoint
229 |         is_best = acc1 > best_acc1
230 |         best_acc1 = max(acc1, best_acc1)
231 | 
232 |         if not args.multiprocessing_distributed or (args.multiprocessing_distributed
233 |                 and args.rank % ngpus_per_node == 0):
234 |             save_checkpoint({
235 |                 'epoch': epoch + 1,
236 |                 'arch': args.arch,
237 |                 'state_dict': model.state_dict(),
238 |                 'best_acc1': best_acc1,
239 |                 'optimizer' : optimizer.state_dict(),
240 |             }, is_best)
241 | 
242 | def make_train_transform(args):
243 |     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
244 | 
245 |     if args.augmentation == "full":
246 |         return transforms.Compose(
247 |             [
248 |                 transforms.RandomResizedCrop(224),
249 |                 transforms.RandomHorizontalFlip(),
250 |                 transforms.ToTensor(),
251 |                 normalize,
252 |             ]
253 |         )
254 |     elif args.augmentation == "simple":
255 |         print("=> using simple augmentation")
256 |         return transforms.Compose(
257 |             [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize,]
258 |         )
259 | 
260 | 
261 | def make_val_transform(args):
262 |     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
263 | 
264 |     return transforms.Compose(
265 |         [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize,]
266 |     )
267 | 
268 | 
269 | def make_train_loader_orig(args):
270 |     print("=> using file based loader")
271 |     traindir = os.path.join(args.data, "train")
272 |     train_transform = make_train_transform(args)
273 |     train_dataset = datasets.ImageFolder(traindir, train_transform)
274 |     args.trainsize = len(train_dataset)
275 | 
276 |     if args.distributed:
277 |         train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
278 |     else:
279 |         train_sampler = None
280 | 
281 |     train_loader = torch.utils.data.DataLoader(
282 |         train_dataset,
283 |         batch_size=args.batch_size,
284 |         shuffle=(train_sampler is None),
285 |         num_workers=args.workers,
286 |         pin_memory=True,
287 |         sampler=train_sampler,
288 |     )
289 | 
290 |     return train_loader
291 | 
292 | 
293 | def identity(x):
294 |     return x
295 | 
296 | 
297 | def worker_urls(urls):
298 |     result = wds.worker_urls(urls)
299 |     print("worker_urls returning", len(result), "of", len(urls), "urls", file=sys.stderr)
300 |     return result
301 | 
302 | 
303 | def make_train_loader_wds(args):
304 |     print("=> using WebDataset loader")
305 |     train_transform = make_train_transform(args)
306 |     num_batches = args.trainsize // args.batch_size
307 |     train_dataset = (
308 |         wds.Dataset(args.trainshards, length=num_batches, shard_selection=worker_urls)
309 |         .shuffle(args.shuffle)
310 |         .decode("pil")
311 |         .to_tuple("jpg;png;jpeg cls")
312 |         .map_tuple(train_transform, identity)
313 |         .batched(args.batch_size)
314 |     )
315 |     train_loader = torch.utils.data.DataLoader(
316 |         train_dataset, batch_size=None, shuffle=False, num_workers=args.workers,
317 |     )
318 |     return train_loader
319 | 
320 | 
321 | def make_val_loader(args):
322 |     valdir = os.path.join(args.data, "val")
323 |     val_transform = make_val_transform(args)
324 |     val_dataset = datasets.ImageFolder(valdir, val_transform)
325 |     val_loader = torch.utils.data.DataLoader(
326 |         val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True
327 |     )
328 |     return val_loader
329 | 
330 | 
331 | 
332 | 
333 | def train(train_loader, model, criterion, optimizer, epoch, args):
334 |     batch_time = AverageMeter('Time', ':6.3f')
335 |     data_time = AverageMeter('Data', ':6.3f')
336 |     losses = AverageMeter('Loss', ':.4e')
337 |     top1 = AverageMeter('Acc@1', ':6.2f')
338 |     top5 = AverageMeter('Acc@5', ':6.2f')
339 |     progress = ProgressMeter(
340 |         len(train_loader),
341 |         [batch_time, data_time, losses, top1, top5],
342 |         prefix="Epoch: [{}]".format(epoch))
343 | 
344 |     # switch to train mode
345 |     model.train()
346 | 
347 |     end = time.time()
348 |     for i, (images, target) in enumerate(train_loader):
349 |         # measure data loading time
350 |         data_time.update(time.time() - end)
351 | 
352 |         if args.gpu is not None:
353 |             images = images.cuda(args.gpu, non_blocking=True)
354 |         target = target.cuda(args.gpu, non_blocking=True)
355 | 
356 |         # compute output
357 |         output = model(images)
358 |         loss = criterion(output, target)
359 | 
360 |         # measure accuracy and record loss
361 |         acc1, acc5 = accuracy(output, target, topk=(1, 5))
362 |         losses.update(loss.item(), images.size(0))
363 |         top1.update(acc1[0], images.size(0))
364 |         top5.update(acc5[0], images.size(0))
365 | 
366 |         # compute gradient and do SGD step
367 |         optimizer.zero_grad()
368 |         loss.backward()
369 |         optimizer.step()
370 | 
371 |         # measure elapsed time
372 |         batch_time.update(time.time() - end)
373 |         end = time.time()
374 | 
375 |         if i % args.print_freq == 0:
376 |             progress.display(i)
377 | 
378 | 
379 | def validate(val_loader, model, criterion, args):
380 |     batch_time = AverageMeter('Time', ':6.3f')
381 |     losses = AverageMeter('Loss', ':.4e')
382 |     top1 = AverageMeter('Acc@1', ':6.2f')
383 |     top5 = AverageMeter('Acc@5', ':6.2f')
384 |     progress = ProgressMeter(
385 |         len(val_loader),
386 |         [batch_time, losses, top1, top5],
387 |         prefix='Test: ')
388 | 
389 |     # switch to evaluate mode
390 |     model.eval()
391 | 
392 |     with torch.no_grad():
393 |         end = time.time()
394 |         for i, (images, target) in enumerate(val_loader):
395 |             if args.gpu is not None:
396 |                 images = images.cuda(args.gpu, non_blocking=True)
397 |             target = target.cuda(args.gpu, non_blocking=True)
398 | 
399 |             # compute output
400 |             output = model(images)
401 |             loss = criterion(output, target)
402 | 
403 |             # measure accuracy and record loss
404 |             acc1, acc5 = accuracy(output, target, topk=(1, 5))
405 |             losses.update(loss.item(), images.size(0))
406 |             top1.update(acc1[0], images.size(0))
407 |             top5.update(acc5[0], images.size(0))
408 | 
409 |             # measure elapsed time
410 |             batch_time.update(time.time() - end)
411 |             end = time.time()
412 | 
413 |             if i % args.print_freq == 0:
414 |                 progress.display(i)
415 | 
416 |         # TODO: this should also be done with the ProgressMeter
417 |         print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
418 |               .format(top1=top1, top5=top5))
419 | 
420 |     return top1.avg
421 | 
422 | 
423 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
424 |     torch.save(state, filename)
425 |     if is_best:
426 |         shutil.copyfile(filename, 'model_best.pth.tar')
427 | 
428 | 
429 | class AverageMeter(object):
430 |     """Computes and stores the average and current value"""
431 |     def __init__(self, name, fmt=':f'):
432 |         self.name = name
433 |         self.fmt = fmt
434 |         self.reset()
435 | 
436 |     def reset(self):
437 |         self.val = 0
438 |         self.avg = 0
439 |         self.sum = 0
440 |         self.count = 0
441 | 
442 |     def update(self, val, n=1):
443 |         self.val = val
444 |         self.sum += val * n
445 |         self.count += n
446 |         self.avg = self.sum / self.count
447 | 
448 |     def __str__(self):
449 |         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
450 |         return fmtstr.format(**self.__dict__)
451 | 
452 | 
453 | class ProgressMeter(object):
454 |     def __init__(self, num_batches, meters, prefix=""):
455 |         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
456 |         self.meters = meters
457 |         self.prefix = prefix
458 | 
459 |     def display(self, batch):
460 |         entries = [self.prefix + self.batch_fmtstr.format(batch)]
461 |         entries += [str(meter) for meter in self.meters]
462 |         print('\t'.join(entries))
463 | 
464 |     def _get_batch_fmtstr(self, num_batches):
465 |         num_digits = len(str(num_batches // 1))
466 |         fmt = '{:' + str(num_digits) + 'd}'
467 |         return '[' + fmt + '/' + fmt.format(num_batches) + ']'
468 | 
469 | 
470 | def adjust_learning_rate(optimizer, epoch, args):
471 |     """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
472 |     lr = args.lr * (0.1 ** (epoch // 30))
473 |     for param_group in optimizer.param_groups:
474 |         param_group['lr'] = lr
475 | 
476 | 
477 | def accuracy(output, target, topk=(1,)):
478 |     """Computes the accuracy over the k top predictions for the specified values of k"""
479 |     with torch.no_grad():
480 |         maxk = max(topk)
481 |         batch_size = target.size(0)
482 | 
483 |         _, pred = output.topk(maxk, 1, True, True)
484 |         pred = pred.t()
485 |         correct = pred.eq(target.view(1, -1).expand_as(pred))
486 | 
487 |         res = []
488 |         for k in topk:
489 |             correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
490 |             res.append(correct_k.mul_(100.0 / batch_size))
491 |         return res
492 | 
493 | 
494 | if __name__ == '__main__':
495 |     main()
496 | 


--------------------------------------------------------------------------------
/.old/makeshards.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import os.path
  4 | import random
  5 | import argparse
  6 | 
  7 | from torchvision import datasets
  8 | 
  9 | import webdataset as wds
 10 | 
 11 | 
 12 | parser = argparse.ArgumentParser("""Generate sharded dataset from original ImageNet data.""")
 13 | parser.add_argument("--splits", default="train,val", help="which splits to write")
 14 | parser.add_argument(
 15 |     "--filekey", action="store_true", help="use file as key (default: index)"
 16 | )
 17 | parser.add_argument("--maxsize", type=float, default=1e9)
 18 | parser.add_argument("--maxcount", type=float, default=1000)
 19 | parser.add_argument(
 20 |     "--shards", default="./shards", help="directory where shards are written"
 21 | )
 22 | parser.add_argument(
 23 |     "--data",
 24 |     default="./data",
 25 |     help="directory containing ImageNet data distribution suitable for torchvision.datasets",
 26 | )
 27 | args = parser.parse_args()
 28 | 
 29 | 
 30 | assert args.maxsize > 10000000
 31 | assert args.maxcount < 1000000
 32 | 
 33 | 
 34 | if not os.path.isdir(os.path.join(args.data, "train")):
 35 |     print(f"{args.data}: should be directory containing ImageNet", file=sys.stderr)
 36 |     print(f"suitable as argument for torchvision.datasets.ImageNet(...)", file=sys.stderr)
 37 |     sys.exit(1)
 38 | 
 39 | 
 40 | if not os.path.isdir(os.path.join(args.shards, ".")):
 41 |     print(f"{args.shards}: should be a writable destination directory for shards", file=sys.stderr)
 42 |     sys.exit(1)
 43 | 
 44 | 
 45 | splits = args.splits.split(",")
 46 | 
 47 | 
 48 | def readfile(fname):
 49 |     "Read a binary file from disk."
 50 |     with open(fname, "rb") as stream:
 51 |         return stream.read()
 52 | 
 53 | 
 54 | all_keys = set()
 55 | 
 56 | 
 57 | def write_dataset(imagenet, base="./shards", split="train"):
 58 | 
 59 |     # We're using the torchvision ImageNet dataset
 60 |     # to parse the metadata; however, we will read
 61 |     # the compressed images directly from disk (to
 62 |     # avoid having to reencode them)
 63 |     ds = datasets.ImageNet(imagenet, split=split)
 64 |     nimages = len(ds.imgs)
 65 |     print("# nimages", nimages)
 66 | 
 67 |     # We shuffle the indexes to make sure that we
 68 |     # don't get any large sequences of a single class
 69 |     # in the dataset.
 70 |     indexes = list(range(nimages))
 71 |     random.shuffle(indexes)
 72 | 
 73 |     # This is the output pattern under which we write shards.
 74 |     pattern = os.path.join(base, f"imagenet-{split}-%06d.tar")
 75 | 
 76 |     with wds.ShardWriter(pattern, maxsize=int(args.maxsize), maxcount=int(args.maxcount)) as sink:
 77 |         for i in indexes:
 78 | 
 79 |             # Internal information from the ImageNet dataset
 80 |             # instance: the file name and the numerical class.
 81 |             fname, cls = ds.imgs[i]
 82 |             assert cls == ds.targets[i]
 83 | 
 84 |             # Read the JPEG-compressed image file contents.
 85 |             image = readfile(fname)
 86 | 
 87 |             # Construct a uniqu keye from the filename.
 88 |             key = os.path.splitext(os.path.basename(fname))[0]
 89 | 
 90 |             # Useful check.
 91 |             assert key not in all_keys
 92 |             all_keys.add(key)
 93 | 
 94 |             # Construct a sample.
 95 |             xkey = key if args.filekey else "%07d" % i
 96 |             sample = {"__key__": xkey, "jpg": image, "cls": cls}
 97 | 
 98 |             # Write the sample to the sharded tar archives.
 99 |             sink.write(sample)
100 | 
101 | 
102 | for split in splits:
103 |     print("# split", split)
104 |     write_dataset(args.data, base=args.shards, split=split)
105 | 


--------------------------------------------------------------------------------
/.old/requirements.txt:
--------------------------------------------------------------------------------
 1 | braceexpand
 2 | numpy
 3 | scipy
 4 | tk
 5 | matplotlib
 6 | torch
 7 | torchvision
 8 | jupyterlab
 9 | bash_kernel
10 | -e git+git://github.com/tmbdev/webdataset#egg=webdataset
11 | 


--------------------------------------------------------------------------------
/.old/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | die() {
 5 |     echo "ERROR: $*"
 6 |     exit 1
 7 | }
 8 | 
 9 | 
10 | cmd_clean() { # remove temporary files
11 |     rm -rf venv shards
12 |     rm -f data
13 |     rm -f *-info.txt *.pth *.pth.tar *.log
14 | }
15 | 
16 | 
17 | cmd_venv() { # set up a virtualenv
18 |     test -d venv || python3 -m venv venv
19 |     source venv/bin/activate
20 |     pip3 install -U pip
21 |     pip3 install -U -r requirements.txt
22 |     python3 -m bash_kernel.install
23 |     pip3 install -U neovim
24 |     pip3 install -U jupyterlab
25 |     pip3 install -U pytest
26 | }
27 | 
28 | cmd_makeshards() {  # make ./shards from ./data
29 |     test -d ./venv || die "'./run venv' first to create the virtual environment"
30 |     test -f ./venv/bin/activate || die "no venv/bin/activate found"
31 |     test -e ./data || die "make a symlink from the ImageNet data directory to ./data"
32 |     test -e ./data/train || die "./data/train: not found"
33 |     test -e ./data/val || die "./data/val: not found"
34 |     rm -rf ./shards/*
35 |     mkdir -p ./shards
36 |     source venv/bin/activate
37 |     export OMP_NUM_THREADS=1
38 |     python3 ./makeshards.py "$@"
39 | }
40 | 
41 | cmd_train() {  # run a training job against ./shards
42 |     test -d ./venv || die "'./run venv' first to create the virtual environment"
43 |     test -f ./venv/bin/activate || die "no venv/bin/activate found"
44 |     test -e ./data || die "make a symlink from the ImageNet data directory to ./data"
45 |     test -d ./shards || die "run 'makeshards' first"
46 |     source venv/bin/activate
47 |     export OMP_NUM_THREADS=1
48 |     python3 ./main-wds.py "$@"
49 | }
50 | 
51 | 
52 | cmd=${1:-help}
53 | shift
54 | case $cmd in
55 | help)
56 |     echo; echo available commands:; echo
57 |     grep '^cmd_[_0-9a-z]*() {' "$0" | sed 's/cmd_//;s/\(.*\)() *{* *#* */\1 -- /'
58 |     ;;
59 | *.py)
60 |     set -e
61 |     cmd_venv > venv.log
62 |     source venv/bin/activate
63 |     export OMP_NUM_THREADS=1
64 |     python3 "$cmd" "$@"
65 |     ;;
66 | *)
67 |     set -e
68 |     eval "cmd_$cmd" "$@"
69 |     ;;
70 | esac
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This repo has moved. Please have a look at:
2 | 
3 | - [tmbdev/webdataset-examples](http://github.com/tmbdev/webdataset-examples) for a simple modification to the original PyTorch Imagenet training example
4 | - [tmbdev/webdataset-lighting](http://github.com/tmbdev/webdataset-lightning) for an example of how to use WebDataset with Lightining
5 | 
6 | 


--------------------------------------------------------------------------------