├── .old ├── main-orig.py ├── main-wds.py ├── makeshards.py ├── requirements.txt └── run └── README.md /.old/main-orig.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | import shutil 5 | import time 6 | import warnings 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.parallel 11 | import torch.backends.cudnn as cudnn 12 | import torch.distributed as dist 13 | import torch.optim 14 | import torch.multiprocessing as mp 15 | import torch.utils.data 16 | import torch.utils.data.distributed 17 | import torchvision.transforms as transforms 18 | import torchvision.datasets as datasets 19 | import torchvision.models as models 20 | 21 | model_names = sorted(name for name in models.__dict__ 22 | if name.islower() and not name.startswith("__") 23 | and callable(models.__dict__[name])) 24 | 25 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 26 | parser.add_argument('data', metavar='DIR', 27 | help='path to dataset') 28 | parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', 29 | choices=model_names, 30 | help='model architecture: ' + 31 | ' | '.join(model_names) + 32 | ' (default: resnet18)') 33 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 34 | help='number of data loading workers (default: 4)') 35 | parser.add_argument('--epochs', default=90, type=int, metavar='N', 36 | help='number of total epochs to run') 37 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 38 | help='manual epoch number (useful on restarts)') 39 | parser.add_argument('-b', '--batch-size', default=256, type=int, 40 | metavar='N', 41 | help='mini-batch size (default: 256), this is the total ' 42 | 'batch size of all GPUs on the current node when ' 43 | 'using Data Parallel or Distributed Data Parallel') 44 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 45 | metavar='LR', help='initial learning rate', dest='lr') 46 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 47 | help='momentum') 48 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, 49 | metavar='W', help='weight decay (default: 1e-4)', 50 | dest='weight_decay') 51 | parser.add_argument('-p', '--print-freq', default=10, type=int, 52 | metavar='N', help='print frequency (default: 10)') 53 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 54 | help='path to latest checkpoint (default: none)') 55 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 56 | help='evaluate model on validation set') 57 | parser.add_argument('--pretrained', dest='pretrained', action='store_true', 58 | help='use pre-trained model') 59 | parser.add_argument('--world-size', default=-1, type=int, 60 | help='number of nodes for distributed training') 61 | parser.add_argument('--rank', default=-1, type=int, 62 | help='node rank for distributed training') 63 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 64 | help='url used to set up distributed training') 65 | parser.add_argument('--dist-backend', default='nccl', type=str, 66 | help='distributed backend') 67 | parser.add_argument('--seed', default=None, type=int, 68 | help='seed for initializing training. ') 69 | parser.add_argument('--gpu', default=None, type=int, 70 | help='GPU id to use.') 71 | parser.add_argument('--multiprocessing-distributed', action='store_true', 72 | help='Use multi-processing distributed training to launch ' 73 | 'N processes per node, which has N GPUs. This is the ' 74 | 'fastest way to use PyTorch for either single node or ' 75 | 'multi node data parallel training') 76 | 77 | best_acc1 = 0 78 | 79 | 80 | def main(): 81 | args = parser.parse_args() 82 | 83 | if args.seed is not None: 84 | random.seed(args.seed) 85 | torch.manual_seed(args.seed) 86 | cudnn.deterministic = True 87 | warnings.warn('You have chosen to seed training. ' 88 | 'This will turn on the CUDNN deterministic setting, ' 89 | 'which can slow down your training considerably! ' 90 | 'You may see unexpected behavior when restarting ' 91 | 'from checkpoints.') 92 | 93 | if args.gpu is not None: 94 | warnings.warn('You have chosen a specific GPU. This will completely ' 95 | 'disable data parallelism.') 96 | 97 | if args.dist_url == "env://" and args.world_size == -1: 98 | args.world_size = int(os.environ["WORLD_SIZE"]) 99 | 100 | args.distributed = args.world_size > 1 or args.multiprocessing_distributed 101 | 102 | ngpus_per_node = torch.cuda.device_count() 103 | if args.multiprocessing_distributed: 104 | # Since we have ngpus_per_node processes per node, the total world_size 105 | # needs to be adjusted accordingly 106 | args.world_size = ngpus_per_node * args.world_size 107 | # Use torch.multiprocessing.spawn to launch distributed processes: the 108 | # main_worker process function 109 | mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) 110 | else: 111 | # Simply call main_worker function 112 | main_worker(args.gpu, ngpus_per_node, args) 113 | 114 | 115 | def main_worker(gpu, ngpus_per_node, args): 116 | global best_acc1 117 | args.gpu = gpu 118 | 119 | if args.gpu is not None: 120 | print("Use GPU: {} for training".format(args.gpu)) 121 | 122 | if args.distributed: 123 | if args.dist_url == "env://" and args.rank == -1: 124 | args.rank = int(os.environ["RANK"]) 125 | if args.multiprocessing_distributed: 126 | # For multiprocessing distributed training, rank needs to be the 127 | # global rank among all the processes 128 | args.rank = args.rank * ngpus_per_node + gpu 129 | dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 130 | world_size=args.world_size, rank=args.rank) 131 | # create model 132 | if args.pretrained: 133 | print("=> using pre-trained model '{}'".format(args.arch)) 134 | model = models.__dict__[args.arch](pretrained=True) 135 | else: 136 | print("=> creating model '{}'".format(args.arch)) 137 | model = models.__dict__[args.arch]() 138 | 139 | if args.distributed: 140 | # For multiprocessing distributed, DistributedDataParallel constructor 141 | # should always set the single device scope, otherwise, 142 | # DistributedDataParallel will use all available devices. 143 | if args.gpu is not None: 144 | torch.cuda.set_device(args.gpu) 145 | model.cuda(args.gpu) 146 | # When using a single GPU per process and per 147 | # DistributedDataParallel, we need to divide the batch size 148 | # ourselves based on the total number of GPUs we have 149 | args.batch_size = int(args.batch_size / ngpus_per_node) 150 | args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) 151 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) 152 | else: 153 | model.cuda() 154 | # DistributedDataParallel will divide and allocate batch_size to all 155 | # available GPUs if device_ids are not set 156 | model = torch.nn.parallel.DistributedDataParallel(model) 157 | elif args.gpu is not None: 158 | torch.cuda.set_device(args.gpu) 159 | model = model.cuda(args.gpu) 160 | else: 161 | # DataParallel will divide and allocate batch_size to all available GPUs 162 | if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): 163 | model.features = torch.nn.DataParallel(model.features) 164 | model.cuda() 165 | else: 166 | model = torch.nn.DataParallel(model).cuda() 167 | 168 | # define loss function (criterion) and optimizer 169 | criterion = nn.CrossEntropyLoss().cuda(args.gpu) 170 | 171 | optimizer = torch.optim.SGD(model.parameters(), args.lr, 172 | momentum=args.momentum, 173 | weight_decay=args.weight_decay) 174 | 175 | # optionally resume from a checkpoint 176 | if args.resume: 177 | if os.path.isfile(args.resume): 178 | print("=> loading checkpoint '{}'".format(args.resume)) 179 | if args.gpu is None: 180 | checkpoint = torch.load(args.resume) 181 | else: 182 | # Map model to be loaded to specified single gpu. 183 | loc = 'cuda:{}'.format(args.gpu) 184 | checkpoint = torch.load(args.resume, map_location=loc) 185 | args.start_epoch = checkpoint['epoch'] 186 | best_acc1 = checkpoint['best_acc1'] 187 | if args.gpu is not None: 188 | # best_acc1 may be from a checkpoint from a different GPU 189 | best_acc1 = best_acc1.to(args.gpu) 190 | model.load_state_dict(checkpoint['state_dict']) 191 | optimizer.load_state_dict(checkpoint['optimizer']) 192 | print("=> loaded checkpoint '{}' (epoch {})" 193 | .format(args.resume, checkpoint['epoch'])) 194 | else: 195 | print("=> no checkpoint found at '{}'".format(args.resume)) 196 | 197 | cudnn.benchmark = True 198 | 199 | # Data loading code 200 | traindir = os.path.join(args.data, 'train') 201 | valdir = os.path.join(args.data, 'val') 202 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 203 | std=[0.229, 0.224, 0.225]) 204 | 205 | train_dataset = datasets.ImageFolder( 206 | traindir, 207 | transforms.Compose([ 208 | transforms.RandomResizedCrop(224), 209 | transforms.RandomHorizontalFlip(), 210 | transforms.ToTensor(), 211 | normalize, 212 | ])) 213 | 214 | if args.distributed: 215 | train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) 216 | else: 217 | train_sampler = None 218 | 219 | train_loader = torch.utils.data.DataLoader( 220 | train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), 221 | num_workers=args.workers, pin_memory=True, sampler=train_sampler) 222 | 223 | val_loader = torch.utils.data.DataLoader( 224 | datasets.ImageFolder(valdir, transforms.Compose([ 225 | transforms.Resize(256), 226 | transforms.CenterCrop(224), 227 | transforms.ToTensor(), 228 | normalize, 229 | ])), 230 | batch_size=args.batch_size, shuffle=False, 231 | num_workers=args.workers, pin_memory=True) 232 | 233 | if args.evaluate: 234 | validate(val_loader, model, criterion, args) 235 | return 236 | 237 | for epoch in range(args.start_epoch, args.epochs): 238 | if args.distributed: 239 | train_sampler.set_epoch(epoch) 240 | adjust_learning_rate(optimizer, epoch, args) 241 | 242 | # train for one epoch 243 | train(train_loader, model, criterion, optimizer, epoch, args) 244 | 245 | # evaluate on validation set 246 | acc1 = validate(val_loader, model, criterion, args) 247 | 248 | # remember best acc@1 and save checkpoint 249 | is_best = acc1 > best_acc1 250 | best_acc1 = max(acc1, best_acc1) 251 | 252 | if not args.multiprocessing_distributed or (args.multiprocessing_distributed 253 | and args.rank % ngpus_per_node == 0): 254 | save_checkpoint({ 255 | 'epoch': epoch + 1, 256 | 'arch': args.arch, 257 | 'state_dict': model.state_dict(), 258 | 'best_acc1': best_acc1, 259 | 'optimizer' : optimizer.state_dict(), 260 | }, is_best) 261 | 262 | 263 | def train(train_loader, model, criterion, optimizer, epoch, args): 264 | batch_time = AverageMeter('Time', ':6.3f') 265 | data_time = AverageMeter('Data', ':6.3f') 266 | losses = AverageMeter('Loss', ':.4e') 267 | top1 = AverageMeter('Acc@1', ':6.2f') 268 | top5 = AverageMeter('Acc@5', ':6.2f') 269 | progress = ProgressMeter( 270 | len(train_loader), 271 | [batch_time, data_time, losses, top1, top5], 272 | prefix="Epoch: [{}]".format(epoch)) 273 | 274 | # switch to train mode 275 | model.train() 276 | 277 | end = time.time() 278 | for i, (images, target) in enumerate(train_loader): 279 | # measure data loading time 280 | data_time.update(time.time() - end) 281 | 282 | if args.gpu is not None: 283 | images = images.cuda(args.gpu, non_blocking=True) 284 | target = target.cuda(args.gpu, non_blocking=True) 285 | 286 | # compute output 287 | output = model(images) 288 | loss = criterion(output, target) 289 | 290 | # measure accuracy and record loss 291 | acc1, acc5 = accuracy(output, target, topk=(1, 5)) 292 | losses.update(loss.item(), images.size(0)) 293 | top1.update(acc1[0], images.size(0)) 294 | top5.update(acc5[0], images.size(0)) 295 | 296 | # compute gradient and do SGD step 297 | optimizer.zero_grad() 298 | loss.backward() 299 | optimizer.step() 300 | 301 | # measure elapsed time 302 | batch_time.update(time.time() - end) 303 | end = time.time() 304 | 305 | if i % args.print_freq == 0: 306 | progress.display(i) 307 | 308 | 309 | def validate(val_loader, model, criterion, args): 310 | batch_time = AverageMeter('Time', ':6.3f') 311 | losses = AverageMeter('Loss', ':.4e') 312 | top1 = AverageMeter('Acc@1', ':6.2f') 313 | top5 = AverageMeter('Acc@5', ':6.2f') 314 | progress = ProgressMeter( 315 | len(val_loader), 316 | [batch_time, losses, top1, top5], 317 | prefix='Test: ') 318 | 319 | # switch to evaluate mode 320 | model.eval() 321 | 322 | with torch.no_grad(): 323 | end = time.time() 324 | for i, (images, target) in enumerate(val_loader): 325 | if args.gpu is not None: 326 | images = images.cuda(args.gpu, non_blocking=True) 327 | target = target.cuda(args.gpu, non_blocking=True) 328 | 329 | # compute output 330 | output = model(images) 331 | loss = criterion(output, target) 332 | 333 | # measure accuracy and record loss 334 | acc1, acc5 = accuracy(output, target, topk=(1, 5)) 335 | losses.update(loss.item(), images.size(0)) 336 | top1.update(acc1[0], images.size(0)) 337 | top5.update(acc5[0], images.size(0)) 338 | 339 | # measure elapsed time 340 | batch_time.update(time.time() - end) 341 | end = time.time() 342 | 343 | if i % args.print_freq == 0: 344 | progress.display(i) 345 | 346 | # TODO: this should also be done with the ProgressMeter 347 | print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' 348 | .format(top1=top1, top5=top5)) 349 | 350 | return top1.avg 351 | 352 | 353 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 354 | torch.save(state, filename) 355 | if is_best: 356 | shutil.copyfile(filename, 'model_best.pth.tar') 357 | 358 | 359 | class AverageMeter(object): 360 | """Computes and stores the average and current value""" 361 | def __init__(self, name, fmt=':f'): 362 | self.name = name 363 | self.fmt = fmt 364 | self.reset() 365 | 366 | def reset(self): 367 | self.val = 0 368 | self.avg = 0 369 | self.sum = 0 370 | self.count = 0 371 | 372 | def update(self, val, n=1): 373 | self.val = val 374 | self.sum += val * n 375 | self.count += n 376 | self.avg = self.sum / self.count 377 | 378 | def __str__(self): 379 | fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' 380 | return fmtstr.format(**self.__dict__) 381 | 382 | 383 | class ProgressMeter(object): 384 | def __init__(self, num_batches, meters, prefix=""): 385 | self.batch_fmtstr = self._get_batch_fmtstr(num_batches) 386 | self.meters = meters 387 | self.prefix = prefix 388 | 389 | def display(self, batch): 390 | entries = [self.prefix + self.batch_fmtstr.format(batch)] 391 | entries += [str(meter) for meter in self.meters] 392 | print('\t'.join(entries)) 393 | 394 | def _get_batch_fmtstr(self, num_batches): 395 | num_digits = len(str(num_batches // 1)) 396 | fmt = '{:' + str(num_digits) + 'd}' 397 | return '[' + fmt + '/' + fmt.format(num_batches) + ']' 398 | 399 | 400 | def adjust_learning_rate(optimizer, epoch, args): 401 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 402 | lr = args.lr * (0.1 ** (epoch // 30)) 403 | for param_group in optimizer.param_groups: 404 | param_group['lr'] = lr 405 | 406 | 407 | def accuracy(output, target, topk=(1,)): 408 | """Computes the accuracy over the k top predictions for the specified values of k""" 409 | with torch.no_grad(): 410 | maxk = max(topk) 411 | batch_size = target.size(0) 412 | 413 | _, pred = output.topk(maxk, 1, True, True) 414 | pred = pred.t() 415 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 416 | 417 | res = [] 418 | for k in topk: 419 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 420 | res.append(correct_k.mul_(100.0 / batch_size)) 421 | return res 422 | 423 | 424 | if __name__ == '__main__': 425 | main() 426 | -------------------------------------------------------------------------------- /.old/main-wds.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import os 4 | import random 5 | import shutil 6 | import time 7 | import warnings 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.parallel 12 | import torch.backends.cudnn as cudnn 13 | import torch.distributed as dist 14 | import torch.optim 15 | import torch.multiprocessing as mp 16 | import torch.utils.data 17 | import torch.utils.data.distributed 18 | import torchvision.transforms as transforms 19 | import torchvision.datasets as datasets 20 | import torchvision.models as models 21 | import webdataset as wds 22 | 23 | model_names = sorted(name for name in models.__dict__ 24 | if name.islower() and not name.startswith("__") 25 | and callable(models.__dict__[name])) 26 | 27 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 28 | parser.add_argument('--data', metavar='DIR', default='./data', 29 | help='path to dataset') 30 | parser.add_argument('--loader', default='wds', help='loader to use: orig, wds') 31 | parser.add_argument('--shuffle', type=int, default=1000, help='shuffle buffer size for WebDataset') 32 | parser.add_argument('--trainshards', default='./shards/imagenet-train-{000000..001281}.tar', help='path/URL for ImageNet shards', 33 | ) 34 | parser.add_argument('--trainsize', type=int, default=1281167, help='ImageNet training set size') 35 | parser.add_argument('--augmentation', default='full') 36 | parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', 37 | choices=model_names, 38 | help='model architecture: ' + 39 | ' | '.join(model_names) + 40 | ' (default: resnet18)') 41 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 42 | help='number of data loading workers (default: 4)') 43 | parser.add_argument('--epochs', default=90, type=int, metavar='N', 44 | help='number of total epochs to run') 45 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 46 | help='manual epoch number (useful on restarts)') 47 | parser.add_argument('-b', '--batch-size', default=256, type=int, 48 | metavar='N', 49 | help='mini-batch size (default: 256), this is the total ' 50 | 'batch size of all GPUs on the current node when ' 51 | 'using Data Parallel or Distributed Data Parallel') 52 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 53 | metavar='LR', help='initial learning rate', dest='lr') 54 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 55 | help='momentum') 56 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, 57 | metavar='W', help='weight decay (default: 1e-4)', 58 | dest='weight_decay') 59 | parser.add_argument('-p', '--print-freq', default=10, type=int, 60 | metavar='N', help='print frequency (default: 10)') 61 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 62 | help='path to latest checkpoint (default: none)') 63 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 64 | help='evaluate model on validation set') 65 | parser.add_argument('--pretrained', dest='pretrained', action='store_true', 66 | help='use pre-trained model') 67 | parser.add_argument('--world-size', default=-1, type=int, 68 | help='number of nodes for distributed training') 69 | parser.add_argument('--rank', default=-1, type=int, 70 | help='node rank for distributed training') 71 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 72 | help='url used to set up distributed training') 73 | parser.add_argument('--dist-backend', default='nccl', type=str, 74 | help='distributed backend') 75 | parser.add_argument('--seed', default=None, type=int, 76 | help='seed for initializing training. ') 77 | parser.add_argument('--gpu', default=None, type=int, 78 | help='GPU id to use.') 79 | parser.add_argument('--multiprocessing-distributed', action='store_true', 80 | help='Use multi-processing distributed training to launch ' 81 | 'N processes per node, which has N GPUs. This is the ' 82 | 'fastest way to use PyTorch for either single node or ' 83 | 'multi node data parallel training') 84 | 85 | best_acc1 = 0 86 | 87 | 88 | def main(): 89 | args = parser.parse_args() 90 | 91 | if args.seed is not None: 92 | random.seed(args.seed) 93 | torch.manual_seed(args.seed) 94 | cudnn.deterministic = True 95 | warnings.warn('You have chosen to seed training. ' 96 | 'This will turn on the CUDNN deterministic setting, ' 97 | 'which can slow down your training considerably! ' 98 | 'You may see unexpected behavior when restarting ' 99 | 'from checkpoints.') 100 | 101 | if args.gpu is not None: 102 | warnings.warn('You have chosen a specific GPU. This will completely ' 103 | 'disable data parallelism.') 104 | 105 | if args.dist_url == "env://" and args.world_size == -1: 106 | args.world_size = int(os.environ["WORLD_SIZE"]) 107 | 108 | args.distributed = args.world_size > 1 or args.multiprocessing_distributed 109 | 110 | ngpus_per_node = torch.cuda.device_count() 111 | if args.multiprocessing_distributed: 112 | # Since we have ngpus_per_node processes per node, the total world_size 113 | # needs to be adjusted accordingly 114 | args.world_size = ngpus_per_node * args.world_size 115 | # Use torch.multiprocessing.spawn to launch distributed processes: the 116 | # main_worker process function 117 | mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) 118 | else: 119 | # Simply call main_worker function 120 | main_worker(args.gpu, ngpus_per_node, args) 121 | 122 | 123 | def main_worker(gpu, ngpus_per_node, args): 124 | global best_acc1 125 | args.gpu = gpu 126 | 127 | if args.gpu is not None: 128 | print("Use GPU: {} for training".format(args.gpu)) 129 | 130 | if args.distributed: 131 | if args.dist_url == "env://" and args.rank == -1: 132 | args.rank = int(os.environ["RANK"]) 133 | if args.multiprocessing_distributed: 134 | # For multiprocessing distributed training, rank needs to be the 135 | # global rank among all the processes 136 | args.rank = args.rank * ngpus_per_node + gpu 137 | dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 138 | world_size=args.world_size, rank=args.rank) 139 | # create model 140 | if args.pretrained: 141 | print("=> using pre-trained model '{}'".format(args.arch)) 142 | model = models.__dict__[args.arch](pretrained=True) 143 | else: 144 | print("=> creating model '{}'".format(args.arch)) 145 | model = models.__dict__[args.arch]() 146 | 147 | if args.distributed: 148 | # For multiprocessing distributed, DistributedDataParallel constructor 149 | # should always set the single device scope, otherwise, 150 | # DistributedDataParallel will use all available devices. 151 | if args.gpu is not None: 152 | torch.cuda.set_device(args.gpu) 153 | model.cuda(args.gpu) 154 | # When using a single GPU per process and per 155 | # DistributedDataParallel, we need to divide the batch size 156 | # ourselves based on the total number of GPUs we have 157 | args.batch_size = int(args.batch_size / ngpus_per_node) 158 | args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) 159 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) 160 | else: 161 | model.cuda() 162 | # DistributedDataParallel will divide and allocate batch_size to all 163 | # available GPUs if device_ids are not set 164 | model = torch.nn.parallel.DistributedDataParallel(model) 165 | elif args.gpu is not None: 166 | torch.cuda.set_device(args.gpu) 167 | model = model.cuda(args.gpu) 168 | else: 169 | # DataParallel will divide and allocate batch_size to all available GPUs 170 | if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): 171 | model.features = torch.nn.DataParallel(model.features) 172 | model.cuda() 173 | else: 174 | model = torch.nn.DataParallel(model).cuda() 175 | 176 | # define loss function (criterion) and optimizer 177 | criterion = nn.CrossEntropyLoss().cuda(args.gpu) 178 | 179 | optimizer = torch.optim.SGD(model.parameters(), args.lr, 180 | momentum=args.momentum, 181 | weight_decay=args.weight_decay) 182 | 183 | # optionally resume from a checkpoint 184 | if args.resume: 185 | if os.path.isfile(args.resume): 186 | print("=> loading checkpoint '{}'".format(args.resume)) 187 | if args.gpu is None: 188 | checkpoint = torch.load(args.resume) 189 | else: 190 | # Map model to be loaded to specified single gpu. 191 | loc = 'cuda:{}'.format(args.gpu) 192 | checkpoint = torch.load(args.resume, map_location=loc) 193 | args.start_epoch = checkpoint['epoch'] 194 | best_acc1 = checkpoint['best_acc1'] 195 | if args.gpu is not None: 196 | # best_acc1 may be from a checkpoint from a different GPU 197 | best_acc1 = best_acc1.to(args.gpu) 198 | model.load_state_dict(checkpoint['state_dict']) 199 | optimizer.load_state_dict(checkpoint['optimizer']) 200 | print("=> loaded checkpoint '{}' (epoch {})" 201 | .format(args.resume, checkpoint['epoch'])) 202 | else: 203 | print("=> no checkpoint found at '{}'".format(args.resume)) 204 | 205 | cudnn.benchmark = True 206 | 207 | train_loader = eval(f"make_train_loader_{args.loader}")(args) 208 | 209 | val_loader = make_val_loader(args) 210 | 211 | if args.evaluate: 212 | validate(val_loader, model, criterion, args) 213 | return 214 | 215 | for epoch in range(args.start_epoch, args.epochs): 216 | if args.distributed: 217 | sampler = getattr(train_loader, "sampler", None) 218 | if sampler is not None: 219 | sampler.set_epoch(epoch) 220 | adjust_learning_rate(optimizer, epoch, args) 221 | 222 | # train for one epoch 223 | train(train_loader, model, criterion, optimizer, epoch, args) 224 | 225 | # evaluate on validation set 226 | acc1 = validate(val_loader, model, criterion, args) 227 | 228 | # remember best acc@1 and save checkpoint 229 | is_best = acc1 > best_acc1 230 | best_acc1 = max(acc1, best_acc1) 231 | 232 | if not args.multiprocessing_distributed or (args.multiprocessing_distributed 233 | and args.rank % ngpus_per_node == 0): 234 | save_checkpoint({ 235 | 'epoch': epoch + 1, 236 | 'arch': args.arch, 237 | 'state_dict': model.state_dict(), 238 | 'best_acc1': best_acc1, 239 | 'optimizer' : optimizer.state_dict(), 240 | }, is_best) 241 | 242 | def make_train_transform(args): 243 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 244 | 245 | if args.augmentation == "full": 246 | return transforms.Compose( 247 | [ 248 | transforms.RandomResizedCrop(224), 249 | transforms.RandomHorizontalFlip(), 250 | transforms.ToTensor(), 251 | normalize, 252 | ] 253 | ) 254 | elif args.augmentation == "simple": 255 | print("=> using simple augmentation") 256 | return transforms.Compose( 257 | [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize,] 258 | ) 259 | 260 | 261 | def make_val_transform(args): 262 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 263 | 264 | return transforms.Compose( 265 | [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize,] 266 | ) 267 | 268 | 269 | def make_train_loader_orig(args): 270 | print("=> using file based loader") 271 | traindir = os.path.join(args.data, "train") 272 | train_transform = make_train_transform(args) 273 | train_dataset = datasets.ImageFolder(traindir, train_transform) 274 | args.trainsize = len(train_dataset) 275 | 276 | if args.distributed: 277 | train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) 278 | else: 279 | train_sampler = None 280 | 281 | train_loader = torch.utils.data.DataLoader( 282 | train_dataset, 283 | batch_size=args.batch_size, 284 | shuffle=(train_sampler is None), 285 | num_workers=args.workers, 286 | pin_memory=True, 287 | sampler=train_sampler, 288 | ) 289 | 290 | return train_loader 291 | 292 | 293 | def identity(x): 294 | return x 295 | 296 | 297 | def worker_urls(urls): 298 | result = wds.worker_urls(urls) 299 | print("worker_urls returning", len(result), "of", len(urls), "urls", file=sys.stderr) 300 | return result 301 | 302 | 303 | def make_train_loader_wds(args): 304 | print("=> using WebDataset loader") 305 | train_transform = make_train_transform(args) 306 | num_batches = args.trainsize // args.batch_size 307 | train_dataset = ( 308 | wds.Dataset(args.trainshards, length=num_batches, shard_selection=worker_urls) 309 | .shuffle(args.shuffle) 310 | .decode("pil") 311 | .to_tuple("jpg;png;jpeg cls") 312 | .map_tuple(train_transform, identity) 313 | .batched(args.batch_size) 314 | ) 315 | train_loader = torch.utils.data.DataLoader( 316 | train_dataset, batch_size=None, shuffle=False, num_workers=args.workers, 317 | ) 318 | return train_loader 319 | 320 | 321 | def make_val_loader(args): 322 | valdir = os.path.join(args.data, "val") 323 | val_transform = make_val_transform(args) 324 | val_dataset = datasets.ImageFolder(valdir, val_transform) 325 | val_loader = torch.utils.data.DataLoader( 326 | val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True 327 | ) 328 | return val_loader 329 | 330 | 331 | 332 | 333 | def train(train_loader, model, criterion, optimizer, epoch, args): 334 | batch_time = AverageMeter('Time', ':6.3f') 335 | data_time = AverageMeter('Data', ':6.3f') 336 | losses = AverageMeter('Loss', ':.4e') 337 | top1 = AverageMeter('Acc@1', ':6.2f') 338 | top5 = AverageMeter('Acc@5', ':6.2f') 339 | progress = ProgressMeter( 340 | len(train_loader), 341 | [batch_time, data_time, losses, top1, top5], 342 | prefix="Epoch: [{}]".format(epoch)) 343 | 344 | # switch to train mode 345 | model.train() 346 | 347 | end = time.time() 348 | for i, (images, target) in enumerate(train_loader): 349 | # measure data loading time 350 | data_time.update(time.time() - end) 351 | 352 | if args.gpu is not None: 353 | images = images.cuda(args.gpu, non_blocking=True) 354 | target = target.cuda(args.gpu, non_blocking=True) 355 | 356 | # compute output 357 | output = model(images) 358 | loss = criterion(output, target) 359 | 360 | # measure accuracy and record loss 361 | acc1, acc5 = accuracy(output, target, topk=(1, 5)) 362 | losses.update(loss.item(), images.size(0)) 363 | top1.update(acc1[0], images.size(0)) 364 | top5.update(acc5[0], images.size(0)) 365 | 366 | # compute gradient and do SGD step 367 | optimizer.zero_grad() 368 | loss.backward() 369 | optimizer.step() 370 | 371 | # measure elapsed time 372 | batch_time.update(time.time() - end) 373 | end = time.time() 374 | 375 | if i % args.print_freq == 0: 376 | progress.display(i) 377 | 378 | 379 | def validate(val_loader, model, criterion, args): 380 | batch_time = AverageMeter('Time', ':6.3f') 381 | losses = AverageMeter('Loss', ':.4e') 382 | top1 = AverageMeter('Acc@1', ':6.2f') 383 | top5 = AverageMeter('Acc@5', ':6.2f') 384 | progress = ProgressMeter( 385 | len(val_loader), 386 | [batch_time, losses, top1, top5], 387 | prefix='Test: ') 388 | 389 | # switch to evaluate mode 390 | model.eval() 391 | 392 | with torch.no_grad(): 393 | end = time.time() 394 | for i, (images, target) in enumerate(val_loader): 395 | if args.gpu is not None: 396 | images = images.cuda(args.gpu, non_blocking=True) 397 | target = target.cuda(args.gpu, non_blocking=True) 398 | 399 | # compute output 400 | output = model(images) 401 | loss = criterion(output, target) 402 | 403 | # measure accuracy and record loss 404 | acc1, acc5 = accuracy(output, target, topk=(1, 5)) 405 | losses.update(loss.item(), images.size(0)) 406 | top1.update(acc1[0], images.size(0)) 407 | top5.update(acc5[0], images.size(0)) 408 | 409 | # measure elapsed time 410 | batch_time.update(time.time() - end) 411 | end = time.time() 412 | 413 | if i % args.print_freq == 0: 414 | progress.display(i) 415 | 416 | # TODO: this should also be done with the ProgressMeter 417 | print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' 418 | .format(top1=top1, top5=top5)) 419 | 420 | return top1.avg 421 | 422 | 423 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 424 | torch.save(state, filename) 425 | if is_best: 426 | shutil.copyfile(filename, 'model_best.pth.tar') 427 | 428 | 429 | class AverageMeter(object): 430 | """Computes and stores the average and current value""" 431 | def __init__(self, name, fmt=':f'): 432 | self.name = name 433 | self.fmt = fmt 434 | self.reset() 435 | 436 | def reset(self): 437 | self.val = 0 438 | self.avg = 0 439 | self.sum = 0 440 | self.count = 0 441 | 442 | def update(self, val, n=1): 443 | self.val = val 444 | self.sum += val * n 445 | self.count += n 446 | self.avg = self.sum / self.count 447 | 448 | def __str__(self): 449 | fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' 450 | return fmtstr.format(**self.__dict__) 451 | 452 | 453 | class ProgressMeter(object): 454 | def __init__(self, num_batches, meters, prefix=""): 455 | self.batch_fmtstr = self._get_batch_fmtstr(num_batches) 456 | self.meters = meters 457 | self.prefix = prefix 458 | 459 | def display(self, batch): 460 | entries = [self.prefix + self.batch_fmtstr.format(batch)] 461 | entries += [str(meter) for meter in self.meters] 462 | print('\t'.join(entries)) 463 | 464 | def _get_batch_fmtstr(self, num_batches): 465 | num_digits = len(str(num_batches // 1)) 466 | fmt = '{:' + str(num_digits) + 'd}' 467 | return '[' + fmt + '/' + fmt.format(num_batches) + ']' 468 | 469 | 470 | def adjust_learning_rate(optimizer, epoch, args): 471 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 472 | lr = args.lr * (0.1 ** (epoch // 30)) 473 | for param_group in optimizer.param_groups: 474 | param_group['lr'] = lr 475 | 476 | 477 | def accuracy(output, target, topk=(1,)): 478 | """Computes the accuracy over the k top predictions for the specified values of k""" 479 | with torch.no_grad(): 480 | maxk = max(topk) 481 | batch_size = target.size(0) 482 | 483 | _, pred = output.topk(maxk, 1, True, True) 484 | pred = pred.t() 485 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 486 | 487 | res = [] 488 | for k in topk: 489 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 490 | res.append(correct_k.mul_(100.0 / batch_size)) 491 | return res 492 | 493 | 494 | if __name__ == '__main__': 495 | main() 496 | -------------------------------------------------------------------------------- /.old/makeshards.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import os.path 4 | import random 5 | import argparse 6 | 7 | from torchvision import datasets 8 | 9 | import webdataset as wds 10 | 11 | 12 | parser = argparse.ArgumentParser("""Generate sharded dataset from original ImageNet data.""") 13 | parser.add_argument("--splits", default="train,val", help="which splits to write") 14 | parser.add_argument( 15 | "--filekey", action="store_true", help="use file as key (default: index)" 16 | ) 17 | parser.add_argument("--maxsize", type=float, default=1e9) 18 | parser.add_argument("--maxcount", type=float, default=1000) 19 | parser.add_argument( 20 | "--shards", default="./shards", help="directory where shards are written" 21 | ) 22 | parser.add_argument( 23 | "--data", 24 | default="./data", 25 | help="directory containing ImageNet data distribution suitable for torchvision.datasets", 26 | ) 27 | args = parser.parse_args() 28 | 29 | 30 | assert args.maxsize > 10000000 31 | assert args.maxcount < 1000000 32 | 33 | 34 | if not os.path.isdir(os.path.join(args.data, "train")): 35 | print(f"{args.data}: should be directory containing ImageNet", file=sys.stderr) 36 | print(f"suitable as argument for torchvision.datasets.ImageNet(...)", file=sys.stderr) 37 | sys.exit(1) 38 | 39 | 40 | if not os.path.isdir(os.path.join(args.shards, ".")): 41 | print(f"{args.shards}: should be a writable destination directory for shards", file=sys.stderr) 42 | sys.exit(1) 43 | 44 | 45 | splits = args.splits.split(",") 46 | 47 | 48 | def readfile(fname): 49 | "Read a binary file from disk." 50 | with open(fname, "rb") as stream: 51 | return stream.read() 52 | 53 | 54 | all_keys = set() 55 | 56 | 57 | def write_dataset(imagenet, base="./shards", split="train"): 58 | 59 | # We're using the torchvision ImageNet dataset 60 | # to parse the metadata; however, we will read 61 | # the compressed images directly from disk (to 62 | # avoid having to reencode them) 63 | ds = datasets.ImageNet(imagenet, split=split) 64 | nimages = len(ds.imgs) 65 | print("# nimages", nimages) 66 | 67 | # We shuffle the indexes to make sure that we 68 | # don't get any large sequences of a single class 69 | # in the dataset. 70 | indexes = list(range(nimages)) 71 | random.shuffle(indexes) 72 | 73 | # This is the output pattern under which we write shards. 74 | pattern = os.path.join(base, f"imagenet-{split}-%06d.tar") 75 | 76 | with wds.ShardWriter(pattern, maxsize=int(args.maxsize), maxcount=int(args.maxcount)) as sink: 77 | for i in indexes: 78 | 79 | # Internal information from the ImageNet dataset 80 | # instance: the file name and the numerical class. 81 | fname, cls = ds.imgs[i] 82 | assert cls == ds.targets[i] 83 | 84 | # Read the JPEG-compressed image file contents. 85 | image = readfile(fname) 86 | 87 | # Construct a uniqu keye from the filename. 88 | key = os.path.splitext(os.path.basename(fname))[0] 89 | 90 | # Useful check. 91 | assert key not in all_keys 92 | all_keys.add(key) 93 | 94 | # Construct a sample. 95 | xkey = key if args.filekey else "%07d" % i 96 | sample = {"__key__": xkey, "jpg": image, "cls": cls} 97 | 98 | # Write the sample to the sharded tar archives. 99 | sink.write(sample) 100 | 101 | 102 | for split in splits: 103 | print("# split", split) 104 | write_dataset(args.data, base=args.shards, split=split) 105 | -------------------------------------------------------------------------------- /.old/requirements.txt: -------------------------------------------------------------------------------- 1 | braceexpand 2 | numpy 3 | scipy 4 | tk 5 | matplotlib 6 | torch 7 | torchvision 8 | jupyterlab 9 | bash_kernel 10 | -e git+git://github.com/tmbdev/webdataset#egg=webdataset 11 | -------------------------------------------------------------------------------- /.old/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | die() { 5 | echo "ERROR: $*" 6 | exit 1 7 | } 8 | 9 | 10 | cmd_clean() { # remove temporary files 11 | rm -rf venv shards 12 | rm -f data 13 | rm -f *-info.txt *.pth *.pth.tar *.log 14 | } 15 | 16 | 17 | cmd_venv() { # set up a virtualenv 18 | test -d venv || python3 -m venv venv 19 | source venv/bin/activate 20 | pip3 install -U pip 21 | pip3 install -U -r requirements.txt 22 | python3 -m bash_kernel.install 23 | pip3 install -U neovim 24 | pip3 install -U jupyterlab 25 | pip3 install -U pytest 26 | } 27 | 28 | cmd_makeshards() { # make ./shards from ./data 29 | test -d ./venv || die "'./run venv' first to create the virtual environment" 30 | test -f ./venv/bin/activate || die "no venv/bin/activate found" 31 | test -e ./data || die "make a symlink from the ImageNet data directory to ./data" 32 | test -e ./data/train || die "./data/train: not found" 33 | test -e ./data/val || die "./data/val: not found" 34 | rm -rf ./shards/* 35 | mkdir -p ./shards 36 | source venv/bin/activate 37 | export OMP_NUM_THREADS=1 38 | python3 ./makeshards.py "$@" 39 | } 40 | 41 | cmd_train() { # run a training job against ./shards 42 | test -d ./venv || die "'./run venv' first to create the virtual environment" 43 | test -f ./venv/bin/activate || die "no venv/bin/activate found" 44 | test -e ./data || die "make a symlink from the ImageNet data directory to ./data" 45 | test -d ./shards || die "run 'makeshards' first" 46 | source venv/bin/activate 47 | export OMP_NUM_THREADS=1 48 | python3 ./main-wds.py "$@" 49 | } 50 | 51 | 52 | cmd=${1:-help} 53 | shift 54 | case $cmd in 55 | help) 56 | echo; echo available commands:; echo 57 | grep '^cmd_[_0-9a-z]*() {' "$0" | sed 's/cmd_//;s/\(.*\)() *{* *#* */\1 -- /' 58 | ;; 59 | *.py) 60 | set -e 61 | cmd_venv > venv.log 62 | source venv/bin/activate 63 | export OMP_NUM_THREADS=1 64 | python3 "$cmd" "$@" 65 | ;; 66 | *) 67 | set -e 68 | eval "cmd_$cmd" "$@" 69 | ;; 70 | esac 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repo has moved. Please have a look at: 2 | 3 | - [tmbdev/webdataset-examples](http://github.com/tmbdev/webdataset-examples) for a simple modification to the original PyTorch Imagenet training example 4 | - [tmbdev/webdataset-lighting](http://github.com/tmbdev/webdataset-lightning) for an example of how to use WebDataset with Lightining 5 | 6 | --------------------------------------------------------------------------------