├── README.md ├── algorithm.png ├── main_moco_pretraining_v3.py ├── moco ├── __init__.py ├── builder.py └── loader.py └── pretrain_cub.sh /README.md: -------------------------------------------------------------------------------- 1 | # MoCov3-pytorch 2 | custom implementation of MoCov3 [[arxiv]](https://arxiv.org/abs/2104.02057). I made minor modifications based on the official MoCo repository [[github]](https://github.com/facebookresearch/moco). 3 | 4 | No ViT part code and for reference only. 5 | 6 | ![](algorithm.png) 7 | -------------------------------------------------------------------------------- /algorithm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CupidJay/MoCov3-pytorch/1636a392e2a4a92bd841c61f0085d894369aeb8f/algorithm.png -------------------------------------------------------------------------------- /main_moco_pretraining_v3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | import argparse 4 | import builtins 5 | import math 6 | import os 7 | import random 8 | import shutil 9 | import time 10 | import warnings 11 | 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.parallel 15 | import torch.backends.cudnn as cudnn 16 | import torch.distributed as dist 17 | import torch.optim 18 | import torch.multiprocessing as mp 19 | import torch.utils.data 20 | import torch.utils.data.distributed 21 | import torchvision.transforms as transforms 22 | import torchvision.datasets as datasets 23 | import torchvision.models as models 24 | import moco.loader 25 | import moco.builder 26 | 27 | model_names = sorted(name for name in models.__dict__ 28 | if name.islower() and not name.startswith("__") 29 | and callable(models.__dict__[name])) 30 | 31 | 32 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 33 | parser.add_argument('data', metavar='DIR', 34 | help='path to dataset') 35 | parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', 36 | choices=model_names, 37 | help='model architecture: ' + 38 | ' | '.join(model_names) + 39 | ' (default: resnet50)') 40 | parser.add_argument('-j', '--workers', default=32, type=int, metavar='N', 41 | help='number of data loading workers (default: 32)') 42 | parser.add_argument('--epochs', default=200, type=int, metavar='N', 43 | help='number of total epochs to run') 44 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 45 | help='manual epoch number (useful on restarts)') 46 | parser.add_argument('-b', '--batch-size', default=256, type=int, 47 | metavar='N', 48 | help='mini-batch size (default: 256), this is the total ' 49 | 'batch size of all GPUs on the current node when ' 50 | 'using Data Parallel or Distributed Data Parallel') 51 | parser.add_argument('--lr', '--learning-rate', default=0.03, type=float, 52 | metavar='LR', help='initial learning rate', dest='lr') 53 | parser.add_argument('--schedule', default=[120, 160], nargs='*', type=int, 54 | help='learning rate schedule (when to drop lr by 10x)') 55 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 56 | help='momentum of SGD solver') 57 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, 58 | metavar='W', help='weight decay (default: 1e-4)', 59 | dest='weight_decay') 60 | parser.add_argument('-p', '--print-freq', default=10, type=int, 61 | metavar='N', help='print frequency (default: 10)') 62 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 63 | help='path to latest checkpoint (default: none)') 64 | parser.add_argument('--pretrained', default='', type=str, metavar='PATH', 65 | help='path to pretrained checkpoint (default: none) used for finetune') 66 | parser.add_argument('--world-size', default=-1, type=int, 67 | help='number of nodes for distributed training') 68 | parser.add_argument('--rank', default=-1, type=int, 69 | help='node rank for distributed training') 70 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 71 | help='url used to set up distributed training') 72 | parser.add_argument('--dist-backend', default='nccl', type=str, 73 | help='distributed backend') 74 | parser.add_argument('--seed', default=None, type=int, 75 | help='seed for initializing training. ') 76 | parser.add_argument('--gpu', default=None, type=int, 77 | help='GPU id to use.') 78 | parser.add_argument('--gpus', type=str, default='0') 79 | parser.add_argument('--save-dir', type=str, default='checkpoints', help='where to save models') 80 | parser.add_argument('--multiprocessing-distributed', action='store_true', 81 | help='Use multi-processing distributed training to launch ' 82 | 'N processes per node, which has N GPUs. This is the ' 83 | 'fastest way to use PyTorch for either single node or ' 84 | 'multi node data parallel training') 85 | 86 | # moco specific configs: 87 | parser.add_argument('--moco-dim', default=128, type=int, 88 | help='feature dimension (default: 128)') 89 | parser.add_argument('--moco-k', default=65536, type=int, 90 | help='queue size; number of negative keys (default: 65536)') 91 | parser.add_argument('--moco-m', default=0.999, type=float, 92 | help='moco momentum of updating key encoder (default: 0.999)') 93 | parser.add_argument('--moco-t', default=0.07, type=float, 94 | help='softmax temperature (default: 0.07)') 95 | 96 | # options for moco v2 97 | parser.add_argument('--mlp', action='store_true', 98 | help='use mlp head') 99 | parser.add_argument('--aug-plus', action='store_true', 100 | help='use moco v2 data augmentation') 101 | parser.add_argument('--cos', action='store_true', 102 | help='use cosine lr schedule') 103 | 104 | 105 | def main(): 106 | args = parser.parse_args() 107 | 108 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus 109 | 110 | args.save_dir = os.path.join(args.save_dir, 'self-similarity', args.arch) 111 | if args.pretrained: 112 | args.save_dir = os.path.join(args.save_dir, 'pretrained_from_{}'.format(args.pretrained.split('/')[-1])) 113 | args.save_dir = os.path.join(args.save_dir, 'gpus_{}_lr_{}_bs_{}_epochs'.format(len(args.gpus.split(',')), 114 | args.lr, 115 | args.batch_size, 116 | args.epochs)) 117 | if not os.path.exists(args.save_dir): 118 | os.makedirs(args.save_dir) 119 | 120 | if args.seed is not None: 121 | random.seed(args.seed) 122 | torch.manual_seed(args.seed) 123 | cudnn.deterministic = True 124 | warnings.warn('You have chosen to seed training. ' 125 | 'This will turn on the CUDNN deterministic setting, ' 126 | 'which can slow down your training considerably! ' 127 | 'You may see unexpected behavior when restarting ' 128 | 'from checkpoints.') 129 | 130 | if args.gpu is not None: 131 | warnings.warn('You have chosen a specific GPU. This will completely ' 132 | 'disable data parallelism.') 133 | 134 | if args.dist_url == "env://" and args.world_size == -1: 135 | args.world_size = int(os.environ["WORLD_SIZE"]) 136 | 137 | args.distributed = args.world_size > 1 or args.multiprocessing_distributed 138 | 139 | ngpus_per_node = torch.cuda.device_count() 140 | if args.multiprocessing_distributed: 141 | # Since we have ngpus_per_node processes per node, the total world_size 142 | # needs to be adjusted accordingly 143 | args.world_size = ngpus_per_node * args.world_size 144 | # Use torch.multiprocessing.spawn to launch distributed processes: the 145 | # main_worker process function 146 | mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) 147 | else: 148 | # Simply call main_worker function 149 | main_worker(args.gpu, ngpus_per_node, args) 150 | 151 | 152 | 153 | def main_worker(gpu, ngpus_per_node, args): 154 | args.gpu = gpu 155 | 156 | # suppress printing if not master 157 | if args.multiprocessing_distributed and args.gpu != 0: 158 | def print_pass(*args): 159 | pass 160 | builtins.print = print_pass 161 | 162 | if args.gpu is not None: 163 | print("Use GPU: {} for training".format(args.gpu)) 164 | 165 | if args.distributed: 166 | if args.dist_url == "env://" and args.rank == -1: 167 | args.rank = int(os.environ["RANK"]) 168 | if args.multiprocessing_distributed: 169 | # For multiprocessing distributed training, rank needs to be the 170 | # global rank among all the processes 171 | args.rank = args.rank * ngpus_per_node + gpu 172 | dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 173 | world_size=args.world_size, rank=args.rank) 174 | # create model 175 | print("=> creating model '{}'".format(args.arch)) 176 | 177 | model = moco.builder.MoCoV3( 178 | models.__dict__[args.arch], 179 | args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) 180 | print(model) 181 | 182 | if args.distributed: 183 | # For multiprocessing distributed, DistributedDataParallel constructor 184 | # should always set the single device scope, otherwise, 185 | # DistributedDataParallel will use all available devices. 186 | if args.gpu is not None: 187 | torch.cuda.set_device(args.gpu) 188 | model.cuda(args.gpu) 189 | # When using a single GPU per process and per 190 | # DistributedDataParallel, we need to divide the batch size 191 | # ourselves based on the total number of GPUs we have 192 | args.batch_size = int(args.batch_size / ngpus_per_node) 193 | args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) 194 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) 195 | else: 196 | model.cuda() 197 | # DistributedDataParallel will divide and allocate batch_size to all 198 | # available GPUs if device_ids are not set 199 | model = torch.nn.parallel.DistributedDataParallel(model) 200 | elif args.gpu is not None: 201 | torch.cuda.set_device(args.gpu) 202 | model = model.cuda(args.gpu) 203 | # comment out the following line for debugging 204 | raise NotImplementedError("Only DistributedDataParallel is supported.") 205 | else: 206 | # AllGather implementation (batch shuffle, queue update, etc.) in 207 | # this code only supports DistributedDataParallel. 208 | raise NotImplementedError("Only DistributedDataParallel is supported.") 209 | 210 | # define loss function (criterion) and optimizer 211 | criterion = nn.CrossEntropyLoss().cuda(args.gpu) 212 | 213 | optimizer = torch.optim.SGD(model.parameters(), args.lr, 214 | momentum=args.momentum, 215 | weight_decay=args.weight_decay) 216 | 217 | # optionally resume from a checkpoint 218 | if args.resume: 219 | if os.path.isfile(args.resume): 220 | print("=> loading checkpoint '{}'".format(args.resume)) 221 | if args.gpu is None: 222 | checkpoint = torch.load(args.resume) 223 | else: 224 | # Map model to be loaded to specified single gpu. 225 | loc = 'cuda:{}'.format(args.gpu) 226 | checkpoint = torch.load(args.resume, map_location=loc) 227 | args.start_epoch = checkpoint['epoch'] 228 | model.load_state_dict(checkpoint['state_dict']) 229 | optimizer.load_state_dict(checkpoint['optimizer']) 230 | print("=> loaded checkpoint '{}' (epoch {})" 231 | .format(args.resume, checkpoint['epoch'])) 232 | else: 233 | print("=> no checkpoint found at '{}'".format(args.resume)) 234 | 235 | 236 | # optionally resume from a checkpoint 237 | if args.pretrained: 238 | if os.path.isfile(args.pretrained): 239 | print("=> loading pretrain weight '{}'".format(args.pretrained)) 240 | if args.gpu is None: 241 | checkpoint = torch.load(args.pretrained) 242 | else: 243 | # Map model to be loaded to specified single gpu. 244 | loc = 'cuda:{}'.format(args.gpu) 245 | checkpoint = torch.load(args.pretrained, map_location=loc) 246 | model.load_state_dict(checkpoint['state_dict']) 247 | print("=> loaded pretrained weight '{}'".format(args.pretrained)) 248 | else: 249 | print("=> no pretrained weight found at '{}'".format(args.pretrained)) 250 | 251 | cudnn.benchmark = True 252 | 253 | # Data loading code 254 | traindir = os.path.join(args.data, 'train') 255 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 256 | std=[0.229, 0.224, 0.225]) 257 | augmentation = [ 258 | transforms.RandomResizedCrop(224, scale=(0.2, 1.)), 259 | transforms.RandomApply([ 260 | transforms.ColorJitter(0.4, 0.4, 0.4, 0.1) # not strengthened 261 | ], p=0.8), 262 | transforms.RandomGrayscale(p=0.2), 263 | transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.5), 264 | transforms.RandomHorizontalFlip(), 265 | transforms.ToTensor(), 266 | normalize 267 | ] 268 | 269 | 270 | train_dataset = datasets.ImageFolder( 271 | traindir, 272 | moco.loader.TwoScaleCropsTransform(transforms.Compose(augmentation))) 273 | 274 | if args.distributed: 275 | train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) 276 | else: 277 | train_sampler = None 278 | 279 | 280 | train_loader = torch.utils.data.DataLoader( 281 | train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), 282 | num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) 283 | 284 | train_start = time.time() 285 | 286 | for epoch in range(args.start_epoch, args.epochs): 287 | if args.distributed: 288 | train_sampler.set_epoch(epoch) 289 | adjust_learning_rate(optimizer, epoch, args) 290 | 291 | # train for one epoch 292 | train(train_loader, model, criterion, optimizer, epoch, args) 293 | 294 | if not args.multiprocessing_distributed or (args.multiprocessing_distributed 295 | and args.rank % ngpus_per_node == 0): 296 | if epoch % 50 == 0 or epoch==args.epochs-1: 297 | save_checkpoint({ 298 | 'epoch': epoch + 1, 299 | 'arch': args.arch, 300 | 'state_dict': model.state_dict(), 301 | 'optimizer' : optimizer.state_dict(), 302 | }, is_best=False, root=args.save_dir, filename='checkpoint_{:04d}.pth.tar'.format(epoch)) 303 | train_end = time.time() 304 | 305 | print('total training time elapses {} hours'.format((train_end-train_start)/3600.0)) 306 | 307 | def ctr(q, k, criterion, tau): 308 | logits = torch.mm(q, k.t()) 309 | N = q.size(0) 310 | labels = range(N) 311 | labels = torch.LongTensor(labels).cuda() 312 | loss = criterion(logits/tau, labels) 313 | return 2*tau*loss 314 | 315 | def train(train_loader, model, criterion, optimizer, epoch, args): 316 | batch_time = AverageMeter('Time', ':6.3f') 317 | data_time = AverageMeter('Data', ':6.3f') 318 | losses = AverageMeter('Loss', ':.4e') 319 | #top1 = AverageMeter('Acc@1', ':6.2f') 320 | #top5 = AverageMeter('Acc@5', ':6.2f') 321 | progress = ProgressMeter( 322 | len(train_loader), 323 | [batch_time, data_time, losses], 324 | prefix="Epoch: [{}]".format(epoch)) 325 | 326 | # switch to train mode 327 | model.train() 328 | 329 | end = time.time() 330 | for i, (images, _) in enumerate(train_loader): 331 | # measure data loading time 332 | data_time.update(time.time() - end) 333 | 334 | #print(images[0].size()) 335 | #print(images[1].size()) 336 | 337 | if args.gpu is not None: 338 | images[0] = images[0].cuda(args.gpu, non_blocking=True) 339 | images[1] = images[1].cuda(args.gpu, non_blocking=True) 340 | 341 | # compute output 342 | #if epoch % 2 == 0: 343 | q1, q2, k1, k2 = model(x1=images[0], x2=images[1]) 344 | 345 | loss = ctr(q1, k2, criterion, args.moco_t) + ctr(q2, k1, criterion, args.moco_t) 346 | #else: 347 | # output, target = model(im_q=images[1], im_k=images[0]) 348 | 349 | # acc1/acc5 are (K+1)-way contrast classifier accuracy 350 | # measure accuracy and record loss 351 | #acc1, acc5 = accuracy(output, target, topk=(1, 5)) 352 | losses.update(loss.item(), images[0].size(0)) 353 | #top1.update(acc1[0], images[0].size(0)) 354 | #top5.update(acc5[0], images[0].size(0)) 355 | 356 | # compute gradient and do SGD step 357 | optimizer.zero_grad() 358 | loss.backward() 359 | optimizer.step() 360 | 361 | # measure elapsed time 362 | batch_time.update(time.time() - end) 363 | end = time.time() 364 | 365 | if i % args.print_freq == 0: 366 | progress.display(i) 367 | 368 | 369 | def save_checkpoint(state, is_best, root, filename='checkpoint.pth.tar'): 370 | torch.save(state, os.path.join(root, filename)) 371 | if is_best: 372 | shutil.copyfile(os.path.join(root, filename), os.path.join(root, 'model_best.pth.tar')) 373 | 374 | 375 | class AverageMeter(object): 376 | """Computes and stores the average and current value""" 377 | def __init__(self, name, fmt=':f'): 378 | self.name = name 379 | self.fmt = fmt 380 | self.reset() 381 | 382 | def reset(self): 383 | self.val = 0 384 | self.avg = 0 385 | self.sum = 0 386 | self.count = 0 387 | 388 | def update(self, val, n=1): 389 | self.val = val 390 | self.sum += val * n 391 | self.count += n 392 | self.avg = self.sum / self.count 393 | 394 | def __str__(self): 395 | fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' 396 | return fmtstr.format(**self.__dict__) 397 | 398 | 399 | class ProgressMeter(object): 400 | def __init__(self, num_batches, meters, prefix=""): 401 | self.batch_fmtstr = self._get_batch_fmtstr(num_batches) 402 | self.meters = meters 403 | self.prefix = prefix 404 | 405 | def display(self, batch): 406 | entries = [self.prefix + self.batch_fmtstr.format(batch)] 407 | entries += [str(meter) for meter in self.meters] 408 | print('\t'.join(entries)) 409 | 410 | def _get_batch_fmtstr(self, num_batches): 411 | num_digits = len(str(num_batches // 1)) 412 | fmt = '{:' + str(num_digits) + 'd}' 413 | return '[' + fmt + '/' + fmt.format(num_batches) + ']' 414 | 415 | 416 | def adjust_learning_rate(optimizer, epoch, args): 417 | """Decay the learning rate based on schedule""" 418 | lr = args.lr 419 | if args.cos: # cosine lr schedule 420 | lr *= 0.5 * (1. + math.cos(math.pi * epoch / args.epochs)) 421 | else: # stepwise lr schedule 422 | for milestone in args.schedule: 423 | lr *= 0.1 if epoch >= milestone else 1. 424 | for param_group in optimizer.param_groups: 425 | param_group['lr'] = lr 426 | 427 | 428 | def accuracy(output, target, topk=(1,)): 429 | """Computes the accuracy over the k top predictions for the specified values of k""" 430 | with torch.no_grad(): 431 | maxk = max(topk) 432 | batch_size = target.size(0) 433 | 434 | _, pred = output.topk(maxk, 1, True, True) 435 | pred = pred.t() 436 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 437 | 438 | res = [] 439 | for k in topk: 440 | correct_k = correct[:k].contiguous().view(-1).float().sum(0, keepdim=True) 441 | res.append(correct_k.mul_(100.0 / batch_size)) 442 | return res 443 | 444 | 445 | if __name__ == '__main__': 446 | main() 447 | 448 | -------------------------------------------------------------------------------- /moco/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | -------------------------------------------------------------------------------- /moco/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import torch 3 | import torch.nn as nn 4 | import random 5 | 6 | class MoCoV3(nn.Module): 7 | """ 8 | Build a MoCo model with: a query encoder, a key encoder, and a queue 9 | https://arxiv.org/abs/1911.05722 10 | """ 11 | def __init__(self, base_encoder, dim=128, K=65536, m=0.999, T=0.07, mlp=False): 12 | """ 13 | dim: feature dimension (default: 128) 14 | K: queue size; number of negative keys (default: 65536) 15 | m: moco momentum of updating key encoder (default: 0.999) 16 | T: softmax temperature (default: 0.07) 17 | """ 18 | super(MoCoV3, self).__init__() 19 | 20 | self.K = K 21 | self.m = m 22 | self.T = T 23 | 24 | # create the encoders 25 | # num_classes is the output fc dimension 26 | self.encoder_q = base_encoder(num_classes=dim) 27 | self.encoder_k = base_encoder(num_classes=dim) 28 | 29 | 30 | 31 | if mlp: # hack: brute-force replacement 32 | dim_mlp = self.encoder_q.fc.weight.shape[1] 33 | self.encoder_q.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_q.fc) 34 | self.encoder_k.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_k.fc) 35 | 36 | self.predictor = nn.Sequential( 37 | nn.Linear(dim, dim), 38 | nn.BatchNorm1d(dim), 39 | nn.ReLU(inplace=True), 40 | nn.Linear(dim, dim), 41 | ) 42 | 43 | for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): 44 | param_k.data.copy_(param_q.data) # initialize 45 | param_k.requires_grad = False # not update by gradient 46 | 47 | @torch.no_grad() 48 | def _momentum_update_key_encoder(self): 49 | """ 50 | Momentum update of the key encoder 51 | """ 52 | for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): 53 | param_k.data = param_k.data * self.m + param_q.data * (1. - self.m) 54 | 55 | 56 | def forward(self, x1, x2): 57 | """ 58 | Input: 59 | x1: a batch of query images 60 | x2: a batch of key images 61 | Output: 62 | q1,q2,k1,k2 63 | """ 64 | 65 | # compute query features 66 | q1, q2 = self.predictor(self.encoder_q(x1)), self.predictor(self.encoder_q(x2)) 67 | 68 | q1 = nn.functional.normalize(q1, dim=1) 69 | q2 = nn.functional.normalize(q2, dim=1) 70 | 71 | # compute key features 72 | with torch.no_grad(): # no gradient to keys 73 | self._momentum_update_key_encoder() # update the key encoder 74 | 75 | k1, k2 = self.encoder_k(x1), self.encoder_k(x2) # keys: NxC 76 | k1 = nn.functional.normalize(k1, dim=1) 77 | k2 = nn.functional.normalize(k2, dim=1) 78 | 79 | return q1, q2, k1, k2 80 | 81 | 82 | -------------------------------------------------------------------------------- /moco/loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | from PIL import ImageFilter 3 | import random 4 | 5 | 6 | class TwoCropsTransform: 7 | """Take two random crops of one image as the query and key.""" 8 | 9 | def __init__(self, base_transform): 10 | self.base_transform = base_transform 11 | 12 | def __call__(self, x): 13 | q = self.base_transform(x) 14 | k = self.base_transform(x) 15 | return [q, k] 16 | 17 | class GaussianBlur(object): 18 | """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709""" 19 | 20 | def __init__(self, sigma=[.1, 2.]): 21 | self.sigma = sigma 22 | 23 | def __call__(self, x): 24 | sigma = random.uniform(self.sigma[0], self.sigma[1]) 25 | x = x.filter(ImageFilter.GaussianBlur(radius=sigma)) 26 | return x 27 | -------------------------------------------------------------------------------- /pretrain_cub.sh: -------------------------------------------------------------------------------- 1 | python main_moco_pretraining_v3.py \ 2 | -a resnet50 \ 3 | --lr 0.3 \ 4 | --batch-size 256 --epochs 200 \ 5 | --dist-url 'tcp://localhost:10005' --multiprocessing-distributed --world-size 1 --rank 0 \ 6 | --gpus 8,9,10,11 \ 7 | --mlp --moco-t 0.2 --moco-k 4096 --aug-plus --cos \ 8 | /opt/caoyh/datasets/cub200 9 | --------------------------------------------------------------------------------