├── README.md
├── algorithm.png
├── main_moco_pretraining_v3.py
├── moco
    ├── __init__.py
    ├── builder.py
    └── loader.py
└── pretrain_cub.sh


/README.md:
--------------------------------------------------------------------------------
1 | # MoCov3-pytorch
2 | custom implementation of MoCov3 [[arxiv]](https://arxiv.org/abs/2104.02057). I made minor modifications based on the official MoCo repository [[github]](https://github.com/facebookresearch/moco). 
3 | 
4 | No ViT part code and for reference only. 
5 | 
6 | ![](algorithm.png)
7 | 


--------------------------------------------------------------------------------
/algorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CupidJay/MoCov3-pytorch/1636a392e2a4a92bd841c61f0085d894369aeb8f/algorithm.png


--------------------------------------------------------------------------------
/main_moco_pretraining_v3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  3 | import argparse
  4 | import builtins
  5 | import math
  6 | import os
  7 | import random
  8 | import shutil
  9 | import time
 10 | import warnings
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.nn.parallel
 15 | import torch.backends.cudnn as cudnn
 16 | import torch.distributed as dist
 17 | import torch.optim
 18 | import torch.multiprocessing as mp
 19 | import torch.utils.data
 20 | import torch.utils.data.distributed
 21 | import torchvision.transforms as transforms
 22 | import torchvision.datasets as datasets
 23 | import torchvision.models as models
 24 | import moco.loader
 25 | import moco.builder
 26 | 
 27 | model_names = sorted(name for name in models.__dict__
 28 |     if name.islower() and not name.startswith("__")
 29 |     and callable(models.__dict__[name]))
 30 | 
 31 | 
 32 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
 33 | parser.add_argument('data', metavar='DIR',
 34 |                     help='path to dataset')
 35 | parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
 36 |                     choices=model_names,
 37 |                     help='model architecture: ' +
 38 |                         ' | '.join(model_names) +
 39 |                         ' (default: resnet50)')
 40 | parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
 41 |                     help='number of data loading workers (default: 32)')
 42 | parser.add_argument('--epochs', default=200, type=int, metavar='N',
 43 |                     help='number of total epochs to run')
 44 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
 45 |                     help='manual epoch number (useful on restarts)')
 46 | parser.add_argument('-b', '--batch-size', default=256, type=int,
 47 |                     metavar='N',
 48 |                     help='mini-batch size (default: 256), this is the total '
 49 |                          'batch size of all GPUs on the current node when '
 50 |                          'using Data Parallel or Distributed Data Parallel')
 51 | parser.add_argument('--lr', '--learning-rate', default=0.03, type=float,
 52 |                     metavar='LR', help='initial learning rate', dest='lr')
 53 | parser.add_argument('--schedule', default=[120, 160], nargs='*', type=int,
 54 |                     help='learning rate schedule (when to drop lr by 10x)')
 55 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
 56 |                     help='momentum of SGD solver')
 57 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
 58 |                     metavar='W', help='weight decay (default: 1e-4)',
 59 |                     dest='weight_decay')
 60 | parser.add_argument('-p', '--print-freq', default=10, type=int,
 61 |                     metavar='N', help='print frequency (default: 10)')
 62 | parser.add_argument('--resume', default='', type=str, metavar='PATH',
 63 |                     help='path to latest checkpoint (default: none)')
 64 | parser.add_argument('--pretrained', default='', type=str, metavar='PATH',
 65 |                     help='path to pretrained checkpoint (default: none) used for finetune')
 66 | parser.add_argument('--world-size', default=-1, type=int,
 67 |                     help='number of nodes for distributed training')
 68 | parser.add_argument('--rank', default=-1, type=int,
 69 |                     help='node rank for distributed training')
 70 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
 71 |                     help='url used to set up distributed training')
 72 | parser.add_argument('--dist-backend', default='nccl', type=str,
 73 |                     help='distributed backend')
 74 | parser.add_argument('--seed', default=None, type=int,
 75 |                     help='seed for initializing training. ')
 76 | parser.add_argument('--gpu', default=None, type=int,
 77 |                     help='GPU id to use.')
 78 | parser.add_argument('--gpus', type=str, default='0')
 79 | parser.add_argument('--save-dir', type=str, default='checkpoints', help='where to save models')
 80 | parser.add_argument('--multiprocessing-distributed', action='store_true',
 81 |                     help='Use multi-processing distributed training to launch '
 82 |                          'N processes per node, which has N GPUs. This is the '
 83 |                          'fastest way to use PyTorch for either single node or '
 84 |                          'multi node data parallel training')
 85 | 
 86 | # moco specific configs:
 87 | parser.add_argument('--moco-dim', default=128, type=int,
 88 |                     help='feature dimension (default: 128)')
 89 | parser.add_argument('--moco-k', default=65536, type=int,
 90 |                     help='queue size; number of negative keys (default: 65536)')
 91 | parser.add_argument('--moco-m', default=0.999, type=float,
 92 |                     help='moco momentum of updating key encoder (default: 0.999)')
 93 | parser.add_argument('--moco-t', default=0.07, type=float,
 94 |                     help='softmax temperature (default: 0.07)')
 95 | 
 96 | # options for moco v2
 97 | parser.add_argument('--mlp', action='store_true',
 98 |                     help='use mlp head')
 99 | parser.add_argument('--aug-plus', action='store_true',
100 |                     help='use moco v2 data augmentation')
101 | parser.add_argument('--cos', action='store_true',
102 |                     help='use cosine lr schedule')
103 | 
104 | 
105 | def main():
106 |     args = parser.parse_args()
107 | 
108 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus
109 | 
110 |     args.save_dir = os.path.join(args.save_dir, 'self-similarity', args.arch)
111 |     if args.pretrained:
112 |         args.save_dir = os.path.join(args.save_dir, 'pretrained_from_{}'.format(args.pretrained.split('/')[-1]))
113 |     args.save_dir = os.path.join(args.save_dir, 'gpus_{}_lr_{}_bs_{}_epochs'.format(len(args.gpus.split(',')),
114 |                                                                                      args.lr,
115 |                                                                                      args.batch_size,
116 |                                                                                      args.epochs))
117 |     if not os.path.exists(args.save_dir):
118 |         os.makedirs(args.save_dir)
119 | 
120 |     if args.seed is not None:
121 |         random.seed(args.seed)
122 |         torch.manual_seed(args.seed)
123 |         cudnn.deterministic = True
124 |         warnings.warn('You have chosen to seed training. '
125 |                       'This will turn on the CUDNN deterministic setting, '
126 |                       'which can slow down your training considerably! '
127 |                       'You may see unexpected behavior when restarting '
128 |                       'from checkpoints.')
129 | 
130 |     if args.gpu is not None:
131 |         warnings.warn('You have chosen a specific GPU. This will completely '
132 |                       'disable data parallelism.')
133 | 
134 |     if args.dist_url == "env://" and args.world_size == -1:
135 |         args.world_size = int(os.environ["WORLD_SIZE"])
136 | 
137 |     args.distributed = args.world_size > 1 or args.multiprocessing_distributed
138 | 
139 |     ngpus_per_node = torch.cuda.device_count()
140 |     if args.multiprocessing_distributed:
141 |         # Since we have ngpus_per_node processes per node, the total world_size
142 |         # needs to be adjusted accordingly
143 |         args.world_size = ngpus_per_node * args.world_size
144 |         # Use torch.multiprocessing.spawn to launch distributed processes: the
145 |         # main_worker process function
146 |         mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
147 |     else:
148 |         # Simply call main_worker function
149 |         main_worker(args.gpu, ngpus_per_node, args)
150 | 
151 | 
152 | 
153 | def main_worker(gpu, ngpus_per_node, args):
154 |     args.gpu = gpu
155 | 
156 |     # suppress printing if not master
157 |     if args.multiprocessing_distributed and args.gpu != 0:
158 |         def print_pass(*args):
159 |             pass
160 |         builtins.print = print_pass
161 | 
162 |     if args.gpu is not None:
163 |         print("Use GPU: {} for training".format(args.gpu))
164 | 
165 |     if args.distributed:
166 |         if args.dist_url == "env://" and args.rank == -1:
167 |             args.rank = int(os.environ["RANK"])
168 |         if args.multiprocessing_distributed:
169 |             # For multiprocessing distributed training, rank needs to be the
170 |             # global rank among all the processes
171 |             args.rank = args.rank * ngpus_per_node + gpu
172 |         dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
173 |                                 world_size=args.world_size, rank=args.rank)
174 |     # create model
175 |     print("=> creating model '{}'".format(args.arch))
176 | 
177 |     model = moco.builder.MoCoV3(
178 |         models.__dict__[args.arch],
179 |         args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp)
180 |     print(model)
181 | 
182 |     if args.distributed:
183 |         # For multiprocessing distributed, DistributedDataParallel constructor
184 |         # should always set the single device scope, otherwise,
185 |         # DistributedDataParallel will use all available devices.
186 |         if args.gpu is not None:
187 |             torch.cuda.set_device(args.gpu)
188 |             model.cuda(args.gpu)
189 |             # When using a single GPU per process and per
190 |             # DistributedDataParallel, we need to divide the batch size
191 |             # ourselves based on the total number of GPUs we have
192 |             args.batch_size = int(args.batch_size / ngpus_per_node)
193 |             args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
194 |             model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
195 |         else:
196 |             model.cuda()
197 |             # DistributedDataParallel will divide and allocate batch_size to all
198 |             # available GPUs if device_ids are not set
199 |             model = torch.nn.parallel.DistributedDataParallel(model)
200 |     elif args.gpu is not None:
201 |         torch.cuda.set_device(args.gpu)
202 |         model = model.cuda(args.gpu)
203 |         # comment out the following line for debugging
204 |         raise NotImplementedError("Only DistributedDataParallel is supported.")
205 |     else:
206 |         # AllGather implementation (batch shuffle, queue update, etc.) in
207 |         # this code only supports DistributedDataParallel.
208 |         raise NotImplementedError("Only DistributedDataParallel is supported.")
209 | 
210 |     # define loss function (criterion) and optimizer
211 |     criterion = nn.CrossEntropyLoss().cuda(args.gpu)
212 | 
213 |     optimizer = torch.optim.SGD(model.parameters(), args.lr,
214 |                                 momentum=args.momentum,
215 |                                 weight_decay=args.weight_decay)
216 | 
217 |     # optionally resume from a checkpoint
218 |     if args.resume:
219 |         if os.path.isfile(args.resume):
220 |             print("=> loading checkpoint '{}'".format(args.resume))
221 |             if args.gpu is None:
222 |                 checkpoint = torch.load(args.resume)
223 |             else:
224 |                 # Map model to be loaded to specified single gpu.
225 |                 loc = 'cuda:{}'.format(args.gpu)
226 |                 checkpoint = torch.load(args.resume, map_location=loc)
227 |             args.start_epoch = checkpoint['epoch']
228 |             model.load_state_dict(checkpoint['state_dict'])
229 |             optimizer.load_state_dict(checkpoint['optimizer'])
230 |             print("=> loaded checkpoint '{}' (epoch {})"
231 |                   .format(args.resume, checkpoint['epoch']))
232 |         else:
233 |             print("=> no checkpoint found at '{}'".format(args.resume))
234 | 
235 | 
236 |     # optionally resume from a checkpoint
237 |     if args.pretrained:
238 |         if os.path.isfile(args.pretrained):
239 |             print("=> loading pretrain weight '{}'".format(args.pretrained))
240 |             if args.gpu is None:
241 |                 checkpoint = torch.load(args.pretrained)
242 |             else:
243 |                 # Map model to be loaded to specified single gpu.
244 |                 loc = 'cuda:{}'.format(args.gpu)
245 |                 checkpoint = torch.load(args.pretrained, map_location=loc)
246 |             model.load_state_dict(checkpoint['state_dict'])
247 |             print("=> loaded pretrained weight '{}'".format(args.pretrained))
248 |         else:
249 |             print("=> no pretrained weight found at '{}'".format(args.pretrained))
250 | 
251 |     cudnn.benchmark = True
252 | 
253 |     # Data loading code
254 |     traindir = os.path.join(args.data, 'train')
255 |     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
256 |                                      std=[0.229, 0.224, 0.225])
257 |     augmentation = [
258 |         transforms.RandomResizedCrop(224, scale=(0.2, 1.)),
259 |         transforms.RandomApply([
260 |             transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)  # not strengthened
261 |         ], p=0.8),
262 |         transforms.RandomGrayscale(p=0.2),
263 |         transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.5),
264 |         transforms.RandomHorizontalFlip(),
265 |         transforms.ToTensor(),
266 |         normalize
267 |     ]
268 | 
269 | 
270 |     train_dataset = datasets.ImageFolder(
271 |         traindir,
272 |         moco.loader.TwoScaleCropsTransform(transforms.Compose(augmentation)))
273 | 
274 |     if args.distributed:
275 |         train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
276 |     else:
277 |         train_sampler = None
278 | 
279 | 
280 |     train_loader = torch.utils.data.DataLoader(
281 |         train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
282 |         num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
283 | 
284 |     train_start = time.time()
285 | 
286 |     for epoch in range(args.start_epoch, args.epochs):
287 |         if args.distributed:
288 |             train_sampler.set_epoch(epoch)
289 |         adjust_learning_rate(optimizer, epoch, args)
290 | 
291 |         # train for one epoch
292 |         train(train_loader, model, criterion, optimizer, epoch, args)
293 | 
294 |         if not args.multiprocessing_distributed or (args.multiprocessing_distributed
295 |                 and args.rank % ngpus_per_node == 0):
296 |             if epoch % 50 == 0 or epoch==args.epochs-1:
297 |                 save_checkpoint({
298 |                     'epoch': epoch + 1,
299 |                     'arch': args.arch,
300 |                     'state_dict': model.state_dict(),
301 |                     'optimizer' : optimizer.state_dict(),
302 |                 }, is_best=False, root=args.save_dir, filename='checkpoint_{:04d}.pth.tar'.format(epoch))
303 |     train_end = time.time()
304 | 
305 |     print('total training time elapses {} hours'.format((train_end-train_start)/3600.0))
306 | 
307 | def ctr(q, k, criterion, tau):
308 |     logits = torch.mm(q, k.t())
309 |     N = q.size(0)
310 |     labels = range(N)
311 |     labels = torch.LongTensor(labels).cuda()
312 |     loss = criterion(logits/tau, labels)
313 |     return 2*tau*loss
314 | 
315 | def train(train_loader, model, criterion, optimizer, epoch, args):
316 |     batch_time = AverageMeter('Time', ':6.3f')
317 |     data_time = AverageMeter('Data', ':6.3f')
318 |     losses = AverageMeter('Loss', ':.4e')
319 |     #top1 = AverageMeter('Acc@1', ':6.2f')
320 |     #top5 = AverageMeter('Acc@5', ':6.2f')
321 |     progress = ProgressMeter(
322 |         len(train_loader),
323 |         [batch_time, data_time, losses],
324 |         prefix="Epoch: [{}]".format(epoch))
325 | 
326 |     # switch to train mode
327 |     model.train()
328 | 
329 |     end = time.time()
330 |     for i, (images, _) in enumerate(train_loader):
331 |         # measure data loading time
332 |         data_time.update(time.time() - end)
333 | 
334 |         #print(images[0].size())
335 |         #print(images[1].size())
336 | 
337 |         if args.gpu is not None:
338 |             images[0] = images[0].cuda(args.gpu, non_blocking=True)
339 |             images[1] = images[1].cuda(args.gpu, non_blocking=True)
340 | 
341 |         # compute output
342 |         #if epoch % 2 == 0:
343 |         q1, q2, k1, k2 = model(x1=images[0], x2=images[1])
344 | 
345 |         loss = ctr(q1, k2, criterion, args.moco_t) + ctr(q2, k1, criterion, args.moco_t)
346 |         #else:
347 |         #    output, target = model(im_q=images[1], im_k=images[0])
348 | 
349 |         # acc1/acc5 are (K+1)-way contrast classifier accuracy
350 |         # measure accuracy and record loss
351 |         #acc1, acc5 = accuracy(output, target, topk=(1, 5))
352 |         losses.update(loss.item(), images[0].size(0))
353 |         #top1.update(acc1[0], images[0].size(0))
354 |         #top5.update(acc5[0], images[0].size(0))
355 | 
356 |         # compute gradient and do SGD step
357 |         optimizer.zero_grad()
358 |         loss.backward()
359 |         optimizer.step()
360 | 
361 |         # measure elapsed time
362 |         batch_time.update(time.time() - end)
363 |         end = time.time()
364 | 
365 |         if i % args.print_freq == 0:
366 |             progress.display(i)
367 | 
368 | 
369 | def save_checkpoint(state, is_best, root, filename='checkpoint.pth.tar'):
370 |     torch.save(state, os.path.join(root, filename))
371 |     if is_best:
372 |         shutil.copyfile(os.path.join(root, filename), os.path.join(root, 'model_best.pth.tar'))
373 | 
374 | 
375 | class AverageMeter(object):
376 |     """Computes and stores the average and current value"""
377 |     def __init__(self, name, fmt=':f'):
378 |         self.name = name
379 |         self.fmt = fmt
380 |         self.reset()
381 | 
382 |     def reset(self):
383 |         self.val = 0
384 |         self.avg = 0
385 |         self.sum = 0
386 |         self.count = 0
387 | 
388 |     def update(self, val, n=1):
389 |         self.val = val
390 |         self.sum += val * n
391 |         self.count += n
392 |         self.avg = self.sum / self.count
393 | 
394 |     def __str__(self):
395 |         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
396 |         return fmtstr.format(**self.__dict__)
397 | 
398 | 
399 | class ProgressMeter(object):
400 |     def __init__(self, num_batches, meters, prefix=""):
401 |         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
402 |         self.meters = meters
403 |         self.prefix = prefix
404 | 
405 |     def display(self, batch):
406 |         entries = [self.prefix + self.batch_fmtstr.format(batch)]
407 |         entries += [str(meter) for meter in self.meters]
408 |         print('\t'.join(entries))
409 | 
410 |     def _get_batch_fmtstr(self, num_batches):
411 |         num_digits = len(str(num_batches // 1))
412 |         fmt = '{:' + str(num_digits) + 'd}'
413 |         return '[' + fmt + '/' + fmt.format(num_batches) + ']'
414 | 
415 | 
416 | def adjust_learning_rate(optimizer, epoch, args):
417 |     """Decay the learning rate based on schedule"""
418 |     lr = args.lr
419 |     if args.cos:  # cosine lr schedule
420 |         lr *= 0.5 * (1. + math.cos(math.pi * epoch / args.epochs))
421 |     else:  # stepwise lr schedule
422 |         for milestone in args.schedule:
423 |             lr *= 0.1 if epoch >= milestone else 1.
424 |     for param_group in optimizer.param_groups:
425 |         param_group['lr'] = lr
426 | 
427 | 
428 | def accuracy(output, target, topk=(1,)):
429 |     """Computes the accuracy over the k top predictions for the specified values of k"""
430 |     with torch.no_grad():
431 |         maxk = max(topk)
432 |         batch_size = target.size(0)
433 | 
434 |         _, pred = output.topk(maxk, 1, True, True)
435 |         pred = pred.t()
436 |         correct = pred.eq(target.view(1, -1).expand_as(pred))
437 | 
438 |         res = []
439 |         for k in topk:
440 |             correct_k = correct[:k].contiguous().view(-1).float().sum(0, keepdim=True)
441 |             res.append(correct_k.mul_(100.0 / batch_size))
442 |         return res
443 | 
444 | 
445 | if __name__ == '__main__':
446 |     main()
447 | 
448 | 


--------------------------------------------------------------------------------
/moco/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | 


--------------------------------------------------------------------------------
/moco/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | import torch
 3 | import torch.nn as nn
 4 | import random
 5 | 
 6 | class MoCoV3(nn.Module):
 7 |     """
 8 |     Build a MoCo model with: a query encoder, a key encoder, and a queue
 9 |     https://arxiv.org/abs/1911.05722
10 |     """
11 |     def __init__(self, base_encoder, dim=128, K=65536, m=0.999, T=0.07, mlp=False):
12 |         """
13 |         dim: feature dimension (default: 128)
14 |         K: queue size; number of negative keys (default: 65536)
15 |         m: moco momentum of updating key encoder (default: 0.999)
16 |         T: softmax temperature (default: 0.07)
17 |         """
18 |         super(MoCoV3, self).__init__()
19 | 
20 |         self.K = K
21 |         self.m = m
22 |         self.T = T
23 | 
24 |         # create the encoders
25 |         # num_classes is the output fc dimension
26 |         self.encoder_q = base_encoder(num_classes=dim)
27 |         self.encoder_k = base_encoder(num_classes=dim)
28 | 
29 | 
30 | 
31 |         if mlp:  # hack: brute-force replacement
32 |             dim_mlp = self.encoder_q.fc.weight.shape[1]
33 |             self.encoder_q.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_q.fc)
34 |             self.encoder_k.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_k.fc)
35 | 
36 |         self.predictor = nn.Sequential(
37 |             nn.Linear(dim, dim),
38 |             nn.BatchNorm1d(dim),
39 |             nn.ReLU(inplace=True),
40 |             nn.Linear(dim, dim),
41 |         )
42 | 
43 |         for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
44 |             param_k.data.copy_(param_q.data)  # initialize
45 |             param_k.requires_grad = False  # not update by gradient
46 | 
47 |     @torch.no_grad()
48 |     def _momentum_update_key_encoder(self):
49 |         """
50 |         Momentum update of the key encoder
51 |         """
52 |         for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
53 |             param_k.data = param_k.data * self.m + param_q.data * (1. - self.m)
54 | 
55 | 
56 |     def forward(self, x1, x2):
57 |         """
58 |         Input:
59 |             x1: a batch of query images
60 |             x2: a batch of key images
61 |         Output:
62 |             q1,q2,k1,k2
63 |         """
64 | 
65 |         # compute query features
66 |         q1, q2 = self.predictor(self.encoder_q(x1)), self.predictor(self.encoder_q(x2))
67 | 
68 |         q1 = nn.functional.normalize(q1, dim=1)
69 |         q2 = nn.functional.normalize(q2, dim=1)
70 | 
71 |         # compute key features
72 |         with torch.no_grad():  # no gradient to keys
73 |             self._momentum_update_key_encoder()  # update the key encoder
74 | 
75 |             k1, k2 = self.encoder_k(x1), self.encoder_k(x2)  # keys: NxC
76 |             k1 = nn.functional.normalize(k1, dim=1)
77 |             k2 = nn.functional.normalize(k2, dim=1)
78 | 
79 |         return q1, q2, k1, k2
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/moco/loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | from PIL import ImageFilter
 3 | import random
 4 | 
 5 | 
 6 | class TwoCropsTransform:
 7 |     """Take two random crops of one image as the query and key."""
 8 | 
 9 |     def __init__(self, base_transform):
10 |         self.base_transform = base_transform
11 | 
12 |     def __call__(self, x):
13 |         q = self.base_transform(x)
14 |         k = self.base_transform(x)
15 |         return [q, k]
16 | 
17 | class GaussianBlur(object):
18 |     """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""
19 | 
20 |     def __init__(self, sigma=[.1, 2.]):
21 |         self.sigma = sigma
22 | 
23 |     def __call__(self, x):
24 |         sigma = random.uniform(self.sigma[0], self.sigma[1])
25 |         x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
26 |         return x
27 | 


--------------------------------------------------------------------------------
/pretrain_cub.sh:
--------------------------------------------------------------------------------
1 | python main_moco_pretraining_v3.py \
2 |   -a resnet50 \
3 |   --lr 0.3 \
4 |   --batch-size 256 --epochs 200 \
5 |   --dist-url 'tcp://localhost:10005' --multiprocessing-distributed --world-size 1 --rank 0 \
6 |   --gpus 8,9,10,11 \
7 |   --mlp --moco-t 0.2 --moco-k 4096 --aug-plus --cos \
8 |   /opt/caoyh/datasets/cub200
9 | 


--------------------------------------------------------------------------------