├── PFNet_train_test.py
├── README.md
├── lib
    ├── car_multilable_rois.py
    ├── layer_utils
    │   └── roi_pooling
    │   │   ├── _ext
    │   │       └── roi_pooling
    │   │       │   ├── __init__.py
    │   │       │   └── _roi_pooling.so
    │   │   ├── build.py
    │   │   ├── roi_pool.py
    │   │   ├── roi_pool_py.py
    │   │   └── src
    │   │       ├── cuda
    │   │           ├── roi_pooling_kernel.cu
    │   │           ├── roi_pooling_kernel.cu.o
    │   │           └── roi_pooling_kernel.h
    │   │       ├── roi_pooling.c
    │   │       ├── roi_pooling.h
    │   │       ├── roi_pooling_cuda.c
    │   │       └── roi_pooling_cuda.h
    └── transforms_with_rois.py
├── part proposal
    ├── CarProposalSSW_par.m
    ├── Car_get_database_SSW.m
    ├── Readme.md
    └── roisWarpperforPytorch_generatetxt.m
└── pic
    ├── PFNet.jpg
    └── dog_loss_acc1.png


/PFNet_train_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2018 Jingyun Liang et al.
  2 | # All rights reserved.
  3 | 
  4 | import argparse
  5 | import os
  6 | import shutil
  7 | import time
  8 | import sys
  9 | import matplotlib.pyplot as plt
 10 | import matplotlib.ticker as ticker
 11 | import math
 12 | import pprint
 13 | import numbers
 14 | 
 15 | import torch
 16 | import torch.nn as nn
 17 | import torch.nn.parallel
 18 | import torch.backends.cudnn as cudnn
 19 | import torch.distributed as dist
 20 | import torch.optim
 21 | import torch.utils.data
 22 | import torch.utils.data.distributed
 23 | import torchvision.models as models
 24 | import torch.nn.functional as F
 25 | 
 26 | 
 27 | # dataset preparation, self-defined transforms with rois and spp layer
 28 | from lib.car_multilable_rois import ImageFolder as car_multi
 29 | import lib.transforms_with_rois as transforms
 30 | from lib.layer_utils.roi_pooling.roi_pool import RoIPoolFunction
 31 | 
 32 | 
 33 | model_names = sorted(name for name in models.__dict__
 34 |     if name.islower() and not name.startswith("__")
 35 |     and callable(models.__dict__[name]))
 36 | 
 37 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
 38 | parser.add_argument('--data', metavar='DIR', default='none',
 39 |                     help='path to dataset')
 40 | parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18',
 41 |                     choices=model_names,
 42 |                     help='model architecture: ' +
 43 |                         ' | '.join(model_names) +
 44 |                         ' (default: vgg19)')
 45 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
 46 |                     help='number of data loading workers (default: 4)')
 47 | parser.add_argument('--epochs', default=90, type=int, metavar='N',
 48 |                     help='number of total epochs to run')
 49 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
 50 |                     help='manual epoch number (useful on restarts)')
 51 | parser.add_argument('-b', '--batch-size', default=256, type=int,
 52 |                     metavar='N', help='mini-batch size (default: 256)')
 53 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
 54 |                     metavar='LR', help='initial learning rate')
 55 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
 56 |                     help='momentum')
 57 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
 58 |                     metavar='W', help='weight decay (default: 1e-4)')
 59 | parser.add_argument('--print-freq', '-p', default=10, type=int,
 60 |                     metavar='N', help='print frequency (default: 10, no internel ouput: 0)')
 61 | parser.add_argument('--resume', default='', type=str, metavar='PATH',
 62 |                     help='path to latest checkpoint (default: none)')
 63 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
 64 |                     help='evaluate model on validation set')
 65 | parser.add_argument('--world-size', default=1, type=int,
 66 |                     help='number of distributed processes')
 67 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
 68 |                     help='url used to set up distributed training')
 69 | parser.add_argument('--dist-backend', default='gloo', type=str,
 70 |                     help='distributed backend')
 71 | parser.add_argument('--input-crop', default=224, type=int,
 72 |                     help='input image crop size (default: 224)')
 73 | parser.add_argument('--input-scale', default=256, type=int,
 74 |                     help='input image scale size (default: 256)')
 75 | parser.add_argument('--lr-stepsize', '--learning-rate-stepsize', default=30, type=int,
 76 |                     metavar='LR', help='learning rate stepsize')
 77 | parser.add_argument('--num-Classes', type=int,
 78 |                     help='number of dataset classes')
 79 | parser.add_argument('--maximum-Rois', dest='maximumRois', default=100, type=int,
 80 |                     help='maximum number of rois')
 81 | 
 82 | 
 83 | best_prec1 = 0
 84 | plot_statistic = {"train_loss":[],"test_loss":[],"train_acc1":[],"test_acc1":[]}
 85 | 
 86 | 
 87 | def main():
 88 |     global args, best_prec1, modelDir, log_file, plot_statistic
 89 | 
 90 |     args = parser.parse_args()
 91 |     args.data = 'car' # dataset name: cub car aircraft
 92 |     args.numClasses = 196 # cub 200 car 196 aircraft 100
 93 |     args.arch = 'vgg19' # backbone CNN
 94 |     args.maximumRois = 500 # number of rois
 95 |     modelDir = args.data +'_'+ args.arch +'_test' # checkpoint dir
 96 |     args.resume = os.path.join(modelDir, 'epoch-' + '15' + '-checkpoint.pth.tar') # 1,2,3, 0 for no resume checkpoint
 97 |     args.evaluate = True
 98 | 
 99 |     args.epochs = 20
100 |     args.batch_size = 1
101 |     args.lr = 1e-4
102 |     args.lr_stepsize = 10
103 |     args.weight_decay = 5e-4
104 | 
105 |     args.workers = 2
106 |     args.print_freq = 10
107 |     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
108 |     args.distributed = args.world_size > 1
109 | 
110 |     timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
111 |     log_file = modelDir + "_{}.log".format(timestamp)
112 |     if not os.path.exists(modelDir):
113 |         os.mkdir(modelDir)
114 |     shutil.copy(os.path.abspath(__file__),modelDir)
115 |     os.rename(os.path.join(modelDir,os.path.basename(__file__)),\
116 |               os.path.join(modelDir,os.path.basename(__file__))[:-3]+"_{}.py".format(timestamp))
117 | 
118 |     printlog(args)
119 | 
120 |     if args.distributed:
121 |         dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
122 |                                 world_size=args.world_size)
123 | 
124 | 
125 |     # create model
126 |     printlog("=> using imagenet pre-trained model '{}'".format(args.arch))
127 |     if 'vgg' in args.arch :
128 |         model = models.__dict__[args.arch](pretrained=True)
129 |         model.classifier._modules['6'] = nn.Linear(model.classifier[6].in_features, args.numClasses)
130 |         model = VggBasedNet_PFNet(originalModel = model)
131 |     else:
132 |         raise ValueError
133 | 
134 |     printlog(model)
135 | 
136 |     if not args.distributed:
137 |         model = torch.nn.DataParallel(model.cuda())
138 |     else:
139 |         model.cuda()
140 |         model = torch.nn.parallel.DistributedDataParallel(model)
141 | 
142 |     # define optimizer
143 |     params = []
144 |     if 'vgg' in args.arch :
145 |         for key, value in dict(model.named_parameters()).items():
146 |             if value.requires_grad:
147 |                 if 'features' in key or 'conv5' in key:
148 |                     smaller_lr = 0.1
149 |                 else:
150 |                     smaller_lr = 1
151 | 
152 |                 if 'bias' in key:
153 |                     params += [{'params':[value],'lr':args.lr*smaller_lr, 'weight_decay': False and args.weight_decay or 0}]
154 |                 else:
155 |                     params += [{'params':[value],'lr':args.lr*smaller_lr, 'weight_decay': args.weight_decay}]
156 |             else:
157 |                 printlog('layer --{0}-- is fixed.'.format(key))
158 | 
159 |         optimizer = torch.optim.SGD(params, momentum=args.momentum)
160 |     else:
161 |         raise ValueError
162 | 
163 |     # optionally resume from a checkpoint
164 |     if args.resume :
165 |         printlog("=> loading specified checkpoint '{}'".format(args.resume))
166 | 
167 |         if os.path.isfile(args.resume):
168 |             checkpoint = torch.load(args.resume)
169 |             args.start_epoch = checkpoint['epoch']
170 |             best_prec1 = checkpoint['best_prec1']
171 |             model.load_state_dict(checkpoint['state_dict'])
172 |             optimizer.load_state_dict(checkpoint['optimizer'])
173 |             plot_statistic = checkpoint['loss_acc1']
174 |             printlog("=> loaded checkpoint '{}' (epoch {})"
175 |                   .format(args.resume, checkpoint['epoch']))
176 |         else:
177 |             printlog("=> no checkpoint found at '{}'".format(args.resume))
178 | 
179 |     cudnn.benchmark = True
180 | 
181 | 
182 |     # prepare data
183 |     train_loader,val_loader,train_sampler = get_data_loader()
184 | 
185 |     # define loss
186 |     criterion = [BinaryLogLoss().cuda(), PartAttentionLoss().cuda()]
187 | 
188 |     # model testing
189 |     if args.evaluate:
190 |         validate(val_loader, model, criterion)
191 |         return
192 | 
193 |     # model training
194 |     for epoch in range(args.start_epoch, args.epochs):
195 |         if args.distributed:
196 |             train_sampler.set_epoch(epoch)
197 |         adjust_learning_rate(optimizer, epoch)
198 | 
199 |         # train
200 |         train(train_loader, model, criterion, optimizer, epoch)
201 | 
202 |         # test
203 |         prec1 = validate(val_loader, model, criterion)
204 | 
205 |         # save checkpoint
206 |         best_prec1 = max(prec1, best_prec1)
207 |         save_checkpoint({
208 |             'epoch': epoch + 1,
209 |             'arch': args.arch,
210 |             'state_dict': model.state_dict(),
211 |             'best_prec1': best_prec1,
212 |             'optimizer' : optimizer.state_dict(),
213 |             'loss_acc1':plot_statistic,
214 |         })
215 | 
216 |     showPlot(plot_statistic)
217 |     printlog('Training done, the best test_acc1 is {0} in Epoch {1}'.format(best_prec1,plot_statistic["test_acc1"].index(best_prec1)))
218 | 
219 | 
220 | def train(train_loader, model, criterion, optimizer, epoch):
221 |     """model training"""
222 |     batch_time = AverageMeter()
223 |     data_time = AverageMeter()
224 |     losses = AverageMeter()
225 |     top1 = AverageMeter()
226 |     top5 = AverageMeter()
227 | 
228 |     model.train()
229 | 
230 |     end = time.time()
231 |     for i, (inputs, target) in enumerate(train_loader):
232 |         data_time.update(time.time() - end)
233 | 
234 |         input = inputs[0].cuda() # image tensor
235 |         rois = inputs[1][0,:,:].cuda()  # rois matrix
236 |         target = target.cuda()
237 |         input_var = torch.autograd.Variable(input)
238 |         rois_var = torch.autograd.Variable(rois)
239 |         target_var = torch.autograd.Variable(target)
240 | 
241 |         # forward
242 |         output,softMatrix = model(input_var, rois_var)
243 |         loss = criterion[0](output, target_var)+criterion[1](softMatrix, target_var)
244 | 
245 |         # measure accuracy and record loss
246 |         prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
247 |         losses.update(loss.data[0], input.size(0))
248 |         top1.update(prec1[0], input.size(0))
249 |         top5.update(prec5[0], input.size(0))
250 | 
251 |         # backward
252 |         optimizer.zero_grad()
253 |         loss.backward()
254 |         optimizer.step()
255 | 
256 |         # measure elapsed time
257 |         batch_time.update(time.time() - end)
258 |         end = time.time()
259 |         if args.print_freq:
260 |             if i % args.print_freq == 0:
261 |                 printlog('Epoch: [{0}][{1}/{2}]\t'
262 |                     'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
263 |                     'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
264 |                     'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
265 |                     'Prec_1 {top1.val:.3f} ({top1.avg:.3f})'.format(
266 |                     epoch, i, len(train_loader), batch_time=batch_time,
267 |                     data_time=data_time, loss=losses, top1=top1))
268 | 
269 |     printlog('Epoch {0} \t\t\t Model {1} \t Time {2}'.format(epoch, modelDir,time.strftime("%H-%M-%S")))
270 |     printlog('Train Loss {loss.avg:.4f}   top1 {top1.avg:.3f}  BatchTime{batch_time.avg:.3f}'
271 |           .format(loss = losses, top1=top1, batch_time=batch_time))
272 | 
273 |     plot_statistic["train_loss"].append(losses.avg)
274 |     plot_statistic["train_acc1"].append(top1.avg)
275 | 
276 | 
277 | def validate(val_loader, model, criterion):
278 |     """model testing"""
279 |     batch_time = AverageMeter()
280 |     losses = AverageMeter()
281 |     top1 = AverageMeter()
282 |     top5 = AverageMeter()
283 | 
284 |     # switch to evaluate mode
285 |     model.eval()
286 | 
287 |     end = time.time()
288 |     for i, (inputs, target) in enumerate(val_loader):
289 |         input = inputs[0].cuda()
290 |         rois = inputs[1][0,:,:].cuda()
291 |         target = target.cuda()
292 |         input_var = torch.autograd.Variable(input, volatile=True)
293 |         rois_var = torch.autograd.Variable(rois, volatile=True)
294 |         target_var = torch.autograd.Variable(target, volatile=True)
295 | 
296 |         # compute output
297 |         output,softMatrix = model(input_var, rois_var)
298 |         loss = criterion[0](output, target_var)+criterion[1](softMatrix, target_var)#+criterion[2](sparseSoftMatrix)
299 | 
300 |         # measure accuracy and record loss
301 |         prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
302 |         losses.update(loss.data[0], input.size(0))
303 |         top1.update(prec1[0], input.size(0))
304 |         top5.update(prec5[0], input.size(0))
305 | 
306 |         # measure elapsed time
307 |         batch_time.update(time.time() - end)
308 |         end = time.time()
309 | 
310 |         if args.print_freq:
311 |             if i % args.print_freq == 0:
312 |                 printlog('Test: [{0}/{1}]\t'
313 |                     'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
314 |                     'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
315 |                     'Prec_1 {top1.val:.3f} ({top1.avg:.3f})'.format(
316 |                     i, len(val_loader), batch_time=batch_time, loss=losses,
317 |                     top1=top1))
318 | 
319 |     printlog('Test  Loss {loss.avg:.4f}   top1 {top1.avg:.3f}  BatchTime{batch_time.avg:.3f}'
320 |           .format(loss = losses, top1=top1, batch_time=batch_time))
321 | 
322 |     plot_statistic["test_loss"].append(losses.avg)
323 |     plot_statistic["test_acc1"].append(top1.avg)
324 |     showPlot(plot_statistic)
325 | 
326 |     return top1.avg
327 | 
328 | 
329 | def save_checkpoint(state):
330 |     """save checkpoint"""
331 |     filename = os.path.join(modelDir, 'epoch-' + str(state['epoch']) + '-checkpoint.pth.tar')
332 |     torch.save(state, filename)
333 | 
334 | 
335 | class AverageMeter(object):
336 |     """Computes and stores the average and current value"""
337 |     def __init__(self):
338 |         self.reset()
339 | 
340 |     def reset(self):
341 |         self.val = 0
342 |         self.avg = 0
343 |         self.sum = 0
344 |         self.count = 0
345 | 
346 |     def update(self, val, n=1):
347 |         self.val = val
348 |         self.sum += val * n
349 |         self.count += n
350 |         self.avg = self.sum / self.count
351 | 
352 | 
353 | def adjust_learning_rate(optimizer, epoch):
354 |     """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
355 |     lr = args.lr * (0.1 ** (epoch // args.lr_stepsize))
356 |     for param_group in optimizer.param_groups:
357 |         param_group['lr'] = lr
358 | 
359 | 
360 | def accuracy(output, target, topk=(1,)):
361 |     """Computes the precision@k for the specified values of k"""
362 |     maxk = max(topk)
363 |     batch_size = target.size(0)
364 | 
365 |     _, pred = output.topk(maxk, 1, True, True)
366 |     pred = pred.t()
367 | 
368 |     _, index = torch.max(target,dim=1)
369 |     target = index
370 | 
371 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
372 | 
373 |     res = []
374 |     for k in topk:
375 |         correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
376 |         res.append(correct_k.mul_(100.0 / batch_size))
377 |     return res
378 | 
379 | 
380 | def printlog(output):
381 |     """print log on screen and save to .log file"""
382 |     print(output)
383 | 
384 |     stdout_backup = sys.stdout
385 |     logfile = open(os.path.join(modelDir,log_file),'a')
386 |     sys.stdout = logfile
387 |     pprint.pprint(output)
388 |     logfile.close()
389 |     sys.stdout = stdout_backup
390 | 
391 | 
392 | def showPlot(plot_statistic):
393 |     """plot loss and accuracy"""
394 |     plt.clf()
395 |     plt1 = plt.subplot(121)
396 |     plt2 = plt.subplot(122)
397 |     loc = ticker.MultipleLocator(base=10)
398 |     plt1.xaxis.set_major_locator(loc)
399 |     plt2.xaxis.set_major_locator(loc)
400 |     plt1.plot(plot_statistic["train_loss"],label="train_loss")
401 |     plt2.plot(plot_statistic["train_acc1"],label="train_acc1")
402 |     plt1.plot(plot_statistic["test_loss"],label="test_loss")
403 |     plt2.plot(plot_statistic["test_acc1"],label="test_acc1")
404 |     plt1.legend()
405 |     plt2.legend()
406 |     plt.savefig(os.path.join(modelDir,'loss_acc1.png'))
407 | 
408 | def get_data_loader():
409 |     """Data loading code"""
410 |     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
411 |                                      std=[0.229, 0.224, 0.225])
412 | 
413 |     train_transforms = transforms.Compose([
414 |             transforms.Scale(args.input_crop,scaleheight=[250,350,450,550,650]),
415 |             transforms.RandomHorizontalFlip(),
416 |             transforms.ToTensor(),
417 |             normalize,
418 |         ])
419 |     val_transforms = transforms.Compose([
420 |             transforms.Scale(args.input_crop,scaleheight=[250,350,450,550,650]),#79.75% for above test
421 |             transforms.ToTensor(),
422 |             normalize,
423 |         ])
424 | 
425 |     if args.data == 'car':
426 |         train_dataset = car_multi(args.data, 'trainval',transform=train_transforms)
427 |         val_dataset = car_multi(args.data, 'test',transform=val_transforms)
428 |     else:
429 |         raise ValueError
430 | 
431 |     if args.distributed:
432 |         train_sampler = torch.utils.data.distributed.DistributedSampler(train_transforms)
433 |     else:
434 |         train_sampler = None
435 | 
436 |     train_loader = torch.utils.data.DataLoader(
437 |         train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
438 |         num_workers=args.workers, pin_memory=True, sampler=train_sampler)
439 | 
440 |     val_loader = torch.utils.data.DataLoader(
441 |         val_dataset, batch_size=args.batch_size, shuffle=False,
442 |         num_workers=args.workers, pin_memory=True)
443 | 
444 |     return train_loader,val_loader,train_sampler
445 | 
446 | 
447 | class VggBasedNet_PFNet(nn.Module):
448 |     """model structure of PFNet"""
449 |     def __init__(self, originalModel):
450 |         super(VggBasedNet_PFNet, self).__init__()
451 |         self.features = nn.Sequential(*list(originalModel.features)[:-1])
452 |         self.roipooling = RoIPoolFunction(7, 7, 1. / 16.)
453 |         self.classifier = originalModel.classifier
454 | 
455 |     def forward(self, x, rois):
456 |         # part feature extractor
457 |         x = self.features(x)
458 |         x = self.roipooling(x, rois)
459 |         x = x.view(x.size(0), -1)
460 |         x = self.classifier(x)
461 | 
462 |         # two-level loss
463 |         softMatrix = F.softmax(x, dim=1)
464 |         x = softMatrix.sum(dim=0,keepdim=True)/args.maximumRois
465 | 
466 |         return x, softMatrix
467 | 
468 | 
469 | class VggFtNet(nn.Module):
470 |     """VGG fine-tuning"""
471 |     def __init__(self, originalModel):
472 |         super(VggFtNet, self).__init__()
473 |         self.features = nn.Sequential(*list(originalModel.features))
474 |         self.roipooling = RoIPoolFunction(7, 7, 1. / 16.)
475 |         self.classifier = originalModel.classifier
476 | 
477 | 
478 |     def forward(self, x, rois):
479 |         x = self.features(x)
480 |         x = x.view(x.size(0), -1)
481 |         x = self.classifier(x)
482 | 
483 |         return x
484 | 
485 | 
486 | class BinaryLogLoss(nn.Module):
487 |     """image loss"""
488 |     def __init__(self):
489 |         super(BinaryLogLoss, self).__init__()
490 |         return
491 | 
492 |     def forward(self, input, target):
493 |         #  t = -log(c.*(X-0.5) + 0.5) ;. x is assumed to be the
494 |         #  probability that the attribute is active (c=+1). Hence x must be
495 |         #  a number in the range [0,1]. This is the binary version of the`log` loss.
496 |         return -(target.mul(input*0.9999+1e-5 -0.5)+0.5).log().sum()
497 | 
498 | 
499 | class PartAttentionLoss(nn.Module):
500 |     """part attention loss"""
501 |     def __init__(self):
502 |         super(PartAttentionLoss, self).__init__()
503 |         self.lamda = 1
504 |         return
505 | 
506 |     def forward(self, softMatrix, target):
507 |         p_t = (target.mul(softMatrix-0.5)+0.5)*0.9999+1e-5
508 |         return -(p_t).log().mul(\
509 |             torch.pow(1-p_t,self.lamda)).sum()/softMatrix.size(0)*5
510 | 
511 | 
512 | 
513 | if __name__ == '__main__':
514 |     main()
515 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Codes for *PFNet: A Novel Part Fusion Network for Fine-grained Visual Categorization*
 2 | This repository holds the PyTorch(V0.3.0) code for PFNet.
 3 | 
 4 | ## Introduction
 5 | 
 6 | The existing methods in fine-grained visual categorization focus on integrating multiple deep CNN models or complicated attention mechanism, resulting in increasing cumbersome networks. In addition, most methods rely on part annotations which requires expensive expert guidance. In this paper, without extra annotation, we propose a novel part fusion network (PFNet) to effectively fuse discriminative image parts for classification. More specifically, PFNet consists of a part feature extractor to extract part features and a two-level classification network to utilize part-level and image-level features simultaneously. Part-level features are trained with the weighted part loss, which embeds a weighting mechanism based on different parts' characteristics. Easy parts, hard parts and background parts are proposed and discriminatively used for classification. Moreover, part-level features are fused to form an image-level feature so as to introduce global supervision and generate final predictions. Experiments on three popular benchmark datasets show that our framework achieves competitive performance compared with the state-of-the-art.
 7 | 
 8 | ![alt text](https://github.com/MichaelLiang12/PFNet-FGVC/blob/master/pic/PFNet.jpg "visualization")
 9 | 
10 | ## Prepare Datasets
11 | 
12 | Prepare the corresponding datasets ([CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html), [Stanford Cars](http://ai.stanford.edu/~jkrause/cars/car_dataset.html) or [FGVC-Aircraft](http://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/)) before training PFNet. For quick start, you can download the dataset [Stanford Cars](http://ai.stanford.edu/~jkrause/cars/car_dataset.html), proposed rois files [car_rois500.tar.gz](https://drive.google.com/open?id=18DWMrK2WVEMGzRdMpgqgNiRbWOTtRwnP) and prepared train/test split file [car_splits.tar.gz](https://drive.google.com/open?id=18DWMrK2WVEMGzRdMpgqgNiRbWOTtRwnP). Unzip these files and organize them in the current working directory as follows:
13 | ```
14 | -car
15 | --car_ims
16 | ---000001.jpg
17 | 
18 | --car_rois500
19 | ---car_ims
20 | ----000001.txt
21 | 
22 | --split
23 | ---Acura Integra Type R 2001_test.txt
24 | ```
25 | 
26 | For part proposal, we also provide codes for generating part proposals using [Selective Search Window](https://koen.me/research/selectivesearch/). Please refer to the guide provide in our `part proposal` directory.
27 | 
28 | 
29 | 
30 | ## Usage
31 | 
32 | 1, Download this repo recursively:
33 | ```shell
34 | git clone --recursive https://github.com/MichaelLiang12/PFNet-FGVC.git
35 | ```
36 | 2, Build RoiPooling module
37 | 
38 | Please follow the instuctions in [pytorch-faster-rcnn](https://github.com/ruotianluo/pytorch-faster-rcnn#installation). We use the RoiPooling module implemented by them. Note that if you also use `Ubuntu14.04+Cuda8.0+TitanX`, you might not need to compile again.
39 | 
40 | 
41 | 3, Run `PFNet_train_test.py`
42 | 
43 | You can modify fundamental parameters in the `main()` function. The training process might be like follows. By setting `args.evaluate = True`, you can download [our model](https://drive.google.com/open?id=18DWMrK2WVEMGzRdMpgqgNiRbWOTtRwnP) and test it directly. 
44 | 
45 | ![alt text](https://github.com/MichaelLiang12/PFNet-FGVC/blob/master/pic/dog_loss_acc1.png "visualization")
46 | 
47 | ## Citation
48 | For Selective Search Window and RoiPooling module.
49 | ```
50 | @article{uijlings2013selective,
51 |   title={Selective search for object recognition},
52 |   author={Uijlings, Jasper RR and Van De Sande, Koen EA and Gevers, Theo and Smeulders, Arnold WM},
53 |   journal={International Journal of Computer Vision},
54 |   volume={104},
55 |   number={2},
56 |   pages={154--171},
57 |   year={2013},
58 |   publisher={Springer}
59 | }
60 | 
61 | @article{chen17implementation,
62 |     Author = {Xinlei Chen and Abhinav Gupta},
63 |     Title = {An Implementation of Faster RCNN with Study for Region Sampling},
64 |     Journal = {arXiv preprint arXiv:1702.02138},
65 |     Year = {2017}
66 | }
67 | ```
68 | ## Citation for our PFNet
69 | ```
70 | @Article{Liang2018,
71 | author="Liang, Jingyun
72 | and Guo, Jinlin
73 | and Guo, Yanming
74 | and Lao, Songyang",
75 | title="PFNet: a novel part fusion network for fine-grained visual categorization",
76 | journal="Multimedia Tools and Applications",
77 | year="2018",
78 | month="Dec",
79 | day="15",
80 | issn="1573-7721",
81 | doi="10.1007/s11042-018-7047-5",
82 | url="https://doi.org/10.1007/s11042-018-7047-5"
83 | }
84 | ```
85 | 
86 | 
87 | [View Paper](https://doi.org/10.1007/s11042-018-7047-5)
88 | 


--------------------------------------------------------------------------------
/lib/car_multilable_rois.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | import os.path
  4 | import tarfile
  5 | from urllib.parse import urlparse
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.utils.data as data
 10 | from PIL import Image
 11 | 
 12 | import io
 13 | 
 14 | 
 15 | object_categories = ['AM General Hummer SUV 2000','Acura RL Sedan 2012','Acura TL Sedan 2012','Acura TL Type-S 2008','Acura TSX Sedan 2012','Acura Integra Type R 2001','Acura ZDX Hatchback 2012','Aston Martin V8 Vantage Convertible 2012','Aston Martin V8 Vantage Coupe 2012','Aston Martin Virage Convertible 2012','Aston Martin Virage Coupe 2012','Audi RS 4 Convertible 2008','Audi A5 Coupe 2012','Audi TTS Coupe 2012','Audi R8 Coupe 2012','Audi V8 Sedan 1994','Audi 100 Sedan 1994','Audi 100 Wagon 1994','Audi TT Hatchback 2011','Audi S6 Sedan 2011','Audi S5 Convertible 2012','Audi S5 Coupe 2012','Audi S4 Sedan 2012','Audi S4 Sedan 2007','Audi TT RS Coupe 2012','BMW ActiveHybrid 5 Sedan 2012','BMW 1 Series Convertible 2012','BMW 1 Series Coupe 2012','BMW 3 Series Sedan 2012','BMW 3 Series Wagon 2012','BMW 6 Series Convertible 2007','BMW X5 SUV 2007','BMW X6 SUV 2012','BMW M3 Coupe 2012','BMW M5 Sedan 2010','BMW M6 Convertible 2010','BMW X3 SUV 2012','BMW Z4 Convertible 2012','Bentley Continental Supersports Conv. Convertible 2012','Bentley Arnage Sedan 2009','Bentley Mulsanne Sedan 2011','Bentley Continental GT Coupe 2012','Bentley Continental GT Coupe 2007','Bentley Continental Flying Spur Sedan 2007','Bugatti Veyron 16.4 Convertible 2009','Bugatti Veyron 16.4 Coupe 2009','Buick Regal GS 2012','Buick Rainier SUV 2007','Buick Verano Sedan 2012','Buick Enclave SUV 2012','Cadillac CTS-V Sedan 2012','Cadillac SRX SUV 2012','Cadillac Escalade EXT Crew Cab 2007','Chevrolet Silverado 1500 Hybrid Crew Cab 2012','Chevrolet Corvette Convertible 2012','Chevrolet Corvette ZR1 2012','Chevrolet Corvette Ron Fellows Edition Z06 2007','Chevrolet Traverse SUV 2012','Chevrolet Camaro Convertible 2012','Chevrolet HHR SS 2010','Chevrolet Impala Sedan 2007','Chevrolet Tahoe Hybrid SUV 2012','Chevrolet Sonic Sedan 2012','Chevrolet Express Cargo Van 2007','Chevrolet Avalanche Crew Cab 2012','Chevrolet Cobalt SS 2010','Chevrolet Malibu Hybrid Sedan 2010','Chevrolet TrailBlazer SS 2009','Chevrolet Silverado 2500HD Regular Cab 2012','Chevrolet Silverado 1500 Classic Extended Cab 2007','Chevrolet Express Van 2007','Chevrolet Monte Carlo Coupe 2007','Chevrolet Malibu Sedan 2007','Chevrolet Silverado 1500 Extended Cab 2012','Chevrolet Silverado 1500 Regular Cab 2012','Chrysler Aspen SUV 2009','Chrysler Sebring Convertible 2010','Chrysler Town and Country Minivan 2012','Chrysler 300 SRT-8 2010','Chrysler Crossfire Convertible 2008','Chrysler PT Cruiser Convertible 2008','Daewoo Nubira Wagon 2002','Dodge Caliber Wagon 2012','Dodge Caliber Wagon 2007','Dodge Caravan Minivan 1997','Dodge Ram Pickup 3500 Crew Cab 2010','Dodge Ram Pickup 3500 Quad Cab 2009','Dodge Sprinter Cargo Van 2009','Dodge Journey SUV 2012','Dodge Dakota Crew Cab 2010','Dodge Dakota Club Cab 2007','Dodge Magnum Wagon 2008','Dodge Challenger SRT8 2011','Dodge Durango SUV 2012','Dodge Durango SUV 2007','Dodge Charger Sedan 2012','Dodge Charger SRT-8 2009','Eagle Talon Hatchback 1998','FIAT 500 Abarth 2012','FIAT 500 Convertible 2012','Ferrari FF Coupe 2012','Ferrari California Convertible 2012','Ferrari 458 Italia Convertible 2012','Ferrari 458 Italia Coupe 2012','Fisker Karma Sedan 2012','Ford F-450 Super Duty Crew Cab 2012','Ford Mustang Convertible 2007','Ford Freestar Minivan 2007','Ford Expedition EL SUV 2009','Ford Edge SUV 2012','Ford Ranger SuperCab 2011','Ford GT Coupe 2006','Ford F-150 Regular Cab 2012','Ford F-150 Regular Cab 2007','Ford Focus Sedan 2007','Ford E-Series Wagon Van 2012','Ford Fiesta Sedan 2012','GMC Terrain SUV 2012','GMC Savana Van 2012','GMC Yukon Hybrid SUV 2012','GMC Acadia SUV 2012','GMC Canyon Extended Cab 2012','Geo Metro Convertible 1993','HUMMER H3T Crew Cab 2010','HUMMER H2 SUT Crew Cab 2009','Honda Odyssey Minivan 2012','Honda Odyssey Minivan 2007','Honda Accord Coupe 2012','Honda Accord Sedan 2012','Hyundai Veloster Hatchback 2012','Hyundai Santa Fe SUV 2012','Hyundai Tucson SUV 2012','Hyundai Veracruz SUV 2012','Hyundai Sonata Hybrid Sedan 2012','Hyundai Elantra Sedan 2007','Hyundai Accent Sedan 2012','Hyundai Genesis Sedan 2012','Hyundai Sonata Sedan 2012','Hyundai Elantra Touring Hatchback 2012','Hyundai Azera Sedan 2012','Infiniti G Coupe IPL 2012','Infiniti QX56 SUV 2011','Isuzu Ascender SUV 2008','Jaguar XK XKR 2012','Jeep Patriot SUV 2012','Jeep Wrangler SUV 2012','Jeep Liberty SUV 2012','Jeep Grand Cherokee SUV 2012','Jeep Compass SUV 2012','Lamborghini Reventon Coupe 2008','Lamborghini Aventador Coupe 2012','Lamborghini Gallardo LP 570-4 Superleggera 2012','Lamborghini Diablo Coupe 2001','Land Rover Range Rover SUV 2012','Land Rover LR2 SUV 2012','Lincoln Town Car Sedan 2011','MINI Cooper Roadster Convertible 2012','Maybach Landaulet Convertible 2012','Mazda Tribute SUV 2011','McLaren MP4-12C Coupe 2012','Mercedes-Benz 300-Class Convertible 1993','Mercedes-Benz C-Class Sedan 2012','Mercedes-Benz SL-Class Coupe 2009','Mercedes-Benz E-Class Sedan 2012','Mercedes-Benz S-Class Sedan 2012','Mercedes-Benz Sprinter Van 2012','Mitsubishi Lancer Sedan 2012','Nissan Leaf Hatchback 2012','Nissan NV Passenger Van 2012','Nissan Juke Hatchback 2012','Nissan 240SX Coupe 1998','Plymouth Neon Coupe 1999','Porsche Panamera Sedan 2012','Ram CV Cargo Van Minivan 2012','Rolls-Royce Phantom Drophead Coupe Convertible 2012','Rolls-Royce Ghost Sedan 2012','Rolls-Royce Phantom Sedan 2012','Scion xD Hatchback 2012','Spyker C8 Convertible 2009','Spyker C8 Coupe 2009','Suzuki Aerio Sedan 2007','Suzuki Kizashi Sedan 2012','Suzuki SX4 Hatchback 2012','Suzuki SX4 Sedan 2012','Tesla Model S Sedan 2012','Toyota Sequoia SUV 2012','Toyota Camry Sedan 2012','Toyota Corolla Sedan 2012','Toyota 4Runner SUV 2012','Volkswagen Golf Hatchback 2012','Volkswagen Golf Hatchback 1991','Volkswagen Beetle Hatchback 2012','Volvo C30 Hatchback 2012','Volvo 240 Sedan 1993','Volvo XC90 SUV 2007','smart fortwo Convertible 2012']
 16 | 
 17 | def read_image_label(file):
 18 |     print('[dataset] read ' + file)
 19 |     data = dict()
 20 |     with open(file, 'r') as f:
 21 |         for line in f:
 22 |             tmp = line.split(' ')
 23 |             name = tmp[0]
 24 |             label = int(tmp[-1])
 25 |             data[name] = label
 26 |     return data
 27 | 
 28 | 
 29 | def read_object_labels(root, set):
 30 |     path_labels = os.path.join(root, 'split')
 31 |     labeled_data = dict()
 32 |     num_classes = len(object_categories)
 33 | 
 34 |     for i in range(num_classes):
 35 |         file = os.path.join(path_labels, object_categories[i] + '_' + set + '.txt')
 36 |         data = read_image_label(file)
 37 | 
 38 |         if i == 0:
 39 |             for (name, label) in data.items():
 40 |                 labels = np.zeros(num_classes)
 41 |                 labels[i] = label
 42 |                 labeled_data[name] = labels
 43 |         else:
 44 |             for (name, label) in data.items():
 45 |                 labeled_data[name][i] = label
 46 | 
 47 |     return labeled_data
 48 | 
 49 | def write_object_labels_csv(file, labeled_data):
 50 |     # write a csv file
 51 |     print('[dataset] write file %s' % file)
 52 |     with open(file, 'w') as csvfile:
 53 |         fieldnames = ['name']
 54 |         fieldnames.extend(object_categories)
 55 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 56 | 
 57 |         writer.writeheader()
 58 |         for (name, labels) in labeled_data.items():
 59 |             example = {'name': name}
 60 |             for i in range(len(fieldnames)-1):
 61 |                 example[fieldnames[i + 1]] = int(labels[i])
 62 |             writer.writerow(example)
 63 | 
 64 |     csvfile.close()
 65 | 
 66 | def read_object_labels_csv(file, header=True):
 67 |     images = []
 68 |     num_categories = 0
 69 |     print('[dataset] read', file)
 70 |     with open(file, 'r') as f:
 71 |         reader = csv.reader(f)
 72 |         rownum = 0
 73 |         for row in reader:
 74 |             if header and rownum == 0:
 75 |                 header = row
 76 |             else:
 77 |                 if num_categories == 0:
 78 |                     num_categories = len(row) - 1
 79 |                 name = row[0]
 80 |                 labels = (np.asarray(row[1:num_categories + 1])).astype(np.float32)
 81 |                 labels = torch.from_numpy(labels)
 82 |                 item = (name, labels)
 83 |                 images.append(item)
 84 |             rownum += 1
 85 |     return images
 86 | 
 87 | 
 88 | class ImageFolder(data.Dataset):
 89 |     def __init__(self, root, set, transform=None, target_transform=None):
 90 |         self.root = root
 91 |         self.path_images = root
 92 |         self.path_rois = os.path.join(root,'car_rois500')
 93 |         self.set = set
 94 |         self.transform = transform
 95 |         self.target_transform = target_transform
 96 |         # download dataset
 97 | 
 98 |         # define path of csv file
 99 |         path_csv = os.path.join(self.root, 'path_label_csv')
100 |         # define filename of csv file
101 |         file_csv = os.path.join(path_csv, 'classification_' + set + '.csv')
102 | 
103 |         # create the csv file if necessary
104 |         if not os.path.exists(file_csv):
105 |             if not os.path.exists(path_csv):  # create dir if necessary
106 |                 os.makedirs(path_csv)
107 |             # generate csv file
108 |             labeled_data = read_object_labels(self.root, self.set)
109 |             # write csv file
110 |             write_object_labels_csv(file_csv, labeled_data)
111 | 
112 |             write_rois_pt(self.path_rois, labeled_data.keys())
113 | 
114 |         self.classes = object_categories
115 |         self.images = read_object_labels_csv(file_csv)
116 | 
117 |         print('[dataset] car classification set=%s number of classes=%d  number of images=%d' % (
118 |             set, len(self.classes), len(self.images)))
119 | 
120 |     def __getitem__(self, index):
121 |         path, target = self.images[index]
122 |         img = Image.open(os.path.join(self.path_images, path + '.jpg')).convert('RGB')
123 |         rois = torch.load(os.path.join(self.path_rois, path + '.pt'))
124 | 
125 |         if self.transform is not None:
126 |             img,rois = self.transform(img,rois)
127 |         if self.target_transform is not None:
128 |             target = self.target_transform(target)
129 | 
130 |         return (img, rois), target
131 | 
132 |     def __len__(self):
133 |         return len(self.images)
134 | 
135 |     def get_number_classes(self):
136 |         return len(self.classes)
137 | 
138 | def write_rois_pt(path_rois, imagename):
139 | 
140 |     print('[dataset] transfer rois.txt to rois.pt file')
141 |     for path in imagename:
142 |         ##rois = torch.FloatTensor([[0.0000, 50.0000,  50.0000,  150.0000,  150.0000 ],[0.0000, 100.0000,  100.0000,  200.0000,  200.0000]])
143 |         rois = torch.from_numpy(np.loadtxt(os.path.join(path_rois, path + '.txt'),dtype=float)).type(torch.FloatTensor)
144 |         torch.save(rois, os.path.join(path_rois, path + '.pt'))
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/_ext/roi_pooling/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from torch.utils.ffi import _wrap_function
 3 | from ._roi_pooling import lib as _lib, ffi as _ffi
 4 | 
 5 | __all__ = []
 6 | def _import_symbols(locals):
 7 |     for symbol in dir(_lib):
 8 |         fn = getattr(_lib, symbol)
 9 |         locals[symbol] = _wrap_function(fn, _ffi)
10 |         __all__.append(symbol)
11 | 
12 | _import_symbols(locals())
13 | 


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/_ext/roi_pooling/_roi_pooling.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JingyunLiang/PFNet-FGVC/a1dcf1ddd7427b4a907ab126653dcad505599cc4/lib/layer_utils/roi_pooling/_ext/roi_pooling/_roi_pooling.so


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/build.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from torch.utils.ffi import create_extension
 4 | 
 5 | 
 6 | sources = ['src/roi_pooling.c']
 7 | headers = ['src/roi_pooling.h']
 8 | defines = []
 9 | with_cuda = False
10 | 
11 | if torch.cuda.is_available():
12 |     print('Including CUDA code.')
13 |     sources += ['src/roi_pooling_cuda.c']
14 |     headers += ['src/roi_pooling_cuda.h']
15 |     defines += [('WITH_CUDA', None)]
16 |     with_cuda = True
17 | 
18 | this_file = os.path.dirname(os.path.realpath(__file__))
19 | print(this_file)
20 | extra_objects = ['src/cuda/roi_pooling_kernel.cu.o']
21 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
22 | 
23 | ffi = create_extension(
24 |     '_ext.roi_pooling',
25 |     headers=headers,
26 |     sources=sources,
27 |     define_macros=defines,
28 |     relative_to=__file__,
29 |     with_cuda=with_cuda,
30 |     extra_objects=extra_objects
31 | )
32 | 
33 | if __name__ == '__main__':
34 |     ffi.build()
35 | 


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/roi_pool.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Function
 3 | from ._ext import roi_pooling
 4 | 
 5 | 
 6 | class RoIPoolFunction(Function):
 7 |     def __init__(self, pooled_height, pooled_width, spatial_scale):
 8 |         self.pooled_width = int(pooled_width)
 9 |         self.pooled_height = int(pooled_height)
10 |         self.spatial_scale = float(spatial_scale)
11 |         self.output = None
12 |         self.argmax = None
13 |         self.rois = None
14 |         self.feature_size = None
15 | 
16 |     def forward(self, features, rois):
17 |         batch_size, num_channels, data_height, data_width = features.size()
18 |         num_rois = rois.size()[0]
19 |         output = torch.zeros(num_rois, num_channels, self.pooled_height, self.pooled_width)
20 |         argmax = torch.IntTensor(num_rois, num_channels, self.pooled_height, self.pooled_width).zero_()
21 | 
22 |         if not features.is_cuda:
23 |             _features = features.permute(0, 2, 3, 1)
24 |             roi_pooling.roi_pooling_forward(self.pooled_height, self.pooled_width, self.spatial_scale,
25 |                                             _features, rois, output)
26 |             # output = output.cuda()
27 |         else:
28 |             output = output.cuda()
29 |             argmax = argmax.cuda()
30 | 
31 |             roi_pooling.roi_pooling_forward_cuda(self.pooled_height, self.pooled_width, self.spatial_scale,
32 |                                                  features, rois, output, argmax)
33 | 
34 |             self.output = output
35 |             self.argmax = argmax
36 |             self.rois = rois
37 |             self.feature_size = features.size()
38 | 
39 |         return output
40 | 
41 |     def backward(self, grad_output):
42 |         assert(self.feature_size is not None and grad_output.is_cuda)
43 | 
44 |         batch_size, num_channels, data_height, data_width = self.feature_size
45 | 
46 |         grad_input = torch.zeros(batch_size, num_channels, data_height, data_width).cuda()
47 |         roi_pooling.roi_pooling_backward_cuda(self.pooled_height, self.pooled_width, self.spatial_scale,
48 |                                               grad_output, self.rois, grad_input, self.argmax)
49 | 
50 |         # print grad_input
51 | 
52 |         return grad_input, None
53 | 
54 | 
55 | class RoIPool(torch.nn.Module):
56 |     def __init__(self, pooled_height, pooled_width, spatial_scale):
57 |         super(RoIPool, self).__init__()
58 | 
59 |         self.pooled_width = int(pooled_width)
60 |         self.pooled_height = int(pooled_height)
61 |         self.spatial_scale = float(spatial_scale)
62 | 
63 |     def forward(self, features, rois):
64 |         return RoIPoolFunction(self.pooled_height, self.pooled_width, self.spatial_scale)(features, rois)
65 | 


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/roi_pool_py.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Variable
 4 | import numpy as np
 5 | 
 6 | 
 7 | class RoIPool(nn.Module):
 8 |     def __init__(self, pooled_height, pooled_width, spatial_scale):
 9 |         super(RoIPool, self).__init__()
10 |         self.pooled_width = int(pooled_width)
11 |         self.pooled_height = int(pooled_height)
12 |         self.spatial_scale = float(spatial_scale)
13 | 
14 |     def forward(self, features, rois):
15 |         batch_size, num_channels, data_height, data_width = features.size()
16 |         num_rois = rois.size()[0]
17 |         outputs = Variable(torch.zeros(num_rois, num_channels, self.pooled_height, self.pooled_width)).cuda()
18 | 
19 |         for roi_ind, roi in enumerate(rois):
20 |             batch_ind = int(roi[0].data[0])
21 |             roi_start_w, roi_start_h, roi_end_w, roi_end_h = np.round(
22 |                 roi[1:].data.cpu().numpy() * self.spatial_scale).astype(int)
23 |             roi_width = max(roi_end_w - roi_start_w + 1, 1)
24 |             roi_height = max(roi_end_h - roi_start_h + 1, 1)
25 |             bin_size_w = float(roi_width) / float(self.pooled_width)
26 |             bin_size_h = float(roi_height) / float(self.pooled_height)
27 | 
28 |             for ph in range(self.pooled_height):
29 |                 hstart = int(np.floor(ph * bin_size_h))
30 |                 hend = int(np.ceil((ph + 1) * bin_size_h))
31 |                 hstart = min(data_height, max(0, hstart + roi_start_h))
32 |                 hend = min(data_height, max(0, hend + roi_start_h))
33 |                 for pw in range(self.pooled_width):
34 |                     wstart = int(np.floor(pw * bin_size_w))
35 |                     wend = int(np.ceil((pw + 1) * bin_size_w))
36 |                     wstart = min(data_width, max(0, wstart + roi_start_w))
37 |                     wend = min(data_width, max(0, wend + roi_start_w))
38 | 
39 |                     is_empty = (hend <= hstart) or(wend <= wstart)
40 |                     if is_empty:
41 |                         outputs[roi_ind, :, ph, pw] = 0
42 |                     else:
43 |                         data = features[batch_ind]
44 |                         outputs[roi_ind, :, ph, pw] = torch.max(
45 |                             torch.max(data[:, hstart:hend, wstart:wend], 1)[0], 2)[0].view(-1)
46 | 
47 |         return outputs
48 | 
49 | 


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu:
--------------------------------------------------------------------------------
  1 | #ifdef __cplusplus
  2 | extern "C" {
  3 | #endif
  4 | 
  5 | #include <stdio.h>
  6 | #include <math.h>
  7 | #include <float.h>
  8 | #include "roi_pooling_kernel.h"
  9 | 
 10 | #define CUDA_1D_KERNEL_LOOP(i, n)                            \
 11 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
 12 |        i += blockDim.x * gridDim.x)
 13 | 
 14 | 
 15 | __global__ void ROIPoolForward(const int nthreads, const float* bottom_data,
 16 |     const float spatial_scale, const int height, const int width,
 17 |     const int channels, const int pooled_height, const int pooled_width,
 18 |     const float* bottom_rois, float* top_data, int* argmax_data)
 19 | {
 20 |     CUDA_1D_KERNEL_LOOP(index, nthreads)
 21 |     {
 22 |         // (n, c, ph, pw) is an element in the pooled output
 23 |         int n = index;
 24 |         int pw = n % pooled_width;
 25 |         n /= pooled_width;
 26 |         int ph = n % pooled_height;
 27 |         n /= pooled_height;
 28 |         int c = n % channels;
 29 |         n /= channels;
 30 | 
 31 |         bottom_rois += n * 5;
 32 |         int roi_batch_ind = bottom_rois[0];
 33 |         int roi_start_w = round(bottom_rois[1] * spatial_scale);
 34 |         int roi_start_h = round(bottom_rois[2] * spatial_scale);
 35 |         int roi_end_w = round(bottom_rois[3] * spatial_scale);
 36 |         int roi_end_h = round(bottom_rois[4] * spatial_scale);
 37 | 
 38 |         // Force malformed ROIs to be 1x1
 39 |         int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1);
 40 |         int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1);
 41 |         float bin_size_h = (float)(roi_height) / (float)(pooled_height);
 42 |         float bin_size_w = (float)(roi_width) / (float)(pooled_width);
 43 | 
 44 |         int hstart = (int)(floor((float)(ph) * bin_size_h));
 45 |         int wstart = (int)(floor((float)(pw) * bin_size_w));
 46 |         int hend = (int)(ceil((float)(ph + 1) * bin_size_h));
 47 |         int wend = (int)(ceil((float)(pw + 1) * bin_size_w));
 48 | 
 49 |         // Add roi offsets and clip to input boundaries
 50 |         hstart = fminf(fmaxf(hstart + roi_start_h, 0), height);
 51 |         hend = fminf(fmaxf(hend + roi_start_h, 0), height);
 52 |         wstart = fminf(fmaxf(wstart + roi_start_w, 0), width);
 53 |         wend = fminf(fmaxf(wend + roi_start_w, 0), width);
 54 |         bool is_empty = (hend <= hstart) || (wend <= wstart);
 55 | 
 56 |         // Define an empty pooling region to be zero
 57 |         float maxval = is_empty ? 0 : -FLT_MAX;
 58 |         // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
 59 |         int maxidx = -1;
 60 |         bottom_data += roi_batch_ind * channels * height * width;
 61 |         for (int h = hstart; h < hend; ++h) {
 62 |             for (int w = wstart; w < wend; ++w) {
 63 |     //            int bottom_index = (h * width + w) * channels + c;
 64 |                 int bottom_index = (c * height + h) * width + w;
 65 |                 if (bottom_data[bottom_index] > maxval) {
 66 |                     maxval = bottom_data[bottom_index];
 67 |                     maxidx = bottom_index;
 68 |                 }
 69 |             }
 70 |         }
 71 |         top_data[index] = maxval;
 72 |         if (argmax_data != NULL)
 73 |             argmax_data[index] = maxidx;
 74 |     }
 75 | }
 76 | 
 77 | 
 78 | int ROIPoolForwardLaucher(
 79 |     const float* bottom_data, const float spatial_scale, const int num_rois, const int height,
 80 |     const int width, const int channels, const int pooled_height,
 81 |     const int pooled_width, const float* bottom_rois,
 82 |     float* top_data, int* argmax_data, cudaStream_t stream)
 83 | {
 84 |     const int kThreadsPerBlock = 1024;
 85 |     const int output_size = num_rois * pooled_height * pooled_width * channels;
 86 |     cudaError_t err;
 87 | 
 88 | 
 89 |     ROIPoolForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
 90 |       output_size, bottom_data, spatial_scale, height, width, channels, pooled_height,
 91 |       pooled_width, bottom_rois, top_data, argmax_data);
 92 | 
 93 |     err = cudaGetLastError();
 94 |     if(cudaSuccess != err)
 95 |     {
 96 |         fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
 97 |         exit( -1 );
 98 |     }
 99 | 
100 |     return 1;
101 | }
102 | 
103 | 
104 | __global__ void ROIPoolBackward(const int nthreads, const float* top_diff,
105 |     const int* argmax_data, const int num_rois, const float spatial_scale,
106 |     const int height, const int width, const int channels,
107 |     const int pooled_height, const int pooled_width, float* bottom_diff,
108 |     const float* bottom_rois) {
109 |     CUDA_1D_KERNEL_LOOP(index, nthreads)
110 |     {
111 | 
112 |         // (n, c, ph, pw) is an element in the pooled output
113 |         int n = index;
114 |         int w = n % width;
115 |         n /= width;
116 |         int h = n % height;
117 |         n /= height;
118 |         int c = n % channels;
119 |         n /= channels;
120 | 
121 |         float gradient = 0;
122 |         // Accumulate gradient over all ROIs that pooled this element
123 |         for (int roi_n = 0; roi_n < num_rois; ++roi_n)
124 |         {
125 |             const float* offset_bottom_rois = bottom_rois + roi_n * 5;
126 |             int roi_batch_ind = offset_bottom_rois[0];
127 |             // Skip if ROI's batch index doesn't match n
128 |             if (n != roi_batch_ind) {
129 |                 continue;
130 |             }
131 | 
132 |             int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
133 |             int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
134 |             int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
135 |             int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
136 | 
137 |             // Skip if ROI doesn't include (h, w)
138 |             const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
139 |                                h >= roi_start_h && h <= roi_end_h);
140 |             if (!in_roi) {
141 |                 continue;
142 |             }
143 | 
144 |             int offset = roi_n * pooled_height * pooled_width * channels;
145 |             const float* offset_top_diff = top_diff + offset;
146 |             const int* offset_argmax_data = argmax_data + offset;
147 | 
148 |             // Compute feasible set of pooled units that could have pooled
149 |             // this bottom unit
150 | 
151 |             // Force malformed ROIs to be 1x1
152 |             int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1);
153 |             int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1);
154 | 
155 |             float bin_size_h = (float)(roi_height) / (float)(pooled_height);
156 |             float bin_size_w = (float)(roi_width) / (float)(pooled_width);
157 | 
158 |             int phstart = floor((float)(h - roi_start_h) / bin_size_h);
159 |             int phend = ceil((float)(h - roi_start_h + 1) / bin_size_h);
160 |             int pwstart = floor((float)(w - roi_start_w) / bin_size_w);
161 |             int pwend = ceil((float)(w - roi_start_w + 1) / bin_size_w);
162 | 
163 |             phstart = fminf(fmaxf(phstart, 0), pooled_height);
164 |             phend = fminf(fmaxf(phend, 0), pooled_height);
165 |             pwstart = fminf(fmaxf(pwstart, 0), pooled_width);
166 |             pwend = fminf(fmaxf(pwend, 0), pooled_width);
167 | 
168 |             for (int ph = phstart; ph < phend; ++ph) {
169 |                 for (int pw = pwstart; pw < pwend; ++pw) {
170 |                     if (offset_argmax_data[(c * pooled_height + ph) * pooled_width + pw] == index)
171 |                     {
172 |                         gradient += offset_top_diff[(c * pooled_height + ph) * pooled_width + pw];
173 |                     }
174 |                 }
175 |             }
176 |         }
177 |         bottom_diff[index] = gradient;
178 |   }
179 | }
180 | 
181 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois,
182 |     const int height, const int width, const int channels, const int pooled_height,
183 |     const int pooled_width, const float* bottom_rois,
184 |     float* bottom_diff, const int* argmax_data, cudaStream_t stream)
185 | {
186 |     const int kThreadsPerBlock = 1024;
187 |     const int output_size = batch_size * height * width * channels;
188 |     cudaError_t err;
189 | 
190 |     ROIPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
191 |       output_size, top_diff, argmax_data, num_rois, spatial_scale, height, width, channels, pooled_height,
192 |       pooled_width, bottom_diff, bottom_rois);
193 | 
194 |     err = cudaGetLastError();
195 |     if(cudaSuccess != err)
196 |     {
197 |         fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
198 |         exit( -1 );
199 |     }
200 | 
201 |     return 1;
202 | }
203 | 
204 | 
205 | #ifdef __cplusplus
206 | }
207 | #endif
208 | 
209 | 
210 | 


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JingyunLiang/PFNet-FGVC/a1dcf1ddd7427b4a907ab126653dcad505599cc4/lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu.o


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _ROI_POOLING_KERNEL
 2 | #define _ROI_POOLING_KERNEL
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | int ROIPoolForwardLaucher(
 9 |     const float* bottom_data, const float spatial_scale, const int num_rois, const int height,
10 |     const int width, const int channels, const int pooled_height,
11 |     const int pooled_width, const float* bottom_rois,
12 |     float* top_data, int* argmax_data, cudaStream_t stream);
13 | 
14 | 
15 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois,
16 |     const int height, const int width, const int channels, const int pooled_height,
17 |     const int pooled_width, const float* bottom_rois,
18 |     float* bottom_diff, const int* argmax_data, cudaStream_t stream);
19 | 
20 | #ifdef __cplusplus
21 | }
22 | #endif
23 | 
24 | #endif
25 | 
26 | 


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/src/roi_pooling.c:
--------------------------------------------------------------------------------
  1 | #include <TH/TH.h>
  2 | #include <math.h>
  3 | 
  4 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale,
  5 |                         THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output)
  6 | {
  7 |     // Grab the input tensor
  8 |     float * data_flat = THFloatTensor_data(features);
  9 |     float * rois_flat = THFloatTensor_data(rois);
 10 | 
 11 |     float * output_flat = THFloatTensor_data(output);
 12 | 
 13 |     // Number of ROIs
 14 |     int num_rois = THFloatTensor_size(rois, 0);
 15 |     int size_rois = THFloatTensor_size(rois, 1);
 16 |     // batch size
 17 |     int batch_size = THFloatTensor_size(features, 0);
 18 |     if(batch_size != 1)
 19 |     {
 20 |         return 0;
 21 |     }
 22 |     // data height
 23 |     int data_height = THFloatTensor_size(features, 1);
 24 |     // data width
 25 |     int data_width = THFloatTensor_size(features, 2);
 26 |     // Number of channels
 27 |     int num_channels = THFloatTensor_size(features, 3);
 28 | 
 29 |     // Set all element of the output tensor to -inf.
 30 |     THFloatStorage_fill(THFloatTensor_storage(output), -1);
 31 | 
 32 |     // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
 33 |     int index_roi = 0;
 34 |     int index_output = 0;
 35 |     int n;
 36 |     for (n = 0; n < num_rois; ++n)
 37 |     {
 38 |         int roi_batch_ind = rois_flat[index_roi + 0];
 39 |         int roi_start_w = round(rois_flat[index_roi + 1] * spatial_scale);
 40 |         int roi_start_h = round(rois_flat[index_roi + 2] * spatial_scale);
 41 |         int roi_end_w = round(rois_flat[index_roi + 3] * spatial_scale);
 42 |         int roi_end_h = round(rois_flat[index_roi + 4] * spatial_scale);
 43 |         //      CHECK_GE(roi_batch_ind, 0);
 44 |         //      CHECK_LT(roi_batch_ind, batch_size);
 45 | 
 46 |         int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1);
 47 |         int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1);
 48 |         float bin_size_h = (float)(roi_height) / (float)(pooled_height);
 49 |         float bin_size_w = (float)(roi_width) / (float)(pooled_width);
 50 | 
 51 |         int index_data = roi_batch_ind * data_height * data_width * num_channels;
 52 |         const int output_area = pooled_width * pooled_height;
 53 | 
 54 |         int c, ph, pw;
 55 |         for (ph = 0; ph < pooled_height; ++ph)
 56 |         {
 57 |             for (pw = 0; pw < pooled_width; ++pw)
 58 |             {
 59 |                 int hstart = (floor((float)(ph) * bin_size_h));
 60 |                 int wstart = (floor((float)(pw) * bin_size_w));
 61 |                 int hend = (ceil((float)(ph + 1) * bin_size_h));
 62 |                 int wend = (ceil((float)(pw + 1) * bin_size_w));
 63 | 
 64 |                 hstart = fminf(fmaxf(hstart + roi_start_h, 0), data_height);
 65 |                 hend = fminf(fmaxf(hend + roi_start_h, 0), data_height);
 66 |                 wstart = fminf(fmaxf(wstart + roi_start_w, 0), data_width);
 67 |                 wend = fminf(fmaxf(wend + roi_start_w, 0), data_width);
 68 | 
 69 |                 const int pool_index = index_output + (ph * pooled_width + pw);
 70 |                 int is_empty = (hend <= hstart) || (wend <= wstart);
 71 |                 if (is_empty)
 72 |                 {
 73 |                     for (c = 0; c < num_channels * output_area; c += output_area)
 74 |                     {
 75 |                         output_flat[pool_index + c] = 0;
 76 |                     }
 77 |                 }
 78 |                 else
 79 |                 {
 80 |                     int h, w, c;
 81 |                     for (h = hstart; h < hend; ++h)
 82 |                     {
 83 |                         for (w = wstart; w < wend; ++w)
 84 |                         {
 85 |                             for (c = 0; c < num_channels; ++c)
 86 |                             {
 87 |                                 const int index = (h * data_width + w) * num_channels + c;
 88 |                                 if (data_flat[index_data + index] > output_flat[pool_index + c * output_area])
 89 |                                 {
 90 |                                     output_flat[pool_index + c * output_area] = data_flat[index_data + index];
 91 |                                 }
 92 |                             }
 93 |                         }
 94 |                     }
 95 |                 }
 96 |             }
 97 |         }
 98 | 
 99 |         // Increment ROI index
100 |         index_roi += size_rois;
101 |         index_output += pooled_height * pooled_width * num_channels;
102 |     }
103 |     return 1;
104 | }


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/src/roi_pooling.h:
--------------------------------------------------------------------------------
1 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale,
2 |                         THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output);


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/src/roi_pooling_cuda.c:
--------------------------------------------------------------------------------
 1 | #include <THC/THC.h>
 2 | #include <math.h>
 3 | #include "cuda/roi_pooling_kernel.h"
 4 | 
 5 | extern THCState *state;
 6 | 
 7 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale,
 8 |                         THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax)
 9 | {
10 |     // Grab the input tensor
11 |     float * data_flat = THCudaTensor_data(state, features);
12 |     float * rois_flat = THCudaTensor_data(state, rois);
13 | 
14 |     float * output_flat = THCudaTensor_data(state, output);
15 |     int * argmax_flat = THCudaIntTensor_data(state, argmax);
16 | 
17 |     // Number of ROIs
18 |     int num_rois = THCudaTensor_size(state, rois, 0);
19 |     int size_rois = THCudaTensor_size(state, rois, 1);
20 |     if (size_rois != 5)
21 |     {
22 |         return 0;
23 |     }
24 | 
25 |     // batch size
26 |     int batch_size = THCudaTensor_size(state, features, 0);
27 |     if (batch_size != 1)
28 |     {
29 |         return 0;
30 |     }
31 |     // data height
32 |     int data_height = THCudaTensor_size(state, features, 2);
33 |     // data width
34 |     int data_width = THCudaTensor_size(state, features, 3);
35 |     // Number of channels
36 |     int num_channels = THCudaTensor_size(state, features, 1);
37 | 
38 |     cudaStream_t stream = THCState_getCurrentStream(state);
39 | 
40 |     ROIPoolForwardLaucher(
41 |         data_flat, spatial_scale, num_rois, data_height,
42 |         data_width, num_channels, pooled_height,
43 |         pooled_width, rois_flat,
44 |         output_flat, argmax_flat, stream);
45 | 
46 |     return 1;
47 | }
48 | 
49 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale,
50 |                         THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax)
51 | {
52 |     // Grab the input tensor
53 |     float * top_grad_flat = THCudaTensor_data(state, top_grad);
54 |     float * rois_flat = THCudaTensor_data(state, rois);
55 | 
56 |     float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad);
57 |     int * argmax_flat = THCudaIntTensor_data(state, argmax);
58 | 
59 |     // Number of ROIs
60 |     int num_rois = THCudaTensor_size(state, rois, 0);
61 |     int size_rois = THCudaTensor_size(state, rois, 1);
62 |     if (size_rois != 5)
63 |     {
64 |         return 0;
65 |     }
66 | 
67 |     // batch size
68 |     int batch_size = THCudaTensor_size(state, bottom_grad, 0);
69 |     if (batch_size != 1)
70 |     {
71 |         return 0;
72 |     }
73 |     // data height
74 |     int data_height = THCudaTensor_size(state, bottom_grad, 2);
75 |     // data width
76 |     int data_width = THCudaTensor_size(state, bottom_grad, 3);
77 |     // Number of channels
78 |     int num_channels = THCudaTensor_size(state, bottom_grad, 1);
79 | 
80 |     cudaStream_t stream = THCState_getCurrentStream(state);
81 |     ROIPoolBackwardLaucher(
82 |         top_grad_flat, spatial_scale, batch_size, num_rois, data_height,
83 |         data_width, num_channels, pooled_height,
84 |         pooled_width, rois_flat,
85 |         bottom_grad_flat, argmax_flat, stream);
86 | 
87 |     return 1;
88 | }


--------------------------------------------------------------------------------
/lib/layer_utils/roi_pooling/src/roi_pooling_cuda.h:
--------------------------------------------------------------------------------
1 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale,
2 |                         THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax);
3 | 
4 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale,
5 |                         THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax);


--------------------------------------------------------------------------------
/lib/transforms_with_rois.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch
  3 | import math
  4 | import random
  5 | from PIL import Image, ImageOps
  6 | try:
  7 |     import accimage
  8 | except ImportError:
  9 |     accimage = None
 10 | import numpy as np
 11 | import numbers
 12 | import types
 13 | import collections
 14 | 
 15 | 
 16 | class Compose(object):
 17 |     """Composes several transforms together.
 18 | 
 19 |     Args:
 20 |         transforms (list of ``Transform`` objects): list of transforms to compose.
 21 | 
 22 |     Example:
 23 |         >>> transforms.Compose([
 24 |         >>>     transforms.CenterCrop(10),
 25 |         >>>     transforms.ToTensor(),
 26 |         >>> ])
 27 |     """
 28 | 
 29 |     def __init__(self, transforms):
 30 |         self.transforms = transforms
 31 | 
 32 |     def __call__(self, img,rois):
 33 |         for t in self.transforms:
 34 |             if 'RandomSizedCrop' in t.__str__() \
 35 |                     or 'CenterCrop' in t.__str__() :
 36 | 
 37 |                 img,rois = t(img,rois)
 38 |                 rois = FixRois(img.size, rois)
 39 |             elif 'RandomHorizontalFlip' in t.__str__() \
 40 |                     or 'Scale' in t.__str__():
 41 | 
 42 |                 img,rois = t(img,rois)
 43 |             else:
 44 |                 img = t(img)
 45 | 
 46 |         # for vgg16
 47 |         # rois = OffSet(rois, (img.shape[2],img.shape[1]), o0=8.5, o=9.5, stride=[16,16])
 48 | 
 49 |         return img, rois
 50 | 
 51 | 
 52 | class ToTensor(object):
 53 |     """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
 54 | 
 55 |     Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
 56 |     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
 57 |     """
 58 | 
 59 |     def __call__(self, pic):
 60 |         """
 61 |         Args:
 62 |             pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
 63 | 
 64 |         Returns:
 65 |             Tensor: Converted image.
 66 |         """
 67 |         if isinstance(pic, np.ndarray):
 68 |             # handle numpy array
 69 |             img = torch.from_numpy(pic.transpose((2, 0, 1)))
 70 |             # backward compatibility
 71 |             return img.float().div(255)
 72 | 
 73 |         if accimage is not None and isinstance(pic, accimage.Image):
 74 |             nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32)
 75 |             pic.copyto(nppic)
 76 |             return torch.from_numpy(nppic)
 77 | 
 78 |         # handle PIL Image
 79 |         if pic.mode == 'I':
 80 |             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
 81 |         elif pic.mode == 'I;16':
 82 |             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
 83 |         else:
 84 |             img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
 85 |         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
 86 |         if pic.mode == 'YCbCr':
 87 |             nchannel = 3
 88 |         elif pic.mode == 'I;16':
 89 |             nchannel = 1
 90 |         else:
 91 |             nchannel = len(pic.mode)
 92 |         img = img.view(pic.size[1], pic.size[0], nchannel)
 93 |         # put it from HWC to CHW format
 94 |         # yikes, this transpose takes 80% of the loading time/CPU
 95 |         img = img.transpose(0, 1).transpose(0, 2).contiguous()
 96 |         if isinstance(img, torch.ByteTensor):
 97 |             return img.float().div(255)
 98 |         else:
 99 |             return img
100 | 
101 | 
102 | class ToPILImage(object):
103 |     """Convert a tensor to PIL Image.
104 | 
105 |     Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
106 |     H x W x C to a PIL.Image while preserving the value range.
107 |     """
108 | 
109 |     def __call__(self, pic):
110 |         """
111 |         Args:
112 |             pic (Tensor or numpy.ndarray): Image to be converted to PIL.Image.
113 | 
114 |         Returns:
115 |             PIL.Image: Image converted to PIL.Image.
116 | 
117 |         """
118 |         npimg = pic
119 |         mode = None
120 |         if isinstance(pic, torch.FloatTensor):
121 |             pic = pic.mul(255).byte()
122 |         if torch.is_tensor(pic):
123 |             npimg = np.transpose(pic.numpy(), (1, 2, 0))
124 |         assert isinstance(npimg, np.ndarray), 'pic should be Tensor or ndarray'
125 |         if npimg.shape[2] == 1:
126 |             npimg = npimg[:, :, 0]
127 | 
128 |             if npimg.dtype == np.uint8:
129 |                 mode = 'L'
130 |             if npimg.dtype == np.int16:
131 |                 mode = 'I;16'
132 |             if npimg.dtype == np.int32:
133 |                 mode = 'I'
134 |             elif npimg.dtype == np.float32:
135 |                 mode = 'F'
136 |         else:
137 |             if npimg.dtype == np.uint8:
138 |                 mode = 'RGB'
139 |         assert mode is not None, '{} is not supported'.format(npimg.dtype)
140 |         return Image.fromarray(npimg, mode=mode)
141 | 
142 | 
143 | class Normalize(object):
144 |     """Normalize an tensor image with mean and standard deviation.
145 | 
146 |     Given mean: (R, G, B) and std: (R, G, B),
147 |     will normalize each channel of the torch.*Tensor, i.e.
148 |     channel = (channel - mean) / std
149 | 
150 |     Args:
151 |         mean (sequence): Sequence of means for R, G, B channels respecitvely.
152 |         std (sequence): Sequence of standard deviations for R, G, B channels
153 |             respecitvely.
154 |     """
155 | 
156 |     def __init__(self, mean, std):
157 |         self.mean = mean
158 |         self.std = std
159 | 
160 |     def __call__(self, tensor):
161 |         """
162 |         Args:
163 |             tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
164 | 
165 |         Returns:
166 |             Tensor: Normalized image.
167 |         """
168 |         # TODO: make efficient
169 |         for t, m, s in zip(tensor, self.mean, self.std):
170 |             t.sub_(m).div_(s)
171 |         return tensor
172 | 
173 | 
174 | class Scale(object):
175 |     """Rescale the input PIL.Image to the given size.
176 | 
177 |     Args:
178 |         size (sequence or int): Desired output size. If size is a sequence like
179 |             (w, h), output size will be matched to this. If size is an int,
180 |             smaller edge of the image will be matched to this number.
181 |             i.e, if height > width, then image will be rescaled to
182 |             (size * height / width, size)
183 |         interpolation (int, optional): Desired interpolation. Default is
184 |             ``PIL.Image.BILINEAR``
185 |     """
186 | 
187 |     def __init__(self, size, interpolation=Image.BILINEAR, scaleheight=None):
188 |         assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
189 |         self.size = size
190 |         self.interpolation = interpolation
191 |         self.scaleheight = scaleheight
192 | 
193 |     def __call__(self, img, rois):
194 |         """
195 |         Args:
196 |             img (PIL.Image): Image to be scaled.
197 | 
198 |         Returns:
199 |             PIL.Image: Rescaled image.
200 |         """
201 |         if self.scaleheight is not None:
202 |             for attempt in range(10):
203 |                 oh = self.scaleheight[random.randint(0,len(self.scaleheight)-1)]
204 |                 ow = int(img.size[0]/img.size[1]*oh)
205 | 
206 |                 if oh<=700 & ow <=700:
207 |                     return img.resize((ow, oh), self.interpolation), ResizeRois(img.size, (ow, oh), rois)
208 | 
209 |             ow = 650#700
210 |             oh = int(img.size[1]/img.size[0]*ow)
211 |             return img.resize((ow, oh), self.interpolation), ResizeRois(img.size, (ow, oh), rois)
212 |             # for attempt in range(10):
213 |             #     if img.size[0]<img.size[1]:
214 |             #         oh = self.scaleheight[random.randint(0,len(self.scaleheight)-1)]
215 |             #         ow = int(img.size[0]/img.size[1]*oh)
216 |             #     else:
217 |             #         ow = self.scaleheight[random.randint(0,len(self.scaleheight)-1)]
218 |             #         oh = int(img.size[1]/img.size[0]*ow)
219 |             #
220 |             #     return img.resize((ow, oh), self.interpolation), ResizeRois(img.size, (ow, oh), rois)
221 |             #
222 |             # return img, rois
223 |         else:
224 |             if isinstance(self.size, int):
225 |                 w, h = img.size
226 |                 if (w <= h and w == self.size) or (h <= w and h == self.size):
227 |                     return img, rois
228 |                 if w < h:
229 |                     ow = self.size
230 |                     oh = int(self.size * h / w)
231 |                     return img.resize((ow, oh), self.interpolation), ResizeRois(img.size, (ow, oh), rois)
232 |                 else:
233 |                     oh = self.size
234 |                     ow = int(self.size * w / h)
235 |                     return img.resize((ow, oh), self.interpolation), ResizeRois(img.size, (ow, oh), rois)
236 |             else:
237 |                 return img.resize(self.size, self.interpolation), ResizeRois(img.size, self.size, rois)
238 | 
239 | 
240 | class CenterCrop(object):
241 |     """Crops the given PIL.Image at the center.
242 | 
243 |     Args:
244 |         size (sequence or int): Desired output size of the crop. If size is an
245 |             int instead of sequence like (h, w), a square crop (size, size) is
246 |             made.
247 |     """
248 | 
249 |     def __init__(self, size):
250 |         if isinstance(size, numbers.Number):
251 |             self.size = (int(size), int(size))
252 |         else:
253 |             self.size = size
254 | 
255 |     def __call__(self, img, rois):
256 |         """
257 |         Args:
258 |             img (PIL.Image): Image to be cropped.
259 | 
260 |         Returns:
261 |             PIL.Image: Cropped image.
262 |         """
263 |         w, h = img.size
264 |         th, tw = self.size
265 |         x1 = int(round((w - tw) / 2.))
266 |         y1 = int(round((h - th) / 2.))
267 |         return img.crop((x1, y1, x1 + tw, y1 + th)), RemoveOuterRois((x1, y1, x1 + tw, y1 + th), rois)
268 | 
269 | 
270 | class Pad(object):
271 |     """Pad the given PIL.Image on all sides with the given "pad" value.
272 | 
273 |     Args:
274 |         padding (int or sequence): Padding on each border. If a sequence of
275 |             length 4, it is used to pad left, top, right and bottom borders respectively.
276 |         fill: Pixel fill value. Default is 0.
277 |     """
278 | 
279 |     def __init__(self, padding, fill=0):
280 |         assert isinstance(padding, numbers.Number)
281 |         assert isinstance(fill, numbers.Number) or isinstance(fill, str) or isinstance(fill, tuple)
282 |         self.padding = padding
283 |         self.fill = fill
284 | 
285 |     def __call__(self, img):
286 |         """
287 |         Args:
288 |             img (PIL.Image): Image to be padded.
289 | 
290 |         Returns:
291 |             PIL.Image: Padded image.
292 |         """
293 |         raise NotImplementedError
294 | 
295 |         return ImageOps.expand(img, border=self.padding, fill=self.fill)
296 | 
297 | 
298 | class Lambda(object):
299 |     """Apply a user-defined lambda as a transform.
300 | 
301 |     Args:
302 |         lambd (function): Lambda/function to be used for transform.
303 |     """
304 | 
305 |     def __init__(self, lambd):
306 |         assert isinstance(lambd, types.LambdaType)
307 |         self.lambd = lambd
308 | 
309 |     def __call__(self, img):
310 |         return self.lambd(img)
311 | 
312 | 
313 | class RandomCrop(object):
314 |     """Crop the given PIL.Image at a random location.
315 | 
316 |     Args:
317 |         size (sequence or int): Desired output size of the crop. If size is an
318 |             int instead of sequence like (h, w), a square crop (size, size) is
319 |             made.
320 |         padding (int or sequence, optional): Optional padding on each border
321 |             of the image. Default is 0, i.e no padding. If a sequence of length
322 |             4 is provided, it is used to pad left, top, right, bottom borders
323 |             respectively.
324 |     """
325 | 
326 |     def __init__(self, size, padding=0):
327 |         if isinstance(size, numbers.Number):
328 |             self.size = (int(size), int(size))
329 |         else:
330 |             self.size = size
331 |         self.padding = padding
332 | 
333 |     def __call__(self, img, rois):
334 |         """
335 |         Args:
336 |             img (PIL.Image): Image to be cropped.
337 | 
338 |         Returns:
339 |             PIL.Image: Cropped image.
340 |         """
341 |         if self.padding > 0:
342 |             img = ImageOps.expand(img, border=self.padding, fill=0)
343 |             raise NotImplementedError
344 | 
345 | 
346 |         w, h = img.size
347 |         th, tw = self.size
348 |         if w == tw and h == th:
349 |             return img,rois
350 | 
351 |         x1 = random.randint(0, w - tw)
352 |         y1 = random.randint(0, h - th)
353 |         return img.crop((x1, y1, x1 + tw, y1 + th)), RemoveOuterRois((x1, y1, x1 + tw, y1 + th), rois)
354 | 
355 | 
356 | class RandomHorizontalFlip(object):
357 |     """Horizontally flip the given PIL.Image randomly with a probability of 0.5."""
358 | 
359 |     def __call__(self, img, rois):
360 |         """
361 |         Args:
362 |             img (PIL.Image): Image to be flipped.
363 | 
364 |         Returns:
365 |             PIL.Image: Randomly flipped image.
366 |         """
367 |         if random.random() < 0.5:
368 |             rois[:,[1,3]] = img.size[0] + 1 - rois[:,[3,1]];
369 |             return img.transpose(Image.FLIP_LEFT_RIGHT), rois
370 |         return img, rois
371 | 
372 | class RandomSizedCrop(object):
373 |     """Crop the given PIL.Image to random size and aspect ratio.
374 | 
375 |     A crop of random size of (0.08 to 1.0) of the original size and a random
376 |     aspect ratio of 3/4 to 4/3 of the original aspect ratio is made. This crop
377 |     is finally resized to given size.
378 |     This is popularly used to train the Inception networks.
379 | 
380 |     Args:
381 |         size: size of the smaller edge
382 |         interpolation: Default: PIL.Image.BILINEAR
383 |     """
384 | 
385 |     def __init__(self, size, interpolation=Image.BILINEAR):
386 |         self.size = size
387 |         self.interpolation = interpolation
388 | 
389 |     def __call__(self, img,rois):
390 |         for attempt in range(10):
391 |             area = img.size[0] * img.size[1]
392 |             target_area = random.uniform(0.08, 1.0) * area
393 |             aspect_ratio = random.uniform(3. / 4, 4. / 3)
394 | 
395 |             w = int(round(math.sqrt(target_area * aspect_ratio)))
396 |             h = int(round(math.sqrt(target_area / aspect_ratio)))
397 | 
398 |             if random.random() < 0.5:
399 |                 w, h = h, w
400 | 
401 |             if w <= img.size[0] and h <= img.size[1]:
402 |                 x1 = random.randint(0, img.size[0] - w)
403 |                 y1 = random.randint(0, img.size[1] - h)
404 | 
405 |                 img = img.crop((x1, y1, x1 + w, y1 + h))
406 |                 rois = RemoveOuterRois((x1, y1, x1 + w, y1 + h), rois)
407 |                 assert(img.size == (w, h))
408 | 
409 |                 return img.resize((self.size, self.size), self.interpolation), ResizeRois(img.size, (self.size,self.size),rois)
410 | 
411 |         # Fallback
412 |         scale = Scale(self.size, interpolation=self.interpolation)
413 |         crop = CenterCrop(self.size)
414 | 
415 |         img,rois = scale(img,rois)
416 |         return crop(img,rois)
417 | 
418 | def RemoveOuterRois(crop, rois):# remove rois out of bounding and move to new coordinate, e.g. use after crop immidiately
419 |     x1, y1, x2, y2 = crop
420 | 
421 |     rois[:,1] = torch.max(torch.FloatTensor([1]), rois[:,1]-x1+1)# might be inaccuract due to crop interpolation
422 |     rois[:,2] = torch.max(torch.FloatTensor([1]), rois[:,2]-y1+1)
423 |     rois[:,3] = torch.min(torch.FloatTensor([x2-x1]), rois[:,3]-x1+1)
424 |     rois[:,4] = torch.min(torch.FloatTensor([y2-y1]), rois[:,4]-x1+1)
425 | 
426 |     return rois
427 | 
428 | def ResizeRois(sizeIn, sizeOut, rois):# resize rois according to image transforms, e.g. use after resize immidiately
429 |     if isinstance(sizeIn, numbers.Number):
430 |             inw,inh = (int(sizeIn), int(sizeIn))
431 |     else:
432 |         inw,inh = sizeIn
433 |     if isinstance(sizeOut, numbers.Number):
434 |             outw,outh = (int(sizeOut), int(sizeOut))
435 |     else:
436 |         outw,outh = sizeOut
437 | 
438 |     # relative box center and width/hegiht, [index x1(horizonal) y1(vertical) x2 y2]
439 |     bxr = (rois[:,1] + rois[:,3])/2/inw
440 |     byr = (rois[:,2] + rois[:,4])/2/inh
441 |     bwr = (rois[:,3] - rois[:,1])/inw
442 |     bhr = (rois[:,4] - rois[:,2])/inh
443 | 
444 |     # new relative box center and width/hegiht
445 |     bxnew = outw*bxr
446 |     bynew = outh*byr
447 |     bwnew = outw*bwr
448 |     bhnew = outh*bhr
449 | 
450 |     rois[:,1] = torch.max(torch.FloatTensor([1]), torch.round(bxnew - bwnew/2))
451 |     rois[:,2] = torch.max(torch.FloatTensor([1]), torch.round(bynew - bhnew/2))
452 |     rois[:,3] = torch.min(torch.FloatTensor([outw]), torch.round(bxnew + bwnew/2))
453 |     rois[:,4] = torch.min(torch.FloatTensor([outh]), torch.round(bynew + bynew/2))
454 | 
455 |     return rois
456 | 
457 | def FixRois(size, rois):# remove meaningless rois,due to 'crop', minrois is defined according to current cordinate
458 | 
459 |     # rois_ = np.concatenate((rois.numpy(),np.array([[0,1,1,size[0],size[1]]])),axis=0)
460 |     rois_ = rois.numpy()
461 | 
462 |     isvalid = np.where((rois_[:,1]>=1) & (rois_[:,2]>=1) & \
463 |     (rois_[:,1]<rois_[:,3]) & (rois_[:,2]<rois_[:,4]) & \
464 |     (rois_[:,3]<=size[0]) & (rois_[:,4]<=size[1]))
465 | 
466 |     return torch.from_numpy(np.unique(rois_[isvalid],axis=0)).type(torch.FloatTensor)
467 | 
468 | def OffSet(rois, size, o0=8.5, o=9.5, stride=[16,16]):
469 |     # x1(horizonal,width) y1(vertical,height) x2 y2
470 | 
471 |     rois[:,1] += (-o0+o + stride[0]*0.5)
472 |     rois[:,2] += (-o0+o + stride[0]*0.5)
473 |     rois[:,3] += (-o0-o - stride[1]*0.5)
474 |     rois[:,4] += (-o0-o - stride[1]*0.5)
475 | 
476 |     return FixRois(size, rois)
477 | 
478 | 


--------------------------------------------------------------------------------
/part proposal/CarProposalSSW_par.m:
--------------------------------------------------------------------------------
  1 | function  imdb = CarProposalSSW_par(imdb, cubDir)
  2 | % This demo shows how to use the software described in our IJCV paper: 
  3 | %   Selective Search for Object Recognition,
  4 | %   J.R.R. Uijlings, K.E.A. van de Sande, T. Gevers, A.W.M. Smeulders, IJCV 2013
  5 | %%
  6 | addpath('Dependencies');
  7 | 
  8 | fprintf('Demo of how to run the code for:\n');
  9 | fprintf('   J. Uijlings, K. van de Sande, T. Gevers, A. Smeulders\n');
 10 | fprintf('   Segmentation as Selective Search for Object Recognition\n');
 11 | fprintf('   IJCV 2013\n\n');
 12 | 
 13 | % Compile anisotropic gaussian filter
 14 | if(~exist('anigauss'))
 15 |     fprintf('Compiling the anisotropic gauss filtering of:\n');
 16 |     fprintf('   J. Geusebroek, A. Smeulders, and J. van de Weijer\n');
 17 |     fprintf('   Fast anisotropic gauss filtering\n');
 18 |     fprintf('   IEEE Transactions on Image Processing, 2003\n');
 19 |     fprintf('Source code/Project page:\n');
 20 |     fprintf('   http://staff.science.uva.nl/~mark/downloads.html#anigauss\n\n');
 21 |     mex Dependencies/anigaussm/anigauss_mex.c Dependencies/anigaussm/anigauss.c -output anigauss
 22 | end
 23 | 
 24 | if(~exist('mexCountWordsIndex'))
 25 |     mex Dependencies/mexCountWordsIndex.cpp
 26 | end
 27 | 
 28 | % Compile the code of Felzenszwalb and Huttenlocher, IJCV 2004.
 29 | if(~exist('mexFelzenSegmentIndex'))
 30 |     fprintf('Compiling the segmentation algorithm of:\n');
 31 |     fprintf('   P. Felzenszwalb and D. Huttenlocher\n');
 32 |     fprintf('   Efficient Graph-Based Image Segmentation\n');
 33 |     fprintf('   International Journal of Computer Vision, 2004\n');
 34 |     fprintf('Source code/Project page:\n');
 35 |     fprintf('   http://www.cs.brown.edu/~pff/segment/\n');
 36 |     fprintf('Note: A small Matlab wrapper was made. See demo.m for usage\n\n');
 37 | %     fprintf('   
 38 |     mex Dependencies/FelzenSegment/mexFelzenSegmentIndex.cpp -output mexFelzenSegmentIndex;
 39 | end
 40 | 
 41 | %%
 42 | % Parameters. Note that this controls the number of hierarchical
 43 | % segmentations which are combined.
 44 | colorTypes = {'Hsv', 'Lab', 'RGI', 'H', 'Intensity'};
 45 | 
 46 | % Here you specify which similarity functions to use in merging
 47 | simFunctionHandles = {@SSSimColourTextureSizeFillOrig, @SSSimTextureSizeFill, @SSSimBoxFillOrig, @SSSimSize};
 48 | 
 49 | % Thresholds for the Felzenszwalb and Huttenlocher segmentation algorithm.
 50 | % Note that by default, we set minSize = k, and sigma = 0.8.
 51 | ks = [50 100 150 300]; % controls size of segments of initial segmentation. 
 52 | sigma = 0.8;
 53 | 
 54 | % After segmentation, filter out boxes which have a width/height smaller
 55 | % than minBoxWidth (default = 20 pixels).
 56 | minBoxWidth = 0;%20
 57 | 
 58 | % Comment the following three lines for the 'quality' version
 59 | % colorTypes = colorTypes(1:2); % 'Fast' uses HSV and Lab
 60 | % simFunctionHandles = simFunctionHandles(1:2); % Two different merging strategies
 61 | % ks = ks(1:2);
 62 | 
 63 | % Test the boxes
 64 | % load('GroundTruthVOC2007test.mat'); % Load ground truth boxes and images and image names
 65 | fprintf('After box extraction, boxes smaller than %d pixels will be removed\n', minBoxWidth);
 66 | fprintf('Obtaining boxes for Cub:\n');
 67 | totalTime = 0;
 68 | 
 69 | imsize = zeros(numel(imdb.images.name),3) ;
 70 | parfor i=1:length(imdb.images.name)
 71 |     fprintf('Selective Search for %d image\n', i);
 72 |     
 73 |     boxesT = [];
 74 |     priorityT = [];
 75 |     
 76 |     % VOCopts.img
 77 |     im = imread(fullfile(cubDir,imdb.images.name{i}));
 78 |     if size(im,3) == 1
 79 |         im = cat(3, im, im, im) ;
 80 |         imwrite(im, fullfile(cubDir,imdb.images.name{i})) ;
 81 |     end 
 82 |     imsize(i,:) = size(im) ;
 83 |     
 84 |     imageScale =1 ;% deal with large images
 85 |     if size(im,1) >=1000 || size(im,2) >=1000
 86 |         im = imresize(im,0.5) ;
 87 |         imageScale = 0.5 ;
 88 |     end
 89 |     
 90 |     idx = 1;
 91 |     for j=1:length(ks)
 92 |         k = ks(j); % Segmentation threshold k
 93 |         minSize = k; % We set minSize = k
 94 |         for n = 1:length(colorTypes)
 95 |             colorType = colorTypes{n};
 96 |             tic;
 97 |             [boxesTT blobIndIm blobBoxes hierarchy priorityTT] = Image2HierarchicalGrouping(im, sigma, k, minSize, colorType, simFunctionHandles);
 98 |             totalTime = totalTime + toc;
 99 |             idx = idx + 1;
100 |             
101 |             boxesT = [boxesT ;boxesTT];
102 |             priorityT = [priorityT ; priorityTT];
103 |         end
104 |     end
105 |     
106 |     priority = priorityT; % Concatenate priorities
107 |     
108 |     % Do pseudo random sorting as in paper
109 |     priority = priority .* rand(size(priority));
110 |     [priority sortIds] = sort(priority, 'ascend');
111 |     boxesT = boxesT(sortIds,:);
112 |     
113 |     % add by Michael
114 |     boxScores{i} = priority;
115 |     boxes{i} =  boxesT/imageScale; % Concatenate boxes from all hierarchies
116 |     
117 | end
118 | fprintf('\n');
119 | 
120 | %%
121 | tic
122 | for i=1:length(boxes)
123 |     [boxes{i} boxScores{i}] = FilterBoxesWidth(boxes{i}, minBoxWidth, boxScores{i});
124 |     [boxes{i} boxScores{i}]= BoxRemoveDuplicates(boxes{i}, boxScores{i});
125 | end
126 | totalTime = totalTime + toc;
127 | 
128 | imdb.images.boxes = boxes;
129 | imdb.images.boxScores = boxScores;
130 | 
131 | imdb.images.size = imsize(:,1:2) ;
132 | 
133 | fprintf('Time per image: %.2f\nNow evaluating the boxes on Cub...\n', totalTime ./ length(imdb.images.name));
134 | 
135 | % %%
136 | % [boxAbo boxMabo boScores avgNumBoxes] = BoxAverageBestOverlap(gtBoxes, gtImIds, boxes);
137 | % 
138 | % fprintf('Mean Average Best Overlap for the box-based locations: %.3f\n', boxMabo);


--------------------------------------------------------------------------------
/part proposal/Car_get_database_SSW.m:
--------------------------------------------------------------------------------
 1 | function imdb = cars_get_database_SSW(varargin)
 2 | % Modified from 2015 Tsung-Yu Lin, Aruni RoyChowdhury, Subhransu Maji.
 3 | % used to prepare the car-196 dataset of imdb.mat for MatCovNet. 
 4 | % imdb.images.boxes stores the proposed rois.
 5 | 
 6 | 
 7 | carsDir = '/raid/L/Fine-grained Dataset/Cars-196';
 8 | useCropped = false;
 9 | ifval = true;
10 | 
11 | if useCropped
12 |     imdb.imageDir = fullfile(carsDir, 'images_cropped') ;
13 | else
14 |     imdb.imageDir = fullfile(carsDir);
15 | end
16 | 
17 | imdb.maskDir = fullfile(carsDir, 'masks'); % doesn't exist
18 | imdb.sets = {'train', 'val', 'test'};
19 | 
20 | load(fullfile(carsDir, 'cars_annos'));
21 | 
22 | % Class names
23 | imdb.classes.name = class_names';
24 | 
25 | 
26 | N = numel(annotations);
27 | 
28 | imdb.images.name = cell(N, 1);
29 | imdb.images.id = 1:N;
30 | imdb.images.label = zeros(1,N);
31 | imdb.images.bounds = zeros(4, N);
32 | imdb.images.set = 3.*ones(1, N);
33 | imdb.images.difficult = false(1, N) ; 
34 | 
35 | % Image names
36 | for i=1:numel(annotations)
37 | 
38 |     imdb.images.name{i} = annotations(i).relative_im_path;
39 | 
40 |     % Class labels
41 |     imdb.images.label(i) = annotations(i).class;
42 | 
43 |     % Bounding boxes
44 |     
45 |     imdb.images.bounds(:,i) = round([annotations(i).bbox_x1 annotations(i).bbox_y1 annotations(i).bbox_x2 annotations(i).bbox_y2]');
46 | 
47 |     % Image sets
48 |     if(~annotations(i).test)
49 |         imdb.images.set(i) = 1;
50 |     end
51 | 
52 | 
53 | end
54 | 
55 | % Class labels
56 | % modified by Michael
57 | classLabel = imdb.images.label ;
58 | imdb.images.label = -ones(numel(imdb.classes.name),numel(classLabel));
59 | for i=1:numel(classLabel)
60 |     imdb.images.label(classLabel(i),i)=1;
61 | end
62 | 
63 | % Image size, update  it in CubProposalSSW_par.m
64 | imdb.images.size = [] ;
65 | 
66 | % Image size
67 | imdb.images.size = [] ;
68 | 
69 | % add image files to imdb
70 | % imdb.images.image = vl_imreadjpeg(strcat([imdb.imageDir filesep], imdb.images.name) , 'NumThreads', 8 );
71 | 
72 | 
73 | if(ifval)
74 | 
75 | trainSize = numel(find(imdb.images.set==1));
76 | validSize = round(trainSize/3);
77 | 
78 | trainIdx = find(imdb.images.set==1);
79 | 
80 | % set 1/3 of train set to validation
81 | valIdx = trainIdx(randperm(trainSize, validSize));
82 | imdb.images.set(valIdx) = 2;
83 | 
84 | end
85 | 
86 | 
87 | imdb.meta.classes = imdb.classes.name ;
88 | imdb.meta.inUse = true(1,numel(imdb.meta.classes)) ;
89 | 
90 | 
91 | % add by Michael
92 | % calculate proposals using SSW
93 | addpath('SelectiveSearchCodeIJCV');
94 | addpath(fullfile('SelectiveSearchCodeIJCV', 'Dependencies'));
95 | imdb = CarProposalSSW_par(imdb, carsDir) ;
96 | 
97 | save('data/Car/car_imdb.mat','-struct', 'imdb', '-v7.3');
98 | 
99 | 


--------------------------------------------------------------------------------
/part proposal/Readme.md:
--------------------------------------------------------------------------------
 1 | ## This directory is used to generate rois for PFNet. 
 2 | 
 3 | 
 4 | 1, download [Selective Search Window](https://koen.me/research/selectivesearch/) and extract it here. It is a Matlab toolbox for SSW.
 5 | 
 6 | 2, run `Car_get_database_SSW.m` to get a `car_imdb.mat` file for the dataset, which contains image data, rois and other metadata. Please make sure you have moved images to a suitbale directory. By the way, in fact, `CarProposalSSW_par.m` is called to generate rois.
 7 | 
 8 | 3, `roisWarpperforPytorch_generatetxt.m` uses `car_imdb.mat` to generate `.txt` file for Pytorch. Generated rois of CUB-200-2011, Stanford Cars and FGVC-Aircraft are [provided](https://drive.google.com/open?id=18DWMrK2WVEMGzRdMpgqgNiRbWOTtRwnP). Here is an example:
 9 | ```
10 | 0 2 2 1024 768
11 | 0 194 76 336 258
12 | 0 218 2 638 458
13 | 0 2 16 1024 454
14 | 0 638 466 792 580
15 | 0 2 318 1024 768
16 | 0 652 404 1024 768
17 | ```
18 | Each line represents a proposed bounding box. `0 2 2 1024 768` are identifier, x1(horizonal), y1(vertical), x2 and y2 respectively.
19 | 
20 | 


--------------------------------------------------------------------------------
/part proposal/roisWarpperforPytorch_generatetxt.m:
--------------------------------------------------------------------------------
 1 | imdb = load('data/Car/car_imdb.mat') ;
 2 | 
 3 | % -------------------------------------------------------------------------
 4 | 
 5 | maxNum = 500 ;
 6 | for i=1:numel(imdb.images.name)
 7 |     bbox = imdb.images.boxes{i};% height width
 8 |     imsize = imdb.images.size(i,:) ;
 9 | 
10 |     isGood = (bbox(:,3)-bbox(:,1))>20 & (bbox(:,4)-bbox(:,2))>20;
11 |     bbox = bbox(isGood,:);
12 |     
13 |     % remove duplicate ones in 14*14
14 |     [dummy, uniqueIdx] = unique(round(bbox/16), 'rows', 'first');
15 |     uniqueIdx = sort(uniqueIdx);
16 |     bbox = bbox(uniqueIdx,:);
17 |     
18 |     % limit number for training
19 |     if 1%imdb.images.set(i)~=3
20 |         nB = min(size(bbox,1),maxNum);
21 |     else
22 |         nB = size(bbox,1);
23 |     end
24 |     
25 |     imdb.images.boxes{i} = bbox(1:nB,:);
26 |     i
27 | end
28 | 
29 | mkdir('car_ims') ;
30 | 
31 | 
32 | parfor i = 1:numel(imdb.images.name)
33 |     rois_ = imdb.images.boxes{i} ;% y1(vertical) x1(horizonal)  y2 x2
34 |     rois = [zeros(size(rois_,1),1) rois_(:,2) rois_(:,1) rois_(:,4) rois_(:,3)] ;% input (x1,y1,x2,y2)
35 |     dlmwrite(fullfile([imdb.images.name{i}(1:end-4) '.txt']),rois,' ') ;
36 |     i
37 | end
38 | 


--------------------------------------------------------------------------------
/pic/PFNet.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JingyunLiang/PFNet-FGVC/a1dcf1ddd7427b4a907ab126653dcad505599cc4/pic/PFNet.jpg


--------------------------------------------------------------------------------
/pic/dog_loss_acc1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JingyunLiang/PFNet-FGVC/a1dcf1ddd7427b4a907ab126653dcad505599cc4/pic/dog_loss_acc1.png


--------------------------------------------------------------------------------