├── PFNet_train_test.py ├── README.md ├── lib ├── car_multilable_rois.py ├── layer_utils │ └── roi_pooling │ │ ├── _ext │ │ └── roi_pooling │ │ │ ├── __init__.py │ │ │ └── _roi_pooling.so │ │ ├── build.py │ │ ├── roi_pool.py │ │ ├── roi_pool_py.py │ │ └── src │ │ ├── cuda │ │ ├── roi_pooling_kernel.cu │ │ ├── roi_pooling_kernel.cu.o │ │ └── roi_pooling_kernel.h │ │ ├── roi_pooling.c │ │ ├── roi_pooling.h │ │ ├── roi_pooling_cuda.c │ │ └── roi_pooling_cuda.h └── transforms_with_rois.py ├── part proposal ├── CarProposalSSW_par.m ├── Car_get_database_SSW.m ├── Readme.md └── roisWarpperforPytorch_generatetxt.m └── pic ├── PFNet.jpg └── dog_loss_acc1.png /PFNet_train_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Jingyun Liang et al. 2 | # All rights reserved. 3 | 4 | import argparse 5 | import os 6 | import shutil 7 | import time 8 | import sys 9 | import matplotlib.pyplot as plt 10 | import matplotlib.ticker as ticker 11 | import math 12 | import pprint 13 | import numbers 14 | 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.parallel 18 | import torch.backends.cudnn as cudnn 19 | import torch.distributed as dist 20 | import torch.optim 21 | import torch.utils.data 22 | import torch.utils.data.distributed 23 | import torchvision.models as models 24 | import torch.nn.functional as F 25 | 26 | 27 | # dataset preparation, self-defined transforms with rois and spp layer 28 | from lib.car_multilable_rois import ImageFolder as car_multi 29 | import lib.transforms_with_rois as transforms 30 | from lib.layer_utils.roi_pooling.roi_pool import RoIPoolFunction 31 | 32 | 33 | model_names = sorted(name for name in models.__dict__ 34 | if name.islower() and not name.startswith("__") 35 | and callable(models.__dict__[name])) 36 | 37 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 38 | parser.add_argument('--data', metavar='DIR', default='none', 39 | help='path to dataset') 40 | parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18', 41 | choices=model_names, 42 | help='model architecture: ' + 43 | ' | '.join(model_names) + 44 | ' (default: vgg19)') 45 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 46 | help='number of data loading workers (default: 4)') 47 | parser.add_argument('--epochs', default=90, type=int, metavar='N', 48 | help='number of total epochs to run') 49 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 50 | help='manual epoch number (useful on restarts)') 51 | parser.add_argument('-b', '--batch-size', default=256, type=int, 52 | metavar='N', help='mini-batch size (default: 256)') 53 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 54 | metavar='LR', help='initial learning rate') 55 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 56 | help='momentum') 57 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 58 | metavar='W', help='weight decay (default: 1e-4)') 59 | parser.add_argument('--print-freq', '-p', default=10, type=int, 60 | metavar='N', help='print frequency (default: 10, no internel ouput: 0)') 61 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 62 | help='path to latest checkpoint (default: none)') 63 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 64 | help='evaluate model on validation set') 65 | parser.add_argument('--world-size', default=1, type=int, 66 | help='number of distributed processes') 67 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 68 | help='url used to set up distributed training') 69 | parser.add_argument('--dist-backend', default='gloo', type=str, 70 | help='distributed backend') 71 | parser.add_argument('--input-crop', default=224, type=int, 72 | help='input image crop size (default: 224)') 73 | parser.add_argument('--input-scale', default=256, type=int, 74 | help='input image scale size (default: 256)') 75 | parser.add_argument('--lr-stepsize', '--learning-rate-stepsize', default=30, type=int, 76 | metavar='LR', help='learning rate stepsize') 77 | parser.add_argument('--num-Classes', type=int, 78 | help='number of dataset classes') 79 | parser.add_argument('--maximum-Rois', dest='maximumRois', default=100, type=int, 80 | help='maximum number of rois') 81 | 82 | 83 | best_prec1 = 0 84 | plot_statistic = {"train_loss":[],"test_loss":[],"train_acc1":[],"test_acc1":[]} 85 | 86 | 87 | def main(): 88 | global args, best_prec1, modelDir, log_file, plot_statistic 89 | 90 | args = parser.parse_args() 91 | args.data = 'car' # dataset name: cub car aircraft 92 | args.numClasses = 196 # cub 200 car 196 aircraft 100 93 | args.arch = 'vgg19' # backbone CNN 94 | args.maximumRois = 500 # number of rois 95 | modelDir = args.data +'_'+ args.arch +'_test' # checkpoint dir 96 | args.resume = os.path.join(modelDir, 'epoch-' + '15' + '-checkpoint.pth.tar') # 1,2,3, 0 for no resume checkpoint 97 | args.evaluate = True 98 | 99 | args.epochs = 20 100 | args.batch_size = 1 101 | args.lr = 1e-4 102 | args.lr_stepsize = 10 103 | args.weight_decay = 5e-4 104 | 105 | args.workers = 2 106 | args.print_freq = 10 107 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 108 | args.distributed = args.world_size > 1 109 | 110 | timestamp = time.strftime("%Y-%m-%d_%H-%M-%S") 111 | log_file = modelDir + "_{}.log".format(timestamp) 112 | if not os.path.exists(modelDir): 113 | os.mkdir(modelDir) 114 | shutil.copy(os.path.abspath(__file__),modelDir) 115 | os.rename(os.path.join(modelDir,os.path.basename(__file__)),\ 116 | os.path.join(modelDir,os.path.basename(__file__))[:-3]+"_{}.py".format(timestamp)) 117 | 118 | printlog(args) 119 | 120 | if args.distributed: 121 | dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 122 | world_size=args.world_size) 123 | 124 | 125 | # create model 126 | printlog("=> using imagenet pre-trained model '{}'".format(args.arch)) 127 | if 'vgg' in args.arch : 128 | model = models.__dict__[args.arch](pretrained=True) 129 | model.classifier._modules['6'] = nn.Linear(model.classifier[6].in_features, args.numClasses) 130 | model = VggBasedNet_PFNet(originalModel = model) 131 | else: 132 | raise ValueError 133 | 134 | printlog(model) 135 | 136 | if not args.distributed: 137 | model = torch.nn.DataParallel(model.cuda()) 138 | else: 139 | model.cuda() 140 | model = torch.nn.parallel.DistributedDataParallel(model) 141 | 142 | # define optimizer 143 | params = [] 144 | if 'vgg' in args.arch : 145 | for key, value in dict(model.named_parameters()).items(): 146 | if value.requires_grad: 147 | if 'features' in key or 'conv5' in key: 148 | smaller_lr = 0.1 149 | else: 150 | smaller_lr = 1 151 | 152 | if 'bias' in key: 153 | params += [{'params':[value],'lr':args.lr*smaller_lr, 'weight_decay': False and args.weight_decay or 0}] 154 | else: 155 | params += [{'params':[value],'lr':args.lr*smaller_lr, 'weight_decay': args.weight_decay}] 156 | else: 157 | printlog('layer --{0}-- is fixed.'.format(key)) 158 | 159 | optimizer = torch.optim.SGD(params, momentum=args.momentum) 160 | else: 161 | raise ValueError 162 | 163 | # optionally resume from a checkpoint 164 | if args.resume : 165 | printlog("=> loading specified checkpoint '{}'".format(args.resume)) 166 | 167 | if os.path.isfile(args.resume): 168 | checkpoint = torch.load(args.resume) 169 | args.start_epoch = checkpoint['epoch'] 170 | best_prec1 = checkpoint['best_prec1'] 171 | model.load_state_dict(checkpoint['state_dict']) 172 | optimizer.load_state_dict(checkpoint['optimizer']) 173 | plot_statistic = checkpoint['loss_acc1'] 174 | printlog("=> loaded checkpoint '{}' (epoch {})" 175 | .format(args.resume, checkpoint['epoch'])) 176 | else: 177 | printlog("=> no checkpoint found at '{}'".format(args.resume)) 178 | 179 | cudnn.benchmark = True 180 | 181 | 182 | # prepare data 183 | train_loader,val_loader,train_sampler = get_data_loader() 184 | 185 | # define loss 186 | criterion = [BinaryLogLoss().cuda(), PartAttentionLoss().cuda()] 187 | 188 | # model testing 189 | if args.evaluate: 190 | validate(val_loader, model, criterion) 191 | return 192 | 193 | # model training 194 | for epoch in range(args.start_epoch, args.epochs): 195 | if args.distributed: 196 | train_sampler.set_epoch(epoch) 197 | adjust_learning_rate(optimizer, epoch) 198 | 199 | # train 200 | train(train_loader, model, criterion, optimizer, epoch) 201 | 202 | # test 203 | prec1 = validate(val_loader, model, criterion) 204 | 205 | # save checkpoint 206 | best_prec1 = max(prec1, best_prec1) 207 | save_checkpoint({ 208 | 'epoch': epoch + 1, 209 | 'arch': args.arch, 210 | 'state_dict': model.state_dict(), 211 | 'best_prec1': best_prec1, 212 | 'optimizer' : optimizer.state_dict(), 213 | 'loss_acc1':plot_statistic, 214 | }) 215 | 216 | showPlot(plot_statistic) 217 | printlog('Training done, the best test_acc1 is {0} in Epoch {1}'.format(best_prec1,plot_statistic["test_acc1"].index(best_prec1))) 218 | 219 | 220 | def train(train_loader, model, criterion, optimizer, epoch): 221 | """model training""" 222 | batch_time = AverageMeter() 223 | data_time = AverageMeter() 224 | losses = AverageMeter() 225 | top1 = AverageMeter() 226 | top5 = AverageMeter() 227 | 228 | model.train() 229 | 230 | end = time.time() 231 | for i, (inputs, target) in enumerate(train_loader): 232 | data_time.update(time.time() - end) 233 | 234 | input = inputs[0].cuda() # image tensor 235 | rois = inputs[1][0,:,:].cuda() # rois matrix 236 | target = target.cuda() 237 | input_var = torch.autograd.Variable(input) 238 | rois_var = torch.autograd.Variable(rois) 239 | target_var = torch.autograd.Variable(target) 240 | 241 | # forward 242 | output,softMatrix = model(input_var, rois_var) 243 | loss = criterion[0](output, target_var)+criterion[1](softMatrix, target_var) 244 | 245 | # measure accuracy and record loss 246 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 247 | losses.update(loss.data[0], input.size(0)) 248 | top1.update(prec1[0], input.size(0)) 249 | top5.update(prec5[0], input.size(0)) 250 | 251 | # backward 252 | optimizer.zero_grad() 253 | loss.backward() 254 | optimizer.step() 255 | 256 | # measure elapsed time 257 | batch_time.update(time.time() - end) 258 | end = time.time() 259 | if args.print_freq: 260 | if i % args.print_freq == 0: 261 | printlog('Epoch: [{0}][{1}/{2}]\t' 262 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 263 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 264 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 265 | 'Prec_1 {top1.val:.3f} ({top1.avg:.3f})'.format( 266 | epoch, i, len(train_loader), batch_time=batch_time, 267 | data_time=data_time, loss=losses, top1=top1)) 268 | 269 | printlog('Epoch {0} \t\t\t Model {1} \t Time {2}'.format(epoch, modelDir,time.strftime("%H-%M-%S"))) 270 | printlog('Train Loss {loss.avg:.4f} top1 {top1.avg:.3f} BatchTime{batch_time.avg:.3f}' 271 | .format(loss = losses, top1=top1, batch_time=batch_time)) 272 | 273 | plot_statistic["train_loss"].append(losses.avg) 274 | plot_statistic["train_acc1"].append(top1.avg) 275 | 276 | 277 | def validate(val_loader, model, criterion): 278 | """model testing""" 279 | batch_time = AverageMeter() 280 | losses = AverageMeter() 281 | top1 = AverageMeter() 282 | top5 = AverageMeter() 283 | 284 | # switch to evaluate mode 285 | model.eval() 286 | 287 | end = time.time() 288 | for i, (inputs, target) in enumerate(val_loader): 289 | input = inputs[0].cuda() 290 | rois = inputs[1][0,:,:].cuda() 291 | target = target.cuda() 292 | input_var = torch.autograd.Variable(input, volatile=True) 293 | rois_var = torch.autograd.Variable(rois, volatile=True) 294 | target_var = torch.autograd.Variable(target, volatile=True) 295 | 296 | # compute output 297 | output,softMatrix = model(input_var, rois_var) 298 | loss = criterion[0](output, target_var)+criterion[1](softMatrix, target_var)#+criterion[2](sparseSoftMatrix) 299 | 300 | # measure accuracy and record loss 301 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 302 | losses.update(loss.data[0], input.size(0)) 303 | top1.update(prec1[0], input.size(0)) 304 | top5.update(prec5[0], input.size(0)) 305 | 306 | # measure elapsed time 307 | batch_time.update(time.time() - end) 308 | end = time.time() 309 | 310 | if args.print_freq: 311 | if i % args.print_freq == 0: 312 | printlog('Test: [{0}/{1}]\t' 313 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 314 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 315 | 'Prec_1 {top1.val:.3f} ({top1.avg:.3f})'.format( 316 | i, len(val_loader), batch_time=batch_time, loss=losses, 317 | top1=top1)) 318 | 319 | printlog('Test Loss {loss.avg:.4f} top1 {top1.avg:.3f} BatchTime{batch_time.avg:.3f}' 320 | .format(loss = losses, top1=top1, batch_time=batch_time)) 321 | 322 | plot_statistic["test_loss"].append(losses.avg) 323 | plot_statistic["test_acc1"].append(top1.avg) 324 | showPlot(plot_statistic) 325 | 326 | return top1.avg 327 | 328 | 329 | def save_checkpoint(state): 330 | """save checkpoint""" 331 | filename = os.path.join(modelDir, 'epoch-' + str(state['epoch']) + '-checkpoint.pth.tar') 332 | torch.save(state, filename) 333 | 334 | 335 | class AverageMeter(object): 336 | """Computes and stores the average and current value""" 337 | def __init__(self): 338 | self.reset() 339 | 340 | def reset(self): 341 | self.val = 0 342 | self.avg = 0 343 | self.sum = 0 344 | self.count = 0 345 | 346 | def update(self, val, n=1): 347 | self.val = val 348 | self.sum += val * n 349 | self.count += n 350 | self.avg = self.sum / self.count 351 | 352 | 353 | def adjust_learning_rate(optimizer, epoch): 354 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 355 | lr = args.lr * (0.1 ** (epoch // args.lr_stepsize)) 356 | for param_group in optimizer.param_groups: 357 | param_group['lr'] = lr 358 | 359 | 360 | def accuracy(output, target, topk=(1,)): 361 | """Computes the precision@k for the specified values of k""" 362 | maxk = max(topk) 363 | batch_size = target.size(0) 364 | 365 | _, pred = output.topk(maxk, 1, True, True) 366 | pred = pred.t() 367 | 368 | _, index = torch.max(target,dim=1) 369 | target = index 370 | 371 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 372 | 373 | res = [] 374 | for k in topk: 375 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 376 | res.append(correct_k.mul_(100.0 / batch_size)) 377 | return res 378 | 379 | 380 | def printlog(output): 381 | """print log on screen and save to .log file""" 382 | print(output) 383 | 384 | stdout_backup = sys.stdout 385 | logfile = open(os.path.join(modelDir,log_file),'a') 386 | sys.stdout = logfile 387 | pprint.pprint(output) 388 | logfile.close() 389 | sys.stdout = stdout_backup 390 | 391 | 392 | def showPlot(plot_statistic): 393 | """plot loss and accuracy""" 394 | plt.clf() 395 | plt1 = plt.subplot(121) 396 | plt2 = plt.subplot(122) 397 | loc = ticker.MultipleLocator(base=10) 398 | plt1.xaxis.set_major_locator(loc) 399 | plt2.xaxis.set_major_locator(loc) 400 | plt1.plot(plot_statistic["train_loss"],label="train_loss") 401 | plt2.plot(plot_statistic["train_acc1"],label="train_acc1") 402 | plt1.plot(plot_statistic["test_loss"],label="test_loss") 403 | plt2.plot(plot_statistic["test_acc1"],label="test_acc1") 404 | plt1.legend() 405 | plt2.legend() 406 | plt.savefig(os.path.join(modelDir,'loss_acc1.png')) 407 | 408 | def get_data_loader(): 409 | """Data loading code""" 410 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 411 | std=[0.229, 0.224, 0.225]) 412 | 413 | train_transforms = transforms.Compose([ 414 | transforms.Scale(args.input_crop,scaleheight=[250,350,450,550,650]), 415 | transforms.RandomHorizontalFlip(), 416 | transforms.ToTensor(), 417 | normalize, 418 | ]) 419 | val_transforms = transforms.Compose([ 420 | transforms.Scale(args.input_crop,scaleheight=[250,350,450,550,650]),#79.75% for above test 421 | transforms.ToTensor(), 422 | normalize, 423 | ]) 424 | 425 | if args.data == 'car': 426 | train_dataset = car_multi(args.data, 'trainval',transform=train_transforms) 427 | val_dataset = car_multi(args.data, 'test',transform=val_transforms) 428 | else: 429 | raise ValueError 430 | 431 | if args.distributed: 432 | train_sampler = torch.utils.data.distributed.DistributedSampler(train_transforms) 433 | else: 434 | train_sampler = None 435 | 436 | train_loader = torch.utils.data.DataLoader( 437 | train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), 438 | num_workers=args.workers, pin_memory=True, sampler=train_sampler) 439 | 440 | val_loader = torch.utils.data.DataLoader( 441 | val_dataset, batch_size=args.batch_size, shuffle=False, 442 | num_workers=args.workers, pin_memory=True) 443 | 444 | return train_loader,val_loader,train_sampler 445 | 446 | 447 | class VggBasedNet_PFNet(nn.Module): 448 | """model structure of PFNet""" 449 | def __init__(self, originalModel): 450 | super(VggBasedNet_PFNet, self).__init__() 451 | self.features = nn.Sequential(*list(originalModel.features)[:-1]) 452 | self.roipooling = RoIPoolFunction(7, 7, 1. / 16.) 453 | self.classifier = originalModel.classifier 454 | 455 | def forward(self, x, rois): 456 | # part feature extractor 457 | x = self.features(x) 458 | x = self.roipooling(x, rois) 459 | x = x.view(x.size(0), -1) 460 | x = self.classifier(x) 461 | 462 | # two-level loss 463 | softMatrix = F.softmax(x, dim=1) 464 | x = softMatrix.sum(dim=0,keepdim=True)/args.maximumRois 465 | 466 | return x, softMatrix 467 | 468 | 469 | class VggFtNet(nn.Module): 470 | """VGG fine-tuning""" 471 | def __init__(self, originalModel): 472 | super(VggFtNet, self).__init__() 473 | self.features = nn.Sequential(*list(originalModel.features)) 474 | self.roipooling = RoIPoolFunction(7, 7, 1. / 16.) 475 | self.classifier = originalModel.classifier 476 | 477 | 478 | def forward(self, x, rois): 479 | x = self.features(x) 480 | x = x.view(x.size(0), -1) 481 | x = self.classifier(x) 482 | 483 | return x 484 | 485 | 486 | class BinaryLogLoss(nn.Module): 487 | """image loss""" 488 | def __init__(self): 489 | super(BinaryLogLoss, self).__init__() 490 | return 491 | 492 | def forward(self, input, target): 493 | # t = -log(c.*(X-0.5) + 0.5) ;. x is assumed to be the 494 | # probability that the attribute is active (c=+1). Hence x must be 495 | # a number in the range [0,1]. This is the binary version of the`log` loss. 496 | return -(target.mul(input*0.9999+1e-5 -0.5)+0.5).log().sum() 497 | 498 | 499 | class PartAttentionLoss(nn.Module): 500 | """part attention loss""" 501 | def __init__(self): 502 | super(PartAttentionLoss, self).__init__() 503 | self.lamda = 1 504 | return 505 | 506 | def forward(self, softMatrix, target): 507 | p_t = (target.mul(softMatrix-0.5)+0.5)*0.9999+1e-5 508 | return -(p_t).log().mul(\ 509 | torch.pow(1-p_t,self.lamda)).sum()/softMatrix.size(0)*5 510 | 511 | 512 | 513 | if __name__ == '__main__': 514 | main() 515 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Codes for *PFNet: A Novel Part Fusion Network for Fine-grained Visual Categorization* 2 | This repository holds the PyTorch(V0.3.0) code for PFNet. 3 | 4 | ## Introduction 5 | 6 | The existing methods in fine-grained visual categorization focus on integrating multiple deep CNN models or complicated attention mechanism, resulting in increasing cumbersome networks. In addition, most methods rely on part annotations which requires expensive expert guidance. In this paper, without extra annotation, we propose a novel part fusion network (PFNet) to effectively fuse discriminative image parts for classification. More specifically, PFNet consists of a part feature extractor to extract part features and a two-level classification network to utilize part-level and image-level features simultaneously. Part-level features are trained with the weighted part loss, which embeds a weighting mechanism based on different parts' characteristics. Easy parts, hard parts and background parts are proposed and discriminatively used for classification. Moreover, part-level features are fused to form an image-level feature so as to introduce global supervision and generate final predictions. Experiments on three popular benchmark datasets show that our framework achieves competitive performance compared with the state-of-the-art. 7 | 8 | ![alt text](https://github.com/MichaelLiang12/PFNet-FGVC/blob/master/pic/PFNet.jpg "visualization") 9 | 10 | ## Prepare Datasets 11 | 12 | Prepare the corresponding datasets ([CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html), [Stanford Cars](http://ai.stanford.edu/~jkrause/cars/car_dataset.html) or [FGVC-Aircraft](http://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/)) before training PFNet. For quick start, you can download the dataset [Stanford Cars](http://ai.stanford.edu/~jkrause/cars/car_dataset.html), proposed rois files [car_rois500.tar.gz](https://drive.google.com/open?id=18DWMrK2WVEMGzRdMpgqgNiRbWOTtRwnP) and prepared train/test split file [car_splits.tar.gz](https://drive.google.com/open?id=18DWMrK2WVEMGzRdMpgqgNiRbWOTtRwnP). Unzip these files and organize them in the current working directory as follows: 13 | ``` 14 | -car 15 | --car_ims 16 | ---000001.jpg 17 | 18 | --car_rois500 19 | ---car_ims 20 | ----000001.txt 21 | 22 | --split 23 | ---Acura Integra Type R 2001_test.txt 24 | ``` 25 | 26 | For part proposal, we also provide codes for generating part proposals using [Selective Search Window](https://koen.me/research/selectivesearch/). Please refer to the guide provide in our `part proposal` directory. 27 | 28 | 29 | 30 | ## Usage 31 | 32 | 1, Download this repo recursively: 33 | ```shell 34 | git clone --recursive https://github.com/MichaelLiang12/PFNet-FGVC.git 35 | ``` 36 | 2, Build RoiPooling module 37 | 38 | Please follow the instuctions in [pytorch-faster-rcnn](https://github.com/ruotianluo/pytorch-faster-rcnn#installation). We use the RoiPooling module implemented by them. Note that if you also use `Ubuntu14.04+Cuda8.0+TitanX`, you might not need to compile again. 39 | 40 | 41 | 3, Run `PFNet_train_test.py` 42 | 43 | You can modify fundamental parameters in the `main()` function. The training process might be like follows. By setting `args.evaluate = True`, you can download [our model](https://drive.google.com/open?id=18DWMrK2WVEMGzRdMpgqgNiRbWOTtRwnP) and test it directly. 44 | 45 | ![alt text](https://github.com/MichaelLiang12/PFNet-FGVC/blob/master/pic/dog_loss_acc1.png "visualization") 46 | 47 | ## Citation 48 | For Selective Search Window and RoiPooling module. 49 | ``` 50 | @article{uijlings2013selective, 51 | title={Selective search for object recognition}, 52 | author={Uijlings, Jasper RR and Van De Sande, Koen EA and Gevers, Theo and Smeulders, Arnold WM}, 53 | journal={International Journal of Computer Vision}, 54 | volume={104}, 55 | number={2}, 56 | pages={154--171}, 57 | year={2013}, 58 | publisher={Springer} 59 | } 60 | 61 | @article{chen17implementation, 62 | Author = {Xinlei Chen and Abhinav Gupta}, 63 | Title = {An Implementation of Faster RCNN with Study for Region Sampling}, 64 | Journal = {arXiv preprint arXiv:1702.02138}, 65 | Year = {2017} 66 | } 67 | ``` 68 | ## Citation for our PFNet 69 | ``` 70 | @Article{Liang2018, 71 | author="Liang, Jingyun 72 | and Guo, Jinlin 73 | and Guo, Yanming 74 | and Lao, Songyang", 75 | title="PFNet: a novel part fusion network for fine-grained visual categorization", 76 | journal="Multimedia Tools and Applications", 77 | year="2018", 78 | month="Dec", 79 | day="15", 80 | issn="1573-7721", 81 | doi="10.1007/s11042-018-7047-5", 82 | url="https://doi.org/10.1007/s11042-018-7047-5" 83 | } 84 | ``` 85 | 86 | 87 | [View Paper](https://doi.org/10.1007/s11042-018-7047-5) 88 | -------------------------------------------------------------------------------- /lib/car_multilable_rois.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import os.path 4 | import tarfile 5 | from urllib.parse import urlparse 6 | 7 | import numpy as np 8 | import torch 9 | import torch.utils.data as data 10 | from PIL import Image 11 | 12 | import io 13 | 14 | 15 | object_categories = ['AM General Hummer SUV 2000','Acura RL Sedan 2012','Acura TL Sedan 2012','Acura TL Type-S 2008','Acura TSX Sedan 2012','Acura Integra Type R 2001','Acura ZDX Hatchback 2012','Aston Martin V8 Vantage Convertible 2012','Aston Martin V8 Vantage Coupe 2012','Aston Martin Virage Convertible 2012','Aston Martin Virage Coupe 2012','Audi RS 4 Convertible 2008','Audi A5 Coupe 2012','Audi TTS Coupe 2012','Audi R8 Coupe 2012','Audi V8 Sedan 1994','Audi 100 Sedan 1994','Audi 100 Wagon 1994','Audi TT Hatchback 2011','Audi S6 Sedan 2011','Audi S5 Convertible 2012','Audi S5 Coupe 2012','Audi S4 Sedan 2012','Audi S4 Sedan 2007','Audi TT RS Coupe 2012','BMW ActiveHybrid 5 Sedan 2012','BMW 1 Series Convertible 2012','BMW 1 Series Coupe 2012','BMW 3 Series Sedan 2012','BMW 3 Series Wagon 2012','BMW 6 Series Convertible 2007','BMW X5 SUV 2007','BMW X6 SUV 2012','BMW M3 Coupe 2012','BMW M5 Sedan 2010','BMW M6 Convertible 2010','BMW X3 SUV 2012','BMW Z4 Convertible 2012','Bentley Continental Supersports Conv. Convertible 2012','Bentley Arnage Sedan 2009','Bentley Mulsanne Sedan 2011','Bentley Continental GT Coupe 2012','Bentley Continental GT Coupe 2007','Bentley Continental Flying Spur Sedan 2007','Bugatti Veyron 16.4 Convertible 2009','Bugatti Veyron 16.4 Coupe 2009','Buick Regal GS 2012','Buick Rainier SUV 2007','Buick Verano Sedan 2012','Buick Enclave SUV 2012','Cadillac CTS-V Sedan 2012','Cadillac SRX SUV 2012','Cadillac Escalade EXT Crew Cab 2007','Chevrolet Silverado 1500 Hybrid Crew Cab 2012','Chevrolet Corvette Convertible 2012','Chevrolet Corvette ZR1 2012','Chevrolet Corvette Ron Fellows Edition Z06 2007','Chevrolet Traverse SUV 2012','Chevrolet Camaro Convertible 2012','Chevrolet HHR SS 2010','Chevrolet Impala Sedan 2007','Chevrolet Tahoe Hybrid SUV 2012','Chevrolet Sonic Sedan 2012','Chevrolet Express Cargo Van 2007','Chevrolet Avalanche Crew Cab 2012','Chevrolet Cobalt SS 2010','Chevrolet Malibu Hybrid Sedan 2010','Chevrolet TrailBlazer SS 2009','Chevrolet Silverado 2500HD Regular Cab 2012','Chevrolet Silverado 1500 Classic Extended Cab 2007','Chevrolet Express Van 2007','Chevrolet Monte Carlo Coupe 2007','Chevrolet Malibu Sedan 2007','Chevrolet Silverado 1500 Extended Cab 2012','Chevrolet Silverado 1500 Regular Cab 2012','Chrysler Aspen SUV 2009','Chrysler Sebring Convertible 2010','Chrysler Town and Country Minivan 2012','Chrysler 300 SRT-8 2010','Chrysler Crossfire Convertible 2008','Chrysler PT Cruiser Convertible 2008','Daewoo Nubira Wagon 2002','Dodge Caliber Wagon 2012','Dodge Caliber Wagon 2007','Dodge Caravan Minivan 1997','Dodge Ram Pickup 3500 Crew Cab 2010','Dodge Ram Pickup 3500 Quad Cab 2009','Dodge Sprinter Cargo Van 2009','Dodge Journey SUV 2012','Dodge Dakota Crew Cab 2010','Dodge Dakota Club Cab 2007','Dodge Magnum Wagon 2008','Dodge Challenger SRT8 2011','Dodge Durango SUV 2012','Dodge Durango SUV 2007','Dodge Charger Sedan 2012','Dodge Charger SRT-8 2009','Eagle Talon Hatchback 1998','FIAT 500 Abarth 2012','FIAT 500 Convertible 2012','Ferrari FF Coupe 2012','Ferrari California Convertible 2012','Ferrari 458 Italia Convertible 2012','Ferrari 458 Italia Coupe 2012','Fisker Karma Sedan 2012','Ford F-450 Super Duty Crew Cab 2012','Ford Mustang Convertible 2007','Ford Freestar Minivan 2007','Ford Expedition EL SUV 2009','Ford Edge SUV 2012','Ford Ranger SuperCab 2011','Ford GT Coupe 2006','Ford F-150 Regular Cab 2012','Ford F-150 Regular Cab 2007','Ford Focus Sedan 2007','Ford E-Series Wagon Van 2012','Ford Fiesta Sedan 2012','GMC Terrain SUV 2012','GMC Savana Van 2012','GMC Yukon Hybrid SUV 2012','GMC Acadia SUV 2012','GMC Canyon Extended Cab 2012','Geo Metro Convertible 1993','HUMMER H3T Crew Cab 2010','HUMMER H2 SUT Crew Cab 2009','Honda Odyssey Minivan 2012','Honda Odyssey Minivan 2007','Honda Accord Coupe 2012','Honda Accord Sedan 2012','Hyundai Veloster Hatchback 2012','Hyundai Santa Fe SUV 2012','Hyundai Tucson SUV 2012','Hyundai Veracruz SUV 2012','Hyundai Sonata Hybrid Sedan 2012','Hyundai Elantra Sedan 2007','Hyundai Accent Sedan 2012','Hyundai Genesis Sedan 2012','Hyundai Sonata Sedan 2012','Hyundai Elantra Touring Hatchback 2012','Hyundai Azera Sedan 2012','Infiniti G Coupe IPL 2012','Infiniti QX56 SUV 2011','Isuzu Ascender SUV 2008','Jaguar XK XKR 2012','Jeep Patriot SUV 2012','Jeep Wrangler SUV 2012','Jeep Liberty SUV 2012','Jeep Grand Cherokee SUV 2012','Jeep Compass SUV 2012','Lamborghini Reventon Coupe 2008','Lamborghini Aventador Coupe 2012','Lamborghini Gallardo LP 570-4 Superleggera 2012','Lamborghini Diablo Coupe 2001','Land Rover Range Rover SUV 2012','Land Rover LR2 SUV 2012','Lincoln Town Car Sedan 2011','MINI Cooper Roadster Convertible 2012','Maybach Landaulet Convertible 2012','Mazda Tribute SUV 2011','McLaren MP4-12C Coupe 2012','Mercedes-Benz 300-Class Convertible 1993','Mercedes-Benz C-Class Sedan 2012','Mercedes-Benz SL-Class Coupe 2009','Mercedes-Benz E-Class Sedan 2012','Mercedes-Benz S-Class Sedan 2012','Mercedes-Benz Sprinter Van 2012','Mitsubishi Lancer Sedan 2012','Nissan Leaf Hatchback 2012','Nissan NV Passenger Van 2012','Nissan Juke Hatchback 2012','Nissan 240SX Coupe 1998','Plymouth Neon Coupe 1999','Porsche Panamera Sedan 2012','Ram CV Cargo Van Minivan 2012','Rolls-Royce Phantom Drophead Coupe Convertible 2012','Rolls-Royce Ghost Sedan 2012','Rolls-Royce Phantom Sedan 2012','Scion xD Hatchback 2012','Spyker C8 Convertible 2009','Spyker C8 Coupe 2009','Suzuki Aerio Sedan 2007','Suzuki Kizashi Sedan 2012','Suzuki SX4 Hatchback 2012','Suzuki SX4 Sedan 2012','Tesla Model S Sedan 2012','Toyota Sequoia SUV 2012','Toyota Camry Sedan 2012','Toyota Corolla Sedan 2012','Toyota 4Runner SUV 2012','Volkswagen Golf Hatchback 2012','Volkswagen Golf Hatchback 1991','Volkswagen Beetle Hatchback 2012','Volvo C30 Hatchback 2012','Volvo 240 Sedan 1993','Volvo XC90 SUV 2007','smart fortwo Convertible 2012'] 16 | 17 | def read_image_label(file): 18 | print('[dataset] read ' + file) 19 | data = dict() 20 | with open(file, 'r') as f: 21 | for line in f: 22 | tmp = line.split(' ') 23 | name = tmp[0] 24 | label = int(tmp[-1]) 25 | data[name] = label 26 | return data 27 | 28 | 29 | def read_object_labels(root, set): 30 | path_labels = os.path.join(root, 'split') 31 | labeled_data = dict() 32 | num_classes = len(object_categories) 33 | 34 | for i in range(num_classes): 35 | file = os.path.join(path_labels, object_categories[i] + '_' + set + '.txt') 36 | data = read_image_label(file) 37 | 38 | if i == 0: 39 | for (name, label) in data.items(): 40 | labels = np.zeros(num_classes) 41 | labels[i] = label 42 | labeled_data[name] = labels 43 | else: 44 | for (name, label) in data.items(): 45 | labeled_data[name][i] = label 46 | 47 | return labeled_data 48 | 49 | def write_object_labels_csv(file, labeled_data): 50 | # write a csv file 51 | print('[dataset] write file %s' % file) 52 | with open(file, 'w') as csvfile: 53 | fieldnames = ['name'] 54 | fieldnames.extend(object_categories) 55 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 56 | 57 | writer.writeheader() 58 | for (name, labels) in labeled_data.items(): 59 | example = {'name': name} 60 | for i in range(len(fieldnames)-1): 61 | example[fieldnames[i + 1]] = int(labels[i]) 62 | writer.writerow(example) 63 | 64 | csvfile.close() 65 | 66 | def read_object_labels_csv(file, header=True): 67 | images = [] 68 | num_categories = 0 69 | print('[dataset] read', file) 70 | with open(file, 'r') as f: 71 | reader = csv.reader(f) 72 | rownum = 0 73 | for row in reader: 74 | if header and rownum == 0: 75 | header = row 76 | else: 77 | if num_categories == 0: 78 | num_categories = len(row) - 1 79 | name = row[0] 80 | labels = (np.asarray(row[1:num_categories + 1])).astype(np.float32) 81 | labels = torch.from_numpy(labels) 82 | item = (name, labels) 83 | images.append(item) 84 | rownum += 1 85 | return images 86 | 87 | 88 | class ImageFolder(data.Dataset): 89 | def __init__(self, root, set, transform=None, target_transform=None): 90 | self.root = root 91 | self.path_images = root 92 | self.path_rois = os.path.join(root,'car_rois500') 93 | self.set = set 94 | self.transform = transform 95 | self.target_transform = target_transform 96 | # download dataset 97 | 98 | # define path of csv file 99 | path_csv = os.path.join(self.root, 'path_label_csv') 100 | # define filename of csv file 101 | file_csv = os.path.join(path_csv, 'classification_' + set + '.csv') 102 | 103 | # create the csv file if necessary 104 | if not os.path.exists(file_csv): 105 | if not os.path.exists(path_csv): # create dir if necessary 106 | os.makedirs(path_csv) 107 | # generate csv file 108 | labeled_data = read_object_labels(self.root, self.set) 109 | # write csv file 110 | write_object_labels_csv(file_csv, labeled_data) 111 | 112 | write_rois_pt(self.path_rois, labeled_data.keys()) 113 | 114 | self.classes = object_categories 115 | self.images = read_object_labels_csv(file_csv) 116 | 117 | print('[dataset] car classification set=%s number of classes=%d number of images=%d' % ( 118 | set, len(self.classes), len(self.images))) 119 | 120 | def __getitem__(self, index): 121 | path, target = self.images[index] 122 | img = Image.open(os.path.join(self.path_images, path + '.jpg')).convert('RGB') 123 | rois = torch.load(os.path.join(self.path_rois, path + '.pt')) 124 | 125 | if self.transform is not None: 126 | img,rois = self.transform(img,rois) 127 | if self.target_transform is not None: 128 | target = self.target_transform(target) 129 | 130 | return (img, rois), target 131 | 132 | def __len__(self): 133 | return len(self.images) 134 | 135 | def get_number_classes(self): 136 | return len(self.classes) 137 | 138 | def write_rois_pt(path_rois, imagename): 139 | 140 | print('[dataset] transfer rois.txt to rois.pt file') 141 | for path in imagename: 142 | ##rois = torch.FloatTensor([[0.0000, 50.0000, 50.0000, 150.0000, 150.0000 ],[0.0000, 100.0000, 100.0000, 200.0000, 200.0000]]) 143 | rois = torch.from_numpy(np.loadtxt(os.path.join(path_rois, path + '.txt'),dtype=float)).type(torch.FloatTensor) 144 | torch.save(rois, os.path.join(path_rois, path + '.pt')) 145 | 146 | 147 | -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/_ext/roi_pooling/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_pooling import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/_ext/roi_pooling/_roi_pooling.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JingyunLiang/PFNet-FGVC/a1dcf1ddd7427b4a907ab126653dcad505599cc4/lib/layer_utils/roi_pooling/_ext/roi_pooling/_roi_pooling.so -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.ffi import create_extension 4 | 5 | 6 | sources = ['src/roi_pooling.c'] 7 | headers = ['src/roi_pooling.h'] 8 | defines = [] 9 | with_cuda = False 10 | 11 | if torch.cuda.is_available(): 12 | print('Including CUDA code.') 13 | sources += ['src/roi_pooling_cuda.c'] 14 | headers += ['src/roi_pooling_cuda.h'] 15 | defines += [('WITH_CUDA', None)] 16 | with_cuda = True 17 | 18 | this_file = os.path.dirname(os.path.realpath(__file__)) 19 | print(this_file) 20 | extra_objects = ['src/cuda/roi_pooling_kernel.cu.o'] 21 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 22 | 23 | ffi = create_extension( 24 | '_ext.roi_pooling', 25 | headers=headers, 26 | sources=sources, 27 | define_macros=defines, 28 | relative_to=__file__, 29 | with_cuda=with_cuda, 30 | extra_objects=extra_objects 31 | ) 32 | 33 | if __name__ == '__main__': 34 | ffi.build() 35 | -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/roi_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from ._ext import roi_pooling 4 | 5 | 6 | class RoIPoolFunction(Function): 7 | def __init__(self, pooled_height, pooled_width, spatial_scale): 8 | self.pooled_width = int(pooled_width) 9 | self.pooled_height = int(pooled_height) 10 | self.spatial_scale = float(spatial_scale) 11 | self.output = None 12 | self.argmax = None 13 | self.rois = None 14 | self.feature_size = None 15 | 16 | def forward(self, features, rois): 17 | batch_size, num_channels, data_height, data_width = features.size() 18 | num_rois = rois.size()[0] 19 | output = torch.zeros(num_rois, num_channels, self.pooled_height, self.pooled_width) 20 | argmax = torch.IntTensor(num_rois, num_channels, self.pooled_height, self.pooled_width).zero_() 21 | 22 | if not features.is_cuda: 23 | _features = features.permute(0, 2, 3, 1) 24 | roi_pooling.roi_pooling_forward(self.pooled_height, self.pooled_width, self.spatial_scale, 25 | _features, rois, output) 26 | # output = output.cuda() 27 | else: 28 | output = output.cuda() 29 | argmax = argmax.cuda() 30 | 31 | roi_pooling.roi_pooling_forward_cuda(self.pooled_height, self.pooled_width, self.spatial_scale, 32 | features, rois, output, argmax) 33 | 34 | self.output = output 35 | self.argmax = argmax 36 | self.rois = rois 37 | self.feature_size = features.size() 38 | 39 | return output 40 | 41 | def backward(self, grad_output): 42 | assert(self.feature_size is not None and grad_output.is_cuda) 43 | 44 | batch_size, num_channels, data_height, data_width = self.feature_size 45 | 46 | grad_input = torch.zeros(batch_size, num_channels, data_height, data_width).cuda() 47 | roi_pooling.roi_pooling_backward_cuda(self.pooled_height, self.pooled_width, self.spatial_scale, 48 | grad_output, self.rois, grad_input, self.argmax) 49 | 50 | # print grad_input 51 | 52 | return grad_input, None 53 | 54 | 55 | class RoIPool(torch.nn.Module): 56 | def __init__(self, pooled_height, pooled_width, spatial_scale): 57 | super(RoIPool, self).__init__() 58 | 59 | self.pooled_width = int(pooled_width) 60 | self.pooled_height = int(pooled_height) 61 | self.spatial_scale = float(spatial_scale) 62 | 63 | def forward(self, features, rois): 64 | return RoIPoolFunction(self.pooled_height, self.pooled_width, self.spatial_scale)(features, rois) 65 | -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/roi_pool_py.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | 6 | 7 | class RoIPool(nn.Module): 8 | def __init__(self, pooled_height, pooled_width, spatial_scale): 9 | super(RoIPool, self).__init__() 10 | self.pooled_width = int(pooled_width) 11 | self.pooled_height = int(pooled_height) 12 | self.spatial_scale = float(spatial_scale) 13 | 14 | def forward(self, features, rois): 15 | batch_size, num_channels, data_height, data_width = features.size() 16 | num_rois = rois.size()[0] 17 | outputs = Variable(torch.zeros(num_rois, num_channels, self.pooled_height, self.pooled_width)).cuda() 18 | 19 | for roi_ind, roi in enumerate(rois): 20 | batch_ind = int(roi[0].data[0]) 21 | roi_start_w, roi_start_h, roi_end_w, roi_end_h = np.round( 22 | roi[1:].data.cpu().numpy() * self.spatial_scale).astype(int) 23 | roi_width = max(roi_end_w - roi_start_w + 1, 1) 24 | roi_height = max(roi_end_h - roi_start_h + 1, 1) 25 | bin_size_w = float(roi_width) / float(self.pooled_width) 26 | bin_size_h = float(roi_height) / float(self.pooled_height) 27 | 28 | for ph in range(self.pooled_height): 29 | hstart = int(np.floor(ph * bin_size_h)) 30 | hend = int(np.ceil((ph + 1) * bin_size_h)) 31 | hstart = min(data_height, max(0, hstart + roi_start_h)) 32 | hend = min(data_height, max(0, hend + roi_start_h)) 33 | for pw in range(self.pooled_width): 34 | wstart = int(np.floor(pw * bin_size_w)) 35 | wend = int(np.ceil((pw + 1) * bin_size_w)) 36 | wstart = min(data_width, max(0, wstart + roi_start_w)) 37 | wend = min(data_width, max(0, wend + roi_start_w)) 38 | 39 | is_empty = (hend <= hstart) or(wend <= wstart) 40 | if is_empty: 41 | outputs[roi_ind, :, ph, pw] = 0 42 | else: 43 | data = features[batch_ind] 44 | outputs[roi_ind, :, ph, pw] = torch.max( 45 | torch.max(data[:, hstart:hend, wstart:wend], 1)[0], 2)[0].view(-1) 46 | 47 | return outputs 48 | 49 | -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include "roi_pooling_kernel.h" 9 | 10 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 11 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 12 | i += blockDim.x * gridDim.x) 13 | 14 | 15 | __global__ void ROIPoolForward(const int nthreads, const float* bottom_data, 16 | const float spatial_scale, const int height, const int width, 17 | const int channels, const int pooled_height, const int pooled_width, 18 | const float* bottom_rois, float* top_data, int* argmax_data) 19 | { 20 | CUDA_1D_KERNEL_LOOP(index, nthreads) 21 | { 22 | // (n, c, ph, pw) is an element in the pooled output 23 | int n = index; 24 | int pw = n % pooled_width; 25 | n /= pooled_width; 26 | int ph = n % pooled_height; 27 | n /= pooled_height; 28 | int c = n % channels; 29 | n /= channels; 30 | 31 | bottom_rois += n * 5; 32 | int roi_batch_ind = bottom_rois[0]; 33 | int roi_start_w = round(bottom_rois[1] * spatial_scale); 34 | int roi_start_h = round(bottom_rois[2] * spatial_scale); 35 | int roi_end_w = round(bottom_rois[3] * spatial_scale); 36 | int roi_end_h = round(bottom_rois[4] * spatial_scale); 37 | 38 | // Force malformed ROIs to be 1x1 39 | int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); 40 | int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); 41 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 42 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 43 | 44 | int hstart = (int)(floor((float)(ph) * bin_size_h)); 45 | int wstart = (int)(floor((float)(pw) * bin_size_w)); 46 | int hend = (int)(ceil((float)(ph + 1) * bin_size_h)); 47 | int wend = (int)(ceil((float)(pw + 1) * bin_size_w)); 48 | 49 | // Add roi offsets and clip to input boundaries 50 | hstart = fminf(fmaxf(hstart + roi_start_h, 0), height); 51 | hend = fminf(fmaxf(hend + roi_start_h, 0), height); 52 | wstart = fminf(fmaxf(wstart + roi_start_w, 0), width); 53 | wend = fminf(fmaxf(wend + roi_start_w, 0), width); 54 | bool is_empty = (hend <= hstart) || (wend <= wstart); 55 | 56 | // Define an empty pooling region to be zero 57 | float maxval = is_empty ? 0 : -FLT_MAX; 58 | // If nothing is pooled, argmax = -1 causes nothing to be backprop'd 59 | int maxidx = -1; 60 | bottom_data += roi_batch_ind * channels * height * width; 61 | for (int h = hstart; h < hend; ++h) { 62 | for (int w = wstart; w < wend; ++w) { 63 | // int bottom_index = (h * width + w) * channels + c; 64 | int bottom_index = (c * height + h) * width + w; 65 | if (bottom_data[bottom_index] > maxval) { 66 | maxval = bottom_data[bottom_index]; 67 | maxidx = bottom_index; 68 | } 69 | } 70 | } 71 | top_data[index] = maxval; 72 | if (argmax_data != NULL) 73 | argmax_data[index] = maxidx; 74 | } 75 | } 76 | 77 | 78 | int ROIPoolForwardLaucher( 79 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 80 | const int width, const int channels, const int pooled_height, 81 | const int pooled_width, const float* bottom_rois, 82 | float* top_data, int* argmax_data, cudaStream_t stream) 83 | { 84 | const int kThreadsPerBlock = 1024; 85 | const int output_size = num_rois * pooled_height * pooled_width * channels; 86 | cudaError_t err; 87 | 88 | 89 | ROIPoolForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 90 | output_size, bottom_data, spatial_scale, height, width, channels, pooled_height, 91 | pooled_width, bottom_rois, top_data, argmax_data); 92 | 93 | err = cudaGetLastError(); 94 | if(cudaSuccess != err) 95 | { 96 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 97 | exit( -1 ); 98 | } 99 | 100 | return 1; 101 | } 102 | 103 | 104 | __global__ void ROIPoolBackward(const int nthreads, const float* top_diff, 105 | const int* argmax_data, const int num_rois, const float spatial_scale, 106 | const int height, const int width, const int channels, 107 | const int pooled_height, const int pooled_width, float* bottom_diff, 108 | const float* bottom_rois) { 109 | CUDA_1D_KERNEL_LOOP(index, nthreads) 110 | { 111 | 112 | // (n, c, ph, pw) is an element in the pooled output 113 | int n = index; 114 | int w = n % width; 115 | n /= width; 116 | int h = n % height; 117 | n /= height; 118 | int c = n % channels; 119 | n /= channels; 120 | 121 | float gradient = 0; 122 | // Accumulate gradient over all ROIs that pooled this element 123 | for (int roi_n = 0; roi_n < num_rois; ++roi_n) 124 | { 125 | const float* offset_bottom_rois = bottom_rois + roi_n * 5; 126 | int roi_batch_ind = offset_bottom_rois[0]; 127 | // Skip if ROI's batch index doesn't match n 128 | if (n != roi_batch_ind) { 129 | continue; 130 | } 131 | 132 | int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); 133 | int roi_start_h = round(offset_bottom_rois[2] * spatial_scale); 134 | int roi_end_w = round(offset_bottom_rois[3] * spatial_scale); 135 | int roi_end_h = round(offset_bottom_rois[4] * spatial_scale); 136 | 137 | // Skip if ROI doesn't include (h, w) 138 | const bool in_roi = (w >= roi_start_w && w <= roi_end_w && 139 | h >= roi_start_h && h <= roi_end_h); 140 | if (!in_roi) { 141 | continue; 142 | } 143 | 144 | int offset = roi_n * pooled_height * pooled_width * channels; 145 | const float* offset_top_diff = top_diff + offset; 146 | const int* offset_argmax_data = argmax_data + offset; 147 | 148 | // Compute feasible set of pooled units that could have pooled 149 | // this bottom unit 150 | 151 | // Force malformed ROIs to be 1x1 152 | int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); 153 | int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); 154 | 155 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 156 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 157 | 158 | int phstart = floor((float)(h - roi_start_h) / bin_size_h); 159 | int phend = ceil((float)(h - roi_start_h + 1) / bin_size_h); 160 | int pwstart = floor((float)(w - roi_start_w) / bin_size_w); 161 | int pwend = ceil((float)(w - roi_start_w + 1) / bin_size_w); 162 | 163 | phstart = fminf(fmaxf(phstart, 0), pooled_height); 164 | phend = fminf(fmaxf(phend, 0), pooled_height); 165 | pwstart = fminf(fmaxf(pwstart, 0), pooled_width); 166 | pwend = fminf(fmaxf(pwend, 0), pooled_width); 167 | 168 | for (int ph = phstart; ph < phend; ++ph) { 169 | for (int pw = pwstart; pw < pwend; ++pw) { 170 | if (offset_argmax_data[(c * pooled_height + ph) * pooled_width + pw] == index) 171 | { 172 | gradient += offset_top_diff[(c * pooled_height + ph) * pooled_width + pw]; 173 | } 174 | } 175 | } 176 | } 177 | bottom_diff[index] = gradient; 178 | } 179 | } 180 | 181 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 182 | const int height, const int width, const int channels, const int pooled_height, 183 | const int pooled_width, const float* bottom_rois, 184 | float* bottom_diff, const int* argmax_data, cudaStream_t stream) 185 | { 186 | const int kThreadsPerBlock = 1024; 187 | const int output_size = batch_size * height * width * channels; 188 | cudaError_t err; 189 | 190 | ROIPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 191 | output_size, top_diff, argmax_data, num_rois, spatial_scale, height, width, channels, pooled_height, 192 | pooled_width, bottom_diff, bottom_rois); 193 | 194 | err = cudaGetLastError(); 195 | if(cudaSuccess != err) 196 | { 197 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 198 | exit( -1 ); 199 | } 200 | 201 | return 1; 202 | } 203 | 204 | 205 | #ifdef __cplusplus 206 | } 207 | #endif 208 | 209 | 210 | -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JingyunLiang/PFNet-FGVC/a1dcf1ddd7427b4a907ab126653dcad505599cc4/lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu.o -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ROI_POOLING_KERNEL 2 | #define _ROI_POOLING_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | int ROIPoolForwardLaucher( 9 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 10 | const int width, const int channels, const int pooled_height, 11 | const int pooled_width, const float* bottom_rois, 12 | float* top_data, int* argmax_data, cudaStream_t stream); 13 | 14 | 15 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 16 | const int height, const int width, const int channels, const int pooled_height, 17 | const int pooled_width, const float* bottom_rois, 18 | float* bottom_diff, const int* argmax_data, cudaStream_t stream); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif 25 | 26 | -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/src/roi_pooling.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale, 5 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output) 6 | { 7 | // Grab the input tensor 8 | float * data_flat = THFloatTensor_data(features); 9 | float * rois_flat = THFloatTensor_data(rois); 10 | 11 | float * output_flat = THFloatTensor_data(output); 12 | 13 | // Number of ROIs 14 | int num_rois = THFloatTensor_size(rois, 0); 15 | int size_rois = THFloatTensor_size(rois, 1); 16 | // batch size 17 | int batch_size = THFloatTensor_size(features, 0); 18 | if(batch_size != 1) 19 | { 20 | return 0; 21 | } 22 | // data height 23 | int data_height = THFloatTensor_size(features, 1); 24 | // data width 25 | int data_width = THFloatTensor_size(features, 2); 26 | // Number of channels 27 | int num_channels = THFloatTensor_size(features, 3); 28 | 29 | // Set all element of the output tensor to -inf. 30 | THFloatStorage_fill(THFloatTensor_storage(output), -1); 31 | 32 | // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R 33 | int index_roi = 0; 34 | int index_output = 0; 35 | int n; 36 | for (n = 0; n < num_rois; ++n) 37 | { 38 | int roi_batch_ind = rois_flat[index_roi + 0]; 39 | int roi_start_w = round(rois_flat[index_roi + 1] * spatial_scale); 40 | int roi_start_h = round(rois_flat[index_roi + 2] * spatial_scale); 41 | int roi_end_w = round(rois_flat[index_roi + 3] * spatial_scale); 42 | int roi_end_h = round(rois_flat[index_roi + 4] * spatial_scale); 43 | // CHECK_GE(roi_batch_ind, 0); 44 | // CHECK_LT(roi_batch_ind, batch_size); 45 | 46 | int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); 47 | int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); 48 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 49 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 50 | 51 | int index_data = roi_batch_ind * data_height * data_width * num_channels; 52 | const int output_area = pooled_width * pooled_height; 53 | 54 | int c, ph, pw; 55 | for (ph = 0; ph < pooled_height; ++ph) 56 | { 57 | for (pw = 0; pw < pooled_width; ++pw) 58 | { 59 | int hstart = (floor((float)(ph) * bin_size_h)); 60 | int wstart = (floor((float)(pw) * bin_size_w)); 61 | int hend = (ceil((float)(ph + 1) * bin_size_h)); 62 | int wend = (ceil((float)(pw + 1) * bin_size_w)); 63 | 64 | hstart = fminf(fmaxf(hstart + roi_start_h, 0), data_height); 65 | hend = fminf(fmaxf(hend + roi_start_h, 0), data_height); 66 | wstart = fminf(fmaxf(wstart + roi_start_w, 0), data_width); 67 | wend = fminf(fmaxf(wend + roi_start_w, 0), data_width); 68 | 69 | const int pool_index = index_output + (ph * pooled_width + pw); 70 | int is_empty = (hend <= hstart) || (wend <= wstart); 71 | if (is_empty) 72 | { 73 | for (c = 0; c < num_channels * output_area; c += output_area) 74 | { 75 | output_flat[pool_index + c] = 0; 76 | } 77 | } 78 | else 79 | { 80 | int h, w, c; 81 | for (h = hstart; h < hend; ++h) 82 | { 83 | for (w = wstart; w < wend; ++w) 84 | { 85 | for (c = 0; c < num_channels; ++c) 86 | { 87 | const int index = (h * data_width + w) * num_channels + c; 88 | if (data_flat[index_data + index] > output_flat[pool_index + c * output_area]) 89 | { 90 | output_flat[pool_index + c * output_area] = data_flat[index_data + index]; 91 | } 92 | } 93 | } 94 | } 95 | } 96 | } 97 | } 98 | 99 | // Increment ROI index 100 | index_roi += size_rois; 101 | index_output += pooled_height * pooled_width * num_channels; 102 | } 103 | return 1; 104 | } -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/src/roi_pooling.h: -------------------------------------------------------------------------------- 1 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale, 2 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output); -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/src/roi_pooling_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cuda/roi_pooling_kernel.h" 4 | 5 | extern THCState *state; 6 | 7 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, 8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax) 9 | { 10 | // Grab the input tensor 11 | float * data_flat = THCudaTensor_data(state, features); 12 | float * rois_flat = THCudaTensor_data(state, rois); 13 | 14 | float * output_flat = THCudaTensor_data(state, output); 15 | int * argmax_flat = THCudaIntTensor_data(state, argmax); 16 | 17 | // Number of ROIs 18 | int num_rois = THCudaTensor_size(state, rois, 0); 19 | int size_rois = THCudaTensor_size(state, rois, 1); 20 | if (size_rois != 5) 21 | { 22 | return 0; 23 | } 24 | 25 | // batch size 26 | int batch_size = THCudaTensor_size(state, features, 0); 27 | if (batch_size != 1) 28 | { 29 | return 0; 30 | } 31 | // data height 32 | int data_height = THCudaTensor_size(state, features, 2); 33 | // data width 34 | int data_width = THCudaTensor_size(state, features, 3); 35 | // Number of channels 36 | int num_channels = THCudaTensor_size(state, features, 1); 37 | 38 | cudaStream_t stream = THCState_getCurrentStream(state); 39 | 40 | ROIPoolForwardLaucher( 41 | data_flat, spatial_scale, num_rois, data_height, 42 | data_width, num_channels, pooled_height, 43 | pooled_width, rois_flat, 44 | output_flat, argmax_flat, stream); 45 | 46 | return 1; 47 | } 48 | 49 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, 50 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax) 51 | { 52 | // Grab the input tensor 53 | float * top_grad_flat = THCudaTensor_data(state, top_grad); 54 | float * rois_flat = THCudaTensor_data(state, rois); 55 | 56 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 57 | int * argmax_flat = THCudaIntTensor_data(state, argmax); 58 | 59 | // Number of ROIs 60 | int num_rois = THCudaTensor_size(state, rois, 0); 61 | int size_rois = THCudaTensor_size(state, rois, 1); 62 | if (size_rois != 5) 63 | { 64 | return 0; 65 | } 66 | 67 | // batch size 68 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 69 | if (batch_size != 1) 70 | { 71 | return 0; 72 | } 73 | // data height 74 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 75 | // data width 76 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 77 | // Number of channels 78 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 79 | 80 | cudaStream_t stream = THCState_getCurrentStream(state); 81 | ROIPoolBackwardLaucher( 82 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height, 83 | data_width, num_channels, pooled_height, 84 | pooled_width, rois_flat, 85 | bottom_grad_flat, argmax_flat, stream); 86 | 87 | return 1; 88 | } -------------------------------------------------------------------------------- /lib/layer_utils/roi_pooling/src/roi_pooling_cuda.h: -------------------------------------------------------------------------------- 1 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax); 3 | 4 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax); -------------------------------------------------------------------------------- /lib/transforms_with_rois.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import torch 3 | import math 4 | import random 5 | from PIL import Image, ImageOps 6 | try: 7 | import accimage 8 | except ImportError: 9 | accimage = None 10 | import numpy as np 11 | import numbers 12 | import types 13 | import collections 14 | 15 | 16 | class Compose(object): 17 | """Composes several transforms together. 18 | 19 | Args: 20 | transforms (list of ``Transform`` objects): list of transforms to compose. 21 | 22 | Example: 23 | >>> transforms.Compose([ 24 | >>> transforms.CenterCrop(10), 25 | >>> transforms.ToTensor(), 26 | >>> ]) 27 | """ 28 | 29 | def __init__(self, transforms): 30 | self.transforms = transforms 31 | 32 | def __call__(self, img,rois): 33 | for t in self.transforms: 34 | if 'RandomSizedCrop' in t.__str__() \ 35 | or 'CenterCrop' in t.__str__() : 36 | 37 | img,rois = t(img,rois) 38 | rois = FixRois(img.size, rois) 39 | elif 'RandomHorizontalFlip' in t.__str__() \ 40 | or 'Scale' in t.__str__(): 41 | 42 | img,rois = t(img,rois) 43 | else: 44 | img = t(img) 45 | 46 | # for vgg16 47 | # rois = OffSet(rois, (img.shape[2],img.shape[1]), o0=8.5, o=9.5, stride=[16,16]) 48 | 49 | return img, rois 50 | 51 | 52 | class ToTensor(object): 53 | """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor. 54 | 55 | Converts a PIL.Image or numpy.ndarray (H x W x C) in the range 56 | [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. 57 | """ 58 | 59 | def __call__(self, pic): 60 | """ 61 | Args: 62 | pic (PIL.Image or numpy.ndarray): Image to be converted to tensor. 63 | 64 | Returns: 65 | Tensor: Converted image. 66 | """ 67 | if isinstance(pic, np.ndarray): 68 | # handle numpy array 69 | img = torch.from_numpy(pic.transpose((2, 0, 1))) 70 | # backward compatibility 71 | return img.float().div(255) 72 | 73 | if accimage is not None and isinstance(pic, accimage.Image): 74 | nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32) 75 | pic.copyto(nppic) 76 | return torch.from_numpy(nppic) 77 | 78 | # handle PIL Image 79 | if pic.mode == 'I': 80 | img = torch.from_numpy(np.array(pic, np.int32, copy=False)) 81 | elif pic.mode == 'I;16': 82 | img = torch.from_numpy(np.array(pic, np.int16, copy=False)) 83 | else: 84 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) 85 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 86 | if pic.mode == 'YCbCr': 87 | nchannel = 3 88 | elif pic.mode == 'I;16': 89 | nchannel = 1 90 | else: 91 | nchannel = len(pic.mode) 92 | img = img.view(pic.size[1], pic.size[0], nchannel) 93 | # put it from HWC to CHW format 94 | # yikes, this transpose takes 80% of the loading time/CPU 95 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 96 | if isinstance(img, torch.ByteTensor): 97 | return img.float().div(255) 98 | else: 99 | return img 100 | 101 | 102 | class ToPILImage(object): 103 | """Convert a tensor to PIL Image. 104 | 105 | Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape 106 | H x W x C to a PIL.Image while preserving the value range. 107 | """ 108 | 109 | def __call__(self, pic): 110 | """ 111 | Args: 112 | pic (Tensor or numpy.ndarray): Image to be converted to PIL.Image. 113 | 114 | Returns: 115 | PIL.Image: Image converted to PIL.Image. 116 | 117 | """ 118 | npimg = pic 119 | mode = None 120 | if isinstance(pic, torch.FloatTensor): 121 | pic = pic.mul(255).byte() 122 | if torch.is_tensor(pic): 123 | npimg = np.transpose(pic.numpy(), (1, 2, 0)) 124 | assert isinstance(npimg, np.ndarray), 'pic should be Tensor or ndarray' 125 | if npimg.shape[2] == 1: 126 | npimg = npimg[:, :, 0] 127 | 128 | if npimg.dtype == np.uint8: 129 | mode = 'L' 130 | if npimg.dtype == np.int16: 131 | mode = 'I;16' 132 | if npimg.dtype == np.int32: 133 | mode = 'I' 134 | elif npimg.dtype == np.float32: 135 | mode = 'F' 136 | else: 137 | if npimg.dtype == np.uint8: 138 | mode = 'RGB' 139 | assert mode is not None, '{} is not supported'.format(npimg.dtype) 140 | return Image.fromarray(npimg, mode=mode) 141 | 142 | 143 | class Normalize(object): 144 | """Normalize an tensor image with mean and standard deviation. 145 | 146 | Given mean: (R, G, B) and std: (R, G, B), 147 | will normalize each channel of the torch.*Tensor, i.e. 148 | channel = (channel - mean) / std 149 | 150 | Args: 151 | mean (sequence): Sequence of means for R, G, B channels respecitvely. 152 | std (sequence): Sequence of standard deviations for R, G, B channels 153 | respecitvely. 154 | """ 155 | 156 | def __init__(self, mean, std): 157 | self.mean = mean 158 | self.std = std 159 | 160 | def __call__(self, tensor): 161 | """ 162 | Args: 163 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 164 | 165 | Returns: 166 | Tensor: Normalized image. 167 | """ 168 | # TODO: make efficient 169 | for t, m, s in zip(tensor, self.mean, self.std): 170 | t.sub_(m).div_(s) 171 | return tensor 172 | 173 | 174 | class Scale(object): 175 | """Rescale the input PIL.Image to the given size. 176 | 177 | Args: 178 | size (sequence or int): Desired output size. If size is a sequence like 179 | (w, h), output size will be matched to this. If size is an int, 180 | smaller edge of the image will be matched to this number. 181 | i.e, if height > width, then image will be rescaled to 182 | (size * height / width, size) 183 | interpolation (int, optional): Desired interpolation. Default is 184 | ``PIL.Image.BILINEAR`` 185 | """ 186 | 187 | def __init__(self, size, interpolation=Image.BILINEAR, scaleheight=None): 188 | assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2) 189 | self.size = size 190 | self.interpolation = interpolation 191 | self.scaleheight = scaleheight 192 | 193 | def __call__(self, img, rois): 194 | """ 195 | Args: 196 | img (PIL.Image): Image to be scaled. 197 | 198 | Returns: 199 | PIL.Image: Rescaled image. 200 | """ 201 | if self.scaleheight is not None: 202 | for attempt in range(10): 203 | oh = self.scaleheight[random.randint(0,len(self.scaleheight)-1)] 204 | ow = int(img.size[0]/img.size[1]*oh) 205 | 206 | if oh<=700 & ow <=700: 207 | return img.resize((ow, oh), self.interpolation), ResizeRois(img.size, (ow, oh), rois) 208 | 209 | ow = 650#700 210 | oh = int(img.size[1]/img.size[0]*ow) 211 | return img.resize((ow, oh), self.interpolation), ResizeRois(img.size, (ow, oh), rois) 212 | # for attempt in range(10): 213 | # if img.size[0] 0: 342 | img = ImageOps.expand(img, border=self.padding, fill=0) 343 | raise NotImplementedError 344 | 345 | 346 | w, h = img.size 347 | th, tw = self.size 348 | if w == tw and h == th: 349 | return img,rois 350 | 351 | x1 = random.randint(0, w - tw) 352 | y1 = random.randint(0, h - th) 353 | return img.crop((x1, y1, x1 + tw, y1 + th)), RemoveOuterRois((x1, y1, x1 + tw, y1 + th), rois) 354 | 355 | 356 | class RandomHorizontalFlip(object): 357 | """Horizontally flip the given PIL.Image randomly with a probability of 0.5.""" 358 | 359 | def __call__(self, img, rois): 360 | """ 361 | Args: 362 | img (PIL.Image): Image to be flipped. 363 | 364 | Returns: 365 | PIL.Image: Randomly flipped image. 366 | """ 367 | if random.random() < 0.5: 368 | rois[:,[1,3]] = img.size[0] + 1 - rois[:,[3,1]]; 369 | return img.transpose(Image.FLIP_LEFT_RIGHT), rois 370 | return img, rois 371 | 372 | class RandomSizedCrop(object): 373 | """Crop the given PIL.Image to random size and aspect ratio. 374 | 375 | A crop of random size of (0.08 to 1.0) of the original size and a random 376 | aspect ratio of 3/4 to 4/3 of the original aspect ratio is made. This crop 377 | is finally resized to given size. 378 | This is popularly used to train the Inception networks. 379 | 380 | Args: 381 | size: size of the smaller edge 382 | interpolation: Default: PIL.Image.BILINEAR 383 | """ 384 | 385 | def __init__(self, size, interpolation=Image.BILINEAR): 386 | self.size = size 387 | self.interpolation = interpolation 388 | 389 | def __call__(self, img,rois): 390 | for attempt in range(10): 391 | area = img.size[0] * img.size[1] 392 | target_area = random.uniform(0.08, 1.0) * area 393 | aspect_ratio = random.uniform(3. / 4, 4. / 3) 394 | 395 | w = int(round(math.sqrt(target_area * aspect_ratio))) 396 | h = int(round(math.sqrt(target_area / aspect_ratio))) 397 | 398 | if random.random() < 0.5: 399 | w, h = h, w 400 | 401 | if w <= img.size[0] and h <= img.size[1]: 402 | x1 = random.randint(0, img.size[0] - w) 403 | y1 = random.randint(0, img.size[1] - h) 404 | 405 | img = img.crop((x1, y1, x1 + w, y1 + h)) 406 | rois = RemoveOuterRois((x1, y1, x1 + w, y1 + h), rois) 407 | assert(img.size == (w, h)) 408 | 409 | return img.resize((self.size, self.size), self.interpolation), ResizeRois(img.size, (self.size,self.size),rois) 410 | 411 | # Fallback 412 | scale = Scale(self.size, interpolation=self.interpolation) 413 | crop = CenterCrop(self.size) 414 | 415 | img,rois = scale(img,rois) 416 | return crop(img,rois) 417 | 418 | def RemoveOuterRois(crop, rois):# remove rois out of bounding and move to new coordinate, e.g. use after crop immidiately 419 | x1, y1, x2, y2 = crop 420 | 421 | rois[:,1] = torch.max(torch.FloatTensor([1]), rois[:,1]-x1+1)# might be inaccuract due to crop interpolation 422 | rois[:,2] = torch.max(torch.FloatTensor([1]), rois[:,2]-y1+1) 423 | rois[:,3] = torch.min(torch.FloatTensor([x2-x1]), rois[:,3]-x1+1) 424 | rois[:,4] = torch.min(torch.FloatTensor([y2-y1]), rois[:,4]-x1+1) 425 | 426 | return rois 427 | 428 | def ResizeRois(sizeIn, sizeOut, rois):# resize rois according to image transforms, e.g. use after resize immidiately 429 | if isinstance(sizeIn, numbers.Number): 430 | inw,inh = (int(sizeIn), int(sizeIn)) 431 | else: 432 | inw,inh = sizeIn 433 | if isinstance(sizeOut, numbers.Number): 434 | outw,outh = (int(sizeOut), int(sizeOut)) 435 | else: 436 | outw,outh = sizeOut 437 | 438 | # relative box center and width/hegiht, [index x1(horizonal) y1(vertical) x2 y2] 439 | bxr = (rois[:,1] + rois[:,3])/2/inw 440 | byr = (rois[:,2] + rois[:,4])/2/inh 441 | bwr = (rois[:,3] - rois[:,1])/inw 442 | bhr = (rois[:,4] - rois[:,2])/inh 443 | 444 | # new relative box center and width/hegiht 445 | bxnew = outw*bxr 446 | bynew = outh*byr 447 | bwnew = outw*bwr 448 | bhnew = outh*bhr 449 | 450 | rois[:,1] = torch.max(torch.FloatTensor([1]), torch.round(bxnew - bwnew/2)) 451 | rois[:,2] = torch.max(torch.FloatTensor([1]), torch.round(bynew - bhnew/2)) 452 | rois[:,3] = torch.min(torch.FloatTensor([outw]), torch.round(bxnew + bwnew/2)) 453 | rois[:,4] = torch.min(torch.FloatTensor([outh]), torch.round(bynew + bynew/2)) 454 | 455 | return rois 456 | 457 | def FixRois(size, rois):# remove meaningless rois,due to 'crop', minrois is defined according to current cordinate 458 | 459 | # rois_ = np.concatenate((rois.numpy(),np.array([[0,1,1,size[0],size[1]]])),axis=0) 460 | rois_ = rois.numpy() 461 | 462 | isvalid = np.where((rois_[:,1]>=1) & (rois_[:,2]>=1) & \ 463 | (rois_[:,1]=1000 || size(im,2) >=1000 86 | im = imresize(im,0.5) ; 87 | imageScale = 0.5 ; 88 | end 89 | 90 | idx = 1; 91 | for j=1:length(ks) 92 | k = ks(j); % Segmentation threshold k 93 | minSize = k; % We set minSize = k 94 | for n = 1:length(colorTypes) 95 | colorType = colorTypes{n}; 96 | tic; 97 | [boxesTT blobIndIm blobBoxes hierarchy priorityTT] = Image2HierarchicalGrouping(im, sigma, k, minSize, colorType, simFunctionHandles); 98 | totalTime = totalTime + toc; 99 | idx = idx + 1; 100 | 101 | boxesT = [boxesT ;boxesTT]; 102 | priorityT = [priorityT ; priorityTT]; 103 | end 104 | end 105 | 106 | priority = priorityT; % Concatenate priorities 107 | 108 | % Do pseudo random sorting as in paper 109 | priority = priority .* rand(size(priority)); 110 | [priority sortIds] = sort(priority, 'ascend'); 111 | boxesT = boxesT(sortIds,:); 112 | 113 | % add by Michael 114 | boxScores{i} = priority; 115 | boxes{i} = boxesT/imageScale; % Concatenate boxes from all hierarchies 116 | 117 | end 118 | fprintf('\n'); 119 | 120 | %% 121 | tic 122 | for i=1:length(boxes) 123 | [boxes{i} boxScores{i}] = FilterBoxesWidth(boxes{i}, minBoxWidth, boxScores{i}); 124 | [boxes{i} boxScores{i}]= BoxRemoveDuplicates(boxes{i}, boxScores{i}); 125 | end 126 | totalTime = totalTime + toc; 127 | 128 | imdb.images.boxes = boxes; 129 | imdb.images.boxScores = boxScores; 130 | 131 | imdb.images.size = imsize(:,1:2) ; 132 | 133 | fprintf('Time per image: %.2f\nNow evaluating the boxes on Cub...\n', totalTime ./ length(imdb.images.name)); 134 | 135 | % %% 136 | % [boxAbo boxMabo boScores avgNumBoxes] = BoxAverageBestOverlap(gtBoxes, gtImIds, boxes); 137 | % 138 | % fprintf('Mean Average Best Overlap for the box-based locations: %.3f\n', boxMabo); -------------------------------------------------------------------------------- /part proposal/Car_get_database_SSW.m: -------------------------------------------------------------------------------- 1 | function imdb = cars_get_database_SSW(varargin) 2 | % Modified from 2015 Tsung-Yu Lin, Aruni RoyChowdhury, Subhransu Maji. 3 | % used to prepare the car-196 dataset of imdb.mat for MatCovNet. 4 | % imdb.images.boxes stores the proposed rois. 5 | 6 | 7 | carsDir = '/raid/L/Fine-grained Dataset/Cars-196'; 8 | useCropped = false; 9 | ifval = true; 10 | 11 | if useCropped 12 | imdb.imageDir = fullfile(carsDir, 'images_cropped') ; 13 | else 14 | imdb.imageDir = fullfile(carsDir); 15 | end 16 | 17 | imdb.maskDir = fullfile(carsDir, 'masks'); % doesn't exist 18 | imdb.sets = {'train', 'val', 'test'}; 19 | 20 | load(fullfile(carsDir, 'cars_annos')); 21 | 22 | % Class names 23 | imdb.classes.name = class_names'; 24 | 25 | 26 | N = numel(annotations); 27 | 28 | imdb.images.name = cell(N, 1); 29 | imdb.images.id = 1:N; 30 | imdb.images.label = zeros(1,N); 31 | imdb.images.bounds = zeros(4, N); 32 | imdb.images.set = 3.*ones(1, N); 33 | imdb.images.difficult = false(1, N) ; 34 | 35 | % Image names 36 | for i=1:numel(annotations) 37 | 38 | imdb.images.name{i} = annotations(i).relative_im_path; 39 | 40 | % Class labels 41 | imdb.images.label(i) = annotations(i).class; 42 | 43 | % Bounding boxes 44 | 45 | imdb.images.bounds(:,i) = round([annotations(i).bbox_x1 annotations(i).bbox_y1 annotations(i).bbox_x2 annotations(i).bbox_y2]'); 46 | 47 | % Image sets 48 | if(~annotations(i).test) 49 | imdb.images.set(i) = 1; 50 | end 51 | 52 | 53 | end 54 | 55 | % Class labels 56 | % modified by Michael 57 | classLabel = imdb.images.label ; 58 | imdb.images.label = -ones(numel(imdb.classes.name),numel(classLabel)); 59 | for i=1:numel(classLabel) 60 | imdb.images.label(classLabel(i),i)=1; 61 | end 62 | 63 | % Image size, update it in CubProposalSSW_par.m 64 | imdb.images.size = [] ; 65 | 66 | % Image size 67 | imdb.images.size = [] ; 68 | 69 | % add image files to imdb 70 | % imdb.images.image = vl_imreadjpeg(strcat([imdb.imageDir filesep], imdb.images.name) , 'NumThreads', 8 ); 71 | 72 | 73 | if(ifval) 74 | 75 | trainSize = numel(find(imdb.images.set==1)); 76 | validSize = round(trainSize/3); 77 | 78 | trainIdx = find(imdb.images.set==1); 79 | 80 | % set 1/3 of train set to validation 81 | valIdx = trainIdx(randperm(trainSize, validSize)); 82 | imdb.images.set(valIdx) = 2; 83 | 84 | end 85 | 86 | 87 | imdb.meta.classes = imdb.classes.name ; 88 | imdb.meta.inUse = true(1,numel(imdb.meta.classes)) ; 89 | 90 | 91 | % add by Michael 92 | % calculate proposals using SSW 93 | addpath('SelectiveSearchCodeIJCV'); 94 | addpath(fullfile('SelectiveSearchCodeIJCV', 'Dependencies')); 95 | imdb = CarProposalSSW_par(imdb, carsDir) ; 96 | 97 | save('data/Car/car_imdb.mat','-struct', 'imdb', '-v7.3'); 98 | 99 | -------------------------------------------------------------------------------- /part proposal/Readme.md: -------------------------------------------------------------------------------- 1 | ## This directory is used to generate rois for PFNet. 2 | 3 | 4 | 1, download [Selective Search Window](https://koen.me/research/selectivesearch/) and extract it here. It is a Matlab toolbox for SSW. 5 | 6 | 2, run `Car_get_database_SSW.m` to get a `car_imdb.mat` file for the dataset, which contains image data, rois and other metadata. Please make sure you have moved images to a suitbale directory. By the way, in fact, `CarProposalSSW_par.m` is called to generate rois. 7 | 8 | 3, `roisWarpperforPytorch_generatetxt.m` uses `car_imdb.mat` to generate `.txt` file for Pytorch. Generated rois of CUB-200-2011, Stanford Cars and FGVC-Aircraft are [provided](https://drive.google.com/open?id=18DWMrK2WVEMGzRdMpgqgNiRbWOTtRwnP). Here is an example: 9 | ``` 10 | 0 2 2 1024 768 11 | 0 194 76 336 258 12 | 0 218 2 638 458 13 | 0 2 16 1024 454 14 | 0 638 466 792 580 15 | 0 2 318 1024 768 16 | 0 652 404 1024 768 17 | ``` 18 | Each line represents a proposed bounding box. `0 2 2 1024 768` are identifier, x1(horizonal), y1(vertical), x2 and y2 respectively. 19 | 20 | -------------------------------------------------------------------------------- /part proposal/roisWarpperforPytorch_generatetxt.m: -------------------------------------------------------------------------------- 1 | imdb = load('data/Car/car_imdb.mat') ; 2 | 3 | % ------------------------------------------------------------------------- 4 | 5 | maxNum = 500 ; 6 | for i=1:numel(imdb.images.name) 7 | bbox = imdb.images.boxes{i};% height width 8 | imsize = imdb.images.size(i,:) ; 9 | 10 | isGood = (bbox(:,3)-bbox(:,1))>20 & (bbox(:,4)-bbox(:,2))>20; 11 | bbox = bbox(isGood,:); 12 | 13 | % remove duplicate ones in 14*14 14 | [dummy, uniqueIdx] = unique(round(bbox/16), 'rows', 'first'); 15 | uniqueIdx = sort(uniqueIdx); 16 | bbox = bbox(uniqueIdx,:); 17 | 18 | % limit number for training 19 | if 1%imdb.images.set(i)~=3 20 | nB = min(size(bbox,1),maxNum); 21 | else 22 | nB = size(bbox,1); 23 | end 24 | 25 | imdb.images.boxes{i} = bbox(1:nB,:); 26 | i 27 | end 28 | 29 | mkdir('car_ims') ; 30 | 31 | 32 | parfor i = 1:numel(imdb.images.name) 33 | rois_ = imdb.images.boxes{i} ;% y1(vertical) x1(horizonal) y2 x2 34 | rois = [zeros(size(rois_,1),1) rois_(:,2) rois_(:,1) rois_(:,4) rois_(:,3)] ;% input (x1,y1,x2,y2) 35 | dlmwrite(fullfile([imdb.images.name{i}(1:end-4) '.txt']),rois,' ') ; 36 | i 37 | end 38 | -------------------------------------------------------------------------------- /pic/PFNet.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JingyunLiang/PFNet-FGVC/a1dcf1ddd7427b4a907ab126653dcad505599cc4/pic/PFNet.jpg -------------------------------------------------------------------------------- /pic/dog_loss_acc1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JingyunLiang/PFNet-FGVC/a1dcf1ddd7427b4a907ab126653dcad505599cc4/pic/dog_loss_acc1.png --------------------------------------------------------------------------------