├── LICENSE ├── README.md ├── ShuffleNetV2.py ├── TestAccuracy.py ├── TrainModel.py ├── augmentations.py ├── croppingDataset.py ├── croppingModel.py ├── demo_eval.py ├── mobilenetv2.py ├── rod_align ├── __init__.pyc ├── _ext │ ├── __init__.pyc │ └── rod_align │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ └── _rod_align.so ├── build.py ├── functions │ ├── __init__.pyc │ ├── rod_align.py │ └── rod_align.pyc ├── make.sh ├── modules │ ├── __init__.pyc │ ├── rod_align.py │ └── rod_align.pyc └── src │ ├── rod_align.c │ ├── rod_align.h │ ├── rod_align_cuda.c │ ├── rod_align_cuda.h │ ├── rod_align_kernel.cu │ ├── rod_align_kernel.cu.o │ └── rod_align_kernel.h ├── roi_align ├── __init__.pyc ├── __pycache__ │ └── __init__.cpython-35.pyc ├── _ext │ ├── __init__.pyc │ ├── __pycache__ │ │ └── __init__.cpython-35.pyc │ └── roi_align │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── __pycache__ │ │ └── __init__.cpython-35.pyc │ │ └── _roi_align.so ├── build.py ├── functions │ ├── __init__.pyc │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ └── roi_align.cpython-35.pyc │ ├── roi_align.py │ └── roi_align.pyc ├── make.sh ├── modules │ ├── __init__.pyc │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ └── roi_align.cpython-35.pyc │ ├── roi_align.py │ └── roi_align.pyc └── src │ ├── roi_align.c │ ├── roi_align.h │ ├── roi_align_cuda.c │ ├── roi_align_cuda.h │ ├── roi_align_kernel.cu │ ├── roi_align_kernel.cu.o │ └── roi_align_kernel.h ├── runTrainTest.sh └── thop ├── __init__.py ├── __init__.pyc ├── count_hooks.py ├── count_hooks.pyc ├── profile.py ├── profile.pyc └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Max deGroot, Ellis Brown 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Grid-Anchor-based-Image-Cropping-Pytorch 2 | The extension of this work has been accepted by TPAMI. Please read the [paper](https://www4.comp.polyu.edu.hk/~cslzhang/paper/GAIC-PAMI.pdf) for details. 3 | 4 | 5 | ### Requirements 6 | python 2.7, pytorch 0.4.1, numpy, cv2, scipy. 7 | 8 | ### Usage 9 | 10 | 1. Download the source code, the datasets [[conference version](https://drive.google.com/file/d/1KhmyjoimsQVXqPnLjKZiU4iXNKNyyxqW/view?usp=sharing)], [[journal version](https://drive.google.com/file/d/1tDdQqDe8dMoMIVi9Z0WWI5vtRViy01nR/view?usp=sharing)] and the pretrained models [[conference version](https://drive.google.com/file/d/1OvLT_ul17zCK4ljAi4myGAgA50PmLy3Y/view?usp=sharing)] [[journal version](https://drive.google.com/file/d/1KWYQdL6R5hmOC9toTymbMORZDThpiEW4/view?usp=sharing)] 11 | 12 | 2. Run ``TrainModel.py`` to train a new model on our dataset or Run ``demo_eval.py`` to test the pretrained model on any images. 13 | 14 | 3. To change the aspect ratio of generated crops, please change the ``generate_bboxes`` function in ``croppingDataset.py`` (line 115). 15 | 16 | ### Annotation software 17 | The executable annotation software can be found [here](https://github.com/lld533/Grid-Anchor-based-Image-Cropping-Pytorch). 18 | 19 | ### Other implementation 20 | 1. [PyTorch 1.0 or later](https://github.com/lld533/Grid-Anchor-based-Image-Cropping-Pytorch) 21 | 2. [Matlab (conference version)](https://github.com/HuiZeng/Grid-Anchor-based-Image-Cropping) 22 | 23 | 24 | ### Citation 25 | ``` 26 | @inproceedings{zhang2019deep, 27 | title={Reliable and Efficient Image Cropping: A Grid Anchor based Approach}, 28 | author={Zeng, Hui, Li, Lida, Cao, Zisheng and Zhang, Lei}, 29 | booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, 30 | year={2019} 31 | } 32 | @article{zeng2020cropping, 33 | title={Grid Anchor based Image Cropping: A New Benchmark and An Efficient Model}, 34 | author={Zeng, Hui and Li, Lida and Cao, Zisheng and Zhang, Lei}, 35 | journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, 36 | volume={}, 37 | number={}, 38 | pages={}, 39 | year={2020}, 40 | publisher={IEEE} 41 | } 42 | ``` 43 | -------------------------------------------------------------------------------- /ShuffleNetV2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from collections import OrderedDict 6 | from torch.nn import init 7 | import math 8 | 9 | def conv_bn(inp, oup, stride): 10 | return nn.Sequential( 11 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 12 | nn.BatchNorm2d(oup), 13 | nn.ReLU(inplace=True) 14 | ) 15 | 16 | 17 | def conv_1x1_bn(inp, oup): 18 | return nn.Sequential( 19 | nn.Conv2d(inp, oup, 1, 1, 0, bias=False), 20 | nn.BatchNorm2d(oup), 21 | nn.ReLU(inplace=True) 22 | ) 23 | 24 | def channel_shuffle(x, groups): 25 | batchsize, num_channels, height, width = x.data.size() 26 | 27 | channels_per_group = num_channels // groups 28 | 29 | # reshape 30 | x = x.view(batchsize, groups, 31 | channels_per_group, height, width) 32 | 33 | x = torch.transpose(x, 1, 2).contiguous() 34 | 35 | # flatten 36 | x = x.view(batchsize, -1, height, width) 37 | 38 | return x 39 | 40 | class InvertedResidual(nn.Module): 41 | def __init__(self, inp, oup, stride, benchmodel): 42 | super(InvertedResidual, self).__init__() 43 | self.benchmodel = benchmodel 44 | self.stride = stride 45 | assert stride in [1, 2] 46 | 47 | oup_inc = oup//2 48 | 49 | if self.benchmodel == 1: 50 | #assert inp == oup_inc 51 | self.banch2 = nn.Sequential( 52 | # pw 53 | nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), 54 | nn.BatchNorm2d(oup_inc), 55 | nn.ReLU(inplace=True), 56 | # dw 57 | nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False), 58 | nn.BatchNorm2d(oup_inc), 59 | # pw-linear 60 | nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), 61 | nn.BatchNorm2d(oup_inc), 62 | nn.ReLU(inplace=True), 63 | ) 64 | else: 65 | self.banch1 = nn.Sequential( 66 | # dw 67 | nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), 68 | nn.BatchNorm2d(inp), 69 | # pw-linear 70 | nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False), 71 | nn.BatchNorm2d(oup_inc), 72 | nn.ReLU(inplace=True), 73 | ) 74 | 75 | self.banch2 = nn.Sequential( 76 | # pw 77 | nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False), 78 | nn.BatchNorm2d(oup_inc), 79 | nn.ReLU(inplace=True), 80 | # dw 81 | nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False), 82 | nn.BatchNorm2d(oup_inc), 83 | # pw-linear 84 | nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), 85 | nn.BatchNorm2d(oup_inc), 86 | nn.ReLU(inplace=True), 87 | ) 88 | 89 | @staticmethod 90 | def _concat(x, out): 91 | # concatenate along channel axis 92 | return torch.cat((x, out), 1) 93 | 94 | def forward(self, x): 95 | if 1==self.benchmodel: 96 | x1 = x[:, :(x.shape[1]//2), :, :] 97 | x2 = x[:, (x.shape[1]//2):, :, :] 98 | out = self._concat(x1, self.banch2(x2)) 99 | elif 2==self.benchmodel: 100 | out = self._concat(self.banch1(x), self.banch2(x)) 101 | 102 | return channel_shuffle(out, 2) 103 | 104 | 105 | class ShuffleNetV2(nn.Module): 106 | def __init__(self, n_class=1000, input_size=224, width_mult=1.): 107 | super(ShuffleNetV2, self).__init__() 108 | 109 | assert input_size % 32 == 0 110 | 111 | self.stage_repeats = [4, 8, 4] 112 | # index 0 is invalid and should never be called. 113 | # only used for indexing convenience. 114 | if width_mult == 0.5: 115 | self.stage_out_channels = [-1, 24, 48, 96, 192, 1024] 116 | elif width_mult == 1.0: 117 | self.stage_out_channels = [-1, 24, 116, 232, 464, 1024] 118 | elif width_mult == 1.5: 119 | self.stage_out_channels = [-1, 24, 176, 352, 704, 1024] 120 | elif width_mult == 2.0: 121 | self.stage_out_channels = [-1, 24, 224, 488, 976, 2048] 122 | else: 123 | raise ValueError( 124 | """{} groups is not supported for 125 | 1x1 Grouped Convolutions""".format(num_groups)) 126 | 127 | # building first layer 128 | input_channel = self.stage_out_channels[1] 129 | self.conv1 = conv_bn(3, input_channel, 2) 130 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 131 | 132 | self.features = [] 133 | # building inverted residual blocks 134 | for idxstage in range(len(self.stage_repeats)): 135 | numrepeat = self.stage_repeats[idxstage] 136 | output_channel = self.stage_out_channels[idxstage+2] 137 | for i in range(numrepeat): 138 | if i == 0: 139 | #inp, oup, stride, benchmodel): 140 | self.features.append(InvertedResidual(input_channel, output_channel, 2, 2)) 141 | else: 142 | self.features.append(InvertedResidual(input_channel, output_channel, 1, 1)) 143 | input_channel = output_channel 144 | 145 | 146 | # make it nn.Sequential 147 | self.features = nn.Sequential(*self.features) 148 | 149 | # building last several layers 150 | self.conv_last = conv_1x1_bn(input_channel, self.stage_out_channels[-1]) 151 | self.globalpool = nn.Sequential(nn.AvgPool2d(int(input_size/32))) 152 | 153 | # building classifier 154 | self.classifier = nn.Sequential(nn.Linear(self.stage_out_channels[-1], n_class)) 155 | 156 | def forward(self, x): 157 | x = self.conv1(x) 158 | x = self.maxpool(x) 159 | x = self.features(x) 160 | x = self.conv_last(x) 161 | x = self.globalpool(x) 162 | x = x.view(-1, self.stage_out_channels[-1]) 163 | x = self.classifier(x) 164 | return x 165 | 166 | def shufflenetv2(width_mult=1.): 167 | model = ShuffleNetV2(width_mult=width_mult) 168 | return model 169 | 170 | if __name__ == "__main__": 171 | """Testing 172 | """ 173 | model = ShuffleNetV2() 174 | print(model) 175 | -------------------------------------------------------------------------------- /TestAccuracy.py: -------------------------------------------------------------------------------- 1 | from croppingDataset import GAICD 2 | from croppingModel import build_crop_model 3 | import time 4 | import math 5 | import sys 6 | import torch 7 | from torch.autograd import Variable 8 | import torch.backends.cudnn as cudnn 9 | import torch.utils.data as data 10 | import argparse 11 | from scipy.stats import spearmanr, pearsonr 12 | 13 | parser = argparse.ArgumentParser( 14 | description='Single Shot MultiBox Detector Training With Pytorch') 15 | parser.add_argument('--dataset_root', default='dataset/GAIC/', help='Dataset root directory path') 16 | parser.add_argument('--image_size', default=256, type=int, help='Batch size for training') 17 | parser.add_argument('--batch_size', default=1, type=int, help='Batch size for training') 18 | parser.add_argument('--num_workers', default=0, type=int, help='Number of workers used in dataloading') 19 | parser.add_argument('--cuda', default=True, help='Use CUDA to train model') 20 | parser.add_argument('--net_path', default='weights/ablation/cropping/mobilenetv2/downsample4_multi_Aug1_Align9_Cdim8/23_0.625_0.583_0.553_0.525_0.785_0.762_0.748_0.723_0.783_0.806.pth_____', 21 | help='Directory for saving checkpoint models') 22 | args = parser.parse_args() 23 | 24 | if torch.cuda.is_available(): 25 | if args.cuda: 26 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 27 | if not args.cuda: 28 | print("WARNING: It looks like you have a CUDA device, but aren't " + 29 | "using CUDA.\nRun with --cuda for optimal training speed.") 30 | torch.set_default_tensor_type('torch.FloatTensor') 31 | else: 32 | torch.set_default_tensor_type('torch.FloatTensor') 33 | 34 | 35 | data_loader = data.DataLoader(GAICD(image_size=args.image_size, dataset_dir=args.dataset_root, set='test'), args.batch_size, num_workers=args.num_workers, shuffle=False) 36 | 37 | def test(): 38 | 39 | net = build_crop_model(scale='multi', alignsize=9, reddim=8, loadweight=True, model='mobilenetv2', downsample=4) 40 | 41 | net.load_state_dict(torch.load(args.net_path)) 42 | 43 | if args.cuda: 44 | net = torch.nn.DataParallel(net,device_ids=[0]) 45 | torch.backends.cudnn.deterministic = True 46 | torch.backends.cudnn.benchmark = False 47 | net = net.cuda() 48 | 49 | net.eval() 50 | 51 | acc4_5 = [] 52 | acc4_10 = [] 53 | wacc4_5 = [] 54 | wacc4_10 = [] 55 | srcc = [] 56 | pcc = [] 57 | for n in range(4): 58 | acc4_5.append(0) 59 | acc4_10.append(0) 60 | wacc4_5.append(0) 61 | wacc4_10.append(0) 62 | 63 | for id, sample in enumerate(data_loader): 64 | image = sample['image'] 65 | bboxs = sample['bbox'] 66 | MOS = sample['MOS'] 67 | 68 | roi = [] 69 | 70 | for idx in range(0,len(bboxs['xmin'])): 71 | roi.append((0, bboxs['xmin'][idx],bboxs['ymin'][idx],bboxs['xmax'][idx],bboxs['ymax'][idx])) 72 | 73 | if args.cuda: 74 | image = Variable(image.cuda()) 75 | roi = Variable(torch.Tensor(roi)) 76 | else: 77 | image = Variable(image) 78 | roi = Variable(torch.Tensor(roi)) 79 | 80 | t0 = time.time() 81 | out = net(image,roi) 82 | t1 = time.time() 83 | print('timer: %.4f sec.' % (t1 - t0)) 84 | 85 | id_MOS = sorted(range(len(MOS)), key=lambda k: MOS[k], reverse=True) 86 | id_out = sorted(range(len(out)), key=lambda k: out[k], reverse=True) 87 | 88 | rank_of_returned_crop = [] 89 | for k in range(4): 90 | rank_of_returned_crop.append(id_MOS.index(id_out[k])) 91 | 92 | for k in range(4): 93 | temp_acc_4_5 = 0.0 94 | temp_acc_4_10 = 0.0 95 | for j in range(k+1): 96 | if MOS[id_out[j]] >= MOS[id_MOS[4]]: 97 | temp_acc_4_5 += 1.0 98 | if MOS[id_out[j]] >= MOS[id_MOS[9]]: 99 | temp_acc_4_10 += 1.0 100 | acc4_5[k] += temp_acc_4_5 / (k+1.0) 101 | acc4_10[k] += temp_acc_4_10 / (k+1.0) 102 | 103 | for k in range(4): 104 | temp_wacc_4_5 = 0.0 105 | temp_wacc_4_10 = 0.0 106 | temp_rank_of_returned_crop = rank_of_returned_crop[:(k+1)] 107 | temp_rank_of_returned_crop.sort() 108 | for j in range(k+1): 109 | if temp_rank_of_returned_crop[j] <= 4: 110 | temp_wacc_4_5 += 1.0 * math.exp(-0.2*(temp_rank_of_returned_crop[j]-j)) 111 | if temp_rank_of_returned_crop[j] <= 9: 112 | temp_wacc_4_10 += 1.0 * math.exp(-0.1*(temp_rank_of_returned_crop[j]-j)) 113 | wacc4_5[k] += temp_wacc_4_5 / (k+1.0) 114 | wacc4_10[k] += temp_wacc_4_10 / (k+1.0) 115 | 116 | 117 | MOS_arr = [] 118 | out = torch.squeeze(out).cpu().detach().numpy() 119 | for k in range(len(MOS)): 120 | MOS_arr.append(MOS[k].numpy()[0]) 121 | srcc.append(spearmanr(MOS_arr,out)[0]) 122 | pcc.append(pearsonr(MOS_arr,out)[0]) 123 | 124 | 125 | for k in range(4): 126 | acc4_5[k] = acc4_5[k] / 200.0 127 | acc4_10[k] = acc4_10[k] / 200.0 128 | wacc4_5[k] = wacc4_5[k] / 200.0 129 | wacc4_10[k] = wacc4_10[k] / 200.0 130 | 131 | avg_srcc = sum(srcc) / 200.0 132 | avg_pcc = sum(pcc) / 200.0 133 | 134 | sys.stdout.write('[%.3f, %.3f, %.3f, %.3f] [%.3f, %.3f, %.3f, %.3f]\n' % (acc4_5[0],acc4_5[1],acc4_5[2],acc4_5[3],acc4_10[0],acc4_10[1],acc4_10[2],acc4_10[3])) 135 | sys.stdout.write('[%.3f, %.3f, %.3f, %.3f] [%.3f, %.3f, %.3f, %.3f]\n' % (wacc4_5[0],wacc4_5[1],wacc4_5[2],wacc4_5[3],wacc4_10[0],wacc4_10[1],wacc4_10[2],wacc4_10[3])) 136 | sys.stdout.write('[Avg SRCC: %.3f] [Avg PCC: %.3f]\n' % (avg_srcc,avg_pcc)) 137 | 138 | 139 | if __name__ == '__main__': 140 | test() 141 | -------------------------------------------------------------------------------- /TrainModel.py: -------------------------------------------------------------------------------- 1 | from croppingModel import build_crop_model 2 | from croppingDataset import GAICD 3 | import os 4 | import sys 5 | import time 6 | import math 7 | import torch 8 | from torch.autograd import Variable 9 | import torch.optim as optim 10 | import torch.utils.data as data 11 | import argparse 12 | import numpy as np 13 | import random 14 | from scipy.stats import spearmanr, pearsonr 15 | 16 | SEED = 0 17 | torch.manual_seed(SEED) 18 | np.random.seed(SEED) 19 | random.seed(SEED) 20 | 21 | parser = argparse.ArgumentParser(description='Grid anchor based image cropping') 22 | parser.add_argument('--dataset_root', default='dataset/GAIC/', help='Dataset root directory path') 23 | parser.add_argument('--base_model', default='mobilenetv2', help='Pretrained base model') 24 | parser.add_argument('--scale', default='multi', type=str, help='choose single or multi scale') 25 | parser.add_argument('--downsample', default=4, type=int, help='downsample time') 26 | parser.add_argument('--augmentation', default=1, type=int, help='choose single or multi scale') 27 | parser.add_argument('--image_size', default=256, type=int, help='Batch size for training') 28 | parser.add_argument('--align_size', default=9, type=int, help='Spatial size of RoIAlign and RoDAlign') 29 | parser.add_argument('--reduced_dim', default=8, type=int, help='Spatial size of RoIAlign and RoDAlign') 30 | parser.add_argument('--batch_size', default=1, type=int, help='Batch size for training') 31 | parser.add_argument('--resume', default=None, type=str, help='Checkpoint state_dict file to resume training from') 32 | parser.add_argument('--start_iter', default=0, type=int, help='Resume training at this iter') 33 | parser.add_argument('--num_workers', default=0, type=int, help='Number of workers used in dataloading') 34 | parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float, help='initial learning rate') 35 | parser.add_argument('--save_folder', default='weights/ablation/cropping/', help='Directory for saving checkpoint models') 36 | args = parser.parse_args() 37 | 38 | args.save_folder = args.save_folder + args.base_model + '/' + 'downsample' + str(args.downsample) + '_' + args.scale + '_Aug' + str(args.augmentation) + '_Align' +str(args.align_size) + '_Cdim'+str(args.reduced_dim) 39 | 40 | if not os.path.exists(args.save_folder): 41 | os.makedirs(args.save_folder) 42 | 43 | cuda = True if torch.cuda.is_available() else False 44 | 45 | if cuda: 46 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 47 | else: 48 | torch.set_default_tensor_type('torch.FloatTensor') 49 | 50 | 51 | data_loader_train = data.DataLoader(GAICD(image_size=args.image_size, dataset_dir=args.dataset_root, set='train', augmentation=args.augmentation), 52 | batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, worker_init_fn=random.seed(SEED)) 53 | 54 | data_loader_test = data.DataLoader(GAICD(image_size=args.image_size, dataset_dir=args.dataset_root, set='test'), 55 | batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False) 56 | 57 | net = build_crop_model(scale=args.scale, alignsize=args.align_size, reddim=args.reduced_dim, loadweight=True, model=args.base_model, downsample=args.downsample) 58 | 59 | # fix the batch normalization in mobilenet and shufflenet because batchsize = 1 60 | net.eval() 61 | 62 | if cuda: 63 | net = torch.nn.DataParallel(net,device_ids=[0]) 64 | torch.backends.cudnn.deterministic = True 65 | torch.backends.cudnn.benchmark = False 66 | #cudnn.benchmark = True 67 | net = net.cuda() 68 | 69 | optimizer = optim.Adam(net.parameters(), lr=args.lr) 70 | 71 | def test(): 72 | acc4_5 = [] 73 | acc4_10 = [] 74 | wacc4_5 = [] 75 | wacc4_10 = [] 76 | srcc = [] 77 | pcc = [] 78 | total_loss = 0 79 | avg_loss = 0 80 | for n in range(4): 81 | acc4_5.append(0) 82 | acc4_10.append(0) 83 | wacc4_5.append(0) 84 | wacc4_10.append(0) 85 | 86 | for id, sample in enumerate(data_loader_test): 87 | image = sample['image'] 88 | bboxs = sample['bbox'] 89 | MOS = sample['MOS'] 90 | 91 | roi = [] 92 | 93 | for idx in range(0,len(bboxs['xmin'])): 94 | roi.append((0, bboxs['xmin'][idx],bboxs['ymin'][idx],bboxs['xmax'][idx],bboxs['ymax'][idx])) 95 | 96 | if cuda: 97 | image = Variable(image.cuda()) 98 | roi = Variable(torch.Tensor(roi)) 99 | else: 100 | image = Variable(image) 101 | roi = Variable(roi) 102 | 103 | #t0 = time.time() 104 | out = net(image,roi) 105 | loss = torch.nn.SmoothL1Loss(reduction='elementwise_mean')(out.squeeze(), torch.Tensor(MOS)) 106 | total_loss += loss.item() 107 | avg_loss = total_loss / (id+1) 108 | 109 | id_MOS = sorted(range(len(MOS)), key=lambda k: MOS[k], reverse = True) 110 | id_out = sorted(range(len(out)), key=lambda k: out[k], reverse = True) 111 | for k in range(4): 112 | temp_acc_4_5 = 0.0 113 | temp_acc_4_10 = 0.0 114 | for j in range(k+1): 115 | if MOS[id_out[j]] >= MOS[id_MOS[4]]: 116 | temp_acc_4_5 += 1.0 117 | if MOS[id_out[j]] >= MOS[id_MOS[9]]: 118 | temp_acc_4_10 += 1.0 119 | acc4_5[k] += temp_acc_4_5 / (k+1.0) 120 | acc4_10[k] += temp_acc_4_10 / (k+1.0) 121 | 122 | rank_of_returned_crop = [] 123 | for k in range(4): 124 | rank_of_returned_crop.append(id_MOS.index(id_out[k])) 125 | 126 | for k in range(4): 127 | temp_wacc_4_5 = 0.0 128 | temp_wacc_4_10 = 0.0 129 | temp_rank_of_returned_crop = rank_of_returned_crop[:(k+1)] 130 | temp_rank_of_returned_crop.sort() 131 | for j in range(k+1): 132 | if temp_rank_of_returned_crop[j] <= 4: 133 | temp_wacc_4_5 += 1.0 * math.exp(-0.2*(temp_rank_of_returned_crop[j]-j)) 134 | if temp_rank_of_returned_crop[j] <= 9: 135 | temp_wacc_4_10 += 1.0 * math.exp(-0.1*(temp_rank_of_returned_crop[j]-j)) 136 | wacc4_5[k] += temp_wacc_4_5 / (k+1.0) 137 | wacc4_10[k] += temp_wacc_4_10 / (k+1.0) 138 | 139 | MOS_arr = [] 140 | out = torch.squeeze(out).cpu().detach().numpy() 141 | for k in range(len(MOS)): 142 | MOS_arr.append(MOS[k].numpy()[0]) 143 | srcc.append(spearmanr(MOS_arr,out)[0]) 144 | pcc.append(pearsonr(MOS_arr,out)[0]) 145 | 146 | #t1 = time.time() 147 | 148 | #print('timer: %.4f sec.' % (t1 - t0)) 149 | for k in range(4): 150 | acc4_5[k] = acc4_5[k] / 200.0 151 | acc4_10[k] = acc4_10[k] / 200.0 152 | wacc4_5[k] = wacc4_5[k] / 200.0 153 | wacc4_10[k] = wacc4_10[k] / 200.0 154 | 155 | avg_srcc = sum(srcc) / 200.0 156 | avg_pcc = sum(pcc) / 200.0 157 | 158 | 159 | return acc4_5, acc4_10, avg_srcc, avg_pcc, avg_loss, wacc4_5, wacc4_10 160 | 161 | 162 | def train(): 163 | 164 | for epoch in range(0, 80): 165 | total_loss = 0 166 | for id, sample in enumerate(data_loader_train): 167 | 168 | image = sample['image'] 169 | bboxs = sample['bbox'] 170 | 171 | roi = [] 172 | MOS = [] 173 | 174 | random_ID = range(0,len(bboxs['xmin'])) 175 | random.shuffle(random_ID) 176 | 177 | for idx in random_ID[:64]: 178 | roi.append((0, bboxs['xmin'][idx],bboxs['ymin'][idx],bboxs['xmax'][idx],bboxs['ymax'][idx])) 179 | MOS.append(sample['MOS'][idx]) 180 | 181 | if cuda: 182 | image = Variable(image.cuda()) 183 | roi = Variable(torch.Tensor(roi)) 184 | MOS = torch.Tensor(MOS) 185 | else: 186 | image = Variable(image) 187 | roi = Variable(roi) 188 | 189 | # forward 190 | 191 | out = net(image,roi) 192 | loss = torch.nn.SmoothL1Loss(reduction='elementwise_mean')(out.squeeze(), MOS) 193 | total_loss += loss.item() 194 | avg_loss = total_loss / (id+1) 195 | 196 | # backprop 197 | optimizer.zero_grad() 198 | loss.backward() 199 | optimizer.step() 200 | 201 | sys.stdout.write('\r[Epoch %d/%d] [Batch %d/%d] [Train Loss: %.4f]' % (epoch, 79, id, len(data_loader_train), avg_loss)) 202 | 203 | acc4_5, acc4_10, avg_srcc, avg_pcc, test_avg_loss, wacc4_5, wacc4_10 = test() 204 | sys.stdout.write('[Test Loss: %.4f] [%.3f, %.3f, %.3f, %.3f] [%.3f, %.3f, %.3f, %.3f] [SRCC: %.3f] [PCC: %.3f]\n' % (test_avg_loss,acc4_5[0],acc4_5[1],acc4_5[2],acc4_5[3],acc4_10[0],acc4_10[1],acc4_10[2],acc4_10[3],avg_srcc,avg_pcc)) 205 | sys.stdout.write('[%.3f, %.3f, %.3f, %.3f] [%.3f, %.3f, %.3f, %.3f]\n' % (wacc4_5[0],wacc4_5[1],wacc4_5[2],wacc4_5[3],wacc4_10[0],wacc4_10[1],wacc4_10[2],wacc4_10[3])) 206 | torch.save(net.module.state_dict(), args.save_folder + '/' + repr(epoch) + '_%.3f_%.3f_%.3f_%.3f_%.3f_%.3f_%.3f_%.3f_%.3f_%.3f' % (acc4_5[0],acc4_5[1],acc4_5[2],acc4_5[3],acc4_10[0],acc4_10[1],acc4_10[2],acc4_10[3],avg_srcc,avg_pcc) + '.pth') 207 | 208 | 209 | if __name__ == '__main__': 210 | train() 211 | -------------------------------------------------------------------------------- /augmentations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision import transforms 3 | import cv2 4 | import numpy as np 5 | import types 6 | from numpy import random 7 | 8 | 9 | def intersect(box_a, box_b): 10 | max_xy = np.minimum(box_a[:, 2:], box_b[2:]) 11 | min_xy = np.maximum(box_a[:, :2], box_b[:2]) 12 | inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf) 13 | return inter[:, 0] * inter[:, 1] 14 | 15 | 16 | def jaccard_numpy(box_a, box_b): 17 | 18 | inter = intersect(box_a, box_b) 19 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 20 | (box_a[:, 3]-box_a[:, 1])) # [A,B] 21 | area_b = ((box_b[2]-box_b[0]) * 22 | (box_b[3]-box_b[1])) # [A,B] 23 | union = area_a + area_b - inter 24 | return inter / union # [A,B] 25 | 26 | 27 | class Compose(object): 28 | """Composes several augmentations together. 29 | Args: 30 | transforms (List[Transform]): list of transforms to compose. 31 | Example: 32 | augmentations.Compose([transforms.CenterCrop(10), transforms.ToTensor(),]) 33 | """ 34 | 35 | def __init__(self, transforms): 36 | self.transforms = transforms 37 | 38 | def __call__(self, img, boxes=None, labels=None): 39 | for t in self.transforms: 40 | img, boxes, labels = t(img, boxes, labels) 41 | return img, boxes, labels 42 | 43 | 44 | class Lambda(object): 45 | """Applies a lambda as a transform.""" 46 | 47 | def __init__(self, lambd): 48 | assert isinstance(lambd, types.LambdaType) 49 | self.lambd = lambd 50 | 51 | def __call__(self, img, boxes=None, labels=None): 52 | return self.lambd(img, boxes, labels) 53 | 54 | 55 | class ConvertFromInts(object): 56 | def __call__(self, image, boxes=None, labels=None): 57 | return image.astype(np.float32), boxes, labels 58 | 59 | 60 | class SubtractMeans(object): 61 | def __init__(self, mean): 62 | self.mean = np.array(mean, dtype=np.float32) 63 | 64 | def __call__(self, image, boxes=None, labels=None): 65 | image = image.astype(np.float32) 66 | image -= self.mean 67 | return image.astype(np.float32), boxes, labels 68 | 69 | 70 | class ToAbsoluteCoords(object): 71 | def __call__(self, image, boxes=None, labels=None): 72 | height, width, channels = image.shape 73 | boxes[:, 0] *= width 74 | boxes[:, 2] *= width 75 | boxes[:, 1] *= height 76 | boxes[:, 3] *= height 77 | 78 | return image, boxes, labels 79 | 80 | 81 | class ToPercentCoords(object): 82 | def __call__(self, image, boxes=None, labels=None): 83 | height, width, channels = image.shape 84 | boxes[:, 0] /= width 85 | boxes[:, 2] /= width 86 | boxes[:, 1] /= height 87 | boxes[:, 3] /= height 88 | 89 | return image, boxes, labels 90 | 91 | 92 | class Resize(object): 93 | def __init__(self, size=300): 94 | self.size = size 95 | 96 | def __call__(self, image, boxes=None, labels=None): 97 | image = cv2.resize(image, (self.size, 98 | self.size)) 99 | return image, boxes, labels 100 | 101 | 102 | class RandomSaturation(object): 103 | def __init__(self, lower=0.5, upper=1.5): 104 | self.lower = lower 105 | self.upper = upper 106 | assert self.upper >= self.lower, "contrast upper must be >= lower." 107 | assert self.lower >= 0, "contrast lower must be non-negative." 108 | 109 | def __call__(self, image, boxes=None, labels=None): 110 | if random.randint(2): 111 | image[:, :, 1] *= random.uniform(self.lower, self.upper) 112 | 113 | return image, boxes, labels 114 | 115 | 116 | class RandomHue(object): 117 | def __init__(self, delta=18.0): 118 | assert delta >= 0.0 and delta <= 360.0 119 | self.delta = delta 120 | 121 | def __call__(self, image, boxes=None, labels=None): 122 | if random.randint(2): 123 | image[:, :, 0] += random.uniform(-self.delta, self.delta) 124 | image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 125 | image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 126 | return image, boxes, labels 127 | 128 | 129 | class RandomLightingNoise(object): 130 | def __init__(self): 131 | self.perms = ((0, 1, 2), (0, 2, 1), 132 | (1, 0, 2), (1, 2, 0), 133 | (2, 0, 1), (2, 1, 0)) 134 | 135 | def __call__(self, image, boxes=None, labels=None): 136 | if random.randint(2): 137 | swap = self.perms[random.randint(len(self.perms))] 138 | shuffle = SwapChannels(swap) # shuffle channels 139 | image = shuffle(image) 140 | return image, boxes, labels 141 | 142 | 143 | class ConvertColor(object): 144 | def __init__(self, current='BGR', transform='HSV'): 145 | self.transform = transform 146 | self.current = current 147 | 148 | def __call__(self, image, boxes=None, labels=None): 149 | if self.current == 'BGR' and self.transform == 'HSV': 150 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 151 | elif self.current == 'HSV' and self.transform == 'BGR': 152 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 153 | else: 154 | raise NotImplementedError 155 | return image, boxes, labels 156 | 157 | 158 | class RandomContrast(object): 159 | def __init__(self, lower=0.5, upper=1.5): 160 | self.lower = lower 161 | self.upper = upper 162 | assert self.upper >= self.lower, "contrast upper must be >= lower." 163 | assert self.lower >= 0, "contrast lower must be non-negative." 164 | 165 | # expects float image 166 | def __call__(self, image, boxes=None, labels=None): 167 | if random.randint(2): 168 | alpha = random.uniform(self.lower, self.upper) 169 | image *= alpha 170 | return image, boxes, labels 171 | 172 | 173 | class RandomBrightness(object): 174 | def __init__(self, delta=32): 175 | assert delta >= 0.0 176 | assert delta <= 255.0 177 | self.delta = delta 178 | 179 | def __call__(self, image, boxes=None, labels=None): 180 | if random.randint(2): 181 | delta = random.uniform(-self.delta, self.delta) 182 | image += delta 183 | return image, boxes, labels 184 | 185 | 186 | class ToCV2Image(object): 187 | def __call__(self, tensor, boxes=None, labels=None): 188 | return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels 189 | 190 | 191 | class ToTensor(object): 192 | def __call__(self, cvimage, boxes=None, labels=None): 193 | return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels 194 | 195 | 196 | class RandomSampleCrop(object): 197 | """Crop 198 | Arguments: 199 | img (Image): the image being input during training 200 | boxes (Tensor): the original bounding boxes in pt form 201 | labels (Tensor): the class labels for each bbox 202 | mode (float tuple): the min and max jaccard overlaps 203 | Return: 204 | (img, boxes, classes) 205 | img (Image): the cropped image 206 | boxes (Tensor): the adjusted bounding boxes in pt form 207 | labels (Tensor): the class labels for each bbox 208 | """ 209 | def __init__(self): 210 | self.sample_options = ( 211 | # using entire original input image 212 | None, 213 | # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9 214 | (0.1, None), 215 | (0.3, None), 216 | (0.7, None), 217 | (0.9, None), 218 | # randomly sample a patch 219 | (None, None), 220 | ) 221 | 222 | def __call__(self, image, boxes=None, labels=None): 223 | height, width, _ = image.shape 224 | while True: 225 | # randomly choose a mode 226 | mode = random.choice(self.sample_options) 227 | if mode is None: 228 | return image, boxes, labels 229 | 230 | min_iou, max_iou = mode 231 | if min_iou is None: 232 | min_iou = float('-inf') 233 | if max_iou is None: 234 | max_iou = float('inf') 235 | 236 | # max trails (50) 237 | for _ in range(50): 238 | current_image = image 239 | 240 | w = random.uniform(0.3 * width, width) 241 | h = random.uniform(0.3 * height, height) 242 | 243 | # aspect ratio constraint b/t .5 & 2 244 | if h / w < 0.5 or h / w > 2: 245 | continue 246 | 247 | left = random.uniform(width - w) 248 | top = random.uniform(height - h) 249 | 250 | # convert to integer rect x1,y1,x2,y2 251 | rect = np.array([int(left), int(top), int(left+w), int(top+h)]) 252 | 253 | # calculate IoU (jaccard overlap) b/t the cropped and gt boxes 254 | overlap = jaccard_numpy(boxes, rect) 255 | 256 | # is min and max overlap constraint satisfied? if not try again 257 | if overlap.min() < min_iou and max_iou < overlap.max(): 258 | continue 259 | 260 | # cut the crop from the image 261 | current_image = current_image[rect[1]:rect[3], rect[0]:rect[2], 262 | :] 263 | 264 | # keep overlap with gt box IF center in sampled patch 265 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 266 | 267 | # mask in all gt boxes that above and to the left of centers 268 | m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1]) 269 | 270 | # mask in all gt boxes that under and to the right of centers 271 | m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1]) 272 | 273 | # mask in that both m1 and m2 are true 274 | mask = m1 * m2 275 | 276 | # have any valid boxes? try again if not 277 | if not mask.any(): 278 | continue 279 | 280 | # take only matching gt boxes 281 | current_boxes = boxes[mask, :].copy() 282 | 283 | # take only matching gt labels 284 | current_labels = labels[mask] 285 | 286 | # should we use the box left and top corner or the crop's 287 | current_boxes[:, :2] = np.maximum(current_boxes[:, :2], 288 | rect[:2]) 289 | # adjust to crop (by substracting crop's left,top) 290 | current_boxes[:, :2] -= rect[:2] 291 | 292 | current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], 293 | rect[2:]) 294 | # adjust to crop (by substracting crop's left,top) 295 | current_boxes[:, 2:] -= rect[:2] 296 | 297 | return current_image, current_boxes, current_labels 298 | 299 | 300 | class Expand(object): 301 | def __init__(self, mean): 302 | self.mean = mean 303 | 304 | def __call__(self, image, boxes, labels): 305 | if random.randint(2): 306 | return image, boxes, labels 307 | 308 | height, width, depth = image.shape 309 | ratio = random.uniform(1, 4) 310 | left = random.uniform(0, width*ratio - width) 311 | top = random.uniform(0, height*ratio - height) 312 | 313 | expand_image = np.zeros( 314 | (int(height*ratio), int(width*ratio), depth), 315 | dtype=image.dtype) 316 | expand_image[:, :, :] = self.mean 317 | expand_image[int(top):int(top + height), 318 | int(left):int(left + width)] = image 319 | image = expand_image 320 | 321 | boxes = boxes.copy() 322 | boxes[:, :2] += (int(left), int(top)) 323 | boxes[:, 2:] += (int(left), int(top)) 324 | 325 | return image, boxes, labels 326 | 327 | 328 | class RandomMirror(object): 329 | def __call__(self, image, annotations, classes): 330 | _, width, _ = image.shape 331 | if random.randint(2): 332 | image = image[:, ::-1] 333 | for i in range (len(annotations)): 334 | annotations[i][1] = width - annotations[i][1] 335 | annotations[i][3] = width - annotations[i][3] 336 | return image, annotations, classes 337 | 338 | 339 | class SwapChannels(object): 340 | """Transforms a tensorized image by swapping the channels in the order 341 | specified in the swap tuple. 342 | Args: 343 | swaps (int triple): final order of channels 344 | eg: (2, 1, 0) 345 | """ 346 | 347 | def __init__(self, swaps): 348 | self.swaps = swaps 349 | 350 | def __call__(self, image): 351 | """ 352 | Args: 353 | image (Tensor): image tensor to be transformed 354 | Return: 355 | a tensor with channels swapped according to swap 356 | """ 357 | # if torch.is_tensor(image): 358 | # image = image.data.cpu().numpy() 359 | # else: 360 | # image = np.array(image) 361 | image = image[:, :, self.swaps] 362 | return image 363 | 364 | 365 | class PhotometricDistort(object): 366 | def __init__(self): 367 | self.pd = [ 368 | RandomContrast(), 369 | ConvertColor(transform='HSV'), 370 | RandomSaturation(), 371 | RandomHue(), 372 | ConvertColor(current='HSV', transform='BGR'), 373 | RandomContrast() 374 | ] 375 | self.rand_brightness = RandomBrightness() 376 | self.rand_light_noise = RandomLightingNoise() 377 | 378 | def __call__(self, image, boxes, labels): 379 | im = image.copy() 380 | im, boxes, labels = self.rand_brightness(im, boxes, labels) 381 | if random.randint(2): 382 | distort = Compose(self.pd[:-1]) 383 | else: 384 | distort = Compose(self.pd[1:]) 385 | return distort(im, boxes, labels) 386 | 387 | 388 | class CropAugmentation(object): 389 | def __init__(self): 390 | self.augment = Compose([ 391 | ConvertFromInts(), 392 | PhotometricDistort(), 393 | RandomMirror() 394 | ]) 395 | 396 | def __call__(self, img, annotations): 397 | image, annotations, label = self.augment(img, annotations) 398 | return image, annotations 399 | -------------------------------------------------------------------------------- /croppingDataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch.utils.data as data 3 | import cv2 4 | import math 5 | import numpy as np 6 | from augmentations import CropAugmentation 7 | 8 | MOS_MEAN = 2.95 9 | MOS_STD = 0.8 10 | RGB_MEAN = (0.485, 0.456, 0.406) 11 | RGB_STD = (0.229, 0.224, 0.225) 12 | 13 | 14 | class TransformFunction(object): 15 | 16 | def __call__(self, sample,image_size): 17 | image, annotations = sample['image'], sample['annotations'] 18 | 19 | scale = image_size / min(image.shape[:2]) 20 | h = round(image.shape[0] * scale / 32.0) * 32 21 | w = round(image.shape[1] * scale / 32.0) * 32 22 | resized_image = cv2.resize(image,(int(w),int(h))) / 256.0 23 | rgb_mean = np.array(RGB_MEAN, dtype=np.float32) 24 | rgb_std = np.array(RGB_STD, dtype=np.float32) 25 | resized_image = resized_image.astype(np.float32) 26 | resized_image -= rgb_mean 27 | resized_image = resized_image / rgb_std 28 | 29 | scale_height = float(resized_image.shape[0]) / image.shape[0] 30 | scale_width = float(resized_image.shape[1]) / image.shape[1] 31 | 32 | transformed_bbox = {} 33 | transformed_bbox['xmin'] = [] 34 | transformed_bbox['ymin'] = [] 35 | transformed_bbox['xmax'] = [] 36 | transformed_bbox['ymax'] = [] 37 | MOS = [] 38 | for annotation in annotations: 39 | transformed_bbox['xmin'].append(math.floor(float(annotation[1]) * scale_width)) 40 | transformed_bbox['ymin'].append(math.floor(float(annotation[0]) * scale_height)) 41 | transformed_bbox['xmax'].append(math.ceil(float(annotation[3]) * scale_width)) 42 | transformed_bbox['ymax'].append(math.ceil(float(annotation[2]) * scale_height)) 43 | 44 | MOS.append((float(annotation[-1]) - MOS_MEAN) / MOS_STD) 45 | 46 | resized_image = resized_image.transpose((2, 0, 1)) 47 | return {'image': resized_image, 'bbox': transformed_bbox, 'MOS': MOS} 48 | 49 | class GAICD(data.Dataset): 50 | 51 | def __init__(self, image_size=256, dataset_dir='dataset/GAIC/', set = 'train', 52 | transform=TransformFunction(), augmentation=False): 53 | self.image_size = float(image_size) 54 | self.dataset_dir = dataset_dir 55 | self.set = set 56 | image_lists = os.listdir(self.dataset_dir + 'images/' + set) 57 | self._imgpath = list() 58 | self._annopath = list() 59 | for image in image_lists: 60 | self._imgpath.append(os.path.join(self.dataset_dir, 'images', set, image)) 61 | self._annopath.append(os.path.join(self.dataset_dir, 'annotations', set, image[:-3]+"txt")) 62 | self.transform = transform 63 | if augmentation: 64 | self.augmentation = CropAugmentation() 65 | else: 66 | self.augmentation = None 67 | 68 | 69 | def __getitem__(self, idx): 70 | image = cv2.imread(self._imgpath[idx]) 71 | 72 | with open(self._annopath[idx],'r') as fid: 73 | annotations_txt = fid.readlines() 74 | 75 | annotations = list() 76 | for annotation in annotations_txt: 77 | annotation_split = annotation.split() 78 | if float(annotation_split[4]) != -2: 79 | annotations.append([float(annotation_split[0]),float(annotation_split[1]),float(annotation_split[2]),float(annotation_split[3]),float(annotation_split[4])]) 80 | 81 | if self.augmentation: 82 | image, annotations = self.augmentation(image, annotations) 83 | 84 | # to rgb 85 | image = image[:, :, (2, 1, 0)] 86 | 87 | sample = {'image': image, 'annotations': annotations} 88 | 89 | if self.transform: 90 | sample = self.transform(sample,self.image_size) 91 | 92 | return sample 93 | 94 | def __len__(self): 95 | return len(self._imgpath) 96 | 97 | 98 | class TransformFunctionTest(object): 99 | 100 | def __call__(self, image, image_size): 101 | 102 | scale = image_size / min(image.shape[:2]) 103 | h = round(image.shape[0] * scale / 32.0) * 32 104 | w = round(image.shape[1] * scale / 32.0) * 32 105 | resized_image = cv2.resize(image,(int(w),int(h))) / 256.0 106 | rgb_mean = np.array(RGB_MEAN, dtype=np.float32) 107 | rgb_std = np.array(RGB_STD, dtype=np.float32) 108 | resized_image = resized_image.astype(np.float32) 109 | resized_image -= rgb_mean 110 | resized_image = resized_image / rgb_std 111 | 112 | scale_height = image.shape[0] / float(resized_image.shape[0]) 113 | scale_width = image.shape[1] / float(resized_image.shape[1]) 114 | 115 | bboxes = generate_bboxes(resized_image) 116 | 117 | transformed_bbox = {} 118 | transformed_bbox['xmin'] = [] 119 | transformed_bbox['ymin'] = [] 120 | transformed_bbox['xmax'] = [] 121 | transformed_bbox['ymax'] = [] 122 | source_bboxes = list() 123 | 124 | for bbox in bboxes: 125 | source_bboxes.append([round(bbox[0] * scale_height),round(bbox[1] * scale_width),round(bbox[2] * scale_height),round(bbox[3] * scale_width)]) 126 | transformed_bbox['xmin'].append(bbox[1]) 127 | transformed_bbox['ymin'].append(bbox[0]) 128 | transformed_bbox['xmax'].append(bbox[3]) 129 | transformed_bbox['ymax'].append(bbox[2]) 130 | 131 | resized_image = resized_image.transpose((2, 0, 1)) 132 | return resized_image,transformed_bbox,source_bboxes 133 | 134 | 135 | def generate_bboxes(image): 136 | 137 | bins = 12.0 138 | h = image.shape[0] 139 | w = image.shape[1] 140 | step_h = h / bins 141 | step_w = w / bins 142 | annotations = list() 143 | for x1 in range(0,4): 144 | for y1 in range(0,4): 145 | for x2 in range(8,12): 146 | for y2 in range(8,12): 147 | if (x2-x1)*(y2-y1)>0.4999*bins*bins and (y2-y1)*step_w/(x2-x1)/step_h>0.5 and (y2-y1)*step_w/(x2-x1)/step_h<2.0: 148 | annotations.append([float(step_h*(0.5+x1)),float(step_w*(0.5+y1)),float(step_h*(0.5+x2)),float(step_w*(0.5+y2))]) 149 | 150 | return annotations 151 | 152 | def generate_bboxes_16_9(image): 153 | 154 | h = image.shape[0] 155 | w = image.shape[1] 156 | h_step = 9 157 | w_step = 16 158 | annotations = list() 159 | for i in range(14,30): 160 | out_h = h_step*i 161 | out_w = w_step*i 162 | if out_h < h and out_w < w and out_h*out_w>0.4*h*w: 163 | for w_start in range(0,w-out_w,w_step): 164 | for h_start in range(0,h-out_h,h_step): 165 | annotations.append([float(h_start),float(w_start),float(h_start+out_h-1),float(w_start+out_w-1)]) 166 | return annotations 167 | 168 | def generate_bboxes_4_3(image): 169 | 170 | h = image.shape[0] 171 | w = image.shape[1] 172 | h_step = 12 173 | w_step = 16 174 | annotations = list() 175 | for i in range(14,30): 176 | out_h = h_step*i 177 | out_w = w_step*i 178 | if out_h < h and out_w < w and out_h*out_w>0.4*h*w: 179 | for w_start in range(0,w-out_w,w_step): 180 | for h_start in range(0,h-out_h,h_step): 181 | annotations.append([float(h_start),float(w_start),float(h_start+out_h-1),float(w_start+out_w-1)]) 182 | return annotations 183 | 184 | def generate_bboxes_1_1(image): 185 | 186 | h = image.shape[0] 187 | w = image.shape[1] 188 | h_step = 12 189 | w_step = 12 190 | annotations = list() 191 | for i in range(14,30): 192 | out_h = h_step*i 193 | out_w = w_step*i 194 | if out_h < h and out_w < w and out_h*out_w>0.4*h*w: 195 | for w_start in range(0,w-out_w,w_step): 196 | for h_start in range(0,h-out_h,h_step): 197 | annotations.append([float(h_start),float(w_start),float(h_start+out_h-1),float(w_start+out_w-1)]) 198 | return annotations 199 | 200 | class setup_test_dataset(data.Dataset): 201 | 202 | def __init__(self, image_size=256.0,dataset_dir='testsetDir', transform=TransformFunctionTest()): 203 | self.image_size = float(image_size) 204 | self.dataset_dir = dataset_dir 205 | image_lists = os.listdir(self.dataset_dir) 206 | self._imgpath = list() 207 | self._annopath = list() 208 | for image in image_lists: 209 | self._imgpath.append(os.path.join(self.dataset_dir, image)) 210 | self.transform = transform 211 | 212 | 213 | def __getitem__(self, idx): 214 | image = cv2.imread(self._imgpath[idx]) 215 | 216 | # to rgb 217 | image = image[:, :, (2, 1, 0)] 218 | 219 | if self.transform: 220 | resized_image,transformed_bbox,source_bboxes = self.transform(image,self.image_size) 221 | 222 | sample = {'imgpath': self._imgpath[idx], 'image': image, 'resized_image': resized_image, 'tbboxes':transformed_bbox , 'sourceboxes': source_bboxes} 223 | 224 | return sample 225 | 226 | def __len__(self): 227 | return len(self._imgpath) 228 | 229 | -------------------------------------------------------------------------------- /croppingModel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision.models as models 4 | from roi_align.modules.roi_align import RoIAlignAvg, RoIAlign 5 | from rod_align.modules.rod_align import RoDAlignAvg, RoDAlign 6 | import torch.nn.init as init 7 | from ShuffleNetV2 import shufflenetv2 8 | from mobilenetv2 import MobileNetV2 9 | from thop import profile 10 | 11 | 12 | class vgg_base(nn.Module): 13 | 14 | def __init__(self, loadweights=True, downsample=4): 15 | super(vgg_base, self).__init__() 16 | 17 | vgg = models.vgg16(pretrained=True) 18 | 19 | if downsample == 4: 20 | self.feature = nn.Sequential(vgg.features[:-1]) 21 | elif downsample == 5: 22 | self.feature = nn.Sequential(vgg.features) 23 | 24 | self.feature3 = nn.Sequential(vgg.features[:23]) 25 | self.feature4 = nn.Sequential(vgg.features[23:30]) 26 | self.feature5 = nn.Sequential(vgg.features[30:]) 27 | 28 | #flops, params = profile(self.feature, input_size=(1, 3, 256,256)) 29 | 30 | def forward(self, x): 31 | #return self.feature(x) 32 | f3 = self.feature3(x) 33 | f4 = self.feature4(f3) 34 | f5 = self.feature5(f4) 35 | return f3, f4, f5 36 | 37 | class resnet50_base(nn.Module): 38 | 39 | def __init__(self, loadweights=True, downsample=4): 40 | super(resnet50_base, self).__init__() 41 | 42 | resnet50 = models.resnet50(pretrained=True) 43 | 44 | self.feature3 = nn.Sequential(resnet50.conv1,resnet50.bn1,resnet50.relu,resnet50.maxpool,resnet50.layer1,resnet50.layer2) 45 | self.feature4 = nn.Sequential(resnet50.layer3) 46 | self.feature5 = nn.Sequential(resnet50.layer4) 47 | 48 | #flops, params = profile(self.feature, input_size=(1, 3, 256,256)) 49 | 50 | def forward(self, x): 51 | #return self.feature(x) 52 | f3 = self.feature3(x) 53 | f4 = self.feature4(f3) 54 | f5 = self.feature5(f4) 55 | return f3, f4, f5 56 | 57 | 58 | class mobilenetv2_base(nn.Module): 59 | 60 | def __init__(self, loadweights=True, downsample=4, model_path='pretrained_model/mobilenetv2_1.0-0c6065bc.pth'): 61 | super(mobilenetv2_base, self).__init__() 62 | 63 | model = MobileNetV2(width_mult=1.0) 64 | 65 | if loadweights: 66 | model.load_state_dict(torch.load(model_path)) 67 | 68 | #if downsample == 4: 69 | # self.feature = nn.Sequential(model.features[:14]) 70 | #elif downsample == 5: 71 | # self.feature = nn.Sequential(model.features) 72 | 73 | self.feature3 = nn.Sequential(model.features[:7]) 74 | self.feature4 = nn.Sequential(model.features[7:14]) 75 | self.feature5 = nn.Sequential(model.features[14:]) 76 | 77 | #flops, params = profile(self.feature, input_size=(1, 3, 256,256)) 78 | 79 | def forward(self, x): 80 | #return self.feature(x) 81 | f3 = self.feature3(x) 82 | f4 = self.feature4(f3) 83 | f5 = self.feature5(f4) 84 | return f3, f4, f5 85 | 86 | 87 | class shufflenetv2_base(nn.Module): 88 | 89 | def __init__(self, loadweights=True, downsample=4, model_path='pretrained_model/shufflenetv2_x1_69.402_88.374.pth.tar'): 90 | super(shufflenetv2_base, self).__init__() 91 | 92 | model = shufflenetv2(width_mult=1.0) 93 | 94 | if loadweights: 95 | model.load_state_dict(torch.load(model_path)) 96 | 97 | self.feature3 = nn.Sequential(model.conv1, model.maxpool, model.features[:4]) 98 | self.feature4 = nn.Sequential(model.features[4:12]) 99 | self.feature5 = nn.Sequential(model.features[12:]) 100 | 101 | #if downsample == 4: 102 | # self.feature = nn.Sequential(model.conv1, model.maxpool, model.features[:12]) 103 | #elif downsample == 5: 104 | # self.feature = nn.Sequential(model.conv1, model.maxpool, model.features) 105 | 106 | #flops, params = profile(self.feature, input_size=(1, 3, 256,256)) 107 | 108 | def forward(self, x): 109 | #return self.feature(x) 110 | f3 = self.feature3(x) 111 | f4 = self.feature4(f3) 112 | f5 = self.feature5(f4) 113 | return f3, f4, f5 114 | 115 | 116 | def fc_layers(reddim = 32, alignsize = 8): 117 | conv1 = nn.Sequential(nn.Conv2d(reddim, 768, kernel_size=alignsize, padding=0),nn.ReLU(inplace=True)) 118 | #conv1 = nn.Sequential(nn.Conv2d(reddim, 768, kernel_size=3, padding=1, stride=2),nn.ReLU(inplace=True), 119 | # nn.Conv2d(768, reddim, kernel_size=1, padding=0),nn.ReLU(inplace=True), 120 | # nn.Conv2d(reddim, 768, kernel_size=3, padding=1,stride=2),nn.ReLU(inplace=True), 121 | # nn.Conv2d(768, reddim, kernel_size=1, padding=0),nn.ReLU(inplace=True), 122 | # nn.Conv2d(reddim, 768, kernel_size=3, padding=0,stride=1),nn.ReLU(inplace=True)) 123 | #conv1 = nn.Sequential(nn.Conv2d(reddim, 768, kernel_size=5, padding=2, stride=2),nn.ReLU(inplace=True), 124 | # nn.Conv2d(768, reddim, kernel_size=1, padding=0),nn.ReLU(inplace=True), 125 | # nn.Conv2d(reddim, 768, kernel_size=5, padding=0,stride=1),nn.ReLU(inplace=True)) 126 | conv2 = nn.Sequential(nn.Conv2d(768, 128, kernel_size=1),nn.ReLU(inplace=True)) 127 | #dropout = nn.Dropout(p=0.5) 128 | conv3 = nn.Conv2d(128, 1, kernel_size=1) 129 | layers = nn.Sequential(conv1, conv2, dropout, conv3) 130 | return layers 131 | 132 | 133 | class crop_model_single_scale(nn.Module): 134 | 135 | def __init__(self, alignsize = 8, reddim = 8, loadweight = True, model = None, downsample=4): 136 | super(crop_model_single_scale, self).__init__() 137 | 138 | if model == 'shufflenetv2': 139 | self.Feat_ext = shufflenetv2_base(loadweight,downsample) 140 | if downsample == 4: 141 | self.DimRed = nn.Conv2d(232, reddim, kernel_size=1, padding=0) 142 | else: 143 | self.DimRed = nn.Conv2d(464, reddim, kernel_size=1, padding=0) 144 | elif model == 'mobilenetv2': 145 | self.Feat_ext = mobilenetv2_base(loadweight,downsample) 146 | if downsample == 4: 147 | self.DimRed = nn.Conv2d(96, reddim, kernel_size=1, padding=0) 148 | else: 149 | self.DimRed = nn.Conv2d(320, reddim, kernel_size=1, padding=0) 150 | elif model == 'vgg16': 151 | self.Feat_ext = vgg_base(loadweight,downsample) 152 | self.DimRed = nn.Conv2d(512, reddim, kernel_size=1, padding=0) 153 | elif model == 'resnet50': 154 | self.Feat_ext = resnet50_base(loadweight,downsample) 155 | self.DimRed = nn.Conv2d(1024, reddim, kernel_size=1, padding=0) 156 | 157 | self.RoIAlign = RoIAlignAvg(alignsize, alignsize, 1.0/2**downsample) 158 | self.RoDAlign = RoDAlignAvg(alignsize, alignsize, 1.0/2**downsample) 159 | self.FC_layers = fc_layers(reddim*2, alignsize) 160 | 161 | #flops, params = profile(self.FC_layers, input_size=(1,reddim*2,9,9)) 162 | 163 | def forward(self, im_data, boxes): 164 | 165 | f3,base_feat,f5 = self.Feat_ext(im_data) 166 | red_feat = self.DimRed(base_feat) 167 | RoI_feat = self.RoIAlign(red_feat, boxes) 168 | RoD_feat = self.RoDAlign(red_feat, boxes) 169 | final_feat = torch.cat((RoI_feat, RoD_feat), 1) 170 | prediction = self.FC_layers(final_feat) 171 | return prediction 172 | 173 | def _init_weights(self): 174 | print('Initializing weights...') 175 | self.DimRed.apply(weights_init) 176 | self.FC_layers.apply(weights_init) 177 | 178 | 179 | class crop_model_multi_scale_individual(nn.Module): 180 | 181 | def __init__(self, alignsize = 8, reddim = 32, loadweight = True, model = None, downsample = 4): 182 | super(crop_model_multi_scale_individual, self).__init__() 183 | 184 | if model == 'shufflenetv2': 185 | self.Feat_ext1 = shufflenetv2_base(loadweight,downsample) 186 | self.Feat_ext2 = shufflenetv2_base(loadweight,downsample) 187 | self.Feat_ext3 = shufflenetv2_base(loadweight,downsample) 188 | self.DimRed = nn.Conv2d(232, reddim, kernel_size=1, padding=0) 189 | elif model == 'mobilenetv2': 190 | self.Feat_ext1 = mobilenetv2_base(loadweight,downsample) 191 | self.Feat_ext2 = mobilenetv2_base(loadweight,downsample) 192 | self.Feat_ext3 = mobilenetv2_base(loadweight,downsample) 193 | self.DimRed = nn.Conv2d(96, reddim, kernel_size=1, padding=0) 194 | elif model == 'vgg16': 195 | self.Feat_ext1 = vgg_base(loadweight,downsample) 196 | self.Feat_ext2 = vgg_base(loadweight,downsample) 197 | self.Feat_ext3 = vgg_base(loadweight,downsample) 198 | self.DimRed = nn.Conv2d(512, reddim, kernel_size=1, padding=0) 199 | 200 | self.downsample2 = nn.UpsamplingBilinear2d(scale_factor=1.0/2.0) 201 | self.upsample2 = nn.UpsamplingBilinear2d(scale_factor=2.0) 202 | self.RoIAlign = RoIAlignAvg(alignsize, alignsize, 1.0/2**downsample) 203 | self.RoDAlign = RoDAlignAvg(alignsize, alignsize, 1.0/2**downsample) 204 | self.FC_layers = fc_layers(reddim*2, alignsize) 205 | 206 | def forward(self, im_data, boxes): 207 | 208 | base_feat = self.Feat_ext1(im_data) 209 | 210 | up_im = self.upsample2(im_data) 211 | up_feat = self.Feat_ext2(up_im) 212 | up_feat = self.downsample2(up_feat) 213 | 214 | down_im = self.downsample2(im_data) 215 | down_feat = self.Feat_ext3(down_im) 216 | down_feat = self.upsample2(down_feat) 217 | 218 | #cat_feat = torch.cat((base_feat,up_feat,down_feat),1) 219 | cat_feat = 0.5*base_feat + 0.35*up_feat + 0.15*down_feat 220 | red_feat = self.DimRed(cat_feat) 221 | RoI_feat = self.RoIAlign(red_feat, boxes) 222 | RoD_feat = self.RoDAlign(red_feat, boxes) 223 | final_feat = torch.cat((RoI_feat, RoD_feat), 1) 224 | prediction = self.FC_layers(final_feat) 225 | return prediction 226 | 227 | def _init_weights(self): 228 | print('Initializing weights...') 229 | self.DimRed.apply(weights_init) 230 | self.FC_layers.apply(weights_init) 231 | 232 | class crop_model_multi_scale_shared(nn.Module): 233 | 234 | def __init__(self, alignsize = 8, reddim = 32, loadweight = True, model = None, downsample = 4): 235 | super(crop_model_multi_scale_shared, self).__init__() 236 | 237 | if model == 'shufflenetv2': 238 | self.Feat_ext = shufflenetv2_base(loadweight,downsample) 239 | self.DimRed = nn.Conv2d(812, reddim, kernel_size=1, padding=0) 240 | elif model == 'mobilenetv2': 241 | self.Feat_ext = mobilenetv2_base(loadweight,downsample) 242 | self.DimRed = nn.Conv2d(448, reddim, kernel_size=1, padding=0) 243 | elif model == 'vgg16': 244 | self.Feat_ext = vgg_base(loadweight,downsample) 245 | self.DimRed = nn.Conv2d(1536, reddim, kernel_size=1, padding=0) 246 | elif model == 'resnet50': 247 | self.Feat_ext = resnet50_base(loadweight,downsample) 248 | self.DimRed = nn.Conv2d(3584, reddim, kernel_size=1, padding=0) 249 | 250 | self.downsample2 = nn.UpsamplingBilinear2d(scale_factor=1.0/2.0) 251 | self.upsample2 = nn.UpsamplingBilinear2d(scale_factor=2.0) 252 | self.RoIAlign = RoIAlignAvg(alignsize, alignsize, 1.0/2**downsample) 253 | self.RoDAlign = RoDAlignAvg(alignsize, alignsize, 1.0/2**downsample) 254 | self.FC_layers = fc_layers(reddim*2, alignsize) 255 | 256 | 257 | def forward(self, im_data, boxes): 258 | 259 | #base_feat = self.Feat_ext(im_data) 260 | 261 | #up_im = self.upsample2(im_data) 262 | #up_feat = self.Feat_ext(up_im) 263 | #up_feat = self.downsample2(up_feat) 264 | 265 | #down_im = self.downsample2(im_data) 266 | #down_feat = self.Feat_ext(down_im) 267 | #down_feat = self.upsample2(down_feat) 268 | 269 | f3,f4,f5 = self.Feat_ext(im_data) 270 | cat_feat = torch.cat((self.downsample2(f3),f4,0.5*self.upsample2(f5)),1) 271 | 272 | #cat_feat = torch.cat((base_feat,up_feat,down_feat),1) 273 | #cat_feat = base_feat + 0.35*up_feat + 0.15*down_feat 274 | red_feat = self.DimRed(cat_feat) 275 | RoI_feat = self.RoIAlign(red_feat, boxes) 276 | RoD_feat = self.RoDAlign(red_feat, boxes) 277 | final_feat = torch.cat((RoI_feat, RoD_feat), 1) 278 | prediction = self.FC_layers(final_feat) 279 | return prediction 280 | 281 | def _init_weights(self): 282 | print('Initializing weights...') 283 | self.DimRed.apply(weights_init) 284 | self.FC_layers.apply(weights_init) 285 | 286 | def xavier(param): 287 | init.xavier_uniform_(param) 288 | 289 | 290 | def weights_init(m): 291 | if isinstance(m, nn.Conv2d): 292 | xavier(m.weight.data) 293 | m.bias.data.zero_() 294 | 295 | 296 | def build_crop_model(scale='single', alignsize=8, reddim=32, loadweight=True, model=None, downsample=4): 297 | 298 | if scale=='single': 299 | return crop_model_single_scale(alignsize, reddim, loadweight, model, downsample) 300 | elif scale=='multi': 301 | return crop_model_multi_scale_shared(alignsize, reddim, loadweight, model, downsample) 302 | 303 | 304 | 305 | -------------------------------------------------------------------------------- /demo_eval.py: -------------------------------------------------------------------------------- 1 | from croppingModel import build_crop_model 2 | from croppingDataset import setup_test_dataset 3 | import os 4 | import torch 5 | import cv2 6 | from torch.autograd import Variable 7 | import torch.backends.cudnn as cudnn 8 | import torch.utils.data as data 9 | import argparse 10 | import time 11 | 12 | 13 | def str2bool(v): 14 | return v.lower() in ("yes", "true", "t", "1") 15 | 16 | 17 | parser = argparse.ArgumentParser( 18 | description='Grid anchor based image cropping With Pytorch') 19 | parser.add_argument('--input_dir', default='dataset/GAIC/images/test', 20 | help='root directory path of testing images') 21 | parser.add_argument('--output_dir', default='dataset/test_result', 22 | help='root directory path of testing images') 23 | parser.add_argument('--batch_size', default=1, type=int, 24 | help='Batch size for training') 25 | parser.add_argument('--num_workers', default=0, type=int, 26 | help='Number of workers used in dataloading') 27 | parser.add_argument('--cuda', default=True, type=str2bool, 28 | help='Use CUDA to train model') 29 | parser.add_argument('--net_path', default='pretrained_model/mobilenet_0.625_0.583_0.553_0.525_0.785_0.762_0.748_0.723_0.783_0.806.pth', 30 | help='Directory for saving checkpoint models') 31 | args = parser.parse_args() 32 | 33 | if not os.path.exists(args.output_dir): 34 | os.makedirs(args.output_dir) 35 | 36 | if torch.cuda.is_available(): 37 | if args.cuda: 38 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 39 | if not args.cuda: 40 | print("WARNING: It looks like you have a CUDA device, but aren't " + 41 | "using CUDA.\nRun with --cuda for optimal training speed.") 42 | torch.set_default_tensor_type('torch.FloatTensor') 43 | 44 | else: 45 | torch.set_default_tensor_type('torch.FloatTensor') 46 | 47 | dataset = setup_test_dataset(dataset_dir = args.input_dir) 48 | 49 | 50 | def test(): 51 | 52 | net = build_crop_model(scale='multi', alignsize=9, reddim=8, loadweight=True, model='mobilenetv2',downsample=4) 53 | net.load_state_dict(torch.load(args.net_path)) 54 | net.eval() 55 | 56 | if args.cuda: 57 | net = torch.nn.DataParallel(net,device_ids=[0]) 58 | cudnn.benchmark = True 59 | net = net.cuda() 60 | 61 | 62 | data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers,shuffle=False,pin_memory=True) 63 | 64 | for id, sample in enumerate(data_loader): 65 | imgpath = sample['imgpath'] 66 | image = sample['image'] 67 | bboxes = sample['sourceboxes'] 68 | resized_image = sample['resized_image'] 69 | tbboxes = sample['tbboxes'] 70 | 71 | if len(tbboxes['xmin'])==0: 72 | continue 73 | 74 | roi = [] 75 | 76 | for idx in range(0,len(tbboxes['xmin'])): 77 | roi.append((0, tbboxes['xmin'][idx],tbboxes['ymin'][idx],tbboxes['xmax'][idx],tbboxes['ymax'][idx])) 78 | 79 | if args.cuda: 80 | resized_image = Variable(resized_image.cuda()) 81 | roi = Variable(torch.Tensor(roi)) 82 | else: 83 | resized_image = Variable(resized_image) 84 | roi = Variable(torch.Tensor(roi)) 85 | 86 | 87 | t0 = time.time() 88 | for r in range(0,100): 89 | out = net(resized_image,roi) 90 | t1 = time.time() 91 | print('timer: %.4f sec.' % (t1 - t0)) 92 | 93 | out = net(resized_image,roi) 94 | 95 | id_out = sorted(range(len(out)), key=lambda k: out[k], reverse = True) 96 | image = image.cpu().numpy().squeeze(0) 97 | 98 | for i in range(4): 99 | top1_box = bboxes[id_out[i]] 100 | top1_box = [top1_box[0].numpy()[0],top1_box[1].numpy()[0],top1_box[2].numpy()[0],top1_box[3].numpy()[0]] 101 | top1_crop = image[int(top1_box[0]):int(top1_box[2]),int(top1_box[1]):int(top1_box[3])] 102 | imgname = imgpath[0].split('/')[-1] 103 | cv2.imwrite(args.output_dir + '/' + imgname[:-4] + '_' +str(i) + imgname[-4:],top1_crop[:,:,(2, 1, 0)]) 104 | 105 | 106 | if __name__ == '__main__': 107 | test() 108 | -------------------------------------------------------------------------------- /mobilenetv2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Creates a MobileNetV2 Model as defined in: 3 | Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. (2018). 4 | MobileNetV2: Inverted Residuals and Linear Bottlenecks 5 | arXiv preprint arXiv:1801.04381. 6 | import from https://github.com/tonylins/pytorch-mobilenet-v2 7 | """ 8 | 9 | import torch.nn as nn 10 | import math 11 | 12 | __all__ = ['mobilenetv2'] 13 | 14 | 15 | def _make_divisible(v, divisor, min_value=None): 16 | """ 17 | This function is taken from the original tf repo. 18 | It ensures that all layers have a channel number that is divisible by 8 19 | It can be seen here: 20 | https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py 21 | :param v: 22 | :param divisor: 23 | :param min_value: 24 | :return: 25 | """ 26 | if min_value is None: 27 | min_value = divisor 28 | new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 29 | # Make sure that round down does not go down by more than 10%. 30 | if new_v < 0.9 * v: 31 | new_v += divisor 32 | return new_v 33 | 34 | 35 | def conv_3x3_bn(inp, oup, stride): 36 | return nn.Sequential( 37 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 38 | nn.BatchNorm2d(oup), 39 | nn.ReLU6(inplace=True) 40 | ) 41 | 42 | 43 | def conv_1x1_bn(inp, oup): 44 | return nn.Sequential( 45 | nn.Conv2d(inp, oup, 1, 1, 0, bias=False), 46 | nn.BatchNorm2d(oup), 47 | nn.ReLU6(inplace=True) 48 | ) 49 | 50 | 51 | class InvertedResidual(nn.Module): 52 | def __init__(self, inp, oup, stride, expand_ratio): 53 | super(InvertedResidual, self).__init__() 54 | assert stride in [1, 2] 55 | 56 | hidden_dim = int(round(inp * expand_ratio)) 57 | self.identity = stride == 1 and inp == oup 58 | 59 | if expand_ratio == 1: 60 | self.conv = nn.Sequential( 61 | # dw 62 | nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), 63 | nn.BatchNorm2d(hidden_dim), 64 | nn.ReLU6(inplace=True), 65 | # pw-linear 66 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 67 | nn.BatchNorm2d(oup), 68 | ) 69 | else: 70 | self.conv = nn.Sequential( 71 | # pw 72 | nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), 73 | nn.BatchNorm2d(hidden_dim), 74 | nn.ReLU6(inplace=True), 75 | # dw 76 | nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), 77 | nn.BatchNorm2d(hidden_dim), 78 | nn.ReLU6(inplace=True), 79 | # pw-linear 80 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 81 | nn.BatchNorm2d(oup), 82 | ) 83 | 84 | def forward(self, x): 85 | if self.identity: 86 | return x + self.conv(x) 87 | else: 88 | return self.conv(x) 89 | 90 | 91 | class MobileNetV2(nn.Module): 92 | def __init__(self, num_classes=1000, input_size=224, width_mult=1.): 93 | super(MobileNetV2, self).__init__() 94 | # setting of inverted residual blocks 95 | self.cfgs = [ 96 | # t, c, n, s 97 | [1, 16, 1, 1], 98 | [6, 24, 2, 2], 99 | [6, 32, 3, 2], 100 | [6, 64, 4, 2], 101 | [6, 96, 3, 1], 102 | [6, 160, 3, 2], 103 | [6, 320, 1, 1], 104 | ] 105 | 106 | # building first layer 107 | assert input_size % 32 == 0 108 | input_channel = _make_divisible(32 * width_mult, 8) 109 | layers = [conv_3x3_bn(3, input_channel, 2)] 110 | # building inverted residual blocks 111 | block = InvertedResidual 112 | for t, c, n, s in self.cfgs: 113 | output_channel = _make_divisible(c * width_mult, 8) 114 | layers.append(block(input_channel, output_channel, s, t)) 115 | input_channel = output_channel 116 | for i in range(1, n): 117 | layers.append(block(input_channel, output_channel, 1, t)) 118 | input_channel = output_channel 119 | self.features = nn.Sequential(*layers) 120 | # building last several layers 121 | output_channel = _make_divisible(1280 * width_mult, 8) if width_mult > 1.0 else 1280 122 | self.conv = conv_1x1_bn(input_channel, output_channel) 123 | self.avgpool = nn.AvgPool2d(input_size // 32, stride=1) 124 | self.classifier = nn.Linear(output_channel, num_classes) 125 | 126 | self._initialize_weights() 127 | 128 | def forward(self, x): 129 | x = self.features(x) 130 | x = self.conv(x) 131 | x = self.avgpool(x) 132 | x = x.view(x.size(0), -1) 133 | x = self.classifier(x) 134 | return x 135 | 136 | def _initialize_weights(self): 137 | for m in self.modules(): 138 | if isinstance(m, nn.Conv2d): 139 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 140 | m.weight.data.normal_(0, math.sqrt(2. / n)) 141 | if m.bias is not None: 142 | m.bias.data.zero_() 143 | elif isinstance(m, nn.BatchNorm2d): 144 | m.weight.data.fill_(1) 145 | m.bias.data.zero_() 146 | elif isinstance(m, nn.Linear): 147 | n = m.weight.size(1) 148 | m.weight.data.normal_(0, 0.01) 149 | m.bias.data.zero_() 150 | 151 | def mobilenetv2(**kwargs): 152 | """ 153 | Constructs a MobileNet V2 model 154 | """ 155 | return MobileNetV2(**kwargs) 156 | 157 | -------------------------------------------------------------------------------- /rod_align/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/__init__.pyc -------------------------------------------------------------------------------- /rod_align/_ext/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/_ext/__init__.pyc -------------------------------------------------------------------------------- /rod_align/_ext/rod_align/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._rod_align import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | if callable(fn): 10 | locals[symbol] = _wrap_function(fn, _ffi) 11 | else: 12 | locals[symbol] = fn 13 | __all__.append(symbol) 14 | 15 | _import_symbols(locals()) 16 | -------------------------------------------------------------------------------- /rod_align/_ext/rod_align/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/_ext/rod_align/__init__.pyc -------------------------------------------------------------------------------- /rod_align/_ext/rod_align/_rod_align.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/_ext/rod_align/_rod_align.so -------------------------------------------------------------------------------- /rod_align/build.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import torch 4 | from torch.utils.ffi import create_extension 5 | 6 | sources = ['src/rod_align.c'] 7 | headers = ['src/rod_align.h'] 8 | extra_objects = [] 9 | #sources = [] 10 | #headers = [] 11 | defines = [] 12 | with_cuda = False 13 | 14 | this_file = os.path.dirname(os.path.realpath(__file__)) 15 | print(this_file) 16 | 17 | if torch.cuda.is_available(): 18 | print('Including CUDA code.') 19 | sources += ['src/rod_align_cuda.c'] 20 | headers += ['src/rod_align_cuda.h'] 21 | defines += [('WITH_CUDA', None)] 22 | with_cuda = True 23 | 24 | extra_objects = ['src/rod_align_kernel.cu.o'] 25 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 26 | 27 | ffi = create_extension( 28 | '_ext.rod_align', 29 | headers=headers, 30 | sources=sources, 31 | define_macros=defines, 32 | relative_to=__file__, 33 | with_cuda=with_cuda, 34 | extra_objects=extra_objects 35 | ) 36 | 37 | if __name__ == '__main__': 38 | ffi.build() 39 | -------------------------------------------------------------------------------- /rod_align/functions/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/functions/__init__.pyc -------------------------------------------------------------------------------- /rod_align/functions/rod_align.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from .._ext import rod_align 4 | 5 | 6 | # TODO use save_for_backward instead 7 | class RoDAlignFunction(Function): 8 | def __init__(self, aligned_height, aligned_width, spatial_scale): 9 | self.aligned_width = int(aligned_width) 10 | self.aligned_height = int(aligned_height) 11 | self.spatial_scale = float(spatial_scale) 12 | self.rois = None 13 | self.feature_size = None 14 | 15 | def forward(self, features, rois): 16 | self.rois = rois 17 | self.feature_size = features.size() 18 | 19 | batch_size, num_channels, data_height, data_width = features.size() 20 | num_rois = rois.size(0) 21 | 22 | output = features.new(num_rois, num_channels, self.aligned_height, self.aligned_width).zero_() 23 | if features.is_cuda: 24 | rod_align.rod_align_forward_cuda(self.aligned_height, 25 | self.aligned_width, 26 | self.spatial_scale, features, 27 | rois, output) 28 | else: 29 | rod_align.rod_align_forward(self.aligned_height, 30 | self.aligned_width, 31 | self.spatial_scale, features, 32 | rois, output) 33 | # raise NotImplementedError 34 | 35 | return output 36 | 37 | def backward(self, grad_output): 38 | assert(self.feature_size is not None and grad_output.is_cuda) 39 | 40 | batch_size, num_channels, data_height, data_width = self.feature_size 41 | 42 | grad_input = self.rois.new(batch_size, num_channels, data_height, 43 | data_width).zero_() 44 | rod_align.rod_align_backward_cuda(self.aligned_height, 45 | self.aligned_width, 46 | self.spatial_scale, grad_output, 47 | self.rois, grad_input) 48 | 49 | # print grad_input 50 | 51 | return grad_input, None 52 | -------------------------------------------------------------------------------- /rod_align/functions/rod_align.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/functions/rod_align.pyc -------------------------------------------------------------------------------- /rod_align/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_PATH=/usr/local/cuda/ 4 | 5 | cd src 6 | echo "Compiling my_lib kernels by nvcc..." 7 | nvcc -c -o rod_align_kernel.cu.o rod_align_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52 8 | 9 | cd ../ 10 | python build.py 11 | -------------------------------------------------------------------------------- /rod_align/modules/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/modules/__init__.pyc -------------------------------------------------------------------------------- /rod_align/modules/rod_align.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from torch.nn.functional import avg_pool2d, max_pool2d 3 | from ..functions.rod_align import RoDAlignFunction 4 | 5 | 6 | class RoDAlign(Module): 7 | def __init__(self, aligned_height, aligned_width, spatial_scale): 8 | super(RoDAlign, self).__init__() 9 | 10 | self.aligned_width = int(aligned_width) 11 | self.aligned_height = int(aligned_height) 12 | self.spatial_scale = float(spatial_scale) 13 | 14 | def forward(self, features, rois): 15 | return RoDAlignFunction(self.aligned_height, self.aligned_width, 16 | self.spatial_scale)(features, rois) 17 | 18 | class RoDAlignAvg(Module): 19 | def __init__(self, aligned_height, aligned_width, spatial_scale): 20 | super(RoDAlignAvg, self).__init__() 21 | 22 | self.aligned_width = int(aligned_width) 23 | self.aligned_height = int(aligned_height) 24 | self.spatial_scale = float(spatial_scale) 25 | 26 | def forward(self, features, rois): 27 | x = RoDAlignFunction(self.aligned_height+1, self.aligned_width+1, 28 | self.spatial_scale)(features, rois) 29 | return avg_pool2d(x, kernel_size=2, stride=1) 30 | 31 | class RoDAlignMax(Module): 32 | def __init__(self, aligned_height, aligned_width, spatial_scale): 33 | super(RoDAlignMax, self).__init__() 34 | 35 | self.aligned_width = int(aligned_width) 36 | self.aligned_height = int(aligned_height) 37 | self.spatial_scale = float(spatial_scale) 38 | 39 | def forward(self, features, rois): 40 | x = RoDAlignFunction(self.aligned_height+1, self.aligned_width+1, 41 | self.spatial_scale)(features, rois) 42 | return max_pool2d(x, kernel_size=2, stride=1) 43 | -------------------------------------------------------------------------------- /rod_align/modules/rod_align.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/modules/rod_align.pyc -------------------------------------------------------------------------------- /rod_align/src/rod_align.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | void RODAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois, 7 | const int height, const int width, const int channels, 8 | const int aligned_height, const int aligned_width, const float * bottom_rois, 9 | float* top_data); 10 | 11 | void RODAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois, 12 | const int height, const int width, const int channels, 13 | const int aligned_height, const int aligned_width, const float * bottom_rois, 14 | float* top_data); 15 | 16 | int rod_align_forward(int aligned_height, int aligned_width, float spatial_scale, 17 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output) 18 | { 19 | //Grab the input tensor 20 | float * data_flat = THFloatTensor_data(features); 21 | float * rois_flat = THFloatTensor_data(rois); 22 | 23 | float * output_flat = THFloatTensor_data(output); 24 | 25 | // Number of ROIs 26 | int num_rois = THFloatTensor_size(rois, 0); 27 | int size_rois = THFloatTensor_size(rois, 1); 28 | if (size_rois != 5) 29 | { 30 | return 0; 31 | } 32 | 33 | // data height 34 | int data_height = THFloatTensor_size(features, 2); 35 | // data width 36 | int data_width = THFloatTensor_size(features, 3); 37 | // Number of channels 38 | int num_channels = THFloatTensor_size(features, 1); 39 | 40 | // do ROIAlignForward 41 | RODAlignForwardCpu(data_flat, spatial_scale, num_rois, data_height, data_width, num_channels, 42 | aligned_height, aligned_width, rois_flat, output_flat); 43 | 44 | return 1; 45 | } 46 | 47 | int rod_align_backward(int aligned_height, int aligned_width, float spatial_scale, 48 | THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad) 49 | { 50 | //Grab the input tensor 51 | float * top_grad_flat = THFloatTensor_data(top_grad); 52 | float * rois_flat = THFloatTensor_data(rois); 53 | 54 | float * bottom_grad_flat = THFloatTensor_data(bottom_grad); 55 | 56 | // Number of ROIs 57 | int num_rois = THFloatTensor_size(rois, 0); 58 | int size_rois = THFloatTensor_size(rois, 1); 59 | if (size_rois != 5) 60 | { 61 | return 0; 62 | } 63 | 64 | // batch size 65 | // int batch_size = THFloatTensor_size(bottom_grad, 0); 66 | // data height 67 | int data_height = THFloatTensor_size(bottom_grad, 2); 68 | // data width 69 | int data_width = THFloatTensor_size(bottom_grad, 3); 70 | // Number of channels 71 | int num_channels = THFloatTensor_size(bottom_grad, 1); 72 | 73 | // do ROIAlignBackward 74 | RODAlignBackwardCpu(top_grad_flat, spatial_scale, num_rois, data_height, 75 | data_width, num_channels, aligned_height, aligned_width, rois_flat, bottom_grad_flat); 76 | 77 | return 1; 78 | } 79 | 80 | void RODAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois, 81 | const int height, const int width, const int channels, 82 | const int aligned_height, const int aligned_width, const float * bottom_rois, 83 | float* top_data) 84 | { 85 | const int output_size = num_rois * aligned_height * aligned_width * channels; 86 | 87 | int idx = 0; 88 | float bin_size_h = (float)(height - 1.001) / (aligned_height - 1.); 89 | float bin_size_w = (float)(width - 1.001) / (aligned_width - 1.); 90 | for (idx = 0; idx < output_size; ++idx) 91 | { 92 | // (n, c, ph, pw) is an element in the aligned output 93 | int pw = idx % aligned_width; 94 | int ph = (idx / aligned_width) % aligned_height; 95 | int c = (idx / aligned_width / aligned_height) % channels; 96 | int n = idx / aligned_width / aligned_height / channels; 97 | 98 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 99 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 100 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 101 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 102 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 103 | 104 | 105 | float h = (float)(ph) * bin_size_h; 106 | float w = (float)(pw) * bin_size_w; 107 | 108 | int hstart = fminf(floor(h), height - 2); 109 | int wstart = fminf(floor(w), width - 2); 110 | 111 | int img_start = roi_batch_ind * channels * height * width; 112 | 113 | // bilinear interpolation 114 | if (h >= roi_start_h && h <= roi_end_h && w >= roi_start_w && w <= roi_end_w){ 115 | top_data[idx] = 0.; 116 | } else { 117 | float h_ratio = h - (float)(hstart); 118 | float w_ratio = w - (float)(wstart); 119 | int upleft = img_start + (c * height + hstart) * width + wstart; 120 | int upright = upleft + 1; 121 | int downleft = upleft + width; 122 | int downright = downleft + 1; 123 | 124 | top_data[idx] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio) 125 | + bottom_data[upright] * (1. - h_ratio) * w_ratio 126 | + bottom_data[downleft] * h_ratio * (1. - w_ratio) 127 | + bottom_data[downright] * h_ratio * w_ratio; 128 | } 129 | } 130 | } 131 | 132 | void RODAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois, 133 | const int height, const int width, const int channels, 134 | const int aligned_height, const int aligned_width, const float * bottom_rois, 135 | float* bottom_diff) 136 | { 137 | const int output_size = num_rois * aligned_height * aligned_width * channels; 138 | 139 | int idx = 0; 140 | float bin_size_h = (float)(height - 1.001) / (aligned_height - 1.); 141 | float bin_size_w = (float)(width - 1.001) / (aligned_width - 1.); 142 | for (idx = 0; idx < output_size; ++idx) 143 | { 144 | // (n, c, ph, pw) is an element in the aligned output 145 | int pw = idx % aligned_width; 146 | int ph = (idx / aligned_width) % aligned_height; 147 | int c = (idx / aligned_width / aligned_height) % channels; 148 | int n = idx / aligned_width / aligned_height / channels; 149 | 150 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 151 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 152 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 153 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 154 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 155 | 156 | float h = (float)(ph) * bin_size_h; 157 | float w = (float)(pw) * bin_size_w; 158 | 159 | int hstart = fminf(floor(h), height - 2); 160 | int wstart = fminf(floor(w), width - 2); 161 | 162 | int img_start = roi_batch_ind * channels * height * width; 163 | 164 | // bilinear interpolation 165 | if (!(h >= roi_start_h && h <= roi_end_h && w >= roi_start_w && w <= roi_end_w)) { 166 | float h_ratio = h - (float)(hstart); 167 | float w_ratio = w - (float)(wstart); 168 | int upleft = img_start + (c * height + hstart) * width + wstart; 169 | int upright = upleft + 1; 170 | int downleft = upleft + width; 171 | int downright = downleft + 1; 172 | 173 | bottom_diff[upleft] += top_diff[idx] * (1. - h_ratio) * (1. - w_ratio); 174 | bottom_diff[upright] += top_diff[idx] * (1. - h_ratio) * w_ratio; 175 | bottom_diff[downleft] += top_diff[idx] * h_ratio * (1. - w_ratio); 176 | bottom_diff[downright] += top_diff[idx] * h_ratio * w_ratio; 177 | } 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /rod_align/src/rod_align.h: -------------------------------------------------------------------------------- 1 | int rod_align_forward(int aligned_height, int aligned_width, float spatial_scale, 2 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output); 3 | 4 | int rod_align_backward(int aligned_height, int aligned_width, float spatial_scale, 5 | THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad); 6 | -------------------------------------------------------------------------------- /rod_align/src/rod_align_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "rod_align_kernel.h" 4 | 5 | extern THCState *state; 6 | 7 | int rod_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output) 9 | { 10 | // Grab the input tensor 11 | float * data_flat = THCudaTensor_data(state, features); 12 | float * rois_flat = THCudaTensor_data(state, rois); 13 | 14 | float * output_flat = THCudaTensor_data(state, output); 15 | 16 | // Number of ROIs 17 | int num_rois = THCudaTensor_size(state, rois, 0); 18 | int size_rois = THCudaTensor_size(state, rois, 1); 19 | if (size_rois != 5) 20 | { 21 | return 0; 22 | } 23 | 24 | // data height 25 | int data_height = THCudaTensor_size(state, features, 2); 26 | // data width 27 | int data_width = THCudaTensor_size(state, features, 3); 28 | // Number of channels 29 | int num_channels = THCudaTensor_size(state, features, 1); 30 | 31 | cudaStream_t stream = THCState_getCurrentStream(state); 32 | 33 | RODAlignForwardLaucher( 34 | data_flat, spatial_scale, num_rois, data_height, 35 | data_width, num_channels, aligned_height, 36 | aligned_width, rois_flat, 37 | output_flat, stream); 38 | 39 | return 1; 40 | } 41 | 42 | int rod_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 43 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad) 44 | { 45 | // Grab the input tensor 46 | float * top_grad_flat = THCudaTensor_data(state, top_grad); 47 | float * rois_flat = THCudaTensor_data(state, rois); 48 | 49 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 50 | 51 | // Number of ROIs 52 | int num_rois = THCudaTensor_size(state, rois, 0); 53 | int size_rois = THCudaTensor_size(state, rois, 1); 54 | if (size_rois != 5) 55 | { 56 | return 0; 57 | } 58 | 59 | // batch size 60 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 61 | // data height 62 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 63 | // data width 64 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 65 | // Number of channels 66 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 67 | 68 | cudaStream_t stream = THCState_getCurrentStream(state); 69 | RODAlignBackwardLaucher( 70 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height, 71 | data_width, num_channels, aligned_height, 72 | aligned_width, rois_flat, 73 | bottom_grad_flat, stream); 74 | 75 | return 1; 76 | } 77 | -------------------------------------------------------------------------------- /rod_align/src/rod_align_cuda.h: -------------------------------------------------------------------------------- 1 | int rod_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output); 3 | 4 | int rod_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad); 6 | -------------------------------------------------------------------------------- /rod_align/src/rod_align_kernel.cu: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include "rod_align_kernel.h" 9 | 10 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 11 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 12 | i += blockDim.x * gridDim.x) 13 | 14 | 15 | __global__ void RODAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width, 16 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) { 17 | float bin_size_h = (float)(height - 1.001) / (aligned_height - 1.); 18 | float bin_size_w = (float)(width - 1.001) / (aligned_width - 1.); 19 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 20 | // (n, c, ph, pw) is an element in the aligned output 21 | // int n = index; 22 | // int pw = n % aligned_width; 23 | // n /= aligned_width; 24 | // int ph = n % aligned_height; 25 | // n /= aligned_height; 26 | // int c = n % channels; 27 | // n /= channels; 28 | 29 | int pw = index % aligned_width; 30 | int ph = (index / aligned_width) % aligned_height; 31 | int c = (index / aligned_width / aligned_height) % channels; 32 | int n = index / aligned_width / aligned_height / channels; 33 | 34 | // bottom_rois += n * 5; 35 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 36 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 37 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 38 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 39 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 40 | 41 | 42 | float h = (float)(ph) * bin_size_h; 43 | float w = (float)(pw) * bin_size_w; 44 | 45 | int hstart = fminf(floor(h), height - 2); 46 | int wstart = fminf(floor(w), width - 2); 47 | 48 | int img_start = roi_batch_ind * channels * height * width; 49 | 50 | // bilinear interpolation 51 | if (h >= roi_start_h && h <= roi_end_h && w >= roi_start_w && w <= roi_end_w){ 52 | top_data[index] = 0.; 53 | } else { 54 | float h_ratio = h - (float)(hstart); 55 | float w_ratio = w - (float)(wstart); 56 | int upleft = img_start + (c * height + hstart) * width + wstart; 57 | int upright = upleft + 1; 58 | int downleft = upleft + width; 59 | int downright = downleft + 1; 60 | 61 | top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio) 62 | + bottom_data[upright] * (1. - h_ratio) * w_ratio 63 | + bottom_data[downleft] * h_ratio * (1. - w_ratio) 64 | + bottom_data[downright] * h_ratio * w_ratio; 65 | } 66 | } 67 | } 68 | 69 | 70 | int RODAlignForwardLaucher(const float* bottom_data, const float spatial_scale, const int num_rois, const int height, const int width, 71 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data, cudaStream_t stream) { 72 | const int kThreadsPerBlock = 1024; 73 | const int output_size = num_rois * aligned_height * aligned_width * channels; 74 | cudaError_t err; 75 | 76 | 77 | RODAlignForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 78 | output_size, bottom_data, spatial_scale, height, width, channels, 79 | aligned_height, aligned_width, bottom_rois, top_data); 80 | 81 | err = cudaGetLastError(); 82 | if(cudaSuccess != err) { 83 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 84 | exit( -1 ); 85 | } 86 | 87 | return 1; 88 | } 89 | 90 | 91 | __global__ void RODAlignBackward(const int nthreads, const float* top_diff, const float spatial_scale, const int height, const int width, 92 | const int channels, const int aligned_height, const int aligned_width, float* bottom_diff, const float* bottom_rois) { 93 | float bin_size_h = (float)(height - 1.001) / (aligned_height - 1.); 94 | float bin_size_w = (float)(width - 1.001) / (aligned_width - 1.); 95 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 96 | 97 | // (n, c, ph, pw) is an element in the aligned output 98 | int pw = index % aligned_width; 99 | int ph = (index / aligned_width) % aligned_height; 100 | int c = (index / aligned_width / aligned_height) % channels; 101 | int n = index / aligned_width / aligned_height / channels; 102 | 103 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 104 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 105 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 106 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 107 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 108 | 109 | 110 | float h = (float)(ph) * bin_size_h; 111 | float w = (float)(pw) * bin_size_w; 112 | 113 | int hstart = fminf(floor(h), height - 2); 114 | int wstart = fminf(floor(w), width - 2); 115 | 116 | int img_start = roi_batch_ind * channels * height * width; 117 | 118 | // bilinear interpolation 119 | if (!(h >= roi_start_h && h <= roi_end_h && w >= roi_start_w && w <= roi_end_w)) { 120 | float h_ratio = h - (float)(hstart); 121 | float w_ratio = w - (float)(wstart); 122 | int upleft = img_start + (c * height + hstart) * width + wstart; 123 | int upright = upleft + 1; 124 | int downleft = upleft + width; 125 | int downright = downleft + 1; 126 | 127 | atomicAdd(bottom_diff + upleft, top_diff[index] * (1. - h_ratio) * (1 - w_ratio)); 128 | atomicAdd(bottom_diff + upright, top_diff[index] * (1. - h_ratio) * w_ratio); 129 | atomicAdd(bottom_diff + downleft, top_diff[index] * h_ratio * (1 - w_ratio)); 130 | atomicAdd(bottom_diff + downright, top_diff[index] * h_ratio * w_ratio); 131 | } 132 | } 133 | } 134 | 135 | int RODAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, const int height, const int width, 136 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* bottom_diff, cudaStream_t stream) { 137 | const int kThreadsPerBlock = 1024; 138 | const int output_size = num_rois * aligned_height * aligned_width * channels; 139 | cudaError_t err; 140 | 141 | RODAlignBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 142 | output_size, top_diff, spatial_scale, height, width, channels, 143 | aligned_height, aligned_width, bottom_diff, bottom_rois); 144 | 145 | err = cudaGetLastError(); 146 | if(cudaSuccess != err) { 147 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 148 | exit( -1 ); 149 | } 150 | 151 | return 1; 152 | } 153 | 154 | 155 | #ifdef __cplusplus 156 | } 157 | #endif 158 | -------------------------------------------------------------------------------- /rod_align/src/rod_align_kernel.cu.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/src/rod_align_kernel.cu.o -------------------------------------------------------------------------------- /rod_align/src/rod_align_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ROD_ALIGN_KERNEL 2 | #define _ROD_ALIGN_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | __global__ void RODAlignForward(const int nthreads, const float* bottom_data, 9 | const float spatial_scale, const int height, const int width, 10 | const int channels, const int aligned_height, const int aligned_width, 11 | const float* bottom_rois, float* top_data); 12 | 13 | int RODAlignForwardLaucher( 14 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 15 | const int width, const int channels, const int aligned_height, 16 | const int aligned_width, const float* bottom_rois, 17 | float* top_data, cudaStream_t stream); 18 | 19 | __global__ void RODAlignBackward(const int nthreads, const float* top_diff, 20 | const float spatial_scale, const int height, const int width, 21 | const int channels, const int aligned_height, const int aligned_width, 22 | float* bottom_diff, const float* bottom_rois); 23 | 24 | int RODAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 25 | const int height, const int width, const int channels, const int aligned_height, 26 | const int aligned_width, const float* bottom_rois, 27 | float* bottom_diff, cudaStream_t stream); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif 34 | 35 | -------------------------------------------------------------------------------- /roi_align/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/__init__.pyc -------------------------------------------------------------------------------- /roi_align/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /roi_align/_ext/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/_ext/__init__.pyc -------------------------------------------------------------------------------- /roi_align/_ext/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/_ext/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /roi_align/_ext/roi_align/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_align import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | if callable(fn): 10 | locals[symbol] = _wrap_function(fn, _ffi) 11 | else: 12 | locals[symbol] = fn 13 | __all__.append(symbol) 14 | 15 | _import_symbols(locals()) 16 | -------------------------------------------------------------------------------- /roi_align/_ext/roi_align/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/_ext/roi_align/__init__.pyc -------------------------------------------------------------------------------- /roi_align/_ext/roi_align/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/_ext/roi_align/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /roi_align/_ext/roi_align/_roi_align.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/_ext/roi_align/_roi_align.so -------------------------------------------------------------------------------- /roi_align/build.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import torch 4 | from torch.utils.ffi import create_extension 5 | 6 | sources = ['src/roi_align.c'] 7 | headers = ['src/roi_align.h'] 8 | extra_objects = [] 9 | #sources = [] 10 | #headers = [] 11 | defines = [] 12 | with_cuda = False 13 | 14 | this_file = os.path.dirname(os.path.realpath(__file__)) 15 | print(this_file) 16 | 17 | if torch.cuda.is_available(): 18 | print('Including CUDA code.') 19 | sources += ['src/roi_align_cuda.c'] 20 | headers += ['src/roi_align_cuda.h'] 21 | defines += [('WITH_CUDA', None)] 22 | with_cuda = True 23 | 24 | extra_objects = ['src/roi_align_kernel.cu.o'] 25 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 26 | 27 | ffi = create_extension( 28 | '_ext.roi_align', 29 | headers=headers, 30 | sources=sources, 31 | define_macros=defines, 32 | relative_to=__file__, 33 | with_cuda=with_cuda, 34 | extra_objects=extra_objects 35 | ) 36 | 37 | if __name__ == '__main__': 38 | ffi.build() 39 | -------------------------------------------------------------------------------- /roi_align/functions/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/functions/__init__.pyc -------------------------------------------------------------------------------- /roi_align/functions/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/functions/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /roi_align/functions/__pycache__/roi_align.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/functions/__pycache__/roi_align.cpython-35.pyc -------------------------------------------------------------------------------- /roi_align/functions/roi_align.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from .._ext import roi_align 4 | 5 | 6 | # TODO use save_for_backward instead 7 | class RoIAlignFunction(Function): 8 | def __init__(self, aligned_height, aligned_width, spatial_scale): 9 | self.aligned_width = int(aligned_width) 10 | self.aligned_height = int(aligned_height) 11 | self.spatial_scale = float(spatial_scale) 12 | self.rois = None 13 | self.feature_size = None 14 | 15 | def forward(self, features, rois): 16 | self.rois = rois 17 | self.feature_size = features.size() 18 | 19 | batch_size, num_channels, data_height, data_width = features.size() 20 | num_rois = rois.size(0) 21 | 22 | output = features.new(num_rois, num_channels, self.aligned_height, self.aligned_width).zero_() 23 | if features.is_cuda: 24 | roi_align.roi_align_forward_cuda(self.aligned_height, 25 | self.aligned_width, 26 | self.spatial_scale, features, 27 | rois, output) 28 | else: 29 | roi_align.roi_align_forward(self.aligned_height, 30 | self.aligned_width, 31 | self.spatial_scale, features, 32 | rois, output) 33 | # raise NotImplementedError 34 | 35 | return output 36 | 37 | def backward(self, grad_output): 38 | assert(self.feature_size is not None and grad_output.is_cuda) 39 | 40 | batch_size, num_channels, data_height, data_width = self.feature_size 41 | 42 | grad_input = self.rois.new(batch_size, num_channels, data_height, 43 | data_width).zero_() 44 | roi_align.roi_align_backward_cuda(self.aligned_height, 45 | self.aligned_width, 46 | self.spatial_scale, grad_output, 47 | self.rois, grad_input) 48 | 49 | # print grad_input 50 | 51 | return grad_input, None 52 | -------------------------------------------------------------------------------- /roi_align/functions/roi_align.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/functions/roi_align.pyc -------------------------------------------------------------------------------- /roi_align/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_PATH=/usr/local/cuda/ 4 | 5 | cd src 6 | echo "Compiling my_lib kernels by nvcc..." 7 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52 8 | 9 | cd ../ 10 | python build.py 11 | -------------------------------------------------------------------------------- /roi_align/modules/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/modules/__init__.pyc -------------------------------------------------------------------------------- /roi_align/modules/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/modules/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /roi_align/modules/__pycache__/roi_align.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/modules/__pycache__/roi_align.cpython-35.pyc -------------------------------------------------------------------------------- /roi_align/modules/roi_align.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from torch.nn.functional import avg_pool2d, max_pool2d 3 | from ..functions.roi_align import RoIAlignFunction 4 | 5 | 6 | class RoIAlign(Module): 7 | def __init__(self, aligned_height, aligned_width, spatial_scale): 8 | super(RoIAlign, self).__init__() 9 | 10 | self.aligned_width = int(aligned_width) 11 | self.aligned_height = int(aligned_height) 12 | self.spatial_scale = float(spatial_scale) 13 | 14 | def forward(self, features, rois): 15 | return RoIAlignFunction(self.aligned_height, self.aligned_width, 16 | self.spatial_scale)(features, rois) 17 | 18 | class RoIAlignAvg(Module): 19 | def __init__(self, aligned_height, aligned_width, spatial_scale): 20 | super(RoIAlignAvg, self).__init__() 21 | 22 | self.aligned_width = int(aligned_width) 23 | self.aligned_height = int(aligned_height) 24 | self.spatial_scale = float(spatial_scale) 25 | 26 | def forward(self, features, rois): 27 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1, 28 | self.spatial_scale)(features, rois) 29 | return avg_pool2d(x, kernel_size=2, stride=1) 30 | 31 | class RoIAlignMax(Module): 32 | def __init__(self, aligned_height, aligned_width, spatial_scale): 33 | super(RoIAlignMax, self).__init__() 34 | 35 | self.aligned_width = int(aligned_width) 36 | self.aligned_height = int(aligned_height) 37 | self.spatial_scale = float(spatial_scale) 38 | 39 | def forward(self, features, rois): 40 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1, 41 | self.spatial_scale)(features, rois) 42 | return max_pool2d(x, kernel_size=2, stride=1) 43 | -------------------------------------------------------------------------------- /roi_align/modules/roi_align.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/modules/roi_align.pyc -------------------------------------------------------------------------------- /roi_align/src/roi_align.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois, 7 | const int height, const int width, const int channels, 8 | const int aligned_height, const int aligned_width, const float * bottom_rois, 9 | float* top_data); 10 | 11 | void ROIAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois, 12 | const int height, const int width, const int channels, 13 | const int aligned_height, const int aligned_width, const float * bottom_rois, 14 | float* top_data); 15 | 16 | int roi_align_forward(int aligned_height, int aligned_width, float spatial_scale, 17 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output) 18 | { 19 | //Grab the input tensor 20 | float * data_flat = THFloatTensor_data(features); 21 | float * rois_flat = THFloatTensor_data(rois); 22 | 23 | float * output_flat = THFloatTensor_data(output); 24 | 25 | // Number of ROIs 26 | int num_rois = THFloatTensor_size(rois, 0); 27 | int size_rois = THFloatTensor_size(rois, 1); 28 | if (size_rois != 5) 29 | { 30 | return 0; 31 | } 32 | 33 | // data height 34 | int data_height = THFloatTensor_size(features, 2); 35 | // data width 36 | int data_width = THFloatTensor_size(features, 3); 37 | // Number of channels 38 | int num_channels = THFloatTensor_size(features, 1); 39 | 40 | // do ROIAlignForward 41 | ROIAlignForwardCpu(data_flat, spatial_scale, num_rois, data_height, data_width, num_channels, 42 | aligned_height, aligned_width, rois_flat, output_flat); 43 | 44 | return 1; 45 | } 46 | 47 | int roi_align_backward(int aligned_height, int aligned_width, float spatial_scale, 48 | THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad) 49 | { 50 | //Grab the input tensor 51 | float * top_grad_flat = THFloatTensor_data(top_grad); 52 | float * rois_flat = THFloatTensor_data(rois); 53 | 54 | float * bottom_grad_flat = THFloatTensor_data(bottom_grad); 55 | 56 | // Number of ROIs 57 | int num_rois = THFloatTensor_size(rois, 0); 58 | int size_rois = THFloatTensor_size(rois, 1); 59 | if (size_rois != 5) 60 | { 61 | return 0; 62 | } 63 | 64 | // batch size 65 | // int batch_size = THFloatTensor_size(bottom_grad, 0); 66 | // data height 67 | int data_height = THFloatTensor_size(bottom_grad, 2); 68 | // data width 69 | int data_width = THFloatTensor_size(bottom_grad, 3); 70 | // Number of channels 71 | int num_channels = THFloatTensor_size(bottom_grad, 1); 72 | 73 | // do ROIAlignBackward 74 | ROIAlignBackwardCpu(top_grad_flat, spatial_scale, num_rois, data_height, 75 | data_width, num_channels, aligned_height, aligned_width, rois_flat, bottom_grad_flat); 76 | 77 | return 1; 78 | } 79 | 80 | void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois, 81 | const int height, const int width, const int channels, 82 | const int aligned_height, const int aligned_width, const float * bottom_rois, 83 | float* top_data) 84 | { 85 | const int output_size = num_rois * aligned_height * aligned_width * channels; 86 | 87 | int idx = 0; 88 | for (idx = 0; idx < output_size; ++idx) 89 | { 90 | // (n, c, ph, pw) is an element in the aligned output 91 | int pw = idx % aligned_width; 92 | int ph = (idx / aligned_width) % aligned_height; 93 | int c = (idx / aligned_width / aligned_height) % channels; 94 | int n = idx / aligned_width / aligned_height / channels; 95 | 96 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 97 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 98 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 99 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 100 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 101 | 102 | // Force malformed ROI to be 1x1 103 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 104 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 105 | float bin_size_h = roi_height / (aligned_height - 1.); 106 | float bin_size_w = roi_width / (aligned_width - 1.); 107 | 108 | float h = (float)(ph) * bin_size_h + roi_start_h; 109 | float w = (float)(pw) * bin_size_w + roi_start_w; 110 | 111 | int hstart = fminf(floor(h), height - 2); 112 | int wstart = fminf(floor(w), width - 2); 113 | 114 | int img_start = roi_batch_ind * channels * height * width; 115 | 116 | // bilinear interpolation 117 | if (h < 0 || h >= height || w < 0 || w >= width) 118 | { 119 | top_data[idx] = 0.; 120 | } 121 | else 122 | { 123 | float h_ratio = h - (float)(hstart); 124 | float w_ratio = w - (float)(wstart); 125 | int upleft = img_start + (c * height + hstart) * width + wstart; 126 | int upright = upleft + 1; 127 | int downleft = upleft + width; 128 | int downright = downleft + 1; 129 | 130 | top_data[idx] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio) 131 | + bottom_data[upright] * (1. - h_ratio) * w_ratio 132 | + bottom_data[downleft] * h_ratio * (1. - w_ratio) 133 | + bottom_data[downright] * h_ratio * w_ratio; 134 | } 135 | } 136 | } 137 | 138 | void ROIAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois, 139 | const int height, const int width, const int channels, 140 | const int aligned_height, const int aligned_width, const float * bottom_rois, 141 | float* bottom_diff) 142 | { 143 | const int output_size = num_rois * aligned_height * aligned_width * channels; 144 | 145 | int idx = 0; 146 | for (idx = 0; idx < output_size; ++idx) 147 | { 148 | // (n, c, ph, pw) is an element in the aligned output 149 | int pw = idx % aligned_width; 150 | int ph = (idx / aligned_width) % aligned_height; 151 | int c = (idx / aligned_width / aligned_height) % channels; 152 | int n = idx / aligned_width / aligned_height / channels; 153 | 154 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 155 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 156 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 157 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 158 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 159 | 160 | // Force malformed ROI to be 1x1 161 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 162 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 163 | float bin_size_h = roi_height / (aligned_height - 1.); 164 | float bin_size_w = roi_width / (aligned_width - 1.); 165 | 166 | float h = (float)(ph) * bin_size_h + roi_start_h; 167 | float w = (float)(pw) * bin_size_w + roi_start_w; 168 | 169 | int hstart = fminf(floor(h), height - 2); 170 | int wstart = fminf(floor(w), width - 2); 171 | 172 | int img_start = roi_batch_ind * channels * height * width; 173 | 174 | // bilinear interpolation 175 | if (h < 0 || h >= height || w < 0 || w >= width) 176 | { 177 | float h_ratio = h - (float)(hstart); 178 | float w_ratio = w - (float)(wstart); 179 | int upleft = img_start + (c * height + hstart) * width + wstart; 180 | int upright = upleft + 1; 181 | int downleft = upleft + width; 182 | int downright = downleft + 1; 183 | 184 | bottom_diff[upleft] += top_diff[idx] * (1. - h_ratio) * (1. - w_ratio); 185 | bottom_diff[upright] += top_diff[idx] * (1. - h_ratio) * w_ratio; 186 | bottom_diff[downleft] += top_diff[idx] * h_ratio * (1. - w_ratio); 187 | bottom_diff[downright] += top_diff[idx] * h_ratio * w_ratio; 188 | } 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /roi_align/src/roi_align.h: -------------------------------------------------------------------------------- 1 | int roi_align_forward(int aligned_height, int aligned_width, float spatial_scale, 2 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output); 3 | 4 | int roi_align_backward(int aligned_height, int aligned_width, float spatial_scale, 5 | THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad); 6 | -------------------------------------------------------------------------------- /roi_align/src/roi_align_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "roi_align_kernel.h" 4 | 5 | extern THCState *state; 6 | 7 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output) 9 | { 10 | // Grab the input tensor 11 | float * data_flat = THCudaTensor_data(state, features); 12 | float * rois_flat = THCudaTensor_data(state, rois); 13 | 14 | float * output_flat = THCudaTensor_data(state, output); 15 | 16 | // Number of ROIs 17 | int num_rois = THCudaTensor_size(state, rois, 0); 18 | int size_rois = THCudaTensor_size(state, rois, 1); 19 | if (size_rois != 5) 20 | { 21 | return 0; 22 | } 23 | 24 | // data height 25 | int data_height = THCudaTensor_size(state, features, 2); 26 | // data width 27 | int data_width = THCudaTensor_size(state, features, 3); 28 | // Number of channels 29 | int num_channels = THCudaTensor_size(state, features, 1); 30 | 31 | cudaStream_t stream = THCState_getCurrentStream(state); 32 | 33 | ROIAlignForwardLaucher( 34 | data_flat, spatial_scale, num_rois, data_height, 35 | data_width, num_channels, aligned_height, 36 | aligned_width, rois_flat, 37 | output_flat, stream); 38 | 39 | return 1; 40 | } 41 | 42 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 43 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad) 44 | { 45 | // Grab the input tensor 46 | float * top_grad_flat = THCudaTensor_data(state, top_grad); 47 | float * rois_flat = THCudaTensor_data(state, rois); 48 | 49 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 50 | 51 | // Number of ROIs 52 | int num_rois = THCudaTensor_size(state, rois, 0); 53 | int size_rois = THCudaTensor_size(state, rois, 1); 54 | if (size_rois != 5) 55 | { 56 | return 0; 57 | } 58 | 59 | // batch size 60 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 61 | // data height 62 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 63 | // data width 64 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 65 | // Number of channels 66 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 67 | 68 | cudaStream_t stream = THCState_getCurrentStream(state); 69 | ROIAlignBackwardLaucher( 70 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height, 71 | data_width, num_channels, aligned_height, 72 | aligned_width, rois_flat, 73 | bottom_grad_flat, stream); 74 | 75 | return 1; 76 | } 77 | -------------------------------------------------------------------------------- /roi_align/src/roi_align_cuda.h: -------------------------------------------------------------------------------- 1 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output); 3 | 4 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad); 6 | -------------------------------------------------------------------------------- /roi_align/src/roi_align_kernel.cu: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include "roi_align_kernel.h" 9 | 10 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 11 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 12 | i += blockDim.x * gridDim.x) 13 | 14 | 15 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width, 16 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) { 17 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 18 | // (n, c, ph, pw) is an element in the aligned output 19 | // int n = index; 20 | // int pw = n % aligned_width; 21 | // n /= aligned_width; 22 | // int ph = n % aligned_height; 23 | // n /= aligned_height; 24 | // int c = n % channels; 25 | // n /= channels; 26 | 27 | int pw = index % aligned_width; 28 | int ph = (index / aligned_width) % aligned_height; 29 | int c = (index / aligned_width / aligned_height) % channels; 30 | int n = index / aligned_width / aligned_height / channels; 31 | 32 | // bottom_rois += n * 5; 33 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 34 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 35 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 36 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 37 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 38 | 39 | // Force malformed ROIs to be 1x1 40 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 41 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 42 | float bin_size_h = roi_height / (aligned_height - 1.); 43 | float bin_size_w = roi_width / (aligned_width - 1.); 44 | 45 | float h = (float)(ph) * bin_size_h + roi_start_h; 46 | float w = (float)(pw) * bin_size_w + roi_start_w; 47 | 48 | int hstart = fminf(floor(h), height - 2); 49 | int wstart = fminf(floor(w), width - 2); 50 | 51 | int img_start = roi_batch_ind * channels * height * width; 52 | 53 | // bilinear interpolation 54 | if (h < 0 || h >= height || w < 0 || w >= width) { 55 | top_data[index] = 0.; 56 | } else { 57 | float h_ratio = h - (float)(hstart); 58 | float w_ratio = w - (float)(wstart); 59 | int upleft = img_start + (c * height + hstart) * width + wstart; 60 | int upright = upleft + 1; 61 | int downleft = upleft + width; 62 | int downright = downleft + 1; 63 | 64 | top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio) 65 | + bottom_data[upright] * (1. - h_ratio) * w_ratio 66 | + bottom_data[downleft] * h_ratio * (1. - w_ratio) 67 | + bottom_data[downright] * h_ratio * w_ratio; 68 | } 69 | } 70 | } 71 | 72 | 73 | int ROIAlignForwardLaucher(const float* bottom_data, const float spatial_scale, const int num_rois, const int height, const int width, 74 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data, cudaStream_t stream) { 75 | const int kThreadsPerBlock = 1024; 76 | const int output_size = num_rois * aligned_height * aligned_width * channels; 77 | cudaError_t err; 78 | 79 | 80 | ROIAlignForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 81 | output_size, bottom_data, spatial_scale, height, width, channels, 82 | aligned_height, aligned_width, bottom_rois, top_data); 83 | 84 | err = cudaGetLastError(); 85 | if(cudaSuccess != err) { 86 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 87 | exit( -1 ); 88 | } 89 | 90 | return 1; 91 | } 92 | 93 | 94 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, const float spatial_scale, const int height, const int width, 95 | const int channels, const int aligned_height, const int aligned_width, float* bottom_diff, const float* bottom_rois) { 96 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 97 | 98 | // (n, c, ph, pw) is an element in the aligned output 99 | int pw = index % aligned_width; 100 | int ph = (index / aligned_width) % aligned_height; 101 | int c = (index / aligned_width / aligned_height) % channels; 102 | int n = index / aligned_width / aligned_height / channels; 103 | 104 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 105 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 106 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 107 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 108 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 109 | /* int roi_start_w = round(bottom_rois[1] * spatial_scale); */ 110 | /* int roi_start_h = round(bottom_rois[2] * spatial_scale); */ 111 | /* int roi_end_w = round(bottom_rois[3] * spatial_scale); */ 112 | /* int roi_end_h = round(bottom_rois[4] * spatial_scale); */ 113 | 114 | // Force malformed ROIs to be 1x1 115 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 116 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 117 | float bin_size_h = roi_height / (aligned_height - 1.); 118 | float bin_size_w = roi_width / (aligned_width - 1.); 119 | 120 | float h = (float)(ph) * bin_size_h + roi_start_h; 121 | float w = (float)(pw) * bin_size_w + roi_start_w; 122 | 123 | int hstart = fminf(floor(h), height - 2); 124 | int wstart = fminf(floor(w), width - 2); 125 | 126 | int img_start = roi_batch_ind * channels * height * width; 127 | 128 | // bilinear interpolation 129 | if (!(h < 0 || h >= height || w < 0 || w >= width)) { 130 | float h_ratio = h - (float)(hstart); 131 | float w_ratio = w - (float)(wstart); 132 | int upleft = img_start + (c * height + hstart) * width + wstart; 133 | int upright = upleft + 1; 134 | int downleft = upleft + width; 135 | int downright = downleft + 1; 136 | 137 | atomicAdd(bottom_diff + upleft, top_diff[index] * (1. - h_ratio) * (1 - w_ratio)); 138 | atomicAdd(bottom_diff + upright, top_diff[index] * (1. - h_ratio) * w_ratio); 139 | atomicAdd(bottom_diff + downleft, top_diff[index] * h_ratio * (1 - w_ratio)); 140 | atomicAdd(bottom_diff + downright, top_diff[index] * h_ratio * w_ratio); 141 | } 142 | } 143 | } 144 | 145 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, const int height, const int width, 146 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* bottom_diff, cudaStream_t stream) { 147 | const int kThreadsPerBlock = 1024; 148 | const int output_size = num_rois * aligned_height * aligned_width * channels; 149 | cudaError_t err; 150 | 151 | ROIAlignBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 152 | output_size, top_diff, spatial_scale, height, width, channels, 153 | aligned_height, aligned_width, bottom_diff, bottom_rois); 154 | 155 | err = cudaGetLastError(); 156 | if(cudaSuccess != err) { 157 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 158 | exit( -1 ); 159 | } 160 | 161 | return 1; 162 | } 163 | 164 | 165 | #ifdef __cplusplus 166 | } 167 | #endif 168 | -------------------------------------------------------------------------------- /roi_align/src/roi_align_kernel.cu.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/src/roi_align_kernel.cu.o -------------------------------------------------------------------------------- /roi_align/src/roi_align_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ROI_ALIGN_KERNEL 2 | #define _ROI_ALIGN_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, 9 | const float spatial_scale, const int height, const int width, 10 | const int channels, const int aligned_height, const int aligned_width, 11 | const float* bottom_rois, float* top_data); 12 | 13 | int ROIAlignForwardLaucher( 14 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 15 | const int width, const int channels, const int aligned_height, 16 | const int aligned_width, const float* bottom_rois, 17 | float* top_data, cudaStream_t stream); 18 | 19 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, 20 | const float spatial_scale, const int height, const int width, 21 | const int channels, const int aligned_height, const int aligned_width, 22 | float* bottom_diff, const float* bottom_rois); 23 | 24 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 25 | const int height, const int width, const int channels, const int aligned_height, 26 | const int aligned_width, const float* bottom_rois, 27 | float* bottom_diff, cudaStream_t stream); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif 34 | 35 | -------------------------------------------------------------------------------- /runTrainTest.sh: -------------------------------------------------------------------------------- 1 | 2 | #python TrainModel.py --base_model='vgg16' --scale='single' --downsample=4 --augmentation=0 --align_size=9 --reduced_dim=8 3 | #python TrainModel.py --base_model='vgg16' --scale='single' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=8 4 | #python TrainModel.py --base_model='vgg16' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=8 5 | 6 | #python TrainModel.py --base_model='mobilenetv2' --scale='single' --downsample=4 --augmentation=0 --align_size=9 --reduced_dim=8 7 | #python TrainModel.py --base_model='mobilenetv2' --scale='single' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=8 8 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=8 9 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=1 10 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=2 11 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=4 12 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=16 13 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=32 14 | 15 | #python TrainModel.py --base_model='shufflenetv2' --scale='single' --downsample=4 --augmentation=0 --align_size=9 --reduced_dim=8 16 | #python TrainModel.py --base_model='shufflenetv2' --scale='single' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=8 17 | python TrainModel.py --base_model='shufflenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=8 18 | 19 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=1 20 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=2 21 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=4 22 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=8 23 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=16 24 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 --reduced_dim=32 25 | 26 | 27 | #python TrainModel.py --base_model='resnet50' --scale='single' --downsample=4 --augmentation=1 --align_size=9 28 | #python TrainModel.py --base_model='resnet50' --scale='multi' --downsample=4 --augmentation=1 --align_size=9 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /thop/__init__.py: -------------------------------------------------------------------------------- 1 | from .profile import profile -------------------------------------------------------------------------------- /thop/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/thop/__init__.pyc -------------------------------------------------------------------------------- /thop/count_hooks.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | multiply_adds = 1 7 | 8 | 9 | def count_convNd(m, x, y): 10 | x = x[0] 11 | cin = m.in_channels 12 | # batch_size = x.size(0) 13 | 14 | kernel_ops = m.weight.size()[2] * m.weight.size()[3] 15 | bias_ops = 1 if m.bias is not None else 0 16 | ops_per_element = kernel_ops + bias_ops 17 | output_elements = y.nelement() 18 | 19 | # cout x oW x oH 20 | total_ops = cin * output_elements * ops_per_element // m.groups 21 | m.total_ops = torch.Tensor([int(total_ops)]) 22 | 23 | 24 | def count_conv2d(m, x, y): 25 | x = x[0] 26 | 27 | cin = m.in_channels 28 | cout = m.out_channels 29 | kh, kw = m.kernel_size 30 | batch_size = x.size()[0] 31 | 32 | out_h = y.size(2) 33 | out_w = y.size(3) 34 | 35 | # ops per output element 36 | # kernel_mul = kh * kw * cin 37 | # kernel_add = kh * kw * cin - 1 38 | kernel_ops = multiply_adds * kh * kw 39 | bias_ops = 1 if m.bias is not None else 0 40 | ops_per_element = kernel_ops + bias_ops 41 | 42 | # total ops 43 | # num_out_elements = y.numel() 44 | output_elements = batch_size * out_w * out_h * cout 45 | total_ops = output_elements * ops_per_element * cin // m.groups 46 | 47 | m.total_ops = torch.Tensor([int(total_ops)]) 48 | 49 | 50 | def count_convtranspose2d(m, x, y): 51 | x = x[0] 52 | 53 | cin = m.in_channels 54 | cout = m.out_channels 55 | kh, kw = m.kernel_size 56 | # batch_size = x.size()[0] 57 | 58 | out_h = y.size(2) 59 | out_w = y.size(3) 60 | 61 | # ops per output element 62 | # kernel_mul = kh * kw * cin 63 | # kernel_add = kh * kw * cin - 1 64 | kernel_ops = multiply_adds * kh * kw * cin // m.groups 65 | bias_ops = 1 if m.bias is not None else 0 66 | ops_per_element = kernel_ops + bias_ops 67 | 68 | # total ops 69 | # num_out_elements = y.numel() 70 | # output_elements = batch_size * out_w * out_h * cout 71 | ops_per_element = m.weight.nelement() 72 | output_elements = y.nelement() 73 | total_ops = output_elements * ops_per_element 74 | 75 | m.total_ops = torch.Tensor([int(total_ops)]) 76 | 77 | 78 | def count_bn(m, x, y): 79 | x = x[0] 80 | 81 | nelements = x.numel() 82 | # subtract, divide, gamma, beta 83 | total_ops = 4 * nelements 84 | 85 | m.total_ops = torch.Tensor([int(total_ops)]) 86 | 87 | 88 | def count_relu(m, x, y): 89 | x = x[0] 90 | 91 | nelements = x.numel() 92 | total_ops = nelements 93 | 94 | m.total_ops = torch.Tensor([int(total_ops)]) 95 | 96 | 97 | def count_softmax(m, x, y): 98 | x = x[0] 99 | 100 | batch_size, nfeatures = x.size() 101 | 102 | total_exp = nfeatures 103 | total_add = nfeatures - 1 104 | total_div = nfeatures 105 | total_ops = batch_size * (total_exp + total_add + total_div) 106 | 107 | m.total_ops = torch.Tensor([int(total_ops)]) 108 | 109 | 110 | def count_maxpool(m, x, y): 111 | kernel_ops = torch.prod(torch.Tensor([m.kernel_size])) 112 | num_elements = y.numel() 113 | total_ops = kernel_ops * num_elements 114 | 115 | m.total_ops = torch.Tensor([int(total_ops)]) 116 | 117 | 118 | def count_adap_maxpool(m, x, y): 119 | kernel = torch.Tensor([(x[0].shape[2:])]) // torch.Tensor(list((m.output_size,))).squeeze() 120 | kernel_ops = torch.prod(kernel) 121 | num_elements = y.numel() 122 | total_ops = kernel_ops * num_elements 123 | 124 | m.total_ops = torch.Tensor([int(total_ops)]) 125 | 126 | 127 | def count_avgpool(m, x, y): 128 | total_add = torch.prod(torch.Tensor([m.kernel_size])) 129 | total_div = 1 130 | kernel_ops = total_add + total_div 131 | num_elements = y.numel() 132 | total_ops = kernel_ops * num_elements 133 | 134 | m.total_ops = torch.Tensor([int(total_ops)]) 135 | 136 | 137 | def count_adap_avgpool(m, x, y): 138 | kernel = torch.Tensor([(x[0].shape[2:])]) // torch.Tensor(list((m.output_size,))).squeeze() 139 | total_add = torch.prod(kernel) 140 | total_div = 1 141 | kernel_ops = total_add + total_div 142 | num_elements = y.numel() 143 | total_ops = kernel_ops * num_elements 144 | 145 | m.total_ops = torch.Tensor([int(total_ops)]) 146 | 147 | 148 | def count_linear(m, x, y): 149 | # per output element 150 | total_mul = m.in_features 151 | total_add = m.in_features - 1 152 | num_elements = y.numel() 153 | total_ops = (total_mul + total_add) * num_elements 154 | 155 | m.total_ops = torch.Tensor([int(total_ops)]) 156 | -------------------------------------------------------------------------------- /thop/count_hooks.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/thop/count_hooks.pyc -------------------------------------------------------------------------------- /thop/profile.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn.modules.conv import _ConvNd 6 | 7 | from .count_hooks import * 8 | 9 | register_hooks = { 10 | nn.Conv1d: count_convNd, 11 | nn.Conv2d: count_convNd, 12 | nn.Conv3d: count_convNd, 13 | nn.ConvTranspose2d: count_convtranspose2d, 14 | 15 | nn.BatchNorm1d: count_bn, 16 | nn.BatchNorm2d: count_bn, 17 | nn.BatchNorm3d: count_bn, 18 | 19 | nn.ReLU: count_relu, 20 | nn.ReLU6: count_relu, 21 | nn.LeakyReLU: count_relu, 22 | 23 | nn.MaxPool1d: count_maxpool, 24 | nn.MaxPool2d: count_maxpool, 25 | nn.MaxPool3d: count_maxpool, 26 | nn.AdaptiveMaxPool1d: count_adap_maxpool, 27 | nn.AdaptiveMaxPool2d: count_adap_maxpool, 28 | nn.AdaptiveMaxPool3d: count_adap_maxpool, 29 | 30 | nn.AvgPool1d: count_avgpool, 31 | nn.AvgPool2d: count_avgpool, 32 | nn.AvgPool3d: count_avgpool, 33 | 34 | nn.AdaptiveAvgPool1d: count_adap_avgpool, 35 | nn.AdaptiveAvgPool2d: count_adap_avgpool, 36 | nn.AdaptiveAvgPool3d: count_adap_avgpool, 37 | nn.Linear: count_linear, 38 | nn.Dropout: None, 39 | } 40 | 41 | 42 | def profile(model, input_size, custom_ops={}, device="cpu"): 43 | handler_collection = [] 44 | 45 | def add_hooks(m): 46 | if len(list(m.children())) > 0: 47 | return 48 | 49 | m.register_buffer('total_ops', torch.zeros(1)) 50 | m.register_buffer('total_params', torch.zeros(1)) 51 | 52 | for p in m.parameters(): 53 | m.total_params += torch.Tensor([p.numel()]) 54 | 55 | m_type = type(m) 56 | fn = None 57 | 58 | if m_type in custom_ops: 59 | fn = custom_ops[m_type] 60 | elif m_type in register_hooks: 61 | fn = register_hooks[m_type] 62 | else: 63 | print("Not implemented for ", m) 64 | 65 | if fn is not None: 66 | print("Register FLOP counter for module %s" % str(m)) 67 | handler = m.register_forward_hook(fn) 68 | handler_collection.append(handler) 69 | 70 | original_device = model.parameters().next().device 71 | training = model.training 72 | 73 | model.eval().to(device) 74 | model.apply(add_hooks) 75 | 76 | x = torch.zeros(input_size).to(device) 77 | with torch.no_grad(): 78 | model(x) 79 | 80 | total_ops = 0 81 | total_params = 0 82 | for m in model.modules(): 83 | if len(list(m.children())) > 0: # skip for non-leaf module 84 | continue 85 | total_ops += m.total_ops 86 | total_params += m.total_params 87 | 88 | total_ops = total_ops.item() 89 | total_params = total_params.item() 90 | 91 | model.train(training).to(original_device) 92 | for handler in handler_collection: 93 | handler.remove() 94 | 95 | return total_ops, total_params 96 | -------------------------------------------------------------------------------- /thop/profile.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/thop/profile.pyc -------------------------------------------------------------------------------- /thop/utils.py: -------------------------------------------------------------------------------- 1 | 2 | def clever_format(num, format="%.2f"): 3 | if num > 1e12: 4 | return format % (num / 1e12) + "T" 5 | if num > 1e9: 6 | return format % (num / 1e9) + "G" 7 | if num > 1e6: 8 | return format % (num / 1e6) + "M" 9 | if num > 1e3: 10 | return format % (num / 1e3) + "K" --------------------------------------------------------------------------------