├── LICENSE
├── README.md
├── ShuffleNetV2.py
├── TestAccuracy.py
├── TrainModel.py
├── augmentations.py
├── croppingDataset.py
├── croppingModel.py
├── demo_eval.py
├── mobilenetv2.py
├── rod_align
    ├── __init__.pyc
    ├── _ext
    │   ├── __init__.pyc
    │   └── rod_align
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   └── _rod_align.so
    ├── build.py
    ├── functions
    │   ├── __init__.pyc
    │   ├── rod_align.py
    │   └── rod_align.pyc
    ├── make.sh
    ├── modules
    │   ├── __init__.pyc
    │   ├── rod_align.py
    │   └── rod_align.pyc
    └── src
    │   ├── rod_align.c
    │   ├── rod_align.h
    │   ├── rod_align_cuda.c
    │   ├── rod_align_cuda.h
    │   ├── rod_align_kernel.cu
    │   ├── rod_align_kernel.cu.o
    │   └── rod_align_kernel.h
├── roi_align
    ├── __init__.pyc
    ├── __pycache__
    │   └── __init__.cpython-35.pyc
    ├── _ext
    │   ├── __init__.pyc
    │   ├── __pycache__
    │   │   └── __init__.cpython-35.pyc
    │   └── roi_align
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── __pycache__
    │   │       └── __init__.cpython-35.pyc
    │   │   └── _roi_align.so
    ├── build.py
    ├── functions
    │   ├── __init__.pyc
    │   ├── __pycache__
    │   │   ├── __init__.cpython-35.pyc
    │   │   └── roi_align.cpython-35.pyc
    │   ├── roi_align.py
    │   └── roi_align.pyc
    ├── make.sh
    ├── modules
    │   ├── __init__.pyc
    │   ├── __pycache__
    │   │   ├── __init__.cpython-35.pyc
    │   │   └── roi_align.cpython-35.pyc
    │   ├── roi_align.py
    │   └── roi_align.pyc
    └── src
    │   ├── roi_align.c
    │   ├── roi_align.h
    │   ├── roi_align_cuda.c
    │   ├── roi_align_cuda.h
    │   ├── roi_align_kernel.cu
    │   ├── roi_align_kernel.cu.o
    │   └── roi_align_kernel.h
├── runTrainTest.sh
└── thop
    ├── __init__.py
    ├── __init__.pyc
    ├── count_hooks.py
    ├── count_hooks.pyc
    ├── profile.py
    ├── profile.pyc
    └── utils.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Max deGroot, Ellis Brown
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Grid-Anchor-based-Image-Cropping-Pytorch
 2 | The extension of this work has been accepted by TPAMI. Please read the [paper](https://www4.comp.polyu.edu.hk/~cslzhang/paper/GAIC-PAMI.pdf) for details.
 3 | 
 4 | 
 5 | ### Requirements
 6 | python 2.7, pytorch 0.4.1, numpy, cv2, scipy. 
 7 | 
 8 | ### Usage
 9 | 
10 | 1. Download the source code, the datasets [[conference version](https://drive.google.com/file/d/1KhmyjoimsQVXqPnLjKZiU4iXNKNyyxqW/view?usp=sharing)], [[journal version](https://drive.google.com/file/d/1tDdQqDe8dMoMIVi9Z0WWI5vtRViy01nR/view?usp=sharing)] and the pretrained models [[conference version](https://drive.google.com/file/d/1OvLT_ul17zCK4ljAi4myGAgA50PmLy3Y/view?usp=sharing)] [[journal version](https://drive.google.com/file/d/1KWYQdL6R5hmOC9toTymbMORZDThpiEW4/view?usp=sharing)]
11 | 
12 | 2. Run ``TrainModel.py`` to train a new model on our dataset or Run ``demo_eval.py`` to test the pretrained model on any images.
13 | 
14 | 3. To change the aspect ratio of generated crops, please change the ``generate_bboxes`` function in ``croppingDataset.py`` (line 115).
15 | 
16 | ### Annotation software
17 | The executable annotation software can be found [here](https://github.com/lld533/Grid-Anchor-based-Image-Cropping-Pytorch).
18 | 
19 | ### Other implementation
20 | 1. [PyTorch 1.0 or later](https://github.com/lld533/Grid-Anchor-based-Image-Cropping-Pytorch)
21 | 2. [Matlab (conference version)](https://github.com/HuiZeng/Grid-Anchor-based-Image-Cropping)
22 | 
23 | 
24 | ### Citation
25 | ```
26 | @inproceedings{zhang2019deep,
27 |   title={Reliable and Efficient Image Cropping: A Grid Anchor based Approach},
28 |   author={Zeng, Hui, Li, Lida， Cao, Zisheng and Zhang, Lei},
29 |   booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
30 |   year={2019}
31 | }
32 | @article{zeng2020cropping,
33 |   title={Grid Anchor based Image Cropping: A New Benchmark and An Efficient Model},
34 |   author={Zeng, Hui and Li, Lida and Cao, Zisheng and Zhang, Lei},
35 |   journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
36 |   volume={},
37 |   number={},
38 |   pages={},
39 |   year={2020},
40 |   publisher={IEEE}
41 | }
42 | ```
43 | 


--------------------------------------------------------------------------------
/ShuffleNetV2.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | from collections import OrderedDict
  6 | from torch.nn import init
  7 | import math
  8 | 
  9 | def conv_bn(inp, oup, stride):
 10 |     return nn.Sequential(
 11 |         nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
 12 |         nn.BatchNorm2d(oup),
 13 |         nn.ReLU(inplace=True)
 14 |     )
 15 | 
 16 | 
 17 | def conv_1x1_bn(inp, oup):
 18 |     return nn.Sequential(
 19 |         nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
 20 |         nn.BatchNorm2d(oup),
 21 |         nn.ReLU(inplace=True)
 22 |     )
 23 | 
 24 | def channel_shuffle(x, groups):
 25 |     batchsize, num_channels, height, width = x.data.size()
 26 | 
 27 |     channels_per_group = num_channels // groups
 28 |     
 29 |     # reshape
 30 |     x = x.view(batchsize, groups, 
 31 |         channels_per_group, height, width)
 32 | 
 33 |     x = torch.transpose(x, 1, 2).contiguous()
 34 | 
 35 |     # flatten
 36 |     x = x.view(batchsize, -1, height, width)
 37 | 
 38 |     return x
 39 |     
 40 | class InvertedResidual(nn.Module):
 41 |     def __init__(self, inp, oup, stride, benchmodel):
 42 |         super(InvertedResidual, self).__init__()
 43 |         self.benchmodel = benchmodel
 44 |         self.stride = stride
 45 |         assert stride in [1, 2]
 46 | 
 47 |         oup_inc = oup//2
 48 |         
 49 |         if self.benchmodel == 1:
 50 |             #assert inp == oup_inc
 51 |         	self.banch2 = nn.Sequential(
 52 |                 # pw
 53 |                 nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
 54 |                 nn.BatchNorm2d(oup_inc),
 55 |                 nn.ReLU(inplace=True),
 56 |                 # dw
 57 |                 nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
 58 |                 nn.BatchNorm2d(oup_inc),
 59 |                 # pw-linear
 60 |                 nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
 61 |                 nn.BatchNorm2d(oup_inc),
 62 |                 nn.ReLU(inplace=True),
 63 |             )                
 64 |         else:                  
 65 |             self.banch1 = nn.Sequential(
 66 |                 # dw
 67 |                 nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
 68 |                 nn.BatchNorm2d(inp),
 69 |                 # pw-linear
 70 |                 nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),
 71 |                 nn.BatchNorm2d(oup_inc),
 72 |                 nn.ReLU(inplace=True),
 73 |             )        
 74 |     
 75 |             self.banch2 = nn.Sequential(
 76 |                 # pw
 77 |                 nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),
 78 |                 nn.BatchNorm2d(oup_inc),
 79 |                 nn.ReLU(inplace=True),
 80 |                 # dw
 81 |                 nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
 82 |                 nn.BatchNorm2d(oup_inc),
 83 |                 # pw-linear
 84 |                 nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
 85 |                 nn.BatchNorm2d(oup_inc),
 86 |                 nn.ReLU(inplace=True),
 87 |             )
 88 |           
 89 |     @staticmethod
 90 |     def _concat(x, out):
 91 |         # concatenate along channel axis
 92 |         return torch.cat((x, out), 1)        
 93 | 
 94 |     def forward(self, x):
 95 |         if 1==self.benchmodel:
 96 |             x1 = x[:, :(x.shape[1]//2), :, :]
 97 |             x2 = x[:, (x.shape[1]//2):, :, :]
 98 |             out = self._concat(x1, self.banch2(x2))
 99 |         elif 2==self.benchmodel:
100 |             out = self._concat(self.banch1(x), self.banch2(x))
101 | 
102 |         return channel_shuffle(out, 2)
103 | 
104 | 
105 | class ShuffleNetV2(nn.Module):
106 |     def __init__(self, n_class=1000, input_size=224, width_mult=1.):
107 |         super(ShuffleNetV2, self).__init__()
108 |         
109 |         assert input_size % 32 == 0
110 |         
111 |         self.stage_repeats = [4, 8, 4]
112 |         # index 0 is invalid and should never be called.
113 |         # only used for indexing convenience.
114 |         if width_mult == 0.5:
115 |             self.stage_out_channels = [-1, 24,  48,  96, 192, 1024]
116 |         elif width_mult == 1.0:
117 |             self.stage_out_channels = [-1, 24, 116, 232, 464, 1024]
118 |         elif width_mult == 1.5:
119 |             self.stage_out_channels = [-1, 24, 176, 352, 704, 1024]
120 |         elif width_mult == 2.0:
121 |             self.stage_out_channels = [-1, 24, 224, 488, 976, 2048]
122 |         else:
123 |             raise ValueError(
124 |                 """{} groups is not supported for
125 |                        1x1 Grouped Convolutions""".format(num_groups))
126 | 
127 |         # building first layer
128 |         input_channel = self.stage_out_channels[1]
129 |         self.conv1 = conv_bn(3, input_channel, 2)    
130 | 	self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
131 |         
132 |         self.features = []
133 |         # building inverted residual blocks
134 |         for idxstage in range(len(self.stage_repeats)):
135 |             numrepeat = self.stage_repeats[idxstage]
136 |             output_channel = self.stage_out_channels[idxstage+2]
137 |             for i in range(numrepeat):
138 |                 if i == 0:
139 | 	            #inp, oup, stride, benchmodel):
140 |                     self.features.append(InvertedResidual(input_channel, output_channel, 2, 2))
141 |                 else:
142 |                     self.features.append(InvertedResidual(input_channel, output_channel, 1, 1))
143 |                 input_channel = output_channel
144 |                 
145 |                 
146 |         # make it nn.Sequential
147 |         self.features = nn.Sequential(*self.features)
148 | 
149 |         # building last several layers
150 |         self.conv_last      = conv_1x1_bn(input_channel, self.stage_out_channels[-1])
151 | 	self.globalpool = nn.Sequential(nn.AvgPool2d(int(input_size/32)))              
152 |     
153 | 	# building classifier
154 | 	self.classifier = nn.Sequential(nn.Linear(self.stage_out_channels[-1], n_class))
155 | 
156 |     def forward(self, x):
157 |         x = self.conv1(x)
158 |         x = self.maxpool(x)
159 |         x = self.features(x)
160 |         x = self.conv_last(x)
161 |         x = self.globalpool(x)
162 |         x = x.view(-1, self.stage_out_channels[-1])
163 |         x = self.classifier(x)
164 |         return x
165 | 
166 | def shufflenetv2(width_mult=1.):
167 |     model = ShuffleNetV2(width_mult=width_mult)
168 |     return model
169 |     
170 | if __name__ == "__main__":
171 |     """Testing
172 |     """
173 |     model = ShuffleNetV2()
174 |     print(model)
175 | 


--------------------------------------------------------------------------------
/TestAccuracy.py:
--------------------------------------------------------------------------------
  1 | from croppingDataset import GAICD
  2 | from croppingModel import build_crop_model
  3 | import time
  4 | import math
  5 | import sys
  6 | import torch
  7 | from torch.autograd import Variable
  8 | import torch.backends.cudnn as cudnn
  9 | import torch.utils.data as data
 10 | import argparse
 11 | from scipy.stats import spearmanr, pearsonr
 12 | 
 13 | parser = argparse.ArgumentParser(
 14 |     description='Single Shot MultiBox Detector Training With Pytorch')
 15 | parser.add_argument('--dataset_root', default='dataset/GAIC/', help='Dataset root directory path')
 16 | parser.add_argument('--image_size', default=256, type=int, help='Batch size for training')
 17 | parser.add_argument('--batch_size', default=1, type=int, help='Batch size for training')
 18 | parser.add_argument('--num_workers', default=0, type=int, help='Number of workers used in dataloading')
 19 | parser.add_argument('--cuda', default=True, help='Use CUDA to train model')
 20 | parser.add_argument('--net_path', default='weights/ablation/cropping/mobilenetv2/downsample4_multi_Aug1_Align9_Cdim8/23_0.625_0.583_0.553_0.525_0.785_0.762_0.748_0.723_0.783_0.806.pth_____',
 21 |                     help='Directory for saving checkpoint models')
 22 | args = parser.parse_args()
 23 | 
 24 | if torch.cuda.is_available():
 25 |     if args.cuda:
 26 |         torch.set_default_tensor_type('torch.cuda.FloatTensor')
 27 |     if not args.cuda:
 28 |         print("WARNING: It looks like you have a CUDA device, but aren't " +
 29 |               "using CUDA.\nRun with --cuda for optimal training speed.")
 30 |         torch.set_default_tensor_type('torch.FloatTensor')
 31 | else:
 32 |     torch.set_default_tensor_type('torch.FloatTensor')
 33 | 
 34 | 
 35 | data_loader = data.DataLoader(GAICD(image_size=args.image_size, dataset_dir=args.dataset_root, set='test'), args.batch_size, num_workers=args.num_workers, shuffle=False)
 36 | 
 37 | def test():
 38 | 
 39 |     net = build_crop_model(scale='multi', alignsize=9, reddim=8, loadweight=True, model='mobilenetv2', downsample=4)
 40 | 
 41 |     net.load_state_dict(torch.load(args.net_path))
 42 | 
 43 |     if args.cuda:
 44 |         net = torch.nn.DataParallel(net,device_ids=[0])
 45 |         torch.backends.cudnn.deterministic = True
 46 |         torch.backends.cudnn.benchmark = False
 47 |         net = net.cuda()
 48 | 
 49 |     net.eval()
 50 | 
 51 |     acc4_5 = []
 52 |     acc4_10 = []
 53 |     wacc4_5 = []
 54 |     wacc4_10 = []
 55 |     srcc = []
 56 |     pcc = []
 57 |     for n in range(4):
 58 |         acc4_5.append(0)
 59 |         acc4_10.append(0)
 60 |         wacc4_5.append(0)
 61 |         wacc4_10.append(0)
 62 | 
 63 |     for id, sample in enumerate(data_loader):
 64 |         image = sample['image']
 65 |         bboxs = sample['bbox']
 66 |         MOS = sample['MOS']
 67 | 
 68 |         roi = []
 69 | 
 70 |         for idx in range(0,len(bboxs['xmin'])):
 71 |             roi.append((0, bboxs['xmin'][idx],bboxs['ymin'][idx],bboxs['xmax'][idx],bboxs['ymax'][idx]))
 72 | 
 73 |         if args.cuda:
 74 |             image = Variable(image.cuda())
 75 |             roi = Variable(torch.Tensor(roi))
 76 |         else:
 77 |             image = Variable(image)
 78 |             roi = Variable(torch.Tensor(roi))
 79 | 
 80 |         t0 = time.time()
 81 |         out = net(image,roi)
 82 |         t1 = time.time()
 83 |         print('timer: %.4f sec.' % (t1 - t0))
 84 | 
 85 |         id_MOS = sorted(range(len(MOS)), key=lambda k: MOS[k], reverse=True)
 86 |         id_out = sorted(range(len(out)), key=lambda k: out[k], reverse=True)
 87 | 
 88 |         rank_of_returned_crop = []
 89 |         for k in range(4):
 90 |             rank_of_returned_crop.append(id_MOS.index(id_out[k]))
 91 | 
 92 |         for k in range(4):
 93 |             temp_acc_4_5 = 0.0
 94 |             temp_acc_4_10 = 0.0
 95 |             for j in range(k+1):
 96 |                 if MOS[id_out[j]] >= MOS[id_MOS[4]]:
 97 |                     temp_acc_4_5 += 1.0
 98 |                 if MOS[id_out[j]] >= MOS[id_MOS[9]]:
 99 |                     temp_acc_4_10 += 1.0
100 |             acc4_5[k] += temp_acc_4_5 / (k+1.0)
101 |             acc4_10[k] += temp_acc_4_10 / (k+1.0)
102 | 
103 |         for k in range(4):
104 |             temp_wacc_4_5 = 0.0
105 |             temp_wacc_4_10 = 0.0
106 |             temp_rank_of_returned_crop = rank_of_returned_crop[:(k+1)]
107 |             temp_rank_of_returned_crop.sort()
108 |             for j in range(k+1):
109 |                 if temp_rank_of_returned_crop[j] <= 4:
110 |                     temp_wacc_4_5 += 1.0 * math.exp(-0.2*(temp_rank_of_returned_crop[j]-j))
111 |                 if temp_rank_of_returned_crop[j] <= 9:
112 |                     temp_wacc_4_10 += 1.0 * math.exp(-0.1*(temp_rank_of_returned_crop[j]-j))
113 |             wacc4_5[k] += temp_wacc_4_5 / (k+1.0)
114 |             wacc4_10[k] += temp_wacc_4_10 / (k+1.0)
115 | 
116 | 
117 |         MOS_arr = []
118 |         out = torch.squeeze(out).cpu().detach().numpy()
119 |         for k in range(len(MOS)):
120 |             MOS_arr.append(MOS[k].numpy()[0])
121 |         srcc.append(spearmanr(MOS_arr,out)[0])
122 |         pcc.append(pearsonr(MOS_arr,out)[0])
123 | 
124 | 
125 |     for k in range(4):
126 |         acc4_5[k] = acc4_5[k] / 200.0
127 |         acc4_10[k] = acc4_10[k] / 200.0
128 |         wacc4_5[k] = wacc4_5[k] / 200.0
129 |         wacc4_10[k] = wacc4_10[k] / 200.0
130 | 
131 |     avg_srcc = sum(srcc) / 200.0
132 |     avg_pcc = sum(pcc) / 200.0
133 | 
134 |     sys.stdout.write('[%.3f, %.3f, %.3f, %.3f] [%.3f, %.3f, %.3f, %.3f]\n' % (acc4_5[0],acc4_5[1],acc4_5[2],acc4_5[3],acc4_10[0],acc4_10[1],acc4_10[2],acc4_10[3]))
135 |     sys.stdout.write('[%.3f, %.3f, %.3f, %.3f] [%.3f, %.3f, %.3f, %.3f]\n' % (wacc4_5[0],wacc4_5[1],wacc4_5[2],wacc4_5[3],wacc4_10[0],wacc4_10[1],wacc4_10[2],wacc4_10[3]))
136 |     sys.stdout.write('[Avg SRCC: %.3f] [Avg PCC: %.3f]\n' % (avg_srcc,avg_pcc))
137 | 
138 | 
139 | if __name__ == '__main__':
140 |     test()
141 | 


--------------------------------------------------------------------------------
/TrainModel.py:
--------------------------------------------------------------------------------
  1 | from croppingModel import build_crop_model
  2 | from croppingDataset import GAICD
  3 | import os
  4 | import sys
  5 | import time
  6 | import math
  7 | import torch
  8 | from torch.autograd import Variable
  9 | import torch.optim as optim
 10 | import torch.utils.data as data
 11 | import argparse
 12 | import numpy as np
 13 | import random
 14 | from scipy.stats import spearmanr, pearsonr
 15 | 
 16 | SEED = 0
 17 | torch.manual_seed(SEED)
 18 | np.random.seed(SEED)
 19 | random.seed(SEED)
 20 | 
 21 | parser = argparse.ArgumentParser(description='Grid anchor based image cropping')
 22 | parser.add_argument('--dataset_root', default='dataset/GAIC/', help='Dataset root directory path')
 23 | parser.add_argument('--base_model', default='mobilenetv2', help='Pretrained base model')
 24 | parser.add_argument('--scale', default='multi', type=str, help='choose single or multi scale')
 25 | parser.add_argument('--downsample', default=4, type=int, help='downsample time')
 26 | parser.add_argument('--augmentation', default=1, type=int, help='choose single or multi scale')
 27 | parser.add_argument('--image_size', default=256, type=int, help='Batch size for training')
 28 | parser.add_argument('--align_size', default=9, type=int, help='Spatial size of RoIAlign and RoDAlign')
 29 | parser.add_argument('--reduced_dim', default=8, type=int, help='Spatial size of RoIAlign and RoDAlign')
 30 | parser.add_argument('--batch_size', default=1, type=int, help='Batch size for training')
 31 | parser.add_argument('--resume', default=None, type=str, help='Checkpoint state_dict file to resume training from')
 32 | parser.add_argument('--start_iter', default=0, type=int, help='Resume training at this iter')
 33 | parser.add_argument('--num_workers', default=0, type=int, help='Number of workers used in dataloading')
 34 | parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float, help='initial learning rate')
 35 | parser.add_argument('--save_folder', default='weights/ablation/cropping/', help='Directory for saving checkpoint models')
 36 | args = parser.parse_args()
 37 | 
 38 | args.save_folder = args.save_folder + args.base_model + '/' + 'downsample' + str(args.downsample) + '_' + args.scale + '_Aug' + str(args.augmentation) + '_Align' +str(args.align_size) + '_Cdim'+str(args.reduced_dim)
 39 | 
 40 | if not os.path.exists(args.save_folder):
 41 |     os.makedirs(args.save_folder)
 42 | 
 43 | cuda = True if torch.cuda.is_available() else False
 44 | 
 45 | if cuda:
 46 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 47 | else:
 48 |     torch.set_default_tensor_type('torch.FloatTensor')
 49 | 
 50 | 
 51 | data_loader_train = data.DataLoader(GAICD(image_size=args.image_size, dataset_dir=args.dataset_root, set='train', augmentation=args.augmentation),
 52 |                               batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, worker_init_fn=random.seed(SEED))
 53 | 
 54 | data_loader_test = data.DataLoader(GAICD(image_size=args.image_size, dataset_dir=args.dataset_root, set='test'),
 55 |                               batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False)
 56 | 
 57 | net = build_crop_model(scale=args.scale, alignsize=args.align_size, reddim=args.reduced_dim, loadweight=True, model=args.base_model, downsample=args.downsample)
 58 | 
 59 | # fix the batch normalization in mobilenet and shufflenet because batchsize = 1
 60 | net.eval()
 61 | 
 62 | if cuda:
 63 |     net = torch.nn.DataParallel(net,device_ids=[0])
 64 |     torch.backends.cudnn.deterministic = True
 65 |     torch.backends.cudnn.benchmark = False
 66 |     #cudnn.benchmark = True
 67 |     net = net.cuda()
 68 | 
 69 | optimizer = optim.Adam(net.parameters(), lr=args.lr)
 70 | 
 71 | def test():
 72 |     acc4_5 = []
 73 |     acc4_10 = []
 74 |     wacc4_5 = []
 75 |     wacc4_10 = []
 76 |     srcc = []
 77 |     pcc = []
 78 |     total_loss = 0
 79 |     avg_loss = 0
 80 |     for n in range(4):
 81 |         acc4_5.append(0)
 82 |         acc4_10.append(0)
 83 |         wacc4_5.append(0)
 84 |         wacc4_10.append(0)
 85 | 
 86 |     for id, sample in enumerate(data_loader_test):
 87 |         image = sample['image']
 88 |         bboxs = sample['bbox']
 89 |         MOS = sample['MOS']
 90 | 
 91 |         roi = []
 92 | 
 93 |         for idx in range(0,len(bboxs['xmin'])):
 94 |             roi.append((0, bboxs['xmin'][idx],bboxs['ymin'][idx],bboxs['xmax'][idx],bboxs['ymax'][idx]))
 95 | 
 96 |         if cuda:
 97 |             image = Variable(image.cuda())
 98 |             roi = Variable(torch.Tensor(roi))
 99 |         else:
100 |             image = Variable(image)
101 |             roi = Variable(roi)
102 | 
103 |         #t0 = time.time()
104 |         out = net(image,roi)
105 |         loss = torch.nn.SmoothL1Loss(reduction='elementwise_mean')(out.squeeze(), torch.Tensor(MOS))
106 |         total_loss += loss.item()
107 |         avg_loss = total_loss / (id+1)
108 | 
109 |         id_MOS = sorted(range(len(MOS)), key=lambda k: MOS[k], reverse = True)
110 |         id_out = sorted(range(len(out)), key=lambda k: out[k], reverse = True)
111 |         for k in range(4):
112 |             temp_acc_4_5 = 0.0
113 |             temp_acc_4_10 = 0.0
114 |             for j in range(k+1):
115 |                 if MOS[id_out[j]] >= MOS[id_MOS[4]]:
116 |                     temp_acc_4_5 += 1.0
117 |                 if MOS[id_out[j]] >= MOS[id_MOS[9]]:
118 |                     temp_acc_4_10 += 1.0
119 |             acc4_5[k] += temp_acc_4_5 / (k+1.0)
120 |             acc4_10[k] += temp_acc_4_10 / (k+1.0)
121 | 
122 |         rank_of_returned_crop = []
123 |         for k in range(4):
124 |             rank_of_returned_crop.append(id_MOS.index(id_out[k]))
125 | 
126 |         for k in range(4):
127 |             temp_wacc_4_5 = 0.0
128 |             temp_wacc_4_10 = 0.0
129 |             temp_rank_of_returned_crop = rank_of_returned_crop[:(k+1)]
130 |             temp_rank_of_returned_crop.sort()
131 |             for j in range(k+1):
132 |                 if temp_rank_of_returned_crop[j] <= 4:
133 |                     temp_wacc_4_5 += 1.0 * math.exp(-0.2*(temp_rank_of_returned_crop[j]-j))
134 |                 if temp_rank_of_returned_crop[j] <= 9:
135 |                     temp_wacc_4_10 += 1.0 * math.exp(-0.1*(temp_rank_of_returned_crop[j]-j))
136 |             wacc4_5[k] += temp_wacc_4_5 / (k+1.0)
137 |             wacc4_10[k] += temp_wacc_4_10 / (k+1.0)
138 | 
139 |         MOS_arr = []
140 |         out = torch.squeeze(out).cpu().detach().numpy()
141 |         for k in range(len(MOS)):
142 |             MOS_arr.append(MOS[k].numpy()[0])
143 |         srcc.append(spearmanr(MOS_arr,out)[0])
144 |         pcc.append(pearsonr(MOS_arr,out)[0])
145 | 
146 |         #t1 = time.time()
147 | 
148 |         #print('timer: %.4f sec.' % (t1 - t0))
149 |     for k in range(4):
150 |         acc4_5[k] = acc4_5[k] / 200.0
151 |         acc4_10[k] = acc4_10[k] / 200.0
152 |         wacc4_5[k] = wacc4_5[k] / 200.0
153 |         wacc4_10[k] = wacc4_10[k] / 200.0
154 | 
155 |     avg_srcc = sum(srcc) / 200.0
156 |     avg_pcc = sum(pcc) / 200.0
157 | 
158 | 
159 |     return acc4_5, acc4_10, avg_srcc, avg_pcc, avg_loss, wacc4_5, wacc4_10
160 | 
161 | 
162 | def train():
163 | 
164 |     for epoch in range(0, 80):
165 |         total_loss = 0
166 |         for id, sample in enumerate(data_loader_train):
167 | 
168 |             image = sample['image']
169 |             bboxs = sample['bbox']
170 | 
171 |             roi = []
172 |             MOS = []
173 | 
174 |             random_ID = range(0,len(bboxs['xmin']))
175 |             random.shuffle(random_ID)
176 | 
177 |             for idx in random_ID[:64]:
178 |                 roi.append((0, bboxs['xmin'][idx],bboxs['ymin'][idx],bboxs['xmax'][idx],bboxs['ymax'][idx]))
179 |                 MOS.append(sample['MOS'][idx])
180 | 
181 |             if cuda:
182 |                 image = Variable(image.cuda())
183 |                 roi = Variable(torch.Tensor(roi))
184 |                 MOS = torch.Tensor(MOS)
185 |             else:
186 |                 image = Variable(image)
187 |                 roi = Variable(roi)
188 | 
189 |             # forward
190 | 
191 |             out = net(image,roi)
192 |             loss = torch.nn.SmoothL1Loss(reduction='elementwise_mean')(out.squeeze(), MOS)
193 |             total_loss += loss.item()
194 |             avg_loss = total_loss / (id+1)
195 | 
196 |             # backprop
197 |             optimizer.zero_grad()
198 |             loss.backward()
199 |             optimizer.step()
200 | 
201 |             sys.stdout.write('\r[Epoch %d/%d] [Batch %d/%d] [Train Loss: %.4f]' % (epoch, 79, id, len(data_loader_train), avg_loss))
202 | 
203 |         acc4_5, acc4_10, avg_srcc, avg_pcc, test_avg_loss, wacc4_5, wacc4_10 = test()
204 |         sys.stdout.write('[Test Loss: %.4f] [%.3f, %.3f, %.3f, %.3f] [%.3f, %.3f, %.3f, %.3f] [SRCC: %.3f] [PCC: %.3f]\n' % (test_avg_loss,acc4_5[0],acc4_5[1],acc4_5[2],acc4_5[3],acc4_10[0],acc4_10[1],acc4_10[2],acc4_10[3],avg_srcc,avg_pcc))
205 |         sys.stdout.write('[%.3f, %.3f, %.3f, %.3f] [%.3f, %.3f, %.3f, %.3f]\n' % (wacc4_5[0],wacc4_5[1],wacc4_5[2],wacc4_5[3],wacc4_10[0],wacc4_10[1],wacc4_10[2],wacc4_10[3]))
206 |         torch.save(net.module.state_dict(), args.save_folder + '/' + repr(epoch) + '_%.3f_%.3f_%.3f_%.3f_%.3f_%.3f_%.3f_%.3f_%.3f_%.3f' % (acc4_5[0],acc4_5[1],acc4_5[2],acc4_5[3],acc4_10[0],acc4_10[1],acc4_10[2],acc4_10[3],avg_srcc,avg_pcc) + '.pth')
207 | 
208 | 
209 | if __name__ == '__main__':
210 |     train()
211 | 


--------------------------------------------------------------------------------
/augmentations.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torchvision import transforms
  3 | import cv2
  4 | import numpy as np
  5 | import types
  6 | from numpy import random
  7 | 
  8 | 
  9 | def intersect(box_a, box_b):
 10 |     max_xy = np.minimum(box_a[:, 2:], box_b[2:])
 11 |     min_xy = np.maximum(box_a[:, :2], box_b[:2])
 12 |     inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
 13 |     return inter[:, 0] * inter[:, 1]
 14 | 
 15 | 
 16 | def jaccard_numpy(box_a, box_b):
 17 | 
 18 |     inter = intersect(box_a, box_b)
 19 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 20 |               (box_a[:, 3]-box_a[:, 1]))  # [A,B]
 21 |     area_b = ((box_b[2]-box_b[0]) *
 22 |               (box_b[3]-box_b[1]))  # [A,B]
 23 |     union = area_a + area_b - inter
 24 |     return inter / union  # [A,B]
 25 | 
 26 | 
 27 | class Compose(object):
 28 |     """Composes several augmentations together.
 29 |     Args:
 30 |         transforms (List[Transform]): list of transforms to compose.
 31 |     Example:
 32 |          augmentations.Compose([transforms.CenterCrop(10), transforms.ToTensor(),])
 33 |     """
 34 | 
 35 |     def __init__(self, transforms):
 36 |         self.transforms = transforms
 37 | 
 38 |     def __call__(self, img, boxes=None, labels=None):
 39 |         for t in self.transforms:
 40 |             img, boxes, labels = t(img, boxes, labels)
 41 |         return img, boxes, labels
 42 | 
 43 | 
 44 | class Lambda(object):
 45 |     """Applies a lambda as a transform."""
 46 | 
 47 |     def __init__(self, lambd):
 48 |         assert isinstance(lambd, types.LambdaType)
 49 |         self.lambd = lambd
 50 | 
 51 |     def __call__(self, img, boxes=None, labels=None):
 52 |         return self.lambd(img, boxes, labels)
 53 | 
 54 | 
 55 | class ConvertFromInts(object):
 56 |     def __call__(self, image, boxes=None, labels=None):
 57 |         return image.astype(np.float32), boxes, labels
 58 | 
 59 | 
 60 | class SubtractMeans(object):
 61 |     def __init__(self, mean):
 62 |         self.mean = np.array(mean, dtype=np.float32)
 63 | 
 64 |     def __call__(self, image, boxes=None, labels=None):
 65 |         image = image.astype(np.float32)
 66 |         image -= self.mean
 67 |         return image.astype(np.float32), boxes, labels
 68 | 
 69 | 
 70 | class ToAbsoluteCoords(object):
 71 |     def __call__(self, image, boxes=None, labels=None):
 72 |         height, width, channels = image.shape
 73 |         boxes[:, 0] *= width
 74 |         boxes[:, 2] *= width
 75 |         boxes[:, 1] *= height
 76 |         boxes[:, 3] *= height
 77 | 
 78 |         return image, boxes, labels
 79 | 
 80 | 
 81 | class ToPercentCoords(object):
 82 |     def __call__(self, image, boxes=None, labels=None):
 83 |         height, width, channels = image.shape
 84 |         boxes[:, 0] /= width
 85 |         boxes[:, 2] /= width
 86 |         boxes[:, 1] /= height
 87 |         boxes[:, 3] /= height
 88 | 
 89 |         return image, boxes, labels
 90 | 
 91 | 
 92 | class Resize(object):
 93 |     def __init__(self, size=300):
 94 |         self.size = size
 95 | 
 96 |     def __call__(self, image, boxes=None, labels=None):
 97 |         image = cv2.resize(image, (self.size,
 98 |                                  self.size))
 99 |         return image, boxes, labels
100 | 
101 | 
102 | class RandomSaturation(object):
103 |     def __init__(self, lower=0.5, upper=1.5):
104 |         self.lower = lower
105 |         self.upper = upper
106 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
107 |         assert self.lower >= 0, "contrast lower must be non-negative."
108 | 
109 |     def __call__(self, image, boxes=None, labels=None):
110 |         if random.randint(2):
111 |             image[:, :, 1] *= random.uniform(self.lower, self.upper)
112 | 
113 |         return image, boxes, labels
114 | 
115 | 
116 | class RandomHue(object):
117 |     def __init__(self, delta=18.0):
118 |         assert delta >= 0.0 and delta <= 360.0
119 |         self.delta = delta
120 | 
121 |     def __call__(self, image, boxes=None, labels=None):
122 |         if random.randint(2):
123 |             image[:, :, 0] += random.uniform(-self.delta, self.delta)
124 |             image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
125 |             image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
126 |         return image, boxes, labels
127 | 
128 | 
129 | class RandomLightingNoise(object):
130 |     def __init__(self):
131 |         self.perms = ((0, 1, 2), (0, 2, 1),
132 |                       (1, 0, 2), (1, 2, 0),
133 |                       (2, 0, 1), (2, 1, 0))
134 | 
135 |     def __call__(self, image, boxes=None, labels=None):
136 |         if random.randint(2):
137 |             swap = self.perms[random.randint(len(self.perms))]
138 |             shuffle = SwapChannels(swap)  # shuffle channels
139 |             image = shuffle(image)
140 |         return image, boxes, labels
141 | 
142 | 
143 | class ConvertColor(object):
144 |     def __init__(self, current='BGR', transform='HSV'):
145 |         self.transform = transform
146 |         self.current = current
147 | 
148 |     def __call__(self, image, boxes=None, labels=None):
149 |         if self.current == 'BGR' and self.transform == 'HSV':
150 |             image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
151 |         elif self.current == 'HSV' and self.transform == 'BGR':
152 |             image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
153 |         else:
154 |             raise NotImplementedError
155 |         return image, boxes, labels
156 | 
157 | 
158 | class RandomContrast(object):
159 |     def __init__(self, lower=0.5, upper=1.5):
160 |         self.lower = lower
161 |         self.upper = upper
162 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
163 |         assert self.lower >= 0, "contrast lower must be non-negative."
164 | 
165 |     # expects float image
166 |     def __call__(self, image, boxes=None, labels=None):
167 |         if random.randint(2):
168 |             alpha = random.uniform(self.lower, self.upper)
169 |             image *= alpha
170 |         return image, boxes, labels
171 | 
172 | 
173 | class RandomBrightness(object):
174 |     def __init__(self, delta=32):
175 |         assert delta >= 0.0
176 |         assert delta <= 255.0
177 |         self.delta = delta
178 | 
179 |     def __call__(self, image, boxes=None, labels=None):
180 |         if random.randint(2):
181 |             delta = random.uniform(-self.delta, self.delta)
182 |             image += delta
183 |         return image, boxes, labels
184 | 
185 | 
186 | class ToCV2Image(object):
187 |     def __call__(self, tensor, boxes=None, labels=None):
188 |         return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels
189 | 
190 | 
191 | class ToTensor(object):
192 |     def __call__(self, cvimage, boxes=None, labels=None):
193 |         return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels
194 | 
195 | 
196 | class RandomSampleCrop(object):
197 |     """Crop
198 |     Arguments:
199 |         img (Image): the image being input during training
200 |         boxes (Tensor): the original bounding boxes in pt form
201 |         labels (Tensor): the class labels for each bbox
202 |         mode (float tuple): the min and max jaccard overlaps
203 |     Return:
204 |         (img, boxes, classes)
205 |             img (Image): the cropped image
206 |             boxes (Tensor): the adjusted bounding boxes in pt form
207 |             labels (Tensor): the class labels for each bbox
208 |     """
209 |     def __init__(self):
210 |         self.sample_options = (
211 |             # using entire original input image
212 |             None,
213 |             # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
214 |             (0.1, None),
215 |             (0.3, None),
216 |             (0.7, None),
217 |             (0.9, None),
218 |             # randomly sample a patch
219 |             (None, None),
220 |         )
221 | 
222 |     def __call__(self, image, boxes=None, labels=None):
223 |         height, width, _ = image.shape
224 |         while True:
225 |             # randomly choose a mode
226 |             mode = random.choice(self.sample_options)
227 |             if mode is None:
228 |                 return image, boxes, labels
229 | 
230 |             min_iou, max_iou = mode
231 |             if min_iou is None:
232 |                 min_iou = float('-inf')
233 |             if max_iou is None:
234 |                 max_iou = float('inf')
235 | 
236 |             # max trails (50)
237 |             for _ in range(50):
238 |                 current_image = image
239 | 
240 |                 w = random.uniform(0.3 * width, width)
241 |                 h = random.uniform(0.3 * height, height)
242 | 
243 |                 # aspect ratio constraint b/t .5 & 2
244 |                 if h / w < 0.5 or h / w > 2:
245 |                     continue
246 | 
247 |                 left = random.uniform(width - w)
248 |                 top = random.uniform(height - h)
249 | 
250 |                 # convert to integer rect x1,y1,x2,y2
251 |                 rect = np.array([int(left), int(top), int(left+w), int(top+h)])
252 | 
253 |                 # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
254 |                 overlap = jaccard_numpy(boxes, rect)
255 | 
256 |                 # is min and max overlap constraint satisfied? if not try again
257 |                 if overlap.min() < min_iou and max_iou < overlap.max():
258 |                     continue
259 | 
260 |                 # cut the crop from the image
261 |                 current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
262 |                                               :]
263 | 
264 |                 # keep overlap with gt box IF center in sampled patch
265 |                 centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
266 | 
267 |                 # mask in all gt boxes that above and to the left of centers
268 |                 m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
269 | 
270 |                 # mask in all gt boxes that under and to the right of centers
271 |                 m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
272 | 
273 |                 # mask in that both m1 and m2 are true
274 |                 mask = m1 * m2
275 | 
276 |                 # have any valid boxes? try again if not
277 |                 if not mask.any():
278 |                     continue
279 | 
280 |                 # take only matching gt boxes
281 |                 current_boxes = boxes[mask, :].copy()
282 | 
283 |                 # take only matching gt labels
284 |                 current_labels = labels[mask]
285 | 
286 |                 # should we use the box left and top corner or the crop's
287 |                 current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
288 |                                                   rect[:2])
289 |                 # adjust to crop (by substracting crop's left,top)
290 |                 current_boxes[:, :2] -= rect[:2]
291 | 
292 |                 current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
293 |                                                   rect[2:])
294 |                 # adjust to crop (by substracting crop's left,top)
295 |                 current_boxes[:, 2:] -= rect[:2]
296 | 
297 |                 return current_image, current_boxes, current_labels
298 | 
299 | 
300 | class Expand(object):
301 |     def __init__(self, mean):
302 |         self.mean = mean
303 | 
304 |     def __call__(self, image, boxes, labels):
305 |         if random.randint(2):
306 |             return image, boxes, labels
307 | 
308 |         height, width, depth = image.shape
309 |         ratio = random.uniform(1, 4)
310 |         left = random.uniform(0, width*ratio - width)
311 |         top = random.uniform(0, height*ratio - height)
312 | 
313 |         expand_image = np.zeros(
314 |             (int(height*ratio), int(width*ratio), depth),
315 |             dtype=image.dtype)
316 |         expand_image[:, :, :] = self.mean
317 |         expand_image[int(top):int(top + height),
318 |                      int(left):int(left + width)] = image
319 |         image = expand_image
320 | 
321 |         boxes = boxes.copy()
322 |         boxes[:, :2] += (int(left), int(top))
323 |         boxes[:, 2:] += (int(left), int(top))
324 | 
325 |         return image, boxes, labels
326 | 
327 | 
328 | class RandomMirror(object):
329 |     def __call__(self, image, annotations, classes):
330 |         _, width, _ = image.shape
331 |         if random.randint(2):
332 |             image = image[:, ::-1]
333 |             for i in range (len(annotations)):
334 |                 annotations[i][1] = width - annotations[i][1]
335 |                 annotations[i][3] = width - annotations[i][3]
336 |         return image, annotations, classes
337 | 
338 | 
339 | class SwapChannels(object):
340 |     """Transforms a tensorized image by swapping the channels in the order
341 |      specified in the swap tuple.
342 |     Args:
343 |         swaps (int triple): final order of channels
344 |             eg: (2, 1, 0)
345 |     """
346 | 
347 |     def __init__(self, swaps):
348 |         self.swaps = swaps
349 | 
350 |     def __call__(self, image):
351 |         """
352 |         Args:
353 |             image (Tensor): image tensor to be transformed
354 |         Return:
355 |             a tensor with channels swapped according to swap
356 |         """
357 |         # if torch.is_tensor(image):
358 |         #     image = image.data.cpu().numpy()
359 |         # else:
360 |         #     image = np.array(image)
361 |         image = image[:, :, self.swaps]
362 |         return image
363 | 
364 | 
365 | class PhotometricDistort(object):
366 |     def __init__(self):
367 |         self.pd = [
368 |             RandomContrast(),
369 |             ConvertColor(transform='HSV'),
370 |             RandomSaturation(),
371 |             RandomHue(),
372 |             ConvertColor(current='HSV', transform='BGR'),
373 |             RandomContrast()
374 |         ]
375 |         self.rand_brightness = RandomBrightness()
376 |         self.rand_light_noise = RandomLightingNoise()
377 | 
378 |     def __call__(self, image, boxes, labels):
379 |         im = image.copy()
380 |         im, boxes, labels = self.rand_brightness(im, boxes, labels)
381 |         if random.randint(2):
382 |             distort = Compose(self.pd[:-1])
383 |         else:
384 |             distort = Compose(self.pd[1:])
385 |         return distort(im, boxes, labels)
386 | 
387 | 
388 | class CropAugmentation(object):
389 |     def __init__(self):
390 |         self.augment = Compose([
391 |             ConvertFromInts(),
392 |             PhotometricDistort(),
393 |             RandomMirror()
394 |         ])
395 | 
396 |     def __call__(self, img, annotations):
397 |         image, annotations, label = self.augment(img, annotations)
398 |         return image, annotations
399 | 


--------------------------------------------------------------------------------
/croppingDataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch.utils.data as data
  3 | import cv2
  4 | import math
  5 | import numpy as np
  6 | from augmentations import CropAugmentation
  7 | 
  8 | MOS_MEAN = 2.95
  9 | MOS_STD = 0.8
 10 | RGB_MEAN = (0.485, 0.456, 0.406)
 11 | RGB_STD = (0.229, 0.224, 0.225)
 12 | 
 13 | 
 14 | class TransformFunction(object):
 15 | 
 16 |     def __call__(self, sample,image_size):
 17 |         image, annotations = sample['image'], sample['annotations']
 18 | 
 19 |         scale = image_size / min(image.shape[:2])
 20 |         h = round(image.shape[0] * scale / 32.0) * 32
 21 |         w = round(image.shape[1] * scale / 32.0) * 32
 22 |         resized_image = cv2.resize(image,(int(w),int(h))) / 256.0
 23 |         rgb_mean = np.array(RGB_MEAN, dtype=np.float32)
 24 |         rgb_std = np.array(RGB_STD, dtype=np.float32)
 25 |         resized_image = resized_image.astype(np.float32)
 26 |         resized_image -= rgb_mean
 27 |         resized_image = resized_image / rgb_std
 28 | 
 29 |         scale_height = float(resized_image.shape[0]) / image.shape[0]
 30 |         scale_width = float(resized_image.shape[1]) / image.shape[1]
 31 | 
 32 |         transformed_bbox = {}
 33 |         transformed_bbox['xmin'] = []
 34 |         transformed_bbox['ymin'] = []
 35 |         transformed_bbox['xmax'] = []
 36 |         transformed_bbox['ymax'] = []
 37 |         MOS = []
 38 |         for annotation in annotations:
 39 |             transformed_bbox['xmin'].append(math.floor(float(annotation[1]) * scale_width))
 40 |             transformed_bbox['ymin'].append(math.floor(float(annotation[0]) * scale_height))
 41 |             transformed_bbox['xmax'].append(math.ceil(float(annotation[3]) * scale_width))
 42 |             transformed_bbox['ymax'].append(math.ceil(float(annotation[2]) * scale_height))
 43 | 
 44 |             MOS.append((float(annotation[-1]) - MOS_MEAN) / MOS_STD)
 45 | 
 46 |         resized_image = resized_image.transpose((2, 0, 1))
 47 |         return {'image': resized_image, 'bbox': transformed_bbox, 'MOS': MOS}
 48 | 
 49 | class GAICD(data.Dataset):
 50 | 
 51 |     def __init__(self, image_size=256, dataset_dir='dataset/GAIC/', set = 'train',
 52 |                  transform=TransformFunction(), augmentation=False):
 53 |         self.image_size = float(image_size)
 54 |         self.dataset_dir = dataset_dir
 55 |         self.set = set
 56 |         image_lists = os.listdir(self.dataset_dir + 'images/' + set)
 57 |         self._imgpath = list()
 58 |         self._annopath = list()
 59 |         for image in image_lists:
 60 |           self._imgpath.append(os.path.join(self.dataset_dir, 'images', set, image))
 61 |           self._annopath.append(os.path.join(self.dataset_dir, 'annotations', set, image[:-3]+"txt"))
 62 |         self.transform = transform
 63 |         if augmentation:
 64 |             self.augmentation = CropAugmentation()
 65 |         else:
 66 |             self.augmentation = None
 67 | 
 68 | 
 69 |     def __getitem__(self, idx):
 70 |         image = cv2.imread(self._imgpath[idx])
 71 | 
 72 |         with open(self._annopath[idx],'r') as fid:
 73 |             annotations_txt = fid.readlines()
 74 | 
 75 |         annotations = list()
 76 |         for annotation in annotations_txt:
 77 |             annotation_split = annotation.split()
 78 |             if float(annotation_split[4]) != -2:
 79 |                 annotations.append([float(annotation_split[0]),float(annotation_split[1]),float(annotation_split[2]),float(annotation_split[3]),float(annotation_split[4])])
 80 | 
 81 |         if self.augmentation:
 82 |             image, annotations = self.augmentation(image, annotations)
 83 | 
 84 |         # to rgb
 85 |         image = image[:, :, (2, 1, 0)]
 86 | 
 87 |         sample = {'image': image, 'annotations': annotations}
 88 | 
 89 |         if self.transform:
 90 |             sample = self.transform(sample,self.image_size)
 91 | 
 92 |         return sample
 93 | 
 94 |     def __len__(self):
 95 |         return len(self._imgpath)
 96 | 
 97 | 
 98 | class TransformFunctionTest(object):
 99 | 
100 |     def __call__(self, image, image_size):
101 | 
102 |         scale = image_size / min(image.shape[:2])
103 |         h = round(image.shape[0] * scale / 32.0) * 32
104 |         w = round(image.shape[1] * scale / 32.0) * 32
105 |         resized_image = cv2.resize(image,(int(w),int(h))) / 256.0
106 |         rgb_mean = np.array(RGB_MEAN, dtype=np.float32)
107 |         rgb_std = np.array(RGB_STD, dtype=np.float32)
108 |         resized_image = resized_image.astype(np.float32)
109 |         resized_image -= rgb_mean
110 |         resized_image = resized_image / rgb_std
111 | 
112 |         scale_height = image.shape[0] / float(resized_image.shape[0])
113 |         scale_width = image.shape[1] / float(resized_image.shape[1])
114 | 
115 |         bboxes = generate_bboxes(resized_image)
116 | 
117 |         transformed_bbox = {}
118 |         transformed_bbox['xmin'] = []
119 |         transformed_bbox['ymin'] = []
120 |         transformed_bbox['xmax'] = []
121 |         transformed_bbox['ymax'] = []
122 |         source_bboxes = list()
123 | 
124 |         for bbox in bboxes:
125 |             source_bboxes.append([round(bbox[0] * scale_height),round(bbox[1] * scale_width),round(bbox[2] * scale_height),round(bbox[3] * scale_width)])
126 |             transformed_bbox['xmin'].append(bbox[1])
127 |             transformed_bbox['ymin'].append(bbox[0])
128 |             transformed_bbox['xmax'].append(bbox[3])
129 |             transformed_bbox['ymax'].append(bbox[2])
130 | 
131 |         resized_image = resized_image.transpose((2, 0, 1))
132 |         return resized_image,transformed_bbox,source_bboxes
133 | 
134 | 
135 | def generate_bboxes(image):
136 | 
137 |     bins = 12.0
138 |     h = image.shape[0]
139 |     w = image.shape[1]
140 |     step_h = h / bins
141 |     step_w = w / bins
142 |     annotations = list()
143 |     for x1 in range(0,4):
144 |         for y1 in range(0,4):
145 |             for x2 in range(8,12):
146 |                 for y2 in range(8,12):
147 |                     if (x2-x1)*(y2-y1)>0.4999*bins*bins and (y2-y1)*step_w/(x2-x1)/step_h>0.5 and (y2-y1)*step_w/(x2-x1)/step_h<2.0:
148 |                         annotations.append([float(step_h*(0.5+x1)),float(step_w*(0.5+y1)),float(step_h*(0.5+x2)),float(step_w*(0.5+y2))])
149 | 
150 |     return annotations
151 | 
152 | def generate_bboxes_16_9(image):
153 | 
154 |     h = image.shape[0]
155 |     w = image.shape[1]
156 |     h_step = 9
157 |     w_step = 16
158 |     annotations = list()
159 |     for i in range(14,30):
160 |         out_h = h_step*i
161 |         out_w = w_step*i
162 |         if out_h < h and out_w < w and out_h*out_w>0.4*h*w:
163 |             for w_start in range(0,w-out_w,w_step):
164 |                 for h_start in range(0,h-out_h,h_step):
165 |                     annotations.append([float(h_start),float(w_start),float(h_start+out_h-1),float(w_start+out_w-1)])
166 |     return annotations
167 | 
168 | def generate_bboxes_4_3(image):
169 | 
170 |     h = image.shape[0]
171 |     w = image.shape[1]
172 |     h_step = 12
173 |     w_step = 16
174 |     annotations = list()
175 |     for i in range(14,30):
176 |         out_h = h_step*i
177 |         out_w = w_step*i
178 |         if out_h < h and out_w < w and out_h*out_w>0.4*h*w:
179 |             for w_start in range(0,w-out_w,w_step):
180 |                 for h_start in range(0,h-out_h,h_step):
181 |                     annotations.append([float(h_start),float(w_start),float(h_start+out_h-1),float(w_start+out_w-1)])
182 |     return annotations
183 | 
184 | def generate_bboxes_1_1(image):
185 | 
186 |     h = image.shape[0]
187 |     w = image.shape[1]
188 |     h_step = 12
189 |     w_step = 12
190 |     annotations = list()
191 |     for i in range(14,30):
192 |         out_h = h_step*i
193 |         out_w = w_step*i
194 |         if out_h < h and out_w < w and out_h*out_w>0.4*h*w:
195 |             for w_start in range(0,w-out_w,w_step):
196 |                 for h_start in range(0,h-out_h,h_step):
197 |                     annotations.append([float(h_start),float(w_start),float(h_start+out_h-1),float(w_start+out_w-1)])
198 |     return annotations
199 | 
200 | class setup_test_dataset(data.Dataset):
201 | 
202 |     def __init__(self, image_size=256.0,dataset_dir='testsetDir', transform=TransformFunctionTest()):
203 |         self.image_size = float(image_size)
204 |         self.dataset_dir = dataset_dir
205 |         image_lists = os.listdir(self.dataset_dir)
206 |         self._imgpath = list()
207 |         self._annopath = list()
208 |         for image in image_lists:
209 |           self._imgpath.append(os.path.join(self.dataset_dir, image))
210 |         self.transform = transform
211 | 
212 | 
213 |     def __getitem__(self, idx):
214 |         image = cv2.imread(self._imgpath[idx])
215 | 
216 |         # to rgb
217 |         image = image[:, :, (2, 1, 0)]
218 | 
219 |         if self.transform:
220 |             resized_image,transformed_bbox,source_bboxes = self.transform(image,self.image_size)
221 | 
222 |         sample = {'imgpath': self._imgpath[idx], 'image': image, 'resized_image': resized_image, 'tbboxes':transformed_bbox , 'sourceboxes': source_bboxes}
223 | 
224 |         return sample
225 | 
226 |     def __len__(self):
227 |         return len(self._imgpath)
228 | 
229 | 


--------------------------------------------------------------------------------
/croppingModel.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torchvision.models as models
  4 | from roi_align.modules.roi_align import RoIAlignAvg, RoIAlign
  5 | from rod_align.modules.rod_align import RoDAlignAvg, RoDAlign
  6 | import torch.nn.init as init
  7 | from ShuffleNetV2 import shufflenetv2
  8 | from mobilenetv2 import MobileNetV2
  9 | from thop import profile
 10 | 
 11 | 
 12 | class vgg_base(nn.Module):
 13 | 
 14 |     def __init__(self, loadweights=True, downsample=4):
 15 |         super(vgg_base, self).__init__()
 16 | 
 17 |         vgg = models.vgg16(pretrained=True)
 18 | 
 19 |         if downsample == 4:
 20 |             self.feature = nn.Sequential(vgg.features[:-1])
 21 |         elif downsample == 5:
 22 |             self.feature = nn.Sequential(vgg.features)
 23 | 
 24 |         self.feature3 = nn.Sequential(vgg.features[:23])
 25 |         self.feature4 = nn.Sequential(vgg.features[23:30])
 26 |         self.feature5 = nn.Sequential(vgg.features[30:])
 27 | 
 28 |         #flops, params = profile(self.feature, input_size=(1, 3, 256,256))
 29 | 
 30 |     def forward(self, x):
 31 |         #return self.feature(x)
 32 |         f3 = self.feature3(x)
 33 |         f4 = self.feature4(f3)
 34 |         f5 = self.feature5(f4)
 35 |         return f3, f4, f5
 36 | 
 37 | class resnet50_base(nn.Module):
 38 | 
 39 |     def __init__(self, loadweights=True, downsample=4):
 40 |         super(resnet50_base, self).__init__()
 41 | 
 42 |         resnet50 = models.resnet50(pretrained=True)
 43 | 
 44 |         self.feature3 = nn.Sequential(resnet50.conv1,resnet50.bn1,resnet50.relu,resnet50.maxpool,resnet50.layer1,resnet50.layer2)
 45 |         self.feature4 = nn.Sequential(resnet50.layer3)
 46 |         self.feature5 = nn.Sequential(resnet50.layer4)
 47 | 
 48 |         #flops, params = profile(self.feature, input_size=(1, 3, 256,256))
 49 | 
 50 |     def forward(self, x):
 51 |         #return self.feature(x)
 52 |         f3 = self.feature3(x)
 53 |         f4 = self.feature4(f3)
 54 |         f5 = self.feature5(f4)
 55 |         return f3, f4, f5
 56 | 
 57 | 
 58 | class mobilenetv2_base(nn.Module):
 59 | 
 60 |     def __init__(self, loadweights=True, downsample=4, model_path='pretrained_model/mobilenetv2_1.0-0c6065bc.pth'):
 61 |         super(mobilenetv2_base, self).__init__()
 62 | 
 63 |         model = MobileNetV2(width_mult=1.0)
 64 | 
 65 |         if loadweights:
 66 |             model.load_state_dict(torch.load(model_path))
 67 | 
 68 |         #if downsample == 4:
 69 |         #    self.feature = nn.Sequential(model.features[:14])
 70 |         #elif downsample == 5:
 71 |         #    self.feature = nn.Sequential(model.features)
 72 | 
 73 |         self.feature3 = nn.Sequential(model.features[:7])
 74 |         self.feature4 = nn.Sequential(model.features[7:14])
 75 |         self.feature5 = nn.Sequential(model.features[14:])
 76 | 
 77 |         #flops, params = profile(self.feature, input_size=(1, 3, 256,256))
 78 | 
 79 |     def forward(self, x):
 80 |         #return self.feature(x)
 81 |         f3 = self.feature3(x)
 82 |         f4 = self.feature4(f3)
 83 |         f5 = self.feature5(f4)
 84 |         return f3, f4, f5
 85 | 
 86 | 
 87 | class shufflenetv2_base(nn.Module):
 88 | 
 89 |     def __init__(self, loadweights=True, downsample=4, model_path='pretrained_model/shufflenetv2_x1_69.402_88.374.pth.tar'):
 90 |         super(shufflenetv2_base, self).__init__()
 91 | 
 92 |         model = shufflenetv2(width_mult=1.0)
 93 | 
 94 |         if loadweights:
 95 |             model.load_state_dict(torch.load(model_path))
 96 | 
 97 |         self.feature3 = nn.Sequential(model.conv1, model.maxpool, model.features[:4])
 98 |         self.feature4 = nn.Sequential(model.features[4:12])
 99 |         self.feature5 = nn.Sequential(model.features[12:])
100 | 
101 |         #if downsample == 4:
102 |         #    self.feature = nn.Sequential(model.conv1, model.maxpool, model.features[:12])
103 |         #elif downsample == 5:
104 |         #    self.feature = nn.Sequential(model.conv1, model.maxpool, model.features)
105 | 
106 |         #flops, params = profile(self.feature, input_size=(1, 3, 256,256))
107 | 
108 |     def forward(self, x):
109 |         #return self.feature(x)
110 |         f3 = self.feature3(x)
111 |         f4 = self.feature4(f3)
112 |         f5 = self.feature5(f4)
113 |         return f3, f4, f5
114 | 
115 | 
116 | def fc_layers(reddim = 32, alignsize = 8):
117 |     conv1 = nn.Sequential(nn.Conv2d(reddim, 768, kernel_size=alignsize, padding=0),nn.ReLU(inplace=True))
118 |     #conv1 = nn.Sequential(nn.Conv2d(reddim, 768, kernel_size=3, padding=1, stride=2),nn.ReLU(inplace=True),
119 |     #                      nn.Conv2d(768, reddim, kernel_size=1, padding=0),nn.ReLU(inplace=True),
120 |     #                      nn.Conv2d(reddim, 768, kernel_size=3, padding=1,stride=2),nn.ReLU(inplace=True),
121 |     #                      nn.Conv2d(768, reddim, kernel_size=1, padding=0),nn.ReLU(inplace=True),
122 |     #                      nn.Conv2d(reddim, 768, kernel_size=3, padding=0,stride=1),nn.ReLU(inplace=True))
123 |     #conv1 = nn.Sequential(nn.Conv2d(reddim, 768, kernel_size=5, padding=2, stride=2),nn.ReLU(inplace=True),
124 |     #                      nn.Conv2d(768, reddim, kernel_size=1, padding=0),nn.ReLU(inplace=True),
125 |     #                      nn.Conv2d(reddim, 768, kernel_size=5, padding=0,stride=1),nn.ReLU(inplace=True))
126 |     conv2 = nn.Sequential(nn.Conv2d(768, 128, kernel_size=1),nn.ReLU(inplace=True))
127 |     #dropout = nn.Dropout(p=0.5)
128 |     conv3 = nn.Conv2d(128, 1, kernel_size=1)
129 |     layers = nn.Sequential(conv1, conv2, dropout, conv3)
130 |     return layers
131 | 
132 | 
133 | class crop_model_single_scale(nn.Module):
134 | 
135 |     def __init__(self, alignsize = 8, reddim = 8, loadweight = True, model = None, downsample=4):
136 |         super(crop_model_single_scale, self).__init__()
137 | 
138 |         if model == 'shufflenetv2':
139 |             self.Feat_ext = shufflenetv2_base(loadweight,downsample)
140 |             if downsample == 4:
141 |                 self.DimRed = nn.Conv2d(232, reddim, kernel_size=1, padding=0)
142 |             else:
143 |                 self.DimRed = nn.Conv2d(464, reddim, kernel_size=1, padding=0)
144 |         elif model == 'mobilenetv2':
145 |             self.Feat_ext = mobilenetv2_base(loadweight,downsample)
146 |             if downsample == 4:
147 |                 self.DimRed = nn.Conv2d(96, reddim, kernel_size=1, padding=0)
148 |             else:
149 |                 self.DimRed = nn.Conv2d(320, reddim, kernel_size=1, padding=0)
150 |         elif model == 'vgg16':
151 |             self.Feat_ext = vgg_base(loadweight,downsample)
152 |             self.DimRed = nn.Conv2d(512, reddim, kernel_size=1, padding=0)
153 |         elif model == 'resnet50':
154 |             self.Feat_ext = resnet50_base(loadweight,downsample)
155 |             self.DimRed = nn.Conv2d(1024, reddim, kernel_size=1, padding=0)
156 | 
157 |         self.RoIAlign = RoIAlignAvg(alignsize, alignsize, 1.0/2**downsample)
158 |         self.RoDAlign = RoDAlignAvg(alignsize, alignsize, 1.0/2**downsample)
159 |         self.FC_layers = fc_layers(reddim*2, alignsize)
160 | 
161 |         #flops, params = profile(self.FC_layers, input_size=(1,reddim*2,9,9))
162 | 
163 |     def forward(self, im_data, boxes):
164 | 
165 |         f3,base_feat,f5 = self.Feat_ext(im_data)
166 |         red_feat = self.DimRed(base_feat)
167 |         RoI_feat = self.RoIAlign(red_feat, boxes)
168 |         RoD_feat = self.RoDAlign(red_feat, boxes)
169 |         final_feat = torch.cat((RoI_feat, RoD_feat), 1)
170 |         prediction = self.FC_layers(final_feat)
171 |         return prediction
172 | 
173 |     def _init_weights(self):
174 |         print('Initializing weights...')
175 |         self.DimRed.apply(weights_init)
176 |         self.FC_layers.apply(weights_init)
177 | 
178 | 
179 | class crop_model_multi_scale_individual(nn.Module):
180 | 
181 |     def __init__(self, alignsize = 8, reddim = 32, loadweight = True, model = None, downsample = 4):
182 |         super(crop_model_multi_scale_individual, self).__init__()
183 | 
184 |         if model == 'shufflenetv2':
185 |             self.Feat_ext1 = shufflenetv2_base(loadweight,downsample)
186 |             self.Feat_ext2 = shufflenetv2_base(loadweight,downsample)
187 |             self.Feat_ext3 = shufflenetv2_base(loadweight,downsample)
188 |             self.DimRed = nn.Conv2d(232, reddim, kernel_size=1, padding=0)
189 |         elif model == 'mobilenetv2':
190 |             self.Feat_ext1 = mobilenetv2_base(loadweight,downsample)
191 |             self.Feat_ext2 = mobilenetv2_base(loadweight,downsample)
192 |             self.Feat_ext3 = mobilenetv2_base(loadweight,downsample)
193 |             self.DimRed = nn.Conv2d(96, reddim, kernel_size=1, padding=0)
194 |         elif model == 'vgg16':
195 |             self.Feat_ext1 = vgg_base(loadweight,downsample)
196 |             self.Feat_ext2 = vgg_base(loadweight,downsample)
197 |             self.Feat_ext3 = vgg_base(loadweight,downsample)
198 |             self.DimRed = nn.Conv2d(512, reddim, kernel_size=1, padding=0)
199 | 
200 |         self.downsample2 = nn.UpsamplingBilinear2d(scale_factor=1.0/2.0)
201 |         self.upsample2 = nn.UpsamplingBilinear2d(scale_factor=2.0)
202 |         self.RoIAlign = RoIAlignAvg(alignsize, alignsize, 1.0/2**downsample)
203 |         self.RoDAlign = RoDAlignAvg(alignsize, alignsize, 1.0/2**downsample)
204 |         self.FC_layers = fc_layers(reddim*2, alignsize)
205 | 
206 |     def forward(self, im_data, boxes):
207 | 
208 |         base_feat = self.Feat_ext1(im_data)
209 | 
210 |         up_im = self.upsample2(im_data)
211 |         up_feat = self.Feat_ext2(up_im)
212 |         up_feat = self.downsample2(up_feat)
213 | 
214 |         down_im = self.downsample2(im_data)
215 |         down_feat = self.Feat_ext3(down_im)
216 |         down_feat = self.upsample2(down_feat)
217 | 
218 |         #cat_feat = torch.cat((base_feat,up_feat,down_feat),1)
219 |         cat_feat = 0.5*base_feat + 0.35*up_feat + 0.15*down_feat
220 |         red_feat = self.DimRed(cat_feat)
221 |         RoI_feat = self.RoIAlign(red_feat, boxes)
222 |         RoD_feat = self.RoDAlign(red_feat, boxes)
223 |         final_feat = torch.cat((RoI_feat, RoD_feat), 1)
224 |         prediction = self.FC_layers(final_feat)
225 |         return prediction
226 | 
227 |     def _init_weights(self):
228 |         print('Initializing weights...')
229 |         self.DimRed.apply(weights_init)
230 |         self.FC_layers.apply(weights_init)
231 | 
232 | class crop_model_multi_scale_shared(nn.Module):
233 | 
234 |     def __init__(self, alignsize = 8, reddim = 32, loadweight = True, model = None, downsample = 4):
235 |         super(crop_model_multi_scale_shared, self).__init__()
236 | 
237 |         if model == 'shufflenetv2':
238 |             self.Feat_ext = shufflenetv2_base(loadweight,downsample)
239 |             self.DimRed = nn.Conv2d(812, reddim, kernel_size=1, padding=0)
240 |         elif model == 'mobilenetv2':
241 |             self.Feat_ext = mobilenetv2_base(loadweight,downsample)
242 |             self.DimRed = nn.Conv2d(448, reddim, kernel_size=1, padding=0)
243 |         elif model == 'vgg16':
244 |             self.Feat_ext = vgg_base(loadweight,downsample)
245 |             self.DimRed = nn.Conv2d(1536, reddim, kernel_size=1, padding=0)
246 |         elif model == 'resnet50':
247 |             self.Feat_ext = resnet50_base(loadweight,downsample)
248 |             self.DimRed = nn.Conv2d(3584, reddim, kernel_size=1, padding=0)
249 | 
250 |         self.downsample2 = nn.UpsamplingBilinear2d(scale_factor=1.0/2.0)
251 |         self.upsample2 = nn.UpsamplingBilinear2d(scale_factor=2.0)
252 |         self.RoIAlign = RoIAlignAvg(alignsize, alignsize, 1.0/2**downsample)
253 |         self.RoDAlign = RoDAlignAvg(alignsize, alignsize, 1.0/2**downsample)
254 |         self.FC_layers = fc_layers(reddim*2, alignsize)
255 | 
256 | 
257 |     def forward(self, im_data, boxes):
258 | 
259 |         #base_feat = self.Feat_ext(im_data)
260 | 
261 |         #up_im = self.upsample2(im_data)
262 |         #up_feat = self.Feat_ext(up_im)
263 |         #up_feat = self.downsample2(up_feat)
264 | 
265 |         #down_im = self.downsample2(im_data)
266 |         #down_feat = self.Feat_ext(down_im)
267 |         #down_feat = self.upsample2(down_feat)
268 | 
269 |         f3,f4,f5 = self.Feat_ext(im_data)
270 |         cat_feat = torch.cat((self.downsample2(f3),f4,0.5*self.upsample2(f5)),1)
271 | 
272 |         #cat_feat = torch.cat((base_feat,up_feat,down_feat),1)
273 |         #cat_feat = base_feat + 0.35*up_feat + 0.15*down_feat
274 |         red_feat = self.DimRed(cat_feat)
275 |         RoI_feat = self.RoIAlign(red_feat, boxes)
276 |         RoD_feat = self.RoDAlign(red_feat, boxes)
277 |         final_feat = torch.cat((RoI_feat, RoD_feat), 1)
278 |         prediction = self.FC_layers(final_feat)
279 |         return prediction
280 | 
281 |     def _init_weights(self):
282 |         print('Initializing weights...')
283 |         self.DimRed.apply(weights_init)
284 |         self.FC_layers.apply(weights_init)
285 | 
286 | def xavier(param):
287 |     init.xavier_uniform_(param)
288 | 
289 | 
290 | def weights_init(m):
291 |     if isinstance(m, nn.Conv2d):
292 |         xavier(m.weight.data)
293 |         m.bias.data.zero_()
294 | 
295 | 
296 | def build_crop_model(scale='single', alignsize=8, reddim=32, loadweight=True, model=None, downsample=4):
297 | 
298 |     if scale=='single':
299 |         return crop_model_single_scale(alignsize, reddim, loadweight, model, downsample)
300 |     elif scale=='multi':
301 |         return crop_model_multi_scale_shared(alignsize, reddim, loadweight, model, downsample)
302 | 
303 | 
304 | 
305 | 


--------------------------------------------------------------------------------
/demo_eval.py:
--------------------------------------------------------------------------------
  1 | from croppingModel import build_crop_model
  2 | from croppingDataset import setup_test_dataset
  3 | import os
  4 | import torch
  5 | import cv2
  6 | from torch.autograd import Variable
  7 | import torch.backends.cudnn as cudnn
  8 | import torch.utils.data as data
  9 | import argparse
 10 | import time
 11 | 
 12 | 
 13 | def str2bool(v):
 14 |     return v.lower() in ("yes", "true", "t", "1")
 15 | 
 16 | 
 17 | parser = argparse.ArgumentParser(
 18 |     description='Grid anchor based image cropping With Pytorch')
 19 | parser.add_argument('--input_dir', default='dataset/GAIC/images/test',
 20 |                     help='root directory path of testing images')
 21 | parser.add_argument('--output_dir', default='dataset/test_result',
 22 |                     help='root directory path of testing images')
 23 | parser.add_argument('--batch_size', default=1, type=int,
 24 |                     help='Batch size for training')
 25 | parser.add_argument('--num_workers', default=0, type=int,
 26 |                     help='Number of workers used in dataloading')
 27 | parser.add_argument('--cuda', default=True, type=str2bool,
 28 |                     help='Use CUDA to train model')
 29 | parser.add_argument('--net_path', default='pretrained_model/mobilenet_0.625_0.583_0.553_0.525_0.785_0.762_0.748_0.723_0.783_0.806.pth',
 30 |                     help='Directory for saving checkpoint models')
 31 | args = parser.parse_args()
 32 | 
 33 | if not os.path.exists(args.output_dir):
 34 |     os.makedirs(args.output_dir)
 35 | 
 36 | if torch.cuda.is_available():
 37 |     if args.cuda:
 38 |         torch.set_default_tensor_type('torch.cuda.FloatTensor')
 39 |     if not args.cuda:
 40 |         print("WARNING: It looks like you have a CUDA device, but aren't " +
 41 |               "using CUDA.\nRun with --cuda for optimal training speed.")
 42 |         torch.set_default_tensor_type('torch.FloatTensor')
 43 | 
 44 | else:
 45 |     torch.set_default_tensor_type('torch.FloatTensor')
 46 | 
 47 | dataset = setup_test_dataset(dataset_dir = args.input_dir)
 48 | 
 49 | 
 50 | def test():
 51 | 
 52 |     net = build_crop_model(scale='multi', alignsize=9, reddim=8, loadweight=True, model='mobilenetv2',downsample=4)
 53 |     net.load_state_dict(torch.load(args.net_path))
 54 |     net.eval()
 55 | 
 56 |     if args.cuda:
 57 |         net = torch.nn.DataParallel(net,device_ids=[0])
 58 |         cudnn.benchmark = True
 59 |         net = net.cuda()
 60 | 
 61 | 
 62 |     data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers,shuffle=False,pin_memory=True)
 63 | 
 64 |     for id, sample in enumerate(data_loader):
 65 |         imgpath = sample['imgpath']
 66 |         image = sample['image']
 67 |         bboxes = sample['sourceboxes']
 68 |         resized_image = sample['resized_image']
 69 |         tbboxes = sample['tbboxes']
 70 | 
 71 |         if len(tbboxes['xmin'])==0:
 72 |             continue
 73 | 
 74 |         roi = []
 75 | 
 76 |         for idx in range(0,len(tbboxes['xmin'])):
 77 |             roi.append((0, tbboxes['xmin'][idx],tbboxes['ymin'][idx],tbboxes['xmax'][idx],tbboxes['ymax'][idx]))
 78 | 
 79 |         if args.cuda:
 80 |             resized_image = Variable(resized_image.cuda())
 81 |             roi = Variable(torch.Tensor(roi))
 82 |         else:
 83 |             resized_image = Variable(resized_image)
 84 |             roi = Variable(torch.Tensor(roi))
 85 | 
 86 | 
 87 |         t0 = time.time()
 88 |         for r in range(0,100):
 89 |             out = net(resized_image,roi)
 90 |         t1 = time.time()
 91 |         print('timer: %.4f sec.' % (t1 - t0))
 92 | 
 93 |         out = net(resized_image,roi)
 94 | 
 95 |         id_out = sorted(range(len(out)), key=lambda k: out[k], reverse = True)
 96 |         image = image.cpu().numpy().squeeze(0)
 97 | 
 98 |         for i in range(4):
 99 |             top1_box = bboxes[id_out[i]]
100 |             top1_box = [top1_box[0].numpy()[0],top1_box[1].numpy()[0],top1_box[2].numpy()[0],top1_box[3].numpy()[0]]
101 |             top1_crop = image[int(top1_box[0]):int(top1_box[2]),int(top1_box[1]):int(top1_box[3])]
102 |             imgname = imgpath[0].split('/')[-1]
103 |             cv2.imwrite(args.output_dir + '/' + imgname[:-4] + '_' +str(i) + imgname[-4:],top1_crop[:,:,(2, 1, 0)])
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     test()
108 | 


--------------------------------------------------------------------------------
/mobilenetv2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Creates a MobileNetV2 Model as defined in:
  3 | Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. (2018). 
  4 | MobileNetV2: Inverted Residuals and Linear Bottlenecks
  5 | arXiv preprint arXiv:1801.04381.
  6 | import from https://github.com/tonylins/pytorch-mobilenet-v2
  7 | """
  8 | 
  9 | import torch.nn as nn
 10 | import math
 11 | 
 12 | __all__ = ['mobilenetv2']
 13 | 
 14 | 
 15 | def _make_divisible(v, divisor, min_value=None):
 16 |     """
 17 |     This function is taken from the original tf repo.
 18 |     It ensures that all layers have a channel number that is divisible by 8
 19 |     It can be seen here:
 20 |     https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
 21 |     :param v:
 22 |     :param divisor:
 23 |     :param min_value:
 24 |     :return:
 25 |     """
 26 |     if min_value is None:
 27 |         min_value = divisor
 28 |     new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
 29 |     # Make sure that round down does not go down by more than 10%.
 30 |     if new_v < 0.9 * v:
 31 |         new_v += divisor
 32 |     return new_v
 33 | 
 34 | 
 35 | def conv_3x3_bn(inp, oup, stride):
 36 |     return nn.Sequential(
 37 |         nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
 38 |         nn.BatchNorm2d(oup),
 39 |         nn.ReLU6(inplace=True)
 40 |     )
 41 | 
 42 | 
 43 | def conv_1x1_bn(inp, oup):
 44 |     return nn.Sequential(
 45 |         nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
 46 |         nn.BatchNorm2d(oup),
 47 |         nn.ReLU6(inplace=True)
 48 |     )
 49 | 
 50 | 
 51 | class InvertedResidual(nn.Module):
 52 |     def __init__(self, inp, oup, stride, expand_ratio):
 53 |         super(InvertedResidual, self).__init__()
 54 |         assert stride in [1, 2]
 55 | 
 56 |         hidden_dim = int(round(inp * expand_ratio))
 57 |         self.identity = stride == 1 and inp == oup
 58 | 
 59 |         if expand_ratio == 1:
 60 |             self.conv = nn.Sequential(
 61 |                 # dw
 62 |                 nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
 63 |                 nn.BatchNorm2d(hidden_dim),
 64 |                 nn.ReLU6(inplace=True),
 65 |                 # pw-linear
 66 |                 nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
 67 |                 nn.BatchNorm2d(oup),
 68 |             )
 69 |         else:
 70 |             self.conv = nn.Sequential(
 71 |                 # pw
 72 |                 nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
 73 |                 nn.BatchNorm2d(hidden_dim),
 74 |                 nn.ReLU6(inplace=True),
 75 |                 # dw
 76 |                 nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
 77 |                 nn.BatchNorm2d(hidden_dim),
 78 |                 nn.ReLU6(inplace=True),
 79 |                 # pw-linear
 80 |                 nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
 81 |                 nn.BatchNorm2d(oup),
 82 |             )
 83 | 
 84 |     def forward(self, x):
 85 |         if self.identity:
 86 |             return x + self.conv(x)
 87 |         else:
 88 |             return self.conv(x)
 89 | 
 90 | 
 91 | class MobileNetV2(nn.Module):
 92 |     def __init__(self, num_classes=1000, input_size=224, width_mult=1.):
 93 |         super(MobileNetV2, self).__init__()
 94 |         # setting of inverted residual blocks
 95 |         self.cfgs = [
 96 |             # t, c, n, s
 97 |             [1,  16, 1, 1],
 98 |             [6,  24, 2, 2],
 99 |             [6,  32, 3, 2],
100 |             [6,  64, 4, 2],
101 |             [6,  96, 3, 1],
102 |             [6, 160, 3, 2],
103 |             [6, 320, 1, 1],
104 |         ]
105 | 
106 |         # building first layer
107 |         assert input_size % 32 == 0
108 |         input_channel = _make_divisible(32 * width_mult, 8)
109 |         layers = [conv_3x3_bn(3, input_channel, 2)]
110 |         # building inverted residual blocks
111 |         block = InvertedResidual
112 |         for t, c, n, s in self.cfgs:
113 |             output_channel = _make_divisible(c * width_mult, 8)
114 |             layers.append(block(input_channel, output_channel, s, t))
115 |             input_channel = output_channel
116 |             for i in range(1, n):
117 |                 layers.append(block(input_channel, output_channel, 1, t))
118 |                 input_channel = output_channel
119 |         self.features = nn.Sequential(*layers)
120 |         # building last several layers
121 |         output_channel = _make_divisible(1280 * width_mult, 8) if width_mult > 1.0 else 1280
122 |         self.conv = conv_1x1_bn(input_channel, output_channel)
123 |         self.avgpool = nn.AvgPool2d(input_size // 32, stride=1)
124 |         self.classifier = nn.Linear(output_channel, num_classes)
125 | 
126 |         self._initialize_weights()
127 | 
128 |     def forward(self, x):
129 |         x = self.features(x)
130 |         x = self.conv(x)
131 |         x = self.avgpool(x)
132 |         x = x.view(x.size(0), -1)
133 |         x = self.classifier(x)
134 |         return x
135 | 
136 |     def _initialize_weights(self):
137 |         for m in self.modules():
138 |             if isinstance(m, nn.Conv2d):
139 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
140 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
141 |                 if m.bias is not None:
142 |                     m.bias.data.zero_()
143 |             elif isinstance(m, nn.BatchNorm2d):
144 |                 m.weight.data.fill_(1)
145 |                 m.bias.data.zero_()
146 |             elif isinstance(m, nn.Linear):
147 |                 n = m.weight.size(1)
148 |                 m.weight.data.normal_(0, 0.01)
149 |                 m.bias.data.zero_()
150 | 
151 | def mobilenetv2(**kwargs):
152 |     """
153 |     Constructs a MobileNet V2 model
154 |     """
155 |     return MobileNetV2(**kwargs)
156 | 
157 | 


--------------------------------------------------------------------------------
/rod_align/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/__init__.pyc


--------------------------------------------------------------------------------
/rod_align/_ext/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/_ext/__init__.pyc


--------------------------------------------------------------------------------
/rod_align/_ext/rod_align/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from torch.utils.ffi import _wrap_function
 3 | from ._rod_align import lib as _lib, ffi as _ffi
 4 | 
 5 | __all__ = []
 6 | def _import_symbols(locals):
 7 |     for symbol in dir(_lib):
 8 |         fn = getattr(_lib, symbol)
 9 |         if callable(fn):
10 |             locals[symbol] = _wrap_function(fn, _ffi)
11 |         else:
12 |             locals[symbol] = fn
13 |         __all__.append(symbol)
14 | 
15 | _import_symbols(locals())
16 | 


--------------------------------------------------------------------------------
/rod_align/_ext/rod_align/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/_ext/rod_align/__init__.pyc


--------------------------------------------------------------------------------
/rod_align/_ext/rod_align/_rod_align.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/_ext/rod_align/_rod_align.so


--------------------------------------------------------------------------------
/rod_align/build.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | import torch
 4 | from torch.utils.ffi import create_extension
 5 | 
 6 | sources = ['src/rod_align.c']
 7 | headers = ['src/rod_align.h']
 8 | extra_objects = []
 9 | #sources = []
10 | #headers = []
11 | defines = []
12 | with_cuda = False
13 | 
14 | this_file = os.path.dirname(os.path.realpath(__file__))
15 | print(this_file)
16 | 
17 | if torch.cuda.is_available():
18 |     print('Including CUDA code.')
19 |     sources += ['src/rod_align_cuda.c']
20 |     headers += ['src/rod_align_cuda.h']
21 |     defines += [('WITH_CUDA', None)]
22 |     with_cuda = True
23 |     
24 |     extra_objects = ['src/rod_align_kernel.cu.o']
25 |     extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
26 | 
27 | ffi = create_extension(
28 |     '_ext.rod_align',
29 |     headers=headers,
30 |     sources=sources,
31 |     define_macros=defines,
32 |     relative_to=__file__,
33 |     with_cuda=with_cuda,
34 |     extra_objects=extra_objects
35 | )
36 | 
37 | if __name__ == '__main__':
38 |     ffi.build()
39 | 


--------------------------------------------------------------------------------
/rod_align/functions/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/functions/__init__.pyc


--------------------------------------------------------------------------------
/rod_align/functions/rod_align.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Function
 3 | from .._ext import rod_align
 4 | 
 5 | 
 6 | # TODO use save_for_backward instead
 7 | class RoDAlignFunction(Function):
 8 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
 9 |         self.aligned_width = int(aligned_width)
10 |         self.aligned_height = int(aligned_height)
11 |         self.spatial_scale = float(spatial_scale)
12 |         self.rois = None
13 |         self.feature_size = None
14 | 
15 |     def forward(self, features, rois):
16 |         self.rois = rois
17 |         self.feature_size = features.size()
18 | 
19 |         batch_size, num_channels, data_height, data_width = features.size()
20 |         num_rois = rois.size(0)
21 | 
22 |         output = features.new(num_rois, num_channels, self.aligned_height, self.aligned_width).zero_()
23 |         if features.is_cuda:
24 |             rod_align.rod_align_forward_cuda(self.aligned_height,
25 |                                              self.aligned_width,
26 |                                              self.spatial_scale, features,
27 |                                              rois, output)
28 |         else:
29 |             rod_align.rod_align_forward(self.aligned_height,
30 |                                         self.aligned_width,
31 |                                         self.spatial_scale, features,
32 |                                         rois, output)
33 | #            raise NotImplementedError
34 | 
35 |         return output
36 | 
37 |     def backward(self, grad_output):
38 |         assert(self.feature_size is not None and grad_output.is_cuda)
39 | 
40 |         batch_size, num_channels, data_height, data_width = self.feature_size
41 | 
42 |         grad_input = self.rois.new(batch_size, num_channels, data_height,
43 |                                   data_width).zero_()
44 |         rod_align.rod_align_backward_cuda(self.aligned_height,
45 |                                           self.aligned_width,
46 |                                           self.spatial_scale, grad_output,
47 |                                           self.rois, grad_input)
48 | 
49 |         # print grad_input
50 | 
51 |         return grad_input, None
52 | 


--------------------------------------------------------------------------------
/rod_align/functions/rod_align.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/functions/rod_align.pyc


--------------------------------------------------------------------------------
/rod_align/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CUDA_PATH=/usr/local/cuda/
 4 | 
 5 | cd src
 6 | echo "Compiling my_lib kernels by nvcc..."
 7 | nvcc -c -o rod_align_kernel.cu.o rod_align_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52
 8 | 
 9 | cd ../
10 | python build.py
11 | 


--------------------------------------------------------------------------------
/rod_align/modules/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/modules/__init__.pyc


--------------------------------------------------------------------------------
/rod_align/modules/rod_align.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.modules.module import Module
 2 | from torch.nn.functional import avg_pool2d, max_pool2d
 3 | from ..functions.rod_align import RoDAlignFunction
 4 | 
 5 | 
 6 | class RoDAlign(Module):
 7 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
 8 |         super(RoDAlign, self).__init__()
 9 | 
10 |         self.aligned_width = int(aligned_width)
11 |         self.aligned_height = int(aligned_height)
12 |         self.spatial_scale = float(spatial_scale)
13 | 
14 |     def forward(self, features, rois):
15 |         return RoDAlignFunction(self.aligned_height, self.aligned_width,
16 |                                 self.spatial_scale)(features, rois)
17 | 
18 | class RoDAlignAvg(Module):
19 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
20 |         super(RoDAlignAvg, self).__init__()
21 | 
22 |         self.aligned_width = int(aligned_width)
23 |         self.aligned_height = int(aligned_height)
24 |         self.spatial_scale = float(spatial_scale)
25 | 
26 |     def forward(self, features, rois):
27 |         x = RoDAlignFunction(self.aligned_height+1, self.aligned_width+1,
28 |                                 self.spatial_scale)(features, rois)
29 |         return avg_pool2d(x, kernel_size=2, stride=1)
30 | 
31 | class RoDAlignMax(Module):
32 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
33 |         super(RoDAlignMax, self).__init__()
34 | 
35 |         self.aligned_width = int(aligned_width)
36 |         self.aligned_height = int(aligned_height)
37 |         self.spatial_scale = float(spatial_scale)
38 | 
39 |     def forward(self, features, rois):
40 |         x =  RoDAlignFunction(self.aligned_height+1, self.aligned_width+1,
41 |                                 self.spatial_scale)(features, rois)
42 |         return max_pool2d(x, kernel_size=2, stride=1)
43 | 


--------------------------------------------------------------------------------
/rod_align/modules/rod_align.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/modules/rod_align.pyc


--------------------------------------------------------------------------------
/rod_align/src/rod_align.c:
--------------------------------------------------------------------------------
  1 | #include <TH/TH.h>
  2 | #include <math.h>
  3 | #include <omp.h>
  4 | 
  5 | 
  6 | void RODAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois,
  7 |                      const int height, const int width, const int channels,
  8 |                      const int aligned_height, const int aligned_width, const float * bottom_rois,
  9 |                      float* top_data);
 10 | 
 11 | void RODAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois,
 12 |                      const int height, const int width, const int channels,
 13 |                      const int aligned_height, const int aligned_width, const float * bottom_rois,
 14 |                      float* top_data);
 15 | 
 16 | int rod_align_forward(int aligned_height, int aligned_width, float spatial_scale,
 17 |                      THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output)
 18 | {
 19 |     //Grab the input tensor
 20 |     float * data_flat = THFloatTensor_data(features);
 21 |     float * rois_flat = THFloatTensor_data(rois);
 22 | 
 23 |     float * output_flat = THFloatTensor_data(output);
 24 | 
 25 |     // Number of ROIs
 26 |     int num_rois = THFloatTensor_size(rois, 0);
 27 |     int size_rois = THFloatTensor_size(rois, 1);
 28 |     if (size_rois != 5)
 29 |     {
 30 |         return 0;
 31 |     }
 32 | 
 33 |     // data height
 34 |     int data_height = THFloatTensor_size(features, 2);
 35 |     // data width
 36 |     int data_width = THFloatTensor_size(features, 3);
 37 |     // Number of channels
 38 |     int num_channels = THFloatTensor_size(features, 1);
 39 | 
 40 |     // do ROIAlignForward
 41 |     RODAlignForwardCpu(data_flat, spatial_scale, num_rois, data_height, data_width, num_channels,
 42 |             aligned_height, aligned_width, rois_flat, output_flat);
 43 | 
 44 |     return 1;
 45 | }
 46 | 
 47 | int rod_align_backward(int aligned_height, int aligned_width, float spatial_scale,
 48 |                        THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad)
 49 | {
 50 |     //Grab the input tensor
 51 |     float * top_grad_flat = THFloatTensor_data(top_grad);
 52 |     float * rois_flat = THFloatTensor_data(rois);
 53 | 
 54 |     float * bottom_grad_flat = THFloatTensor_data(bottom_grad);
 55 | 
 56 |     // Number of ROIs
 57 |     int num_rois = THFloatTensor_size(rois, 0);
 58 |     int size_rois = THFloatTensor_size(rois, 1);
 59 |     if (size_rois != 5)
 60 |     {
 61 |         return 0;
 62 |     }
 63 | 
 64 |     // batch size
 65 |     // int batch_size = THFloatTensor_size(bottom_grad, 0);
 66 |     // data height
 67 |     int data_height = THFloatTensor_size(bottom_grad, 2);
 68 |     // data width
 69 |     int data_width = THFloatTensor_size(bottom_grad, 3);
 70 |     // Number of channels
 71 |     int num_channels = THFloatTensor_size(bottom_grad, 1);
 72 | 
 73 |     // do ROIAlignBackward
 74 |     RODAlignBackwardCpu(top_grad_flat, spatial_scale, num_rois, data_height,
 75 |             data_width, num_channels, aligned_height, aligned_width, rois_flat, bottom_grad_flat);
 76 | 
 77 |     return 1;
 78 | }
 79 | 
 80 | void RODAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois,
 81 |                      const int height, const int width, const int channels,
 82 |                      const int aligned_height, const int aligned_width, const float * bottom_rois,
 83 |                      float* top_data)
 84 | {
 85 |     const int output_size = num_rois * aligned_height * aligned_width * channels;
 86 | 
 87 |     int idx = 0;
 88 |     float bin_size_h = (float)(height - 1.001) / (aligned_height - 1.);
 89 |     float bin_size_w = (float)(width - 1.001) / (aligned_width - 1.);
 90 |     for (idx = 0; idx < output_size; ++idx)
 91 |     {
 92 |         // (n, c, ph, pw) is an element in the aligned output
 93 |         int pw = idx % aligned_width;
 94 |         int ph = (idx / aligned_width) % aligned_height;
 95 |         int c = (idx / aligned_width / aligned_height) % channels;
 96 |         int n = idx / aligned_width / aligned_height / channels;
 97 | 
 98 |         float roi_batch_ind = bottom_rois[n * 5 + 0];
 99 |         float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
100 |         float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
101 |         float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
102 |         float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
103 |         
104 | 
105 |         float h = (float)(ph) * bin_size_h;
106 |         float w = (float)(pw) * bin_size_w;
107 | 
108 |         int hstart = fminf(floor(h), height - 2);
109 |         int wstart = fminf(floor(w), width - 2);
110 | 
111 |         int img_start = roi_batch_ind * channels * height * width;
112 | 
113 |         // bilinear interpolation
114 |         if (h >= roi_start_h && h <= roi_end_h && w >= roi_start_w && w <= roi_end_w){
115 |             top_data[idx] = 0.;
116 |         } else {
117 |             float h_ratio = h - (float)(hstart);
118 |             float w_ratio = w - (float)(wstart);
119 |             int upleft = img_start + (c * height + hstart) * width + wstart;
120 |             int upright = upleft + 1;
121 |             int downleft = upleft + width;
122 |             int downright = downleft + 1;
123 | 
124 |             top_data[idx] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio)
125 |                             + bottom_data[upright] * (1. - h_ratio) * w_ratio
126 |                             + bottom_data[downleft] * h_ratio * (1. - w_ratio)
127 |                             + bottom_data[downright] * h_ratio * w_ratio;
128 |         }
129 |     }
130 | }
131 | 
132 | void RODAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois,
133 |                      const int height, const int width, const int channels,
134 |                      const int aligned_height, const int aligned_width, const float * bottom_rois,
135 |                      float* bottom_diff)
136 | {
137 |     const int output_size = num_rois * aligned_height * aligned_width * channels;
138 | 
139 |     int idx = 0;
140 |     float bin_size_h = (float)(height - 1.001) / (aligned_height - 1.);
141 |     float bin_size_w = (float)(width - 1.001) / (aligned_width - 1.);
142 |     for (idx = 0; idx < output_size; ++idx)
143 |     {
144 |         // (n, c, ph, pw) is an element in the aligned output
145 |         int pw = idx % aligned_width;
146 |         int ph = (idx / aligned_width) % aligned_height;
147 |         int c = (idx / aligned_width / aligned_height) % channels;
148 |         int n = idx / aligned_width / aligned_height / channels;
149 | 
150 |         float roi_batch_ind = bottom_rois[n * 5 + 0];
151 |         float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
152 |         float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
153 |         float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
154 |         float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
155 | 
156 |         float h = (float)(ph) * bin_size_h;
157 |         float w = (float)(pw) * bin_size_w;
158 | 
159 |         int hstart = fminf(floor(h), height - 2);
160 |         int wstart = fminf(floor(w), width - 2);
161 | 
162 |         int img_start = roi_batch_ind * channels * height * width;
163 | 
164 |         // bilinear interpolation
165 |         if (!(h >= roi_start_h && h <= roi_end_h && w >= roi_start_w && w <= roi_end_w)) {
166 |             float h_ratio = h - (float)(hstart);
167 |             float w_ratio = w - (float)(wstart);
168 |             int upleft = img_start + (c * height + hstart) * width + wstart;
169 |             int upright = upleft + 1;
170 |             int downleft = upleft + width;
171 |             int downright = downleft + 1;
172 | 
173 |             bottom_diff[upleft] += top_diff[idx] * (1. - h_ratio) * (1. - w_ratio);
174 |             bottom_diff[upright] += top_diff[idx] * (1. - h_ratio) *  w_ratio;
175 |             bottom_diff[downleft] += top_diff[idx] * h_ratio * (1. - w_ratio);
176 |             bottom_diff[downright] += top_diff[idx] * h_ratio * w_ratio;
177 |         }
178 |     }
179 | }
180 | 


--------------------------------------------------------------------------------
/rod_align/src/rod_align.h:
--------------------------------------------------------------------------------
1 | int rod_align_forward(int aligned_height, int aligned_width, float spatial_scale,
2 |                       THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output);
3 | 
4 | int rod_align_backward(int aligned_height, int aligned_width, float spatial_scale,
5 |                       THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad);
6 | 


--------------------------------------------------------------------------------
/rod_align/src/rod_align_cuda.c:
--------------------------------------------------------------------------------
 1 | #include <THC/THC.h>
 2 | #include <math.h>
 3 | #include "rod_align_kernel.h"
 4 | 
 5 | extern THCState *state;
 6 | 
 7 | int rod_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale,
 8 |                         THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output)
 9 | {
10 |     // Grab the input tensor
11 |     float * data_flat = THCudaTensor_data(state, features);
12 |     float * rois_flat = THCudaTensor_data(state, rois);
13 | 
14 |     float * output_flat = THCudaTensor_data(state, output);
15 | 
16 |     // Number of ROIs
17 |     int num_rois = THCudaTensor_size(state, rois, 0);
18 |     int size_rois = THCudaTensor_size(state, rois, 1);
19 |     if (size_rois != 5)
20 |     {
21 |         return 0;
22 |     }
23 | 
24 |     // data height
25 |     int data_height = THCudaTensor_size(state, features, 2);
26 |     // data width
27 |     int data_width = THCudaTensor_size(state, features, 3);
28 |     // Number of channels
29 |     int num_channels = THCudaTensor_size(state, features, 1);
30 | 
31 |     cudaStream_t stream = THCState_getCurrentStream(state);
32 | 
33 |     RODAlignForwardLaucher(
34 |         data_flat, spatial_scale, num_rois, data_height,
35 |         data_width, num_channels, aligned_height,
36 |         aligned_width, rois_flat,
37 |         output_flat, stream);
38 | 
39 |     return 1;
40 | }
41 | 
42 | int rod_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale,
43 |                         THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad)
44 | {
45 |     // Grab the input tensor
46 |     float * top_grad_flat = THCudaTensor_data(state, top_grad);
47 |     float * rois_flat = THCudaTensor_data(state, rois);
48 | 
49 |     float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad);
50 | 
51 |     // Number of ROIs
52 |     int num_rois = THCudaTensor_size(state, rois, 0);
53 |     int size_rois = THCudaTensor_size(state, rois, 1);
54 |     if (size_rois != 5)
55 |     {
56 |         return 0;
57 |     }
58 | 
59 |     // batch size
60 |     int batch_size = THCudaTensor_size(state, bottom_grad, 0);
61 |     // data height
62 |     int data_height = THCudaTensor_size(state, bottom_grad, 2);
63 |     // data width
64 |     int data_width = THCudaTensor_size(state, bottom_grad, 3);
65 |     // Number of channels
66 |     int num_channels = THCudaTensor_size(state, bottom_grad, 1);
67 | 
68 |     cudaStream_t stream = THCState_getCurrentStream(state);
69 |     RODAlignBackwardLaucher(
70 |         top_grad_flat, spatial_scale, batch_size, num_rois, data_height,
71 |         data_width, num_channels, aligned_height,
72 |         aligned_width, rois_flat,
73 |         bottom_grad_flat, stream);
74 | 
75 |     return 1;
76 | }
77 | 


--------------------------------------------------------------------------------
/rod_align/src/rod_align_cuda.h:
--------------------------------------------------------------------------------
1 | int rod_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale,
2 |                         THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output);
3 | 
4 | int rod_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale,
5 |                         THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad);
6 | 


--------------------------------------------------------------------------------
/rod_align/src/rod_align_kernel.cu:
--------------------------------------------------------------------------------
  1 | #ifdef __cplusplus
  2 | extern "C" {
  3 | #endif
  4 | 
  5 | #include <stdio.h>
  6 | #include <math.h>
  7 | #include <float.h>
  8 | #include "rod_align_kernel.h"
  9 | 
 10 | #define CUDA_1D_KERNEL_LOOP(i, n)                            \
 11 |     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
 12 |             i += blockDim.x * gridDim.x)
 13 | 
 14 | 
 15 |     __global__ void RODAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width,
 16 |                                     const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) {
 17 | 	float bin_size_h = (float)(height - 1.001) / (aligned_height - 1.);
 18 |         float bin_size_w = (float)(width - 1.001) / (aligned_width - 1.);
 19 |         CUDA_1D_KERNEL_LOOP(index, nthreads) {
 20 |             // (n, c, ph, pw) is an element in the aligned output
 21 |             // int n = index;
 22 |             // int pw = n % aligned_width;
 23 |             // n /= aligned_width;
 24 |             // int ph = n % aligned_height;
 25 |             // n /= aligned_height;
 26 |             // int c = n % channels;
 27 |             // n /= channels;
 28 | 
 29 |             int pw = index % aligned_width;
 30 |             int ph = (index / aligned_width) % aligned_height;
 31 |             int c  = (index / aligned_width / aligned_height) % channels;
 32 |             int n  = index / aligned_width / aligned_height / channels;
 33 | 
 34 |             // bottom_rois += n * 5;
 35 |             float roi_batch_ind = bottom_rois[n * 5 + 0];
 36 |             float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
 37 |             float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
 38 |             float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
 39 |             float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
 40 | 
 41 | 
 42 |             float h = (float)(ph) * bin_size_h;
 43 |             float w = (float)(pw) * bin_size_w;
 44 | 
 45 |             int hstart = fminf(floor(h), height - 2);
 46 |             int wstart = fminf(floor(w), width - 2);
 47 | 
 48 |             int img_start = roi_batch_ind * channels * height * width;
 49 | 
 50 |             // bilinear interpolation
 51 |             if (h >= roi_start_h && h <= roi_end_h && w >= roi_start_w && w <= roi_end_w){
 52 |                 top_data[index] = 0.;
 53 |             } else {
 54 |                 float h_ratio = h - (float)(hstart);
 55 |                 float w_ratio = w - (float)(wstart);
 56 |                 int upleft = img_start + (c * height + hstart) * width + wstart;
 57 |                 int upright = upleft + 1;
 58 |                 int downleft = upleft + width;
 59 |                 int downright = downleft + 1;
 60 | 
 61 |                 top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio)
 62 |                     + bottom_data[upright] * (1. - h_ratio) * w_ratio
 63 |                     + bottom_data[downleft] * h_ratio * (1. - w_ratio)
 64 |                     + bottom_data[downright] * h_ratio * w_ratio;
 65 |             }
 66 |         }
 67 |     }
 68 | 
 69 | 
 70 |     int RODAlignForwardLaucher(const float* bottom_data, const float spatial_scale, const int num_rois, const int height, const int width,
 71 |                                const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data, cudaStream_t stream) {
 72 |         const int kThreadsPerBlock = 1024;
 73 |         const int output_size = num_rois * aligned_height * aligned_width * channels;
 74 |         cudaError_t err;
 75 | 
 76 | 
 77 |         RODAlignForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
 78 |           output_size, bottom_data, spatial_scale, height, width, channels,
 79 |           aligned_height, aligned_width, bottom_rois, top_data);
 80 | 
 81 |         err = cudaGetLastError();
 82 |         if(cudaSuccess != err) {
 83 |             fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
 84 |             exit( -1 );
 85 |         }
 86 | 
 87 |         return 1;
 88 |     }
 89 | 
 90 | 
 91 |     __global__ void RODAlignBackward(const int nthreads, const float* top_diff, const float spatial_scale, const int height, const int width,
 92 |                                      const int channels, const int aligned_height, const int aligned_width, float* bottom_diff, const float* bottom_rois) {
 93 | 	float bin_size_h = (float)(height - 1.001) / (aligned_height - 1.);
 94 |         float bin_size_w = (float)(width - 1.001) / (aligned_width - 1.);
 95 |         CUDA_1D_KERNEL_LOOP(index, nthreads) {
 96 | 
 97 |             // (n, c, ph, pw) is an element in the aligned output
 98 |             int pw = index % aligned_width;
 99 |             int ph = (index / aligned_width) % aligned_height;
100 |             int c  = (index / aligned_width / aligned_height) % channels;
101 |             int n  = index / aligned_width / aligned_height / channels;
102 | 
103 |             float roi_batch_ind = bottom_rois[n * 5 + 0];
104 |             float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
105 |             float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
106 |             float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
107 |             float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
108 |             
109 | 
110 |             float h = (float)(ph) * bin_size_h;
111 |             float w = (float)(pw) * bin_size_w;
112 | 
113 |             int hstart = fminf(floor(h), height - 2);
114 |             int wstart = fminf(floor(w), width - 2);
115 | 
116 |             int img_start = roi_batch_ind * channels * height * width;
117 | 
118 |             // bilinear interpolation
119 |             if (!(h >= roi_start_h && h <= roi_end_h && w >= roi_start_w && w <= roi_end_w)) {
120 |                 float h_ratio = h - (float)(hstart);
121 |                 float w_ratio = w - (float)(wstart);
122 |                 int upleft = img_start + (c * height + hstart) * width + wstart;
123 |                 int upright = upleft + 1;
124 |                 int downleft = upleft + width;
125 |                 int downright = downleft + 1;
126 | 
127 |                 atomicAdd(bottom_diff + upleft, top_diff[index] * (1. - h_ratio) * (1 - w_ratio));
128 |                 atomicAdd(bottom_diff + upright, top_diff[index] * (1. - h_ratio) * w_ratio);
129 |                 atomicAdd(bottom_diff + downleft, top_diff[index] * h_ratio * (1 - w_ratio));
130 |                 atomicAdd(bottom_diff + downright, top_diff[index] * h_ratio * w_ratio);
131 |             }
132 |         }
133 |     }
134 | 
135 |     int RODAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, const int height, const int width,
136 |                                 const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* bottom_diff, cudaStream_t stream) {
137 |         const int kThreadsPerBlock = 1024;
138 |         const int output_size = num_rois * aligned_height * aligned_width * channels;
139 |         cudaError_t err;
140 | 
141 |         RODAlignBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
142 |           output_size, top_diff, spatial_scale, height, width, channels,
143 |           aligned_height, aligned_width, bottom_diff, bottom_rois);
144 | 
145 |         err = cudaGetLastError();
146 |         if(cudaSuccess != err) {
147 |             fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
148 |             exit( -1 );
149 |         }
150 | 
151 |         return 1;
152 |     }
153 | 
154 | 
155 | #ifdef __cplusplus
156 | }
157 | #endif
158 | 


--------------------------------------------------------------------------------
/rod_align/src/rod_align_kernel.cu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/rod_align/src/rod_align_kernel.cu.o


--------------------------------------------------------------------------------
/rod_align/src/rod_align_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _ROD_ALIGN_KERNEL
 2 | #define _ROD_ALIGN_KERNEL
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | __global__ void RODAlignForward(const int nthreads, const float* bottom_data,
 9 |     const float spatial_scale, const int height, const int width,
10 |     const int channels, const int aligned_height, const int aligned_width,
11 |     const float* bottom_rois, float* top_data);
12 | 
13 | int RODAlignForwardLaucher(
14 |     const float* bottom_data, const float spatial_scale, const int num_rois, const int height,
15 |     const int width, const int channels, const int aligned_height,
16 |     const int aligned_width, const float* bottom_rois,
17 |     float* top_data, cudaStream_t stream);
18 | 
19 | __global__ void RODAlignBackward(const int nthreads, const float* top_diff,
20 |     const float spatial_scale, const int height, const int width,
21 |     const int channels, const int aligned_height, const int aligned_width,
22 |     float* bottom_diff, const float* bottom_rois);
23 | 
24 | int RODAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois,
25 |     const int height, const int width, const int channels, const int aligned_height,
26 |     const int aligned_width, const float* bottom_rois,
27 |     float* bottom_diff, cudaStream_t stream);
28 | 
29 | #ifdef __cplusplus
30 | }
31 | #endif
32 | 
33 | #endif
34 | 
35 | 


--------------------------------------------------------------------------------
/roi_align/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/__init__.pyc


--------------------------------------------------------------------------------
/roi_align/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/roi_align/_ext/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/_ext/__init__.pyc


--------------------------------------------------------------------------------
/roi_align/_ext/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/_ext/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/roi_align/_ext/roi_align/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from torch.utils.ffi import _wrap_function
 3 | from ._roi_align import lib as _lib, ffi as _ffi
 4 | 
 5 | __all__ = []
 6 | def _import_symbols(locals):
 7 |     for symbol in dir(_lib):
 8 |         fn = getattr(_lib, symbol)
 9 |         if callable(fn):
10 |             locals[symbol] = _wrap_function(fn, _ffi)
11 |         else:
12 |             locals[symbol] = fn
13 |         __all__.append(symbol)
14 | 
15 | _import_symbols(locals())
16 | 


--------------------------------------------------------------------------------
/roi_align/_ext/roi_align/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/_ext/roi_align/__init__.pyc


--------------------------------------------------------------------------------
/roi_align/_ext/roi_align/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/_ext/roi_align/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/roi_align/_ext/roi_align/_roi_align.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/_ext/roi_align/_roi_align.so


--------------------------------------------------------------------------------
/roi_align/build.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | import torch
 4 | from torch.utils.ffi import create_extension
 5 | 
 6 | sources = ['src/roi_align.c']
 7 | headers = ['src/roi_align.h']
 8 | extra_objects = []
 9 | #sources = []
10 | #headers = []
11 | defines = []
12 | with_cuda = False
13 | 
14 | this_file = os.path.dirname(os.path.realpath(__file__))
15 | print(this_file)
16 | 
17 | if torch.cuda.is_available():
18 |     print('Including CUDA code.')
19 |     sources += ['src/roi_align_cuda.c']
20 |     headers += ['src/roi_align_cuda.h']
21 |     defines += [('WITH_CUDA', None)]
22 |     with_cuda = True
23 |     
24 |     extra_objects = ['src/roi_align_kernel.cu.o']
25 |     extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
26 | 
27 | ffi = create_extension(
28 |     '_ext.roi_align',
29 |     headers=headers,
30 |     sources=sources,
31 |     define_macros=defines,
32 |     relative_to=__file__,
33 |     with_cuda=with_cuda,
34 |     extra_objects=extra_objects
35 | )
36 | 
37 | if __name__ == '__main__':
38 |     ffi.build()
39 | 


--------------------------------------------------------------------------------
/roi_align/functions/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/functions/__init__.pyc


--------------------------------------------------------------------------------
/roi_align/functions/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/functions/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/roi_align/functions/__pycache__/roi_align.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/functions/__pycache__/roi_align.cpython-35.pyc


--------------------------------------------------------------------------------
/roi_align/functions/roi_align.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Function
 3 | from .._ext import roi_align
 4 | 
 5 | 
 6 | # TODO use save_for_backward instead
 7 | class RoIAlignFunction(Function):
 8 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
 9 |         self.aligned_width = int(aligned_width)
10 |         self.aligned_height = int(aligned_height)
11 |         self.spatial_scale = float(spatial_scale)
12 |         self.rois = None
13 |         self.feature_size = None
14 | 
15 |     def forward(self, features, rois):
16 |         self.rois = rois
17 |         self.feature_size = features.size()
18 | 
19 |         batch_size, num_channels, data_height, data_width = features.size()
20 |         num_rois = rois.size(0)
21 | 
22 |         output = features.new(num_rois, num_channels, self.aligned_height, self.aligned_width).zero_()
23 |         if features.is_cuda:
24 |             roi_align.roi_align_forward_cuda(self.aligned_height,
25 |                                              self.aligned_width,
26 |                                              self.spatial_scale, features,
27 |                                              rois, output)
28 |         else:
29 |             roi_align.roi_align_forward(self.aligned_height,
30 |                                         self.aligned_width,
31 |                                         self.spatial_scale, features,
32 |                                         rois, output)
33 | #            raise NotImplementedError
34 | 
35 |         return output
36 | 
37 |     def backward(self, grad_output):
38 |         assert(self.feature_size is not None and grad_output.is_cuda)
39 | 
40 |         batch_size, num_channels, data_height, data_width = self.feature_size
41 | 
42 |         grad_input = self.rois.new(batch_size, num_channels, data_height,
43 |                                   data_width).zero_()
44 |         roi_align.roi_align_backward_cuda(self.aligned_height,
45 |                                           self.aligned_width,
46 |                                           self.spatial_scale, grad_output,
47 |                                           self.rois, grad_input)
48 | 
49 |         # print grad_input
50 | 
51 |         return grad_input, None
52 | 


--------------------------------------------------------------------------------
/roi_align/functions/roi_align.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/functions/roi_align.pyc


--------------------------------------------------------------------------------
/roi_align/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CUDA_PATH=/usr/local/cuda/
 4 | 
 5 | cd src
 6 | echo "Compiling my_lib kernels by nvcc..."
 7 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52
 8 | 
 9 | cd ../
10 | python build.py
11 | 


--------------------------------------------------------------------------------
/roi_align/modules/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/modules/__init__.pyc


--------------------------------------------------------------------------------
/roi_align/modules/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/modules/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/roi_align/modules/__pycache__/roi_align.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/modules/__pycache__/roi_align.cpython-35.pyc


--------------------------------------------------------------------------------
/roi_align/modules/roi_align.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.modules.module import Module
 2 | from torch.nn.functional import avg_pool2d, max_pool2d
 3 | from ..functions.roi_align import RoIAlignFunction
 4 | 
 5 | 
 6 | class RoIAlign(Module):
 7 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
 8 |         super(RoIAlign, self).__init__()
 9 | 
10 |         self.aligned_width = int(aligned_width)
11 |         self.aligned_height = int(aligned_height)
12 |         self.spatial_scale = float(spatial_scale)
13 | 
14 |     def forward(self, features, rois):
15 |         return RoIAlignFunction(self.aligned_height, self.aligned_width,
16 |                                 self.spatial_scale)(features, rois)
17 | 
18 | class RoIAlignAvg(Module):
19 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
20 |         super(RoIAlignAvg, self).__init__()
21 | 
22 |         self.aligned_width = int(aligned_width)
23 |         self.aligned_height = int(aligned_height)
24 |         self.spatial_scale = float(spatial_scale)
25 | 
26 |     def forward(self, features, rois):
27 |         x =  RoIAlignFunction(self.aligned_height+1, self.aligned_width+1,
28 |                                 self.spatial_scale)(features, rois)
29 |         return avg_pool2d(x, kernel_size=2, stride=1)
30 | 
31 | class RoIAlignMax(Module):
32 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
33 |         super(RoIAlignMax, self).__init__()
34 | 
35 |         self.aligned_width = int(aligned_width)
36 |         self.aligned_height = int(aligned_height)
37 |         self.spatial_scale = float(spatial_scale)
38 | 
39 |     def forward(self, features, rois):
40 |         x =  RoIAlignFunction(self.aligned_height+1, self.aligned_width+1,
41 |                                 self.spatial_scale)(features, rois)
42 |         return max_pool2d(x, kernel_size=2, stride=1)
43 | 


--------------------------------------------------------------------------------
/roi_align/modules/roi_align.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/modules/roi_align.pyc


--------------------------------------------------------------------------------
/roi_align/src/roi_align.c:
--------------------------------------------------------------------------------
  1 | #include <TH/TH.h>
  2 | #include <math.h>
  3 | #include <omp.h>
  4 | 
  5 | 
  6 | void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois,
  7 |                      const int height, const int width, const int channels,
  8 |                      const int aligned_height, const int aligned_width, const float * bottom_rois,
  9 |                      float* top_data);
 10 | 
 11 | void ROIAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois,
 12 |                      const int height, const int width, const int channels,
 13 |                      const int aligned_height, const int aligned_width, const float * bottom_rois,
 14 |                      float* top_data);
 15 | 
 16 | int roi_align_forward(int aligned_height, int aligned_width, float spatial_scale,
 17 |                      THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output)
 18 | {
 19 |     //Grab the input tensor
 20 |     float * data_flat = THFloatTensor_data(features);
 21 |     float * rois_flat = THFloatTensor_data(rois);
 22 | 
 23 |     float * output_flat = THFloatTensor_data(output);
 24 | 
 25 |     // Number of ROIs
 26 |     int num_rois = THFloatTensor_size(rois, 0);
 27 |     int size_rois = THFloatTensor_size(rois, 1);
 28 |     if (size_rois != 5)
 29 |     {
 30 |         return 0;
 31 |     }
 32 | 
 33 |     // data height
 34 |     int data_height = THFloatTensor_size(features, 2);
 35 |     // data width
 36 |     int data_width = THFloatTensor_size(features, 3);
 37 |     // Number of channels
 38 |     int num_channels = THFloatTensor_size(features, 1);
 39 | 
 40 |     // do ROIAlignForward
 41 |     ROIAlignForwardCpu(data_flat, spatial_scale, num_rois, data_height, data_width, num_channels,
 42 |             aligned_height, aligned_width, rois_flat, output_flat);
 43 | 
 44 |     return 1;
 45 | }
 46 | 
 47 | int roi_align_backward(int aligned_height, int aligned_width, float spatial_scale,
 48 |                        THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad)
 49 | {
 50 |     //Grab the input tensor
 51 |     float * top_grad_flat = THFloatTensor_data(top_grad);
 52 |     float * rois_flat = THFloatTensor_data(rois);
 53 | 
 54 |     float * bottom_grad_flat = THFloatTensor_data(bottom_grad);
 55 | 
 56 |     // Number of ROIs
 57 |     int num_rois = THFloatTensor_size(rois, 0);
 58 |     int size_rois = THFloatTensor_size(rois, 1);
 59 |     if (size_rois != 5)
 60 |     {
 61 |         return 0;
 62 |     }
 63 | 
 64 |     // batch size
 65 |     // int batch_size = THFloatTensor_size(bottom_grad, 0);
 66 |     // data height
 67 |     int data_height = THFloatTensor_size(bottom_grad, 2);
 68 |     // data width
 69 |     int data_width = THFloatTensor_size(bottom_grad, 3);
 70 |     // Number of channels
 71 |     int num_channels = THFloatTensor_size(bottom_grad, 1);
 72 | 
 73 |     // do ROIAlignBackward
 74 |     ROIAlignBackwardCpu(top_grad_flat, spatial_scale, num_rois, data_height,
 75 |             data_width, num_channels, aligned_height, aligned_width, rois_flat, bottom_grad_flat);
 76 | 
 77 |     return 1;
 78 | }
 79 | 
 80 | void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois,
 81 |                      const int height, const int width, const int channels,
 82 |                      const int aligned_height, const int aligned_width, const float * bottom_rois,
 83 |                      float* top_data)
 84 | {
 85 |     const int output_size = num_rois * aligned_height * aligned_width * channels;
 86 | 
 87 |     int idx = 0;
 88 |     for (idx = 0; idx < output_size; ++idx)
 89 |     {
 90 |         // (n, c, ph, pw) is an element in the aligned output
 91 |         int pw = idx % aligned_width;
 92 |         int ph = (idx / aligned_width) % aligned_height;
 93 |         int c = (idx / aligned_width / aligned_height) % channels;
 94 |         int n = idx / aligned_width / aligned_height / channels;
 95 | 
 96 |         float roi_batch_ind = bottom_rois[n * 5 + 0];
 97 |         float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
 98 |         float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
 99 |         float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
100 |         float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
101 | 
102 |         // Force malformed ROI to be 1x1
103 |         float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
104 |         float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
105 |         float bin_size_h = roi_height / (aligned_height - 1.);
106 |         float bin_size_w = roi_width / (aligned_width - 1.);
107 | 
108 |         float h = (float)(ph) * bin_size_h + roi_start_h;
109 |         float w = (float)(pw) * bin_size_w + roi_start_w;
110 | 
111 |         int hstart = fminf(floor(h), height - 2);
112 |         int wstart = fminf(floor(w), width - 2);
113 | 
114 |         int img_start = roi_batch_ind * channels * height * width;
115 | 
116 |         // bilinear interpolation
117 |         if (h < 0 || h >= height || w < 0 || w >= width)
118 |         {
119 |             top_data[idx] = 0.;
120 |         }
121 |         else
122 |         {
123 |             float h_ratio = h - (float)(hstart);
124 |             float w_ratio = w - (float)(wstart);
125 |             int upleft = img_start + (c * height + hstart) * width + wstart;
126 |             int upright = upleft + 1;
127 |             int downleft = upleft + width;
128 |             int downright = downleft + 1;
129 | 
130 |             top_data[idx] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio)
131 |                 + bottom_data[upright] * (1. - h_ratio) * w_ratio
132 |                 + bottom_data[downleft] * h_ratio * (1. - w_ratio)
133 |                 + bottom_data[downright] * h_ratio * w_ratio;
134 |         }
135 |     }
136 | }
137 | 
138 | void ROIAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois,
139 |                      const int height, const int width, const int channels,
140 |                      const int aligned_height, const int aligned_width, const float * bottom_rois,
141 |                      float* bottom_diff)
142 | {
143 |     const int output_size = num_rois * aligned_height * aligned_width * channels;
144 | 
145 |     int idx = 0;
146 |     for (idx = 0; idx < output_size; ++idx)
147 |     {
148 |         // (n, c, ph, pw) is an element in the aligned output
149 |         int pw = idx % aligned_width;
150 |         int ph = (idx / aligned_width) % aligned_height;
151 |         int c = (idx / aligned_width / aligned_height) % channels;
152 |         int n = idx / aligned_width / aligned_height / channels;
153 | 
154 |         float roi_batch_ind = bottom_rois[n * 5 + 0];
155 |         float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
156 |         float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
157 |         float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
158 |         float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
159 | 
160 |         // Force malformed ROI to be 1x1
161 |         float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
162 |         float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
163 |         float bin_size_h = roi_height / (aligned_height - 1.);
164 |         float bin_size_w = roi_width / (aligned_width - 1.);
165 | 
166 |         float h = (float)(ph) * bin_size_h + roi_start_h;
167 |         float w = (float)(pw) * bin_size_w + roi_start_w;
168 | 
169 |         int hstart = fminf(floor(h), height - 2);
170 |         int wstart = fminf(floor(w), width - 2);
171 | 
172 |         int img_start = roi_batch_ind * channels * height * width;
173 | 
174 |         // bilinear interpolation
175 |         if (h < 0 || h >= height || w < 0 || w >= width)
176 |         {
177 |             float h_ratio = h - (float)(hstart);
178 |             float w_ratio = w - (float)(wstart);
179 |             int upleft = img_start + (c * height + hstart) * width + wstart;
180 |             int upright = upleft + 1;
181 |             int downleft = upleft + width;
182 |             int downright = downleft + 1;
183 | 
184 |             bottom_diff[upleft] += top_diff[idx] * (1. - h_ratio) * (1. - w_ratio);
185 |             bottom_diff[upright] += top_diff[idx] * (1. - h_ratio) *  w_ratio;
186 |             bottom_diff[downleft] += top_diff[idx] * h_ratio * (1. - w_ratio);
187 |             bottom_diff[downright] += top_diff[idx] * h_ratio * w_ratio;
188 |         }
189 |     }
190 | }
191 | 


--------------------------------------------------------------------------------
/roi_align/src/roi_align.h:
--------------------------------------------------------------------------------
1 | int roi_align_forward(int aligned_height, int aligned_width, float spatial_scale,
2 |                       THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output);
3 | 
4 | int roi_align_backward(int aligned_height, int aligned_width, float spatial_scale,
5 |                       THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad);
6 | 


--------------------------------------------------------------------------------
/roi_align/src/roi_align_cuda.c:
--------------------------------------------------------------------------------
 1 | #include <THC/THC.h>
 2 | #include <math.h>
 3 | #include "roi_align_kernel.h"
 4 | 
 5 | extern THCState *state;
 6 | 
 7 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale,
 8 |                         THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output)
 9 | {
10 |     // Grab the input tensor
11 |     float * data_flat = THCudaTensor_data(state, features);
12 |     float * rois_flat = THCudaTensor_data(state, rois);
13 | 
14 |     float * output_flat = THCudaTensor_data(state, output);
15 | 
16 |     // Number of ROIs
17 |     int num_rois = THCudaTensor_size(state, rois, 0);
18 |     int size_rois = THCudaTensor_size(state, rois, 1);
19 |     if (size_rois != 5)
20 |     {
21 |         return 0;
22 |     }
23 | 
24 |     // data height
25 |     int data_height = THCudaTensor_size(state, features, 2);
26 |     // data width
27 |     int data_width = THCudaTensor_size(state, features, 3);
28 |     // Number of channels
29 |     int num_channels = THCudaTensor_size(state, features, 1);
30 | 
31 |     cudaStream_t stream = THCState_getCurrentStream(state);
32 | 
33 |     ROIAlignForwardLaucher(
34 |         data_flat, spatial_scale, num_rois, data_height,
35 |         data_width, num_channels, aligned_height,
36 |         aligned_width, rois_flat,
37 |         output_flat, stream);
38 | 
39 |     return 1;
40 | }
41 | 
42 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale,
43 |                         THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad)
44 | {
45 |     // Grab the input tensor
46 |     float * top_grad_flat = THCudaTensor_data(state, top_grad);
47 |     float * rois_flat = THCudaTensor_data(state, rois);
48 | 
49 |     float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad);
50 | 
51 |     // Number of ROIs
52 |     int num_rois = THCudaTensor_size(state, rois, 0);
53 |     int size_rois = THCudaTensor_size(state, rois, 1);
54 |     if (size_rois != 5)
55 |     {
56 |         return 0;
57 |     }
58 | 
59 |     // batch size
60 |     int batch_size = THCudaTensor_size(state, bottom_grad, 0);
61 |     // data height
62 |     int data_height = THCudaTensor_size(state, bottom_grad, 2);
63 |     // data width
64 |     int data_width = THCudaTensor_size(state, bottom_grad, 3);
65 |     // Number of channels
66 |     int num_channels = THCudaTensor_size(state, bottom_grad, 1);
67 | 
68 |     cudaStream_t stream = THCState_getCurrentStream(state);
69 |     ROIAlignBackwardLaucher(
70 |         top_grad_flat, spatial_scale, batch_size, num_rois, data_height,
71 |         data_width, num_channels, aligned_height,
72 |         aligned_width, rois_flat,
73 |         bottom_grad_flat, stream);
74 | 
75 |     return 1;
76 | }
77 | 


--------------------------------------------------------------------------------
/roi_align/src/roi_align_cuda.h:
--------------------------------------------------------------------------------
1 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale,
2 |                         THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output);
3 | 
4 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale,
5 |                         THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad);
6 | 


--------------------------------------------------------------------------------
/roi_align/src/roi_align_kernel.cu:
--------------------------------------------------------------------------------
  1 | #ifdef __cplusplus
  2 | extern "C" {
  3 | #endif
  4 | 
  5 | #include <stdio.h>
  6 | #include <math.h>
  7 | #include <float.h>
  8 | #include "roi_align_kernel.h"
  9 | 
 10 | #define CUDA_1D_KERNEL_LOOP(i, n)                            \
 11 |     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
 12 |             i += blockDim.x * gridDim.x)
 13 | 
 14 | 
 15 |     __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width,
 16 |                                     const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) {
 17 |         CUDA_1D_KERNEL_LOOP(index, nthreads) {
 18 |             // (n, c, ph, pw) is an element in the aligned output
 19 |             // int n = index;
 20 |             // int pw = n % aligned_width;
 21 |             // n /= aligned_width;
 22 |             // int ph = n % aligned_height;
 23 |             // n /= aligned_height;
 24 |             // int c = n % channels;
 25 |             // n /= channels;
 26 | 
 27 |             int pw = index % aligned_width;
 28 |             int ph = (index / aligned_width) % aligned_height;
 29 |             int c  = (index / aligned_width / aligned_height) % channels;
 30 |             int n  = index / aligned_width / aligned_height / channels;
 31 | 
 32 |             // bottom_rois += n * 5;
 33 |             float roi_batch_ind = bottom_rois[n * 5 + 0];
 34 |             float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
 35 |             float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
 36 |             float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
 37 |             float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
 38 | 
 39 |             // Force malformed ROIs to be 1x1
 40 |             float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
 41 |             float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
 42 |             float bin_size_h = roi_height / (aligned_height - 1.);
 43 |             float bin_size_w = roi_width / (aligned_width - 1.);
 44 | 
 45 |             float h = (float)(ph) * bin_size_h + roi_start_h;
 46 |             float w = (float)(pw) * bin_size_w + roi_start_w;
 47 | 
 48 |             int hstart = fminf(floor(h), height - 2);
 49 |             int wstart = fminf(floor(w), width - 2);
 50 | 
 51 |             int img_start = roi_batch_ind * channels * height * width;
 52 | 
 53 |             // bilinear interpolation
 54 |             if (h < 0 || h >= height || w < 0 || w >= width) {
 55 |                 top_data[index] = 0.;
 56 |             } else {
 57 |                 float h_ratio = h - (float)(hstart);
 58 |                 float w_ratio = w - (float)(wstart);
 59 |                 int upleft = img_start + (c * height + hstart) * width + wstart;
 60 |                 int upright = upleft + 1;
 61 |                 int downleft = upleft + width;
 62 |                 int downright = downleft + 1;
 63 | 
 64 |                 top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio)
 65 |                     + bottom_data[upright] * (1. - h_ratio) * w_ratio
 66 |                     + bottom_data[downleft] * h_ratio * (1. - w_ratio)
 67 |                     + bottom_data[downright] * h_ratio * w_ratio;
 68 |             }
 69 |         }
 70 |     }
 71 | 
 72 | 
 73 |     int ROIAlignForwardLaucher(const float* bottom_data, const float spatial_scale, const int num_rois, const int height, const int width,
 74 |                                const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data, cudaStream_t stream) {
 75 |         const int kThreadsPerBlock = 1024;
 76 |         const int output_size = num_rois * aligned_height * aligned_width * channels;
 77 |         cudaError_t err;
 78 | 
 79 | 
 80 |         ROIAlignForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
 81 |           output_size, bottom_data, spatial_scale, height, width, channels,
 82 |           aligned_height, aligned_width, bottom_rois, top_data);
 83 | 
 84 |         err = cudaGetLastError();
 85 |         if(cudaSuccess != err) {
 86 |             fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
 87 |             exit( -1 );
 88 |         }
 89 | 
 90 |         return 1;
 91 |     }
 92 | 
 93 | 
 94 |     __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, const float spatial_scale, const int height, const int width,
 95 |                                      const int channels, const int aligned_height, const int aligned_width, float* bottom_diff, const float* bottom_rois) {
 96 |         CUDA_1D_KERNEL_LOOP(index, nthreads) {
 97 | 
 98 |             // (n, c, ph, pw) is an element in the aligned output
 99 |             int pw = index % aligned_width;
100 |             int ph = (index / aligned_width) % aligned_height;
101 |             int c  = (index / aligned_width / aligned_height) % channels;
102 |             int n  = index / aligned_width / aligned_height / channels;
103 | 
104 |             float roi_batch_ind = bottom_rois[n * 5 + 0];
105 |             float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
106 |             float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
107 |             float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
108 |             float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
109 |             /* int roi_start_w = round(bottom_rois[1] * spatial_scale); */
110 |             /* int roi_start_h = round(bottom_rois[2] * spatial_scale); */
111 |             /* int roi_end_w = round(bottom_rois[3] * spatial_scale); */
112 |             /* int roi_end_h = round(bottom_rois[4] * spatial_scale); */
113 | 
114 |             // Force malformed ROIs to be 1x1
115 |             float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
116 |             float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
117 |             float bin_size_h = roi_height / (aligned_height - 1.);
118 |             float bin_size_w = roi_width / (aligned_width - 1.);
119 | 
120 |             float h = (float)(ph) * bin_size_h + roi_start_h;
121 |             float w = (float)(pw) * bin_size_w + roi_start_w;
122 | 
123 |             int hstart = fminf(floor(h), height - 2);
124 |             int wstart = fminf(floor(w), width - 2);
125 | 
126 |             int img_start = roi_batch_ind * channels * height * width;
127 | 
128 |             // bilinear interpolation
129 |             if (!(h < 0 || h >= height || w < 0 || w >= width)) {
130 |                 float h_ratio = h - (float)(hstart);
131 |                 float w_ratio = w - (float)(wstart);
132 |                 int upleft = img_start + (c * height + hstart) * width + wstart;
133 |                 int upright = upleft + 1;
134 |                 int downleft = upleft + width;
135 |                 int downright = downleft + 1;
136 | 
137 |                 atomicAdd(bottom_diff + upleft, top_diff[index] * (1. - h_ratio) * (1 - w_ratio));
138 |                 atomicAdd(bottom_diff + upright, top_diff[index] * (1. - h_ratio) * w_ratio);
139 |                 atomicAdd(bottom_diff + downleft, top_diff[index] * h_ratio * (1 - w_ratio));
140 |                 atomicAdd(bottom_diff + downright, top_diff[index] * h_ratio * w_ratio);
141 |             }
142 |         }
143 |     }
144 | 
145 |     int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, const int height, const int width,
146 |                                 const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* bottom_diff, cudaStream_t stream) {
147 |         const int kThreadsPerBlock = 1024;
148 |         const int output_size = num_rois * aligned_height * aligned_width * channels;
149 |         cudaError_t err;
150 | 
151 |         ROIAlignBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
152 |           output_size, top_diff, spatial_scale, height, width, channels,
153 |           aligned_height, aligned_width, bottom_diff, bottom_rois);
154 | 
155 |         err = cudaGetLastError();
156 |         if(cudaSuccess != err) {
157 |             fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
158 |             exit( -1 );
159 |         }
160 | 
161 |         return 1;
162 |     }
163 | 
164 | 
165 | #ifdef __cplusplus
166 | }
167 | #endif
168 | 


--------------------------------------------------------------------------------
/roi_align/src/roi_align_kernel.cu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/roi_align/src/roi_align_kernel.cu.o


--------------------------------------------------------------------------------
/roi_align/src/roi_align_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _ROI_ALIGN_KERNEL
 2 | #define _ROI_ALIGN_KERNEL
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data,
 9 |     const float spatial_scale, const int height, const int width,
10 |     const int channels, const int aligned_height, const int aligned_width,
11 |     const float* bottom_rois, float* top_data);
12 | 
13 | int ROIAlignForwardLaucher(
14 |     const float* bottom_data, const float spatial_scale, const int num_rois, const int height,
15 |     const int width, const int channels, const int aligned_height,
16 |     const int aligned_width, const float* bottom_rois,
17 |     float* top_data, cudaStream_t stream);
18 | 
19 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff,
20 |     const float spatial_scale, const int height, const int width,
21 |     const int channels, const int aligned_height, const int aligned_width,
22 |     float* bottom_diff, const float* bottom_rois);
23 | 
24 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois,
25 |     const int height, const int width, const int channels, const int aligned_height,
26 |     const int aligned_width, const float* bottom_rois,
27 |     float* bottom_diff, cudaStream_t stream);
28 | 
29 | #ifdef __cplusplus
30 | }
31 | #endif
32 | 
33 | #endif
34 | 
35 | 


--------------------------------------------------------------------------------
/runTrainTest.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #python TrainModel.py --base_model='vgg16' --scale='single' --downsample=4 --augmentation=0 --align_size=9 --reduced_dim=8
 3 | #python TrainModel.py --base_model='vgg16' --scale='single' --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=8
 4 | #python TrainModel.py --base_model='vgg16' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=8
 5 | 
 6 | #python TrainModel.py --base_model='mobilenetv2' --scale='single' --downsample=4 --augmentation=0 --align_size=9 --reduced_dim=8
 7 | #python TrainModel.py --base_model='mobilenetv2' --scale='single' --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=8
 8 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=8
 9 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=1
10 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=2
11 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=4
12 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=16
13 | #python TrainModel.py --base_model='mobilenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=32
14 | 
15 | #python TrainModel.py --base_model='shufflenetv2' --scale='single' --downsample=4 --augmentation=0 --align_size=9 --reduced_dim=8
16 | #python TrainModel.py --base_model='shufflenetv2' --scale='single' --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=8
17 | python TrainModel.py --base_model='shufflenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=8
18 | 
19 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=1
20 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=2
21 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=4
22 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=8
23 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=16
24 | #python TrainModel.py --base_model='shufflenetv2' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9 --reduced_dim=32
25 | 
26 | 
27 | #python TrainModel.py --base_model='resnet50' --scale='single'  --downsample=4 --augmentation=1  --align_size=9
28 | #python TrainModel.py --base_model='resnet50' --scale='multi'  --downsample=4 --augmentation=1  --align_size=9
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/thop/__init__.py:
--------------------------------------------------------------------------------
1 | from .profile import profile


--------------------------------------------------------------------------------
/thop/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/thop/__init__.pyc


--------------------------------------------------------------------------------
/thop/count_hooks.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | multiply_adds = 1
  7 | 
  8 | 
  9 | def count_convNd(m, x, y):
 10 |     x = x[0]
 11 |     cin = m.in_channels
 12 |     # batch_size = x.size(0)
 13 | 
 14 |     kernel_ops = m.weight.size()[2] * m.weight.size()[3]
 15 |     bias_ops = 1 if m.bias is not None else 0
 16 |     ops_per_element = kernel_ops + bias_ops
 17 |     output_elements = y.nelement()
 18 | 
 19 |     # cout x oW x oH
 20 |     total_ops = cin * output_elements * ops_per_element // m.groups
 21 |     m.total_ops = torch.Tensor([int(total_ops)])
 22 | 
 23 | 
 24 | def count_conv2d(m, x, y):
 25 |     x = x[0]
 26 | 
 27 |     cin = m.in_channels
 28 |     cout = m.out_channels
 29 |     kh, kw = m.kernel_size
 30 |     batch_size = x.size()[0]
 31 | 
 32 |     out_h = y.size(2)
 33 |     out_w = y.size(3)
 34 | 
 35 |     # ops per output element
 36 |     # kernel_mul = kh * kw * cin
 37 |     # kernel_add = kh * kw * cin - 1
 38 |     kernel_ops = multiply_adds * kh * kw
 39 |     bias_ops = 1 if m.bias is not None else 0
 40 |     ops_per_element = kernel_ops + bias_ops
 41 | 
 42 |     # total ops
 43 |     # num_out_elements = y.numel()
 44 |     output_elements = batch_size * out_w * out_h * cout
 45 |     total_ops = output_elements * ops_per_element * cin // m.groups
 46 | 
 47 |     m.total_ops = torch.Tensor([int(total_ops)])
 48 | 
 49 | 
 50 | def count_convtranspose2d(m, x, y):
 51 |     x = x[0]
 52 | 
 53 |     cin = m.in_channels
 54 |     cout = m.out_channels
 55 |     kh, kw = m.kernel_size
 56 |     # batch_size = x.size()[0]
 57 | 
 58 |     out_h = y.size(2)
 59 |     out_w = y.size(3)
 60 | 
 61 |     # ops per output element
 62 |     # kernel_mul = kh * kw * cin
 63 |     # kernel_add = kh * kw * cin - 1
 64 |     kernel_ops = multiply_adds * kh * kw * cin // m.groups
 65 |     bias_ops = 1 if m.bias is not None else 0
 66 |     ops_per_element = kernel_ops + bias_ops
 67 | 
 68 |     # total ops
 69 |     # num_out_elements = y.numel()
 70 |     # output_elements = batch_size * out_w * out_h * cout
 71 |     ops_per_element = m.weight.nelement()
 72 |     output_elements = y.nelement()
 73 |     total_ops = output_elements * ops_per_element
 74 | 
 75 |     m.total_ops = torch.Tensor([int(total_ops)])
 76 | 
 77 | 
 78 | def count_bn(m, x, y):
 79 |     x = x[0]
 80 | 
 81 |     nelements = x.numel()
 82 |     # subtract, divide, gamma, beta
 83 |     total_ops = 4 * nelements
 84 | 
 85 |     m.total_ops = torch.Tensor([int(total_ops)])
 86 | 
 87 | 
 88 | def count_relu(m, x, y):
 89 |     x = x[0]
 90 | 
 91 |     nelements = x.numel()
 92 |     total_ops = nelements
 93 | 
 94 |     m.total_ops = torch.Tensor([int(total_ops)])
 95 | 
 96 | 
 97 | def count_softmax(m, x, y):
 98 |     x = x[0]
 99 | 
100 |     batch_size, nfeatures = x.size()
101 | 
102 |     total_exp = nfeatures
103 |     total_add = nfeatures - 1
104 |     total_div = nfeatures
105 |     total_ops = batch_size * (total_exp + total_add + total_div)
106 | 
107 |     m.total_ops = torch.Tensor([int(total_ops)])
108 | 
109 | 
110 | def count_maxpool(m, x, y):
111 |     kernel_ops = torch.prod(torch.Tensor([m.kernel_size]))
112 |     num_elements = y.numel()
113 |     total_ops = kernel_ops * num_elements
114 | 
115 |     m.total_ops = torch.Tensor([int(total_ops)])
116 | 
117 | 
118 | def count_adap_maxpool(m, x, y):
119 |     kernel = torch.Tensor([(x[0].shape[2:])]) // torch.Tensor(list((m.output_size,))).squeeze()
120 |     kernel_ops = torch.prod(kernel)
121 |     num_elements = y.numel()
122 |     total_ops = kernel_ops * num_elements
123 | 
124 |     m.total_ops = torch.Tensor([int(total_ops)])
125 | 
126 | 
127 | def count_avgpool(m, x, y):
128 |     total_add = torch.prod(torch.Tensor([m.kernel_size]))
129 |     total_div = 1
130 |     kernel_ops = total_add + total_div
131 |     num_elements = y.numel()
132 |     total_ops = kernel_ops * num_elements
133 | 
134 |     m.total_ops = torch.Tensor([int(total_ops)])
135 | 
136 | 
137 | def count_adap_avgpool(m, x, y):
138 |     kernel = torch.Tensor([(x[0].shape[2:])]) // torch.Tensor(list((m.output_size,))).squeeze()
139 |     total_add = torch.prod(kernel)
140 |     total_div = 1
141 |     kernel_ops = total_add + total_div
142 |     num_elements = y.numel()
143 |     total_ops = kernel_ops * num_elements
144 | 
145 |     m.total_ops = torch.Tensor([int(total_ops)])
146 | 
147 | 
148 | def count_linear(m, x, y):
149 |     # per output element
150 |     total_mul = m.in_features
151 |     total_add = m.in_features - 1
152 |     num_elements = y.numel()
153 |     total_ops = (total_mul + total_add) * num_elements
154 | 
155 |     m.total_ops = torch.Tensor([int(total_ops)])
156 | 


--------------------------------------------------------------------------------
/thop/count_hooks.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/thop/count_hooks.pyc


--------------------------------------------------------------------------------
/thop/profile.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | from torch.nn.modules.conv import _ConvNd
 6 | 
 7 | from .count_hooks import *
 8 | 
 9 | register_hooks = {
10 |     nn.Conv1d: count_convNd,
11 |     nn.Conv2d: count_convNd,
12 |     nn.Conv3d: count_convNd,
13 |     nn.ConvTranspose2d: count_convtranspose2d,
14 | 
15 |     nn.BatchNorm1d: count_bn,
16 |     nn.BatchNorm2d: count_bn,
17 |     nn.BatchNorm3d: count_bn,
18 | 
19 |     nn.ReLU: count_relu,
20 |     nn.ReLU6: count_relu,
21 |     nn.LeakyReLU: count_relu,
22 | 
23 |     nn.MaxPool1d: count_maxpool,
24 |     nn.MaxPool2d: count_maxpool,
25 |     nn.MaxPool3d: count_maxpool,
26 |     nn.AdaptiveMaxPool1d: count_adap_maxpool,
27 |     nn.AdaptiveMaxPool2d: count_adap_maxpool,
28 |     nn.AdaptiveMaxPool3d: count_adap_maxpool,
29 | 
30 |     nn.AvgPool1d: count_avgpool,
31 |     nn.AvgPool2d: count_avgpool,
32 |     nn.AvgPool3d: count_avgpool,
33 | 
34 |     nn.AdaptiveAvgPool1d: count_adap_avgpool,
35 |     nn.AdaptiveAvgPool2d: count_adap_avgpool,
36 |     nn.AdaptiveAvgPool3d: count_adap_avgpool,
37 |     nn.Linear: count_linear,
38 |     nn.Dropout: None,
39 | }
40 | 
41 | 
42 | def profile(model, input_size, custom_ops={}, device="cpu"):
43 |     handler_collection = []
44 | 
45 |     def add_hooks(m):
46 |         if len(list(m.children())) > 0:
47 |             return
48 | 
49 |         m.register_buffer('total_ops', torch.zeros(1))
50 |         m.register_buffer('total_params', torch.zeros(1))
51 | 
52 |         for p in m.parameters():
53 |             m.total_params += torch.Tensor([p.numel()])
54 | 
55 |         m_type = type(m)
56 |         fn = None
57 | 
58 |         if m_type in custom_ops:
59 |             fn = custom_ops[m_type]
60 |         elif m_type in register_hooks:
61 |             fn = register_hooks[m_type]
62 |         else:
63 |             print("Not implemented for ", m)
64 | 
65 |         if fn is not None:
66 |             print("Register FLOP counter for module %s" % str(m))
67 |             handler = m.register_forward_hook(fn)
68 |             handler_collection.append(handler)
69 | 
70 |     original_device = model.parameters().next().device
71 |     training = model.training
72 | 
73 |     model.eval().to(device)
74 |     model.apply(add_hooks)
75 | 
76 |     x = torch.zeros(input_size).to(device)
77 |     with torch.no_grad():
78 |         model(x)
79 | 
80 |     total_ops = 0
81 |     total_params = 0
82 |     for m in model.modules():
83 |         if len(list(m.children())) > 0:  # skip for non-leaf module
84 |             continue
85 |         total_ops += m.total_ops
86 |         total_params += m.total_params
87 | 
88 |     total_ops = total_ops.item()
89 |     total_params = total_params.item()
90 | 
91 |     model.train(training).to(original_device)
92 |     for handler in handler_collection:
93 |         handler.remove()
94 | 
95 |     return total_ops, total_params
96 | 


--------------------------------------------------------------------------------
/thop/profile.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HuiZeng/Grid-Anchor-based-Image-Cropping-Pytorch/344e86fb54b4b083d83c405aa63d7fbeeed4fcdc/thop/profile.pyc


--------------------------------------------------------------------------------
/thop/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def clever_format(num, format="%.2f"):
 3 |     if num > 1e12:
 4 |         return format % (num / 1e12) + "T"
 5 |     if num > 1e9:
 6 |         return format % (num / 1e9) + "G"
 7 |     if num > 1e6:
 8 |         return format % (num / 1e6) + "M"
 9 |     if num > 1e3:
10 |         return format % (num / 1e3) + "K"


--------------------------------------------------------------------------------