├── data
    ├── __init__.py
    ├── dataset.py
    ├── voc_dataset.py
    └── util.py
├── model
    ├── utils
    │   ├── __init__.py
    │   ├── roi_cupy.py
    │   ├── bbox_tools.py
    │   └── creator_tool.py
    ├── __init__.py
    ├── roi_module.py
    ├── faster_rcnn_vgg16.py
    ├── region_proposal_network.py
    └── faster_rcnn.py
├── requirements.txt
├── utils
    ├── __init__.py
    ├── array_tool.py
    ├── config.py
    ├── vis_tool.py
    └── eval_tool.py
├── misc
    └── convert_caffe_pretrain.py
├── LICENSE
├── train.py
├── README.MD
└── trainer.py


/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .faster_rcnn_vgg16 import FasterRCNNVGG16
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | visdom
 2 | # torchvision
 3 | scikit-image
 4 | tqdm
 5 | fire
 6 | pprint
 7 | matplotlib
 8 | ipdb
 9 | cython
10 | git+https://github.com/pytorch/tnt.git@master
11 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #    Copyright 2017 cy
 2 | # 
 3 | #    Licensed under the Apache License, Version 2.0 (the "License");
 4 | #    you may not use this file except in compliance with the License.
 5 | #    You may obtain a copy of the License at
 6 | # 
 7 | #        http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | #    Unless required by applicable law or agreed to in writing, software
10 | #    distributed under the License is distributed on an "AS IS" BASIS,
11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #    See the License for the specific language governing permissions and
13 | #    limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/misc/convert_caffe_pretrain.py:
--------------------------------------------------------------------------------
 1 | # code from ruotian luo
 2 | # https://github.com/ruotianluo/pytorch-faster-rcnn
 3 | import torch
 4 | from torch.utils.model_zoo import load_url
 5 | from torchvision import models
 6 | 
 7 | sd = load_url("https://s3-us-west-2.amazonaws.com/jcjohns-models/vgg16-00b39a1b.pth")
 8 | sd['classifier.0.weight'] = sd['classifier.1.weight']
 9 | sd['classifier.0.bias'] = sd['classifier.1.bias']
10 | del sd['classifier.1.weight']
11 | del sd['classifier.1.bias']
12 | 
13 | sd['classifier.3.weight'] = sd['classifier.4.weight']
14 | sd['classifier.3.bias'] = sd['classifier.4.bias']
15 | del sd['classifier.4.weight']
16 | del sd['classifier.4.bias']
17 | 
18 | 
19 | # speicify the path to save
20 | torch.save(sd, "vgg16_caffe.pth")


--------------------------------------------------------------------------------
/utils/array_tool.py:
--------------------------------------------------------------------------------
 1 | """
 2 | tools to convert specified type
 3 | """
 4 | import torch as t
 5 | import numpy as np
 6 | 
 7 | def tonumpy(data):
 8 |     if isinstance(data, np.ndarray):
 9 |         return data
10 |     if isinstance(data, t._C._TensorBase) and (data.requires_grad==False):
11 |         return data.cpu().numpy()
12 |     if isinstance(data, t.autograd.Variable):
13 |         return tonumpy(data.data)
14 | 
15 | 
16 | def totensor(data, cuda=True):
17 |     if isinstance(data, np.ndarray):
18 |         tensor = t.from_numpy(data)
19 |     if isinstance(data, t._C._TensorBase) and (data.requires_grad==False):
20 |         tensor = data
21 |     if isinstance(data, t.autograd.Variable):
22 |         tensor = data.data
23 |     if cuda:
24 |         tensor = tensor.cuda()
25 |     return tensor
26 | 
27 | 
28 | def tovariable(data):
29 |     if isinstance(data, np.ndarray):
30 |         return tovariable(totensor(data))
31 |     if isinstance(data, t._C._TensorBase) and (data.requires_grad==False):
32 |         return t.autograd.Variable(data)
33 |     if isinstance(data, t.autograd.Variable):
34 |         return data
35 |     else:
36 |         raise ValueError("UnKnow data type: %s, input should be {np.ndarray,Tensor,Variable}" %type(data))
37 | 
38 | 
39 | def scalar(data):
40 |     if isinstance(data, np.ndarray):
41 |         return data.reshape(1)[0]
42 |     if isinstance(data, t._C._TensorBase) and (data.requires_grad==False):
43 |         return data.view(1)[0]
44 |     if isinstance(data, t.autograd.Variable):
45 |         return data.data.view(1)[0]
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2017 Yun Chen
 4 | 
 5 | Original works by:
 6 | --------------------------------------------------------
 7 | chainer/chainercv
 8 | Copyright (c) 2017 Yusuke Niitani
 9 | Licensed under The MIT License
10 | https://github.com/chainer/chainercv/blob/master/LICENSE
11 | --------------------------------------------------------
12 | Faster R-CNN
13 | Copyright (c) 2015 Microsoft
14 | Licensed under The MIT License
15 | https://github.com/rbgirshick/py-faster-rcnn/blob/master/LICENSE
16 | --------------------------------------------------------
17 | 
18 | Permission is hereby granted, free of charge, to any person obtaining a copy
19 | of this software and associated documentation files (the "Software"), to deal
20 | in the Software without restriction, including without limitation the rights
21 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22 | copies of the Software, and to permit persons to whom the Software is
23 | furnished to do so, subject to the following conditions:
24 | 
25 | The above copyright notice and this permission notice shall be included in
26 | all copies or substantial portions of the Software.
27 | 
28 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
34 | THE SOFTWARE.


--------------------------------------------------------------------------------
/utils/config.py:
--------------------------------------------------------------------------------
 1 | from pprint import pprint
 2 | 
 3 | 
 4 | # Default Configs for training
 5 | # NOTE that, config items could be overwriten by passing argument through command line.
 6 | # e.g. --voc-data-dir='./data/'
 7 | 
 8 | 
 9 | class Config:
10 |     
11 |     # probability threshold when using px and py to generate predicting box
12 |     prob_thre = 0.5
13 | 
14 |     # data
15 |     # voc_data_dir = '/home/cy/.chainer/dataset/pfnet/chainercv/voc/VOCdevkit/VOC2007/'
16 |     # voc_data_dir = '/home/cvdev/Faster-RCNN-LocNet/VOCdevkit/VOC2007_traindev'
17 |     voc_data_dir = '/home/cvdev/Faster-RCNN-LocNet/question_dataset'
18 | 
19 |     min_size = 600  # image resize
20 |     max_size = 1000 # image resize
21 |     num_workers = 8
22 |     test_num_workers = 8
23 | 
24 |     # sigma for l1_smooth_loss
25 |     rpn_sigma = 3.
26 |     roi_sigma = 1e-5
27 | 
28 |     # param for optimizer
29 |     # 0.0005 in origin paper but 0.0001 in tf-faster-rcnn
30 |     weight_decay = 0.0005
31 |     lr_decay = 0.1  # 1e-3 -> 1e-4
32 |     lr = 1e-4
33 | 
34 | 
35 |     # visualization
36 |     env = 'faster-rcnn'  # visdom env
37 |     port = 8097
38 |     plot_every = 40  # vis every N iter
39 | 
40 |     # preset
41 |     data = 'voc'
42 |     pretrained_model = 'vgg16'
43 | 
44 |     # training
45 |     epoch = 50
46 | 
47 | 
48 |     use_adam = False # Use Adam optimizer
49 |     use_chainer = False # try match everything as chainer
50 |     use_drop = False # use dropout in RoIHead
51 |     # debug
52 |     debug_file = '/tmp/debugf'
53 | 
54 |     test_num = 10000
55 |     # model
56 |     load_path = "/home/cvdev/Faster-RCNN-LocNet/checkpoints/best_model"
57 | 
58 |     caffe_pretrain = False # use caffe pretrained model instead of torchvision
59 |     caffe_pretrain_path = 'checkpoints/vgg16-caffe.pth'
60 | 
61 |     def _parse(self, kwargs):
62 |         state_dict = self._state_dict()
63 |         for k, v in kwargs.items():
64 |             if k not in state_dict:
65 |                 raise ValueError('UnKnown Option: "--%s"' % k)
66 |             setattr(self, k, v)
67 | 
68 |         print('======user config========')
69 |         pprint(self._state_dict())
70 |         print('==========end============')
71 | 
72 |     def _state_dict(self):
73 |         return {k: getattr(self, k) for k, _ in Config.__dict__.items() \
74 |                 if not k.startswith('_')}
75 | 
76 | 
77 | opt = Config()
78 | 


--------------------------------------------------------------------------------
/data/dataset.py:
--------------------------------------------------------------------------------
  1 | import torch as t
  2 | from .voc_dataset import VOCBboxDataset
  3 | from skimage import transform as sktsf
  4 | from torchvision import transforms as tvtsf
  5 | from . import util
  6 | import numpy as np
  7 | from utils.config import opt
  8 | 
  9 | 
 10 | def inverse_normalize(img):
 11 |     if opt.caffe_pretrain:
 12 |         img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1))
 13 |         return img[::-1, :, :]
 14 |     # approximate un-normalize for visualize
 15 |     return (img * 0.225 + 0.45).clip(min=0, max=1) * 255
 16 | 
 17 | 
 18 | def pytorch_normalze(img):
 19 |     """
 20 |     https://github.com/pytorch/vision/issues/223
 21 |     return appr -1~1 RGB
 22 |     """
 23 |     normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406],
 24 |                                 std=[0.229, 0.224, 0.225])
 25 |     img = normalize(t.from_numpy(img))
 26 |     return img.numpy()
 27 | 
 28 | 
 29 | def caffe_normalize(img):
 30 |     """
 31 |     return appr -125-125 BGR
 32 |     """
 33 |     img = img[[2, 1, 0], :, :]  # RGB-BGR
 34 |     img = img * 255
 35 |     mean = np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1)
 36 |     img = (img - mean).astype(np.float32, copy=True)
 37 |     return img
 38 | 
 39 | 
 40 | def preprocess(img, min_size=600, max_size=1000):
 41 |     """Preprocess an image for feature extraction.
 42 | 
 43 |     The length of the shorter edge is scaled to :obj:`self.min_size`.
 44 |     After the scaling, if the length of the longer edge is longer than
 45 |     :param min_size:
 46 |     :obj:`self.max_size`, the image is scaled to fit the longer edge
 47 |     to :obj:`self.max_size`.
 48 | 
 49 |     After resizing the image, the image is subtracted by a mean image value
 50 |     :obj:`self.mean`.
 51 | 
 52 |     Args:
 53 |         img (~numpy.ndarray): An image. This is in CHW and RGB format.
 54 |             The range of its value is :math:`[0, 255]`.
 55 | 
 56 |     Returns:
 57 |         ~numpy.ndarray: A preprocessed image.
 58 | 
 59 |     """
 60 |     C, H, W = img.shape
 61 |     scale1 = min_size / min(H, W)
 62 |     scale2 = max_size / max(H, W)
 63 |     scale = min(scale1, scale2)
 64 |     img = img / 255.
 65 |     img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect')
 66 |     # both the longer and shorter should be less than
 67 |     # max_size and min_size
 68 |     
 69 |     if opt.caffe_pretrain:
 70 |         normalize = caffe_normalize
 71 |     else:
 72 |         normalize = pytorch_normalze
 73 |     return normalize(img)
 74 | 
 75 | 
 76 | class Transform(object):
 77 |     '''
 78 |     This is a self-defined transform class.
 79 |     '''
 80 | 
 81 |     def __init__(self, min_size=600, max_size=1000):
 82 |         self.min_size = min_size
 83 |         self.max_size = max_size
 84 | 
 85 |     def __call__(self, in_data):
 86 |         img, bbox, label = in_data
 87 |         _, H, W = img.shape
 88 |         img = preprocess(img, self.min_size, self.max_size)
 89 |         _, o_H, o_W = img.shape
 90 |         scale = o_H / H
 91 |         bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))
 92 | 
 93 |         # horizontally flip
 94 |         img, params = util.random_flip(
 95 |             img, x_random=True, return_param=True)
 96 |         bbox = util.flip_bbox(
 97 |             bbox, (o_H, o_W), x_flip=params['x_flip'])
 98 | 
 99 |         return img, bbox, label, scale
100 | 
101 | 
102 | class Dataset:
103 |     def __init__(self, opt):
104 |         self.opt = opt
105 |         self.db = VOCBboxDataset(opt.voc_data_dir)
106 |         self.tsf = Transform(opt.min_size, opt.max_size)
107 | 
108 |     def __getitem__(self, idx):
109 |         ori_img, bbox, label, difficult = self.db.get_example(idx)
110 | 
111 |         img, bbox, label, scale = self.tsf((ori_img, bbox, label))
112 |         # TODO: check whose stride is negative to fix this instead copy all
113 |         # some of the strides of a given numpy array are negative.
114 |         return img.copy(), bbox.copy(), label.copy(), scale
115 | 
116 |     def __len__(self):
117 |         return len(self.db)
118 | 
119 | 
120 | class TestDataset:
121 |     def __init__(self, opt, split='test', use_difficult=True):
122 |         self.opt = opt
123 |         self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult)
124 | 
125 |     def __getitem__(self, idx):
126 |         ori_img, bbox, label, difficult = self.db.get_example(idx)
127 |         img = preprocess(ori_img)
128 |         return img, ori_img.shape[1:], bbox, label, difficult  # the original shape of the image is passed.
129 | 
130 |     def __len__(self):
131 |         return len(self.db)
132 | 


--------------------------------------------------------------------------------
/model/roi_module.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | from string import Template
  3 | 
  4 | import cupy, torch
  5 | import cupy as cp
  6 | import torch as t
  7 | from torch.autograd import Function
  8 | 
  9 | from model.utils.roi_cupy import kernel_backward, kernel_forward
 10 | 
 11 | 
 12 | Stream = namedtuple('Stream', ['ptr'])
 13 | 
 14 | 
 15 | @cupy.util.memoize(for_each_device=True)
 16 | def load_kernel(kernel_name, code, **kwargs):
 17 |     cp.cuda.runtime.free(0)
 18 |     code = Template(code).substitute(**kwargs)
 19 |     kernel_code = cupy.cuda.compile_with_cache(code)
 20 |     return kernel_code.get_function(kernel_name)
 21 | 
 22 | 
 23 | CUDA_NUM_THREADS = 1024
 24 | 
 25 | 
 26 | def GET_BLOCKS(N, K=CUDA_NUM_THREADS):
 27 |     return (N + K - 1) // K
 28 | 
 29 | 
 30 | class RoI(Function):
 31 |     """
 32 |     NOTE：only CUDA-compatible
 33 |     """
 34 | 
 35 |     def __init__(self, outh, outw, spatial_scale):
 36 |         self.forward_fn = load_kernel('roi_forward', kernel_forward)
 37 |         self.backward_fn = load_kernel('roi_backward', kernel_backward)
 38 |         self.outh, self.outw, self.spatial_scale = outh, outw, spatial_scale
 39 | 
 40 |     def forward(self, x, rois):
 41 |         # NOTE: MAKE SURE input is contiguous too
 42 |         x = x.contiguous()
 43 |         rois = rois.contiguous()
 44 |         self.in_size = B, C, H, W = x.size()
 45 |         self.N = N = rois.size(0)
 46 |         output = t.zeros(N, C, self.outh, self.outw).cuda()
 47 |         self.argmax_data = t.zeros(N, C, self.outh, self.outw).int().cuda()
 48 |         self.rois = rois
 49 |         args = [x.data_ptr(), rois.data_ptr(),
 50 |                 output.data_ptr(),
 51 |                 self.argmax_data.data_ptr(),
 52 |                 self.spatial_scale, C, H, W,
 53 |                 self.outh, self.outw,
 54 |                 output.numel()]
 55 |         stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
 56 |         self.forward_fn(args=args,
 57 |                         block=(CUDA_NUM_THREADS, 1, 1),
 58 |                         grid=(GET_BLOCKS(output.numel()), 1, 1),
 59 |                         stream=stream)
 60 |         return output
 61 | 
 62 |     def backward(self, grad_output):
 63 |         ##NOTE: IMPORTANT CONTIGUOUS
 64 |         # TODO: input
 65 |         grad_output = grad_output.contiguous()
 66 |         B, C, H, W = self.in_size
 67 |         grad_input = t.zeros(self.in_size).cuda()
 68 |         stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
 69 |         args = [grad_output.data_ptr(),
 70 |                 self.argmax_data.data_ptr(),
 71 |                 self.rois.data_ptr(),
 72 |                 grad_input.data_ptr(),
 73 |                 self.N, self.spatial_scale, C, H, W, self.outh, self.outw,
 74 |                 grad_input.numel()]
 75 |         self.backward_fn(args=args,
 76 |                          block=(CUDA_NUM_THREADS, 1, 1),
 77 |                          grid=(GET_BLOCKS(grad_input.numel()), 1, 1),
 78 |                          stream=stream
 79 |                          )
 80 |         return grad_input, None
 81 | 
 82 | 
 83 | class RoIPooling2D(t.nn.Module):
 84 | 
 85 |     def __init__(self, outh, outw, spatial_scale):
 86 |         super(RoIPooling2D, self).__init__()
 87 |         self.RoI = RoI(outh, outw, spatial_scale)
 88 | 
 89 |     def forward(self, x, rois):
 90 |         return self.RoI(x, rois)
 91 | 
 92 | 
 93 | def test_roi_module():
 94 |     ## fake data###
 95 |     B, N, C, H, W, PH, PW = 2, 8, 4, 32, 32, 7, 7
 96 | 
 97 |     bottom_data = t.randn(B, C, H, W).cuda()
 98 |     bottom_rois = t.randn(N, 5)
 99 |     bottom_rois[:int(N / 2), 0] = 0
100 |     bottom_rois[int(N / 2):, 0] = 1
101 |     bottom_rois[:, 1:] = (t.rand(N, 4) * 100).float()
102 |     bottom_rois = bottom_rois.cuda()
103 |     spatial_scale = 1. / 16
104 |     outh, outw = PH, PW
105 | 
106 |     # pytorch version
107 |     module = RoIPooling2D(outh, outw, spatial_scale)
108 |     x = t.autograd.Variable(bottom_data, requires_grad=True)
109 |     rois = t.autograd.Variable(bottom_rois)
110 |     output = module(x, rois)
111 |     output.sum().backward()
112 | 
113 |     def t2c(variable):
114 |         npa = variable.data.cpu().numpy()
115 |         return cp.array(npa)
116 | 
117 |     def test_eq(variable, array, info):
118 |         cc = cp.asnumpy(array)
119 |         neq = (cc != variable.data.cpu().numpy())
120 |         assert neq.sum() == 0, 'test failed: %s' % info
121 | 
122 |     # chainer version,if you're going to run this
123 |     # pip install chainer 
124 |     import chainer.functions as F
125 |     from chainer import Variable
126 |     x_cn = Variable(t2c(x))
127 | 
128 |     o_cn = F.roi_pooling_2d(x_cn, t2c(rois), outh, outw, spatial_scale)
129 |     test_eq(output, o_cn.array, 'forward')
130 |     F.sum(o_cn).backward()
131 |     test_eq(x.grad, x_cn.grad, 'backward')
132 |     print('test pass')
133 | 


--------------------------------------------------------------------------------
/data/voc_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import xml.etree.ElementTree as ET
  3 | 
  4 | import numpy as np
  5 | 
  6 | from .util import read_image
  7 | 
  8 | 
  9 | 
 10 | class VOCBboxDataset:
 11 |     """Bounding box dataset for PASCAL `VOC`_.
 12 | 
 13 |     .. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/
 14 | 
 15 |     The index corresponds to each image.
 16 | 
 17 |     When queried by an index, if :obj:`return_difficult == False`,
 18 |     this dataset returns a corresponding
 19 |     :obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels.
 20 |     This is the default behaviour.
 21 |     If :obj:`return_difficult == True`, this dataset returns corresponding
 22 |     :obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array
 23 |     that indicates whether bounding boxes are labeled as difficult or not.
 24 | 
 25 |     The bounding boxes are packed into a two dimensional tensor of shape
 26 |     :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in
 27 |     the image. The second axis represents attributes of the bounding box.
 28 |     They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the
 29 |     four attributes are coordinates of the top left and the bottom right
 30 |     vertices.
 31 | 
 32 |     The labels are packed into a one dimensional tensor of shape :math:`(R,)`.
 33 |     :math:`R` is the number of bounding boxes in the image.
 34 |     The class name of the label :math:`l` is :math:`l` th element of
 35 |     :obj:`VOC_BBOX_LABEL_NAMES`.
 36 | 
 37 |     The array :obj:`difficult` is a one dimensional boolean array of shape
 38 |     :math:`(R,)`. :math:`R` is the number of bounding boxes in the image.
 39 |     If :obj:`use_difficult` is :obj:`False`, this array is
 40 |     a boolean array with all :obj:`False`.
 41 | 
 42 |     The type of the image, the bounding boxes and the labels are as follows.
 43 | 
 44 |     * :obj:`img.dtype == numpy.float32`
 45 |     * :obj:`bbox.dtype == numpy.float32`
 46 |     * :obj:`label.dtype == numpy.int32`
 47 |     * :obj:`difficult.dtype == numpy.bool`
 48 | 
 49 |     Args:
 50 |         data_dir (string): Path to the root of the training data. 
 51 |             i.e. "/data/image/voc/VOCdevkit/VOC2007/"
 52 |         split ({'train', 'val', 'trainval', 'test'}): Select a split of the
 53 |             dataset. :obj:`test` split is only available for
 54 |             2007 dataset.
 55 |         year ({'2007', '2012'}): Use a dataset prepared for a challenge
 56 |             held in :obj:`year`.
 57 |         use_difficult (bool): If :obj:`True`, use images that are labeled as
 58 |             difficult in the original annotation.
 59 |         return_difficult (bool): If :obj:`True`, this dataset returns
 60 |             a boolean array
 61 |             that indicates whether bounding boxes are labeled as difficult
 62 |             or not. The default value is :obj:`False`.
 63 | 
 64 |     """
 65 | 
 66 |     def __init__(self, data_dir, split='trainval',
 67 |                  use_difficult=False, return_difficult=False,
 68 |                  ):
 69 | 
 70 |         # if split not in ['train', 'trainval', 'val']:
 71 |         #     if not (split == 'test' and year == '2007'):
 72 |         #         warnings.warn(
 73 |         #             'please pick split from \'train\', \'trainval\', \'val\''
 74 |         #             'for 2012 dataset. For 2007 dataset, you can pick \'test\''
 75 |         #             ' in addition to the above mentioned splits.'
 76 |         #         )
 77 |         id_list_file = os.path.join(
 78 |             data_dir, 'ImageSets/Main/{0}.txt'.format(split))
 79 | 
 80 |         self.ids = [id_.strip() for id_ in open(id_list_file)]
 81 |         self.data_dir = data_dir
 82 |         self.use_difficult = use_difficult
 83 |         self.return_difficult = return_difficult
 84 |         self.label_names = VOC_BBOX_LABEL_NAMES
 85 | 
 86 |     def __len__(self):
 87 |         return len(self.ids)
 88 | 
 89 |     def get_example(self, i):
 90 |         """Returns the i-th example.
 91 | 
 92 |         Returns a color image and bounding boxes. The image is in CHW format.
 93 |         The returned image is RGB.
 94 | 
 95 |         Args:
 96 |             i (int): The index of the example.
 97 | 
 98 |         Returns:
 99 |             tuple of an image and bounding boxes
100 | 
101 |         """
102 |         id_ = self.ids[i]
103 |         anno = ET.parse(
104 |             os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
105 |         bbox = list()
106 |         label = list()
107 |         difficult = list()
108 |         for obj in anno.findall('object'):
109 |             
110 |             # when in not using difficult split, and the object is
111 |             # difficult, skipt it.
112 |             # if not self.use_difficult and int(obj.find('difficult').text) == 1:
113 |             #     continue
114 | 
115 |             # difficult.append(int(obj.find('difficult').text))
116 |             # difficulty all set to 0 when using question dataset
117 |             difficult.append(0)
118 | 
119 |             bndbox_anno = obj.find('bndbox')
120 |             # subtract 1 to make pixel indexes 0-based
121 |             bbox.append([
122 |                 int(bndbox_anno.find(tag).text) - 1
123 |                 for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
124 |             name = obj.find('name').text.lower().strip()
125 |             label.append(VOC_BBOX_LABEL_NAMES.index(name))
126 |         bbox = np.stack(bbox).astype(np.float32)
127 |         label = np.stack(label).astype(np.int32)
128 |         # When `use_difficult==False`, all elements in `difficult` are False.
129 |         difficult = np.array(difficult, dtype=np.bool).astype(np.uint8)  # PyTorch don't support np.bool
130 | 
131 |         # Load a image
132 |         # img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
133 |         img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpeg')
134 |         img = read_image(img_file, color=True)
135 | 
136 |         # if self.return_difficult:
137 |         #     return img, bbox, label, difficult
138 |         return img, bbox, label, difficult
139 | 
140 |     __getitem__ = get_example
141 | 
142 | 
143 | # VOC_BBOX_LABEL_NAMES = (
144 | #     'aeroplane',
145 | #     'bicycle',
146 | #     'bird',
147 | #     'boat',
148 | #     'bottle',
149 | #     'bus',
150 | #     'car',
151 | #     'cat',
152 | #     'chair',
153 | #     'cow',
154 | #     'diningtable',
155 | #     'dog',
156 | #     'horse',
157 | #     'motorbike',
158 | #     'person',
159 | #     'pottedplant',
160 | #     'sheep',
161 | #     'sofa',
162 | #     'train',
163 | #     'tvmonitor')
164 | 
165 | VOC_BBOX_LABEL_NAMES = ('text')


--------------------------------------------------------------------------------
/model/utils/roi_cupy.py:
--------------------------------------------------------------------------------
  1 | kernel_forward = '''
  2 |     extern "C"
  3 |     __global__ void roi_forward(const float* const bottom_data,const float* const bottom_rois,
  4 |                 float* top_data, int* argmax_data,
  5 |                 const double spatial_scale,const int channels,const int height, 
  6 |                 const int width, const int pooled_height, 
  7 |                 const int pooled_width,const int NN
  8 |     ){
  9 |         
 10 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 11 |     if(idx>=NN)
 12 |         return;
 13 |     const int pw = idx % pooled_width;
 14 |     const int ph = (idx / pooled_width) % pooled_height;
 15 |     const int c = (idx / pooled_width / pooled_height) % channels;
 16 |     int num = idx / pooled_width / pooled_height / channels;
 17 |     const int roi_batch_ind = bottom_rois[num * 5 + 0];
 18 |     const int roi_start_w = round(bottom_rois[num * 5 + 1] * spatial_scale);
 19 |     const int roi_start_h = round(bottom_rois[num * 5 + 2] * spatial_scale);
 20 |     const int roi_end_w = round(bottom_rois[num * 5 + 3] * spatial_scale);
 21 |     const int roi_end_h = round(bottom_rois[num * 5 + 4] * spatial_scale);
 22 |     // Force malformed ROIs to be 1x1
 23 |     const int roi_width = max(roi_end_w - roi_start_w + 1, 1);
 24 |     const int roi_height = max(roi_end_h - roi_start_h + 1, 1);
 25 |     const float bin_size_h = static_cast<float>(roi_height)
 26 |                     / static_cast<float>(pooled_height);
 27 |     const float bin_size_w = static_cast<float>(roi_width)
 28 |                     / static_cast<float>(pooled_width);
 29 | 
 30 |     int hstart = static_cast<int>(floor(static_cast<float>(ph)
 31 |                                     * bin_size_h));
 32 |         int wstart = static_cast<int>(floor(static_cast<float>(pw)
 33 |                                     * bin_size_w));
 34 |     int hend = static_cast<int>(ceil(static_cast<float>(ph + 1)
 35 |                                 * bin_size_h));
 36 |         int wend = static_cast<int>(ceil(static_cast<float>(pw + 1)
 37 |                                 * bin_size_w));
 38 | 
 39 |     // Add roi offsets and clip to input boundaries
 40 |     hstart = min(max(hstart + roi_start_h, 0), height);
 41 |     hend = min(max(hend + roi_start_h, 0), height);
 42 |     wstart = min(max(wstart + roi_start_w, 0), width);
 43 |     wend = min(max(wend + roi_start_w, 0), width);
 44 |     bool is_empty = (hend <= hstart) || (wend <= wstart);
 45 | 
 46 |     // Define an empty pooling region to be zero
 47 |     float maxval = is_empty ? 0 : -1E+37;
 48 |     // If nothing is pooled, argmax=-1 causes nothing to be backprop'd
 49 |     int maxidx = -1;
 50 |     const int data_offset = (roi_batch_ind * channels + c) * height * width;
 51 |     for (int h = hstart; h < hend; ++h) {
 52 |         for (int w = wstart; w < wend; ++w) {
 53 |             int bottom_index = h * width + w;
 54 |             if (bottom_data[data_offset + bottom_index] > maxval) {
 55 |                 maxval = bottom_data[data_offset + bottom_index];
 56 |                 maxidx = bottom_index;
 57 |             }
 58 |         }
 59 |     }
 60 |     top_data[idx]=maxval;
 61 |     argmax_data[idx]=maxidx;
 62 |     }
 63 | '''
 64 | kernel_backward = '''
 65 |     extern "C"
 66 |     __global__ void roi_backward(const float* const top_diff,
 67 |          const int* const argmax_data,const float* const bottom_rois,
 68 |          float* bottom_diff, const int num_rois,
 69 |          const double spatial_scale, int channels,
 70 |          int height, int width, int pooled_height,
 71 |           int pooled_width,const int NN)
 72 |     {
 73 | 
 74 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 75 |     ////Importtan >= instead of >
 76 |     if(idx>=NN)
 77 |         return;
 78 |     int w = idx % width;
 79 |     int h = (idx / width) % height;
 80 |     int c = (idx/ (width * height)) % channels;
 81 |     int num = idx / (width * height * channels);
 82 | 
 83 |     float gradient = 0;
 84 |     // Accumulate gradient over all ROIs that pooled this element
 85 |     for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
 86 |         // Skip if ROI's batch index doesn't match num
 87 |         if (num != static_cast<int>(bottom_rois[roi_n * 5])) {
 88 |             continue;
 89 |         }
 90 | 
 91 |         int roi_start_w = round(bottom_rois[roi_n * 5 + 1]
 92 |                                 * spatial_scale);
 93 |         int roi_start_h = round(bottom_rois[roi_n * 5 + 2]
 94 |                                 * spatial_scale);
 95 |         int roi_end_w = round(bottom_rois[roi_n * 5 + 3]
 96 |                                 * spatial_scale);
 97 |         int roi_end_h = round(bottom_rois[roi_n * 5 + 4]
 98 |                                 * spatial_scale);
 99 | 
100 |         // Skip if ROI doesn't include (h, w)
101 |         const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
102 |                                 h >= roi_start_h && h <= roi_end_h);
103 |         if (!in_roi) {
104 |             continue;
105 |         }
106 | 
107 |         int offset = (roi_n * channels + c) * pooled_height
108 |                         * pooled_width;
109 | 
110 |         // Compute feasible set of pooled units that could have pooled
111 |         // this bottom unit
112 | 
113 |         // Force malformed ROIs to be 1x1
114 |         int roi_width = max(roi_end_w - roi_start_w + 1, 1);
115 |         int roi_height = max(roi_end_h - roi_start_h + 1, 1);
116 | 
117 |         float bin_size_h = static_cast<float>(roi_height)
118 |                         / static_cast<float>(pooled_height);
119 |         float bin_size_w = static_cast<float>(roi_width)
120 |                         / static_cast<float>(pooled_width);
121 | 
122 |         int phstart = floor(static_cast<float>(h - roi_start_h)
123 |                             / bin_size_h);
124 |         int phend = ceil(static_cast<float>(h - roi_start_h + 1)
125 |                             / bin_size_h);
126 |         int pwstart = floor(static_cast<float>(w - roi_start_w)
127 |                             / bin_size_w);
128 |         int pwend = ceil(static_cast<float>(w - roi_start_w + 1)
129 |                             / bin_size_w);
130 | 
131 |         phstart = min(max(phstart, 0), pooled_height);
132 |         phend = min(max(phend, 0), pooled_height);
133 |         pwstart = min(max(pwstart, 0), pooled_width);
134 |         pwend = min(max(pwend, 0), pooled_width);
135 |         for (int ph = phstart; ph < phend; ++ph) {
136 |             for (int pw = pwstart; pw < pwend; ++pw) {
137 |                 int index_ = ph * pooled_width + pw + offset;
138 |                 if (argmax_data[index_] == (h * width + w)) {
139 |                     gradient += top_diff[index_];
140 |                 }
141 |             }
142 |         }
143 |     }
144 |     bottom_diff[idx] = gradient;
145 |     }
146 | '''
147 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import ipdb
  5 | import matplotlib
  6 | from tqdm import tqdm
  7 | 
  8 | from utils.config import opt
  9 | from data.dataset import Dataset, TestDataset, inverse_normalize
 10 | from model import FasterRCNNVGG16
 11 | from torch.autograd import Variable
 12 | from torch.utils import data as data_
 13 | from trainer import FasterRCNNTrainer
 14 | from utils import array_tool as at
 15 | from utils.vis_tool import visdom_bbox
 16 | from utils.eval_tool import eval_detection_voc
 17 | 
 18 | from model.utils.bbox_tools import bbox_iou
 19 | 
 20 | 
 21 | import resource
 22 | 
 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 24 | 
 25 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
 26 | resource.setrlimit(resource.RLIMIT_NOFILE, (20480, rlimit[1]))
 27 | 
 28 | matplotlib.use('agg')
 29 | 
 30 | 
 31 | def eval(dataloader, faster_rcnn, test_num=10000, prob_thre=0.7):
 32 |     pred_bboxes, pred_labels, pred_scores = list(), list(), list()
 33 |     gt_bboxes, gt_labels, gt_difficults = list(), list(), list()
 34 |     for ii, (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) in tqdm(enumerate(dataloader)):
 35 |         # imgs here are reshaped images
 36 |         # sizes here are the original shape of images
 37 |         sizes = [sizes[0][0], sizes[1][0]]
 38 |         pred_bboxes_, pred_labels_, pred_scores_ = faster_rcnn.predict(imgs, [sizes], prob_thre=prob_thre)
 39 |         
 40 |         gt_bboxes += list(gt_bboxes_.numpy())
 41 |         gt_labels += list(gt_labels_.numpy())
 42 |         gt_difficults += list(gt_difficults_.numpy())
 43 |         
 44 |         pred_bboxes += pred_bboxes_
 45 |         pred_labels += pred_labels_
 46 |         pred_scores += pred_scores_
 47 |         if ii == test_num: break
 48 | 
 49 |     result = eval_detection_voc(
 50 |         pred_bboxes, pred_labels, pred_scores,
 51 |         gt_bboxes, gt_labels, gt_difficults,
 52 |         use_07_metric=True)
 53 |     return result
 54 | 
 55 | 
 56 | def train(**kwargs):
 57 |     opt._parse(kwargs)
 58 | 
 59 |     dataset = Dataset(opt)
 60 | 
 61 |     # data
 62 |     print('load data')
 63 |     dataloader = data_.DataLoader(dataset, 
 64 |                                   batch_size=1, 
 65 |                                   shuffle=False, 
 66 |                                   # pin_memory=True,
 67 |                                   num_workers=opt.num_workers)
 68 |     testset = TestDataset(opt)
 69 |     test_dataloader = data_.DataLoader(testset,
 70 |                                        batch_size=1,
 71 |                                        num_workers=opt.test_num_workers,
 72 |                                        shuffle=False, 
 73 |                                        pin_memory=True
 74 |                                        )
 75 | 
 76 |     # model and trainer
 77 |     faster_rcnn = FasterRCNNVGG16()
 78 |     print('model construct completed')
 79 | 
 80 |     trainer = FasterRCNNTrainer(faster_rcnn).cuda()
 81 | 
 82 |     if opt.load_path:
 83 |         trainer.load(opt.load_path)
 84 |         print('load pretrained model from %s' % opt.load_path)
 85 | 
 86 |     trainer.vis.text(dataset.db.label_names, win='labels')
 87 |     best_map = 0
 88 |     lr_ = opt.lr
 89 |     for epoch in range(opt.epoch):
 90 |         trainer.reset_meters()
 91 |         for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
 92 |             scale = at.scalar(scale)
 93 |             img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
 94 |             img, bbox, label = Variable(img), Variable(bbox), Variable(label)
 95 | 
 96 |             # print(label)
 97 |             
 98 |             # all the input data for one training are : img, bbox, label, scale
 99 |             trainer.train_step(img, bbox, label, scale)
100 |             # training code stop here.
101 | 
102 | 
103 |             if (ii + 1) % opt.plot_every == 0:
104 |                 if os.path.exists(opt.debug_file):
105 |                     ipdb.set_trace()
106 | 
107 |                 # plot loss
108 |                 trainer.vis.plot_many(trainer.get_meter_data())
109 | 
110 |                 # plot groud truth bboxes
111 |                 ori_img_ = inverse_normalize(at.tonumpy(img[0]))
112 |                 gt_img = visdom_bbox(ori_img_,
113 |                                      at.tonumpy(bbox_[0]),
114 |                                      at.tonumpy(label_[0]))
115 |                 trainer.vis.img('gt_img', gt_img)
116 | 
117 |                 # plot predicti bboxes
118 |                 _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)
119 | 
120 |                 pred_img = visdom_bbox(ori_img_,
121 |                                        at.tonumpy(_bboxes[0]),
122 |                                        at.tonumpy(_labels[0]).reshape(-1),
123 |                                        at.tonumpy(_scores[0]))
124 |                 trainer.vis.img('pred_img', pred_img)
125 | 
126 |                 # rpn confusion matrix(meter)
127 |                 trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
128 |                 # roi confusion matrix
129 |                 trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
130 |         
131 |         # use the test dataset to eval
132 |         eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num, prob_thre=opt.prob_thre)
133 | 
134 |         print("eval_result", eval_result)
135 | 
136 |         if eval_result['map'] > best_map:
137 |             best_map = eval_result['map']
138 |             best_path = trainer.save(best_map=best_map)
139 |         if epoch == 9:
140 |             trainer.load(best_path)
141 |             trainer.faster_rcnn.scale_lr(opt.lr_decay)
142 |             lr_ = lr_ * opt.lr_decay
143 | 
144 |         trainer.vis.plot('test_map', eval_result['map'])
145 |         log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),
146 |                                                   str(eval_result['map']),
147 |                                                   str(trainer.get_meter_data()))
148 |         trainer.vis.log(log_info)
149 |         if epoch == 13: 
150 |             break
151 | 
152 | 
153 | 
154 | def eval_prob_thre(**kwargs):
155 |     '''
156 |     Use the best trained model to find out the best prob_thre, \
157 |     which is used when generating prediction box using px and py.
158 |     '''
159 |     opt._parse(kwargs)
160 | 
161 |     testset = TestDataset(opt)
162 |     test_dataloader = data_.DataLoader(testset,
163 |                                        batch_size=1,
164 |                                        num_workers=opt.test_num_workers,
165 |                                        shuffle=False, 
166 |                                        pin_memory=True
167 |                                        )
168 | 
169 |     # model and trainer
170 |     faster_rcnn = FasterRCNNVGG16()
171 |     print('model construct completed')
172 | 
173 |     trainer = FasterRCNNTrainer(faster_rcnn).cuda()
174 | 
175 |     if opt.load_path:
176 |         trainer.load(opt.load_path)
177 |         print('load pretrained model from %s' % opt.load_path)
178 | 
179 |     best_map = 0
180 |     best_prob_thre = 0
181 |     
182 |     for prob_thre in np.linspace(0.3,0.9,7):
183 |         
184 |         # use the test dataset to eval
185 |         eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num, prob_thre=prob_thre)
186 |         print("eval_result", eval_result)
187 |         if eval_result['map'] > best_map:
188 |             best_map = eval_result['map']
189 |             best_prob_thre = prob_thre
190 | 
191 |     print("best_map is ", best_map)
192 |     print("best prob_thre is ", best_prob_thre)
193 | 
194 | 
195 | 
196 | 
197 | if __name__ == '__main__':
198 |     import fire
199 | 
200 |     fire.Fire()
201 | 


--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
  1 | # Improved Localization Accuracy by LocNet for Faster R-CNN 
  2 | 
  3 | ## 1. Introduction
  4 | 
  5 | ![](http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-LocNet-FasterRCNN.001.jpeg)
  6 | 
  7 | This project is a Simplified Faster R-CNN **improved by LocNet** (**Loc-Faster-RCNN** for short) implementation based on [Faster R-CNN by chenyuntc](https://github.com/chenyuntc/simple-faster-rcnn-pytorch). It aims in:
  8 | 
  9 | - Improve the localization accuracy of Faster R-CNN by using LocNet in the Fast R-CNN part.
 10 | - The first public implementation of the [original paper](https://ieeexplore.ieee.org/abstract/document/8270086/). The author of the paper didn't release their version.
 11 | - Match the performance reported in original paper.
 12 | 
 13 | And it has the following features:
 14 | 
 15 | - It can be run as pure Python code, no more build affair. (cuda code moves to cupy, Cython acceleration are optional)
 16 | 
 17 | This implementation is slightlly different from the original paper:
 18 | 
 19 | - Skip pooling is not used here. Informations from conv5_3 layer(the feature map of original Faster R-CNN) is enough for my task, so skip pooling is droped in this repo. What's more, with the advent of new methods like [Feature Pyramid Networks](https://arxiv.org/abs/1612.03144), skip pooling seems to be obsolete :)
 20 | - The RPN net is exactly same as Faster R-CNN, which means only 3X3 conv is applied, rather than 3X3 and 5X5 conv nets in the original paper.
 21 | - Training strategy. The original paper train the RPN and LocNet alternately, but losses of RPN and LocNet are backproped at the same time in this repo.
 22 | 
 23 | **prob_thre** :
 24 | 
 25 | - Hyperparameters in Loc-Faster-RCNN are mostly like Faster R-CNN except for **prob_thre**. 
 26 | - prob_thre is the threshold of probability used when predicting the bounding box, if px or py is greater than prob_thre, this row or column is considered to be part of some object.
 27 | - Different detection tasks may have different appropriate prob_thre to achive best performance. If most objects in the detection task are dense blocks, a higher prob_thre may achive better performance. 
 28 | - You can choose your own prob_thre according to your task characteristics. Use **eval_prob_thre** function in train.py to find out the best prob_thre for your task. Remember to set **load_path** variable in the **utils/config.py** to your best model before calling this function.
 29 | 
 30 | ## 2. Performance
 31 | 
 32 | ### 2.1 Pascal VOC
 33 | 
 34 | Training and test set of Pascal VOC 2007 are used in this repo.
 35 | 
 36 | #### 2.1.1 mAP
 37 | 
 38 | The best prob_thre for Pascal VOC is 0.5. When using prob_thre=0.5, the performance of Loc-Faster-RCNN is listed as follows. So with dataset like Pascal VOC, Loc-Faster-RCNN can not achieve better result than Faster R-CNN. However, when apppied to dataset with lots of small and dense objects, Loc-Faster-RCNN is likely to achieve better performance.
 39 | 
 40 | | Implementation  |  mAP   |
 41 | | :-------------: | :----: |
 42 | | Loc-Faster-RCNN | 0.6527 |
 43 | |  Faster R-CNN   | 0.7097 |
 44 | 
 45 | #### 2.1.2 Differences between models predictions in Pascal VOC
 46 | 
 47 | - LocNet part improves localization accuracy of  Loc-Faster-RCNN by predicting the probability rather than locations. This helps when models is used to detection small objects or objects that are not so obvious. Like shown in the first 2 rows below, Loc-Faster-RCNN detected a person(row 1) and plant(row 2) even the objects are too small and not obvious.
 48 | - However, LocNet part also hinders the model from identifying small parts of objects,which are more densely connected with the background rather than the main part of that object, like tail of a cat or wings of a bird ,as shown in the 3th~5th rows bellow.
 49 | - What's more, if objects are overlaping or densely connected with each other in the same image, Loc-Faster-RCNN also have difficulty in drawing accurate bounding boxes around objects, as shown in the last row bellow.
 50 | 
 51 | |                         Ground Truth                         |                       Loc-Faster-RCNN                        |                         Faster R-CNN                         |
 52 | | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
 53 | | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-1.0.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-1.1.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-1.2.png" width="300" />  </p> |
 54 | | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-2.0.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-2.1.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-2.2.png" width="300" />  </p> |
 55 | | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-3.0.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-3.1.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-3.2.png" width="300" />  </p> |
 56 | | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-4.0.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-4.1.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-4.2.png" width="300" />  </p> |
 57 | | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-5.0.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-5.1.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-5.2.png" width="300" />  </p> |
 58 | | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-6.0.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-6.1.png" width="300" />  </p> | <p float="left"> <img src="http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-6.2.png" width="300" />  </p> |
 59 | 
 60 | ### 2.2 Text detection dataset 
 61 | 
 62 | ICDAR-2011 and ICDAR-2013 are used in training and eveluating.
 63 | 
 64 | TBD.
 65 | 
 66 | ## 3. Install dependencies
 67 | 
 68 | This repo is built basically on [Faster R-CNN](https://github.com/chenyuntc/simple-faster-rcnn-pytorch). You can check this repo to see dependencies.
 69 | 
 70 | ## 4. Train
 71 | 
 72 | Compared with Faster R-CNN, Loc-Faster-RCNN is a little bit harder to train. If same initinal learning rate of 1e-3 is applied, the model may not converge after several epoches because px pr py would be nan. So if you encounter the same problem when using Loc-Faster-RCNN on your own dataset, maybe a smaller learning rate of 1e-4 or 1e-5 should work. 
 73 | 
 74 | ## Troubleshooting
 75 | 
 76 | 
 77 | 
 78 | ## More
 79 | 
 80 | - [x] model structure
 81 | - [ ] maybe : skip pooling
 82 | - [ ] Maybe : conv 3X3 and conv 5X5 in RPN
 83 | - [ ] High likely : Feature Pyramid Network as backbone
 84 | - [ ] High likely : RoI Align rather than RoI Pooling
 85 | 
 86 | ## Acknowledgement
 87 | 
 88 | This work builds on many excellent works, which include:
 89 | 
 90 | - [Faster R-CNN by chenyuntc](https://github.com/chenyuntc/simple-faster-rcnn-pytorch), on which this repo is built on. The best implementation of Faster R-CNN in Pytorch I've ever seen.
 91 | 
 92 | - [LocNet by the paper author](https://github.com/gidariss/LocNet).
 93 | 
 94 |   
 95 | 
 96 | ***
 97 | 
 98 | Licensed under MIT, see the LICENSE for more detail.
 99 | 
100 | Contribution Welcome.
101 | 
102 | If you encounter any problem, feel free to open an issue.
103 | 
104 | Correct me if anything is wrong or unclear.


--------------------------------------------------------------------------------
/model/faster_rcnn_vgg16.py:
--------------------------------------------------------------------------------
  1 | import torch as t
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | from torchvision.models import vgg16
  5 | from model.region_proposal_network import RegionProposalNetwork
  6 | from model.faster_rcnn import FasterRCNN
  7 | from model.roi_module import RoIPooling2D
  8 | from utils import array_tool as at
  9 | from utils.config import opt
 10 | 
 11 | 
 12 | def decom_vgg16():
 13 |     # the 30th layer of features is relu of conv5_3
 14 | 
 15 |     # use either caffe or pytorch pretrained model
 16 |     if opt.caffe_pretrain:
 17 |         model = vgg16(pretrained=False)
 18 |         if not opt.load_path:
 19 |             model.load_state_dict(t.load(opt.caffe_pretrain_path))
 20 |     else:
 21 |         model = vgg16(not opt.load_path)  # use pretrained torchvision vgg net
 22 | 
 23 |     features = list(model.features)[:30]
 24 |     
 25 |     # get the classification layer and drop some of them, leave the rest for use
 26 | 
 27 |     # classifier defined in pytorch source code.
 28 |     #     self.classifier = nn.Sequential(
 29 |     # 0    nn.Linear(512 * 7 * 7, 4096),
 30 |     # 1    nn.ReLU(True),
 31 |     # 2    nn.Dropout(),
 32 |     # 3    nn.Linear(4096, 4096),
 33 |     # 4    nn.ReLU(True),
 34 |     # 5    nn.Dropout(),
 35 |     # 6    nn.Linear(4096, num_classes),
 36 |     # )
 37 |     # only two linear and two ReLU layers are kept for classifier.
 38 | 
 39 |     classifier = model.classifier
 40 |     classifier = list(classifier)
 41 |     del classifier[6]
 42 |     if not opt.use_drop:
 43 |         del classifier[5]
 44 |         del classifier[2]
 45 |     classifier = nn.Sequential(*classifier)
 46 | 
 47 |     # freeze top4 conv
 48 |     for layer in features[:10]:
 49 |         for p in layer.parameters():
 50 |             p.requires_grad = False
 51 | 
 52 |     return nn.Sequential(*features), classifier
 53 | 
 54 | 
 55 | class FasterRCNNVGG16(FasterRCNN):
 56 |     """Faster R-CNN based on VGG-16.
 57 |     For descriptions on the interface of this model, please refer to
 58 |     :class:`model.faster_rcnn.FasterRCNN`.
 59 | 
 60 |     Args:
 61 |         n_fg_class (int): The number of classes excluding the background.
 62 |         ratios (list of floats): This is ratios of width to height of
 63 |             the anchors.
 64 |         anchor_scales (list of numbers): This is areas of anchors.
 65 |             Those areas will be the product of the square of an element in
 66 |             :obj:`anchor_scales` and the original area of the reference
 67 |             window.
 68 | 
 69 |     """
 70 | 
 71 | 
 72 |     feat_stride = 16  # downsample 16x for output of conv5 in vgg16
 73 | 
 74 |     def __init__(self,
 75 |                  n_fg_class=20,
 76 |                  ratios=[0.5, 1, 2],
 77 |                  anchor_scales=[8, 16, 32]
 78 |                  ):
 79 |         # extractor is for base net of faster rcnn and classifier is for the final ROIHead part.
 80 |         # These are just some layers, not values.         
 81 |         extractor, classifier = decom_vgg16()
 82 | 
 83 |         rpn = RegionProposalNetwork(
 84 |             512, 512,
 85 |             ratios=ratios,
 86 |             anchor_scales=anchor_scales,
 87 |             feat_stride=self.feat_stride,
 88 |         )
 89 | 
 90 |         head = VGG16RoIHead(
 91 |             n_class=n_fg_class + 1,
 92 |             roi_size=7,
 93 |             spatial_scale=(1. / self.feat_stride),
 94 |             M=28,
 95 |             classifier=classifier
 96 |         )   
 97 | 
 98 |         super(FasterRCNNVGG16, self).__init__(
 99 |             extractor,
100 |             rpn,
101 |             head,
102 |         )
103 | 
104 | 
105 | class VGG16RoIHead(nn.Module):
106 |     """Faster R-CNN Head for VGG-16 based implementation.
107 |     This class is used as a head for Faster R-CNN.
108 |     This outputs class-wise localizations and classification based on feature
109 |     maps in the given RoIs.
110 |     
111 |     Args:
112 |         n_class (int): The number of classes possibly including the background.
113 |         roi_size (int): Height and width of the feature maps after RoI-pooling.
114 |         spatial_scale (float): Scale of the roi is resized.
115 |         classifier (nn.Module): Two layer Linear ported from vgg16
116 | 
117 |     """
118 | 
119 |     def __init__(self, n_class, roi_size, spatial_scale, M,
120 |                  classifier):
121 |         # n_class includes the background
122 |         super(VGG16RoIHead, self).__init__()
123 | 
124 |         self.n_class = n_class
125 |         self.roi_size = roi_size
126 |         self.spatial_scale = spatial_scale
127 |         self.M = M
128 | 
129 | 
130 |         # branch_1
131 |         self.roi_1 = RoIPooling2D(self.roi_size, self.roi_size, self.spatial_scale)     # roi shape of (N, C, outh, outw)
132 |         self.classifier = classifier
133 |         self.score = nn.Linear(4096, n_class)
134 |   
135 |         # branch_2
136 |         self.roi_2 = RoIPooling2D(self.roi_size*2, self.roi_size*2, self.spatial_scale) # roi shape of (N, C, outh*2, outw*2)
137 |         self.conv_21 = nn.Conv2d(512, 512, (3,3), padding=1)
138 |         self.conv_22 = nn.Conv2d(512, 512, (3,3), padding=1)  # output shape (1, 512, 14, 14)
139 |         self.max_x = nn.MaxPool2d((14,1))         # output shape (1, 512, 1, 14)
140 |         self.max_y = nn.MaxPool2d((1,14))         # output shape (1, 512, 14, 1)
141 |         self.fc_x = nn.Linear(7168, M)
142 |         self.fc_y = nn.Linear(7168, M)
143 | 
144 | 
145 |         normal_init(self.score, 0, 0.01)
146 |         normal_init(self.conv_21, 0, 0.01)
147 |         normal_init(self.conv_22, 0, 0.01)
148 |         normal_init(self.fc_x, 0, 0.01)
149 |         normal_init(self.fc_y, 0, 0.01)
150 | 
151 | 
152 |     def forward(self, x, rois, seach_regions, roi_indices):
153 |         """Forward the chain.
154 | 
155 |         We assume that there are :math:`N` batches.
156 | 
157 |         Args:
158 |             x (Variable): 4D image variable. (batch_size, channels, width, height)
159 |             rois (Tensor): A bounding box array containing coordinates of
160 |                 proposal boxes.  This is a concatenation of bounding box
161 |                 arrays from multiple images in the batch.
162 |                 Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed
163 |                 RoIs from the :math:`i` th image,
164 |                 :math:`R' = \\sum _{i=1} ^ N R_i`.
165 |             roi_indices (Tensor): An array containing indices of images to
166 |                 which bounding boxes correspond to. Its shape is :math:`(R',)`.
167 | 
168 |         """
169 |         # in case roi_indices is  ndarray
170 |         
171 |         roi_indices = at.totensor(roi_indices).float()
172 |         
173 |         rois = at.totensor(rois).float()        
174 |         indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1)
175 |         # NOTE: important: yx->xy
176 |         xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
177 |         indices_and_rois = t.autograd.Variable(xy_indices_and_rois.contiguous()) # [index, x1, y1, x2, y2] now
178 | 
179 |         seach_regions = at.totensor(seach_regions).float()  
180 |         indices_and_search_regions = t.cat([roi_indices[:, None], seach_regions], dim=1)
181 |         # NOTE: important: yx->xy
182 |         xy_indices_and_search_regions = indices_and_search_regions[:, [0, 2, 1, 4, 3]]
183 |         indices_and_search_regions = t.autograd.Variable(xy_indices_and_search_regions.contiguous()) # [index, x1, y1, x2, y2] now
184 | 
185 |         # branch_1
186 |         pool_1 = self.roi_1(x, indices_and_rois) # get all the ROI pooling, shape of (N, C, outh, outw)
187 |         pool_1 = pool_1.view(pool_1.size(0), -1)   # shape of shape of (N, C * outh * outw) where C=512    
188 |         fc7 = self.classifier(pool_1)
189 |         roi_scores = self.score(fc7)
190 |   
191 |         # branch_2
192 |         pool_2 = self.roi_2(x, indices_and_search_regions)
193 |         conv_1 = self.conv_21(pool_2)
194 |         conv_2 = self.conv_22(conv_1)
195 |         max_x_ = self.max_x(conv_2)
196 |         max_y_ = self.max_y(conv_2)
197 |         max_x_ = max_x_.view(max_x_.size(0), -1)
198 |         max_y_ = max_y_.view(max_y_.size(0), -1)
199 |         px = F.sigmoid(self.fc_x(max_x_))
200 |         py = F.sigmoid(self.fc_y(max_y_))
201 | 
202 |         return (px, py), roi_scores
203 | 
204 | 
205 | def normal_init(m, mean, stddev, truncated=False):
206 |     """
207 |     weight initalizer: truncated normal and random normal.
208 |     """
209 |     # x is a parameter
210 |     if truncated:
211 |         m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean)  # not a perfect approximation
212 |     else:
213 |         m.weight.data.normal_(mean, stddev)
214 |         m.bias.data.zero_()
215 | 


--------------------------------------------------------------------------------
/utils/vis_tool.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import numpy as np
  4 | import matplotlib
  5 | import torch as t
  6 | import visdom
  7 | 
  8 | matplotlib.use('Agg')
  9 | from matplotlib import pyplot as plot
 10 | 
 11 | # from data.voc_dataset import VOC_BBOX_LABEL_NAMES
 12 | 
 13 | 
 14 | VOC_BBOX_LABEL_NAMES = (
 15 |     'fly',
 16 |     'bike',
 17 |     'bird',
 18 |     'boat',
 19 |     'pin',
 20 |     'bus',
 21 |     'c',
 22 |     'cat',
 23 |     'chair',
 24 |     'cow',
 25 |     'table',
 26 |     'dog',
 27 |     'horse',
 28 |     'moto',
 29 |     'p',
 30 |     'plant',
 31 |     'shep',
 32 |     'sofa',
 33 |     'train',
 34 |     'tv',
 35 | )
 36 | 
 37 | 
 38 | def vis_image(img, ax=None):
 39 |     """Visualize a color image.
 40 | 
 41 |     Args:
 42 |         img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
 43 |             This is in RGB format and the range of its value is
 44 |             :math:`[0, 255]`.
 45 |         ax (matplotlib.axes.Axis): The visualization is displayed on this
 46 |             axis. If this is :obj:`None` (default), a new axis is created.
 47 | 
 48 |     Returns:
 49 |         ~matploblib.axes.Axes:
 50 |         Returns the Axes object with the plot for further tweaking.
 51 | 
 52 |     """
 53 | 
 54 |     if ax is None:
 55 |         fig = plot.figure()
 56 |         ax = fig.add_subplot(1, 1, 1)
 57 |     # CHW -> HWC
 58 |     img = img.transpose((1, 2, 0))
 59 |     ax.imshow(img.astype(np.uint8))
 60 |     return ax
 61 | 
 62 | 
 63 | def vis_bbox(img, bbox, label=None, score=None, ax=None):
 64 |     """Visualize bounding boxes inside image.
 65 | 
 66 |     Args:
 67 |         img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
 68 |             This is in RGB format and the range of its value is
 69 |             :math:`[0, 255]`.
 70 |         bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where
 71 |             :math:`R` is the number of bounding boxes in the image.
 72 |             Each element is organized
 73 |             by :math:`(y_{min}, x_{min}, y_{max}, x_{max})` in the second axis.
 74 |         label (~numpy.ndarray): An integer array of shape :math:`(R,)`.
 75 |             The values correspond to id for label names stored in
 76 |             :obj:`label_names`. This is optional.
 77 |         score (~numpy.ndarray): A float array of shape :math:`(R,)`.
 78 |              Each value indicates how confident the prediction is.
 79 |              This is optional.
 80 |         label_names (iterable of strings): Name of labels ordered according
 81 |             to label ids. If this is :obj:`None`, labels will be skipped.
 82 |         ax (matplotlib.axes.Axis): The visualization is displayed on this
 83 |             axis. If this is :obj:`None` (default), a new axis is created.
 84 | 
 85 |     Returns:
 86 |         ~matploblib.axes.Axes:
 87 |         Returns the Axes object with the plot for further tweaking.
 88 | 
 89 |     """
 90 | 
 91 |     label_names = list(VOC_BBOX_LABEL_NAMES) + ['bg']
 92 |     # add for index `-1`
 93 |     if label is not None and not len(bbox) == len(label):
 94 |         raise ValueError('The length of label must be same as that of bbox')
 95 |     if score is not None and not len(bbox) == len(score):
 96 |         raise ValueError('The length of score must be same as that of bbox')
 97 | 
 98 |     # Returns newly instantiated matplotlib.axes.Axes object if ax is None
 99 |     ax = vis_image(img, ax=ax)
100 | 
101 |     # If there is no bounding box to display, visualize the image and exit.
102 |     if len(bbox) == 0:
103 |         return ax
104 | 
105 |     for i, bb in enumerate(bbox):
106 |         xy = (bb[1], bb[0])
107 |         height = bb[2] - bb[0]
108 |         width = bb[3] - bb[1]
109 |         ax.add_patch(plot.Rectangle(
110 |             xy, width, height, fill=False, edgecolor='red', linewidth=2))
111 | 
112 |         caption = list()
113 | 
114 |         if label is not None and label_names is not None:
115 |             lb = label[i]
116 |             if not (-1 <= lb < len(label_names)):  # modfy here to add backgroud
117 |                 raise ValueError('No corresponding name is given')
118 |             caption.append(label_names[lb])
119 |         if score is not None:
120 |             sc = score[i]
121 |             caption.append('{:.2f}'.format(sc))
122 | 
123 |         if len(caption) > 0:
124 |             ax.text(bb[1], bb[0],
125 |                     ': '.join(caption),
126 |                     style='italic',
127 |                     bbox={'facecolor': 'white', 'alpha': 0.5, 'pad': 0})
128 |     return ax
129 | 
130 | 
131 | def fig2data(fig):
132 |     """
133 |     brief Convert a Matplotlib figure to a 4D numpy array with RGBA 
134 |     channels and return it
135 | 
136 |     @param fig： a matplotlib figure
137 |     @return a numpy 3D array of RGBA values
138 |     """
139 |     # draw the renderer
140 |     fig.canvas.draw()
141 | 
142 |     # Get the RGBA buffer from the figure
143 |     w, h = fig.canvas.get_width_height()
144 |     buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8)
145 |     buf.shape = (w, h, 4)
146 | 
147 |     # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode
148 |     buf = np.roll(buf, 3, axis=2)
149 |     return buf.reshape(h, w, 4)
150 | 
151 | 
152 | def fig4vis(fig):
153 |     """
154 |     convert figure to ndarray
155 |     """
156 |     ax = fig.get_figure()
157 |     img_data = fig2data(ax).astype(np.int32)
158 |     plot.close()
159 |     # HWC->CHW
160 |     return img_data[:, :, :3].transpose((2, 0, 1)) / 255.
161 | 
162 | 
163 | def visdom_bbox(*args, **kwargs):
164 |     fig = vis_bbox(*args, **kwargs)
165 |     data = fig4vis(fig)
166 |     return data
167 | 
168 | 
169 | class Visualizer(object):
170 |     """
171 |     wrapper for visdom
172 |     you can still access naive visdom function by 
173 |     self.line, self.scater,self._send,etc.
174 |     due to the implementation of `__getattr__`
175 |     """
176 | 
177 |     def __init__(self, env='default', **kwargs):
178 |         self.vis = visdom.Visdom(env=env, **kwargs)
179 |         self._vis_kw = kwargs
180 | 
181 |         # e.g.（’loss',23） the 23th value of loss
182 |         self.index = {}
183 |         self.log_text = ''
184 | 
185 |     def reinit(self, env='default', **kwargs):
186 |         """
187 |         change the config of visdom
188 |         """
189 |         self.vis = visdom.Visdom(env=env, **kwargs)
190 |         return self
191 | 
192 |     def plot_many(self, d):
193 |         """
194 |         plot multi values
195 |         @params d: dict (name,value) i.e. ('loss',0.11)
196 |         """
197 |         for k, v in d.items():
198 |             if v is not None:
199 |                 self.plot(k, v)
200 | 
201 |     def img_many(self, d):
202 |         for k, v in d.items():
203 |             self.img(k, v)
204 | 
205 |     def plot(self, name, y, **kwargs):
206 |         """
207 |         self.plot('loss',1.00)
208 |         """
209 |         x = self.index.get(name, 0)
210 |         self.vis.line(Y=np.array([y]), X=np.array([x]),
211 |                       win=name,
212 |                       opts=dict(title=name),
213 |                       update=None if x == 0 else 'append',
214 |                       **kwargs
215 |                       )
216 |         self.index[name] = x + 1
217 | 
218 |     def img(self, name, img_, **kwargs):
219 |         """
220 |         self.img('input_img',t.Tensor(64,64))
221 |         self.img('input_imgs',t.Tensor(3,64,64))
222 |         self.img('input_imgs',t.Tensor(100,1,64,64))
223 |         self.img('input_imgs',t.Tensor(100,3,64,64),nrows=10)
224 |         ！！！don‘t ~~self.img('input_imgs',t.Tensor(100,64,64),nrows=10)~~！！！
225 |         """
226 |         self.vis.images(t.Tensor(img_).cpu().numpy(),
227 |                         win=name,
228 |                         opts=dict(title=name),
229 |                         **kwargs
230 |                         )
231 | 
232 |     def log(self, info, win='log_text'):
233 |         """
234 |         self.log({'loss':1,'lr':0.0001})
235 |         """
236 |         self.log_text += ('[{time}] {info} <br>'.format(
237 |             time=time.strftime('%m%d_%H%M%S'), \
238 |             info=info))
239 |         self.vis.text(self.log_text, win)
240 | 
241 |     def __getattr__(self, name):
242 |         return getattr(self.vis, name)
243 | 
244 |     def state_dict(self):
245 |         return {
246 |             'index': self.index,
247 |             'vis_kw': self._vis_kw,
248 |             'log_text': self.log_text,
249 |             'env': self.vis.env
250 |         }
251 | 
252 |     def load_state_dict(self, d):
253 |         self.vis = visdom.Visdom(env=d.get('env', self.vis.env), **(self.d.get('vis_kw')))
254 |         self.log_text = d.get('log_text', '')
255 |         self.index = d.get('index', dict())
256 |         return self
257 | 


--------------------------------------------------------------------------------
/model/region_proposal_network.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from torch.nn import functional as F
  3 | import torch as t
  4 | from torch import nn
  5 | 
  6 | from model.utils.bbox_tools import generate_anchor_base
  7 | from model.utils.creator_tool import ProposalCreator
  8 | 
  9 | 
 10 | class RegionProposalNetwork(nn.Module):
 11 |     """Region Proposal Network introduced in Faster R-CNN.
 12 | 
 13 |     This is Region Proposal Network introduced in Faster R-CNN [#]_.
 14 |     This takes features extracted from images and propose
 15 |     class agnostic bounding boxes around "objects".
 16 | 
 17 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
 18 |     Faster R-CNN: Towards Real-Time Object Detection with \
 19 |     Region Proposal Networks. NIPS 2015.
 20 | 
 21 |     Args:
 22 |         in_channels (int): The channel size of input.
 23 |         mid_channels (int): The channel size of the intermediate tensor.
 24 |         ratios (list of floats): This is ratios of width to height of
 25 |             the anchors.
 26 |         anchor_scales (list of numbers): This is areas of anchors.
 27 |             Those areas will be the product of the square of an element in
 28 |             :obj:`anchor_scales` and the original area of the reference
 29 |             window.
 30 |         feat_stride (int): Stride size after extracting features from an
 31 |             image.
 32 |         initialW (callable): Initial weight value. If :obj:`None` then this
 33 |             function uses Gaussian distribution scaled by 0.1 to
 34 |             initialize weight.
 35 |             May also be a callable that takes an array and edits its values.
 36 |         proposal_creator_params (dict): Key valued paramters for
 37 |             :class:`model.utils.creator_tools.ProposalCreator`.
 38 | 
 39 |     .. seealso::
 40 |         :class:`~model.utils.creator_tools.ProposalCreator`
 41 | 
 42 |     """
 43 | 
 44 |     def __init__(
 45 |             self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2],
 46 |             anchor_scales=[8, 16, 32], feat_stride=16,
 47 |             proposal_creator_params=dict(),
 48 |     ):
 49 |         super(RegionProposalNetwork, self).__init__()
 50 | 
 51 |         # 在reshaped image的尺度上，以feature map上一个点对应的一个16*16的左上角点为原点，计算得到的所有anchorbox的角点的相对坐标
 52 |         # 为了后边计算reshaped image上的所有anchor box做准备
 53 |         self.anchor_base = generate_anchor_base(anchor_scales=anchor_scales, ratios=ratios)
 54 |         self.feat_stride = feat_stride
 55 |         self.proposal_layer = ProposalCreator(self, **proposal_creator_params) # parent_model = instance of RegionProposalNetwork, and use other default parameters
 56 |         n_anchor = self.anchor_base.shape[0]
 57 |         self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
 58 |         self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)
 59 |         self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
 60 |         normal_init(self.conv1, 0, 0.01)
 61 |         normal_init(self.score, 0, 0.01)
 62 |         normal_init(self.loc, 0, 0.01)
 63 | 
 64 |     def forward(self, x, img_size, scale=1.):
 65 |         """Forward Region Proposal Network.
 66 | 
 67 |         Here are notations.
 68 | 
 69 |         * :math:`N` is batch size.
 70 |         * :math:`C` channel size of the input.
 71 |         * :math:`H` and :math:`W` are height and witdh of the input feature.
 72 |         * :math:`A` is number of anchors assigned to each pixel.
 73 | 
 74 |         Args:
 75 |             x (~torch.autograd.Variable): The Features extracted from images.
 76 |                 Its shape is :math:`(N, C, H, W)`.
 77 |             img_size (tuple of ints): A tuple :obj:`height, width`,
 78 |                 which contains image size after scaling.
 79 |             scale (float): The amount of scaling done to the input images after
 80 |                 reading them from files.
 81 | 
 82 |         Returns:
 83 |             (~torch.autograd.Variable, ~torch.autograd.Variable, array, array, array):
 84 | 
 85 |             This is a tuple of five following values.
 86 | 
 87 |             * **rpn_locs**: Predicted bounding box offsets and scales for anchors. Its shape is :math:`(N, H W A, 4)`.
 88 |             * **rpn_scores**:  Predicted foreground scores for anchors. Its shape is :math:`(N, H W A, 2)`.
 89 |             * **rois**: A bounding box array containing coordinates of proposal boxes.  This is a concatenation of bounding box \
 90 |                 arrays from multiple images in the batch. Its shape is :math:`(R', 4)`. Given :math:`R_i` predicted \
 91 |                 bounding boxes from the :math:`i` th image, :math:`R' = \\sum _{i=1} ^ N R_i`.
 92 |             * **roi_indices**: An array containing indices of images to which RoIs correspond to. Its shape is :math:`(R',)`.
 93 |             * **anchor**: Coordinates of enumerated shifted anchors. Its shape is :math:`(H W A, 4)`.
 94 | 
 95 |         """
 96 |         n, _, hh, ww = x.shape  # n is always 1 here.
 97 | 
 98 |         # reshaped image中的所有anchor box, shape (hh*ww*n_anchor, 4)
 99 |         anchor = _enumerate_shifted_anchor(np.array(self.anchor_base),
100 |                                            self.feat_stride, hh, ww)
101 | 
102 |         n_anchor = anchor.shape[0] // (hh * ww)  # feature map中每一个点上的anchor box数量
103 |         h = F.relu(self.conv1(x))
104 |         
105 |         rpn_locs = self.loc(h)
106 |         rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)   # shape (n, hh*ww*n_anchor, 4)
107 |         
108 |         rpn_scores = self.score(h)
109 |         rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous()
110 |         rpn_scores = rpn_scores.view(n, -1, 2)       # shape (n, hh*ww*n_anchor, 2)
111 | 
112 |         rpn_fg_scores = rpn_scores.view(n, hh, ww, n_anchor, 2)[:, :, :, :, 1].contiguous() # 该anchor是前景的概率
113 |         rpn_fg_scores = rpn_fg_scores.view(n, -1)    # shape (n, hh*ww*n_anchor)
114 | 
115 |         rois = list()
116 |         search_regions = list()
117 |         roi_indices = list()
118 |         for i in range(n):
119 |             roi, search_region = self.proposal_layer(rpn_locs[i].cpu().data.numpy(),
120 |                                                       rpn_fg_scores[i].cpu().data.numpy(),
121 |                                                       anchor, img_size,
122 |                                                       scale=scale)
123 |             batch_index = i * np.ones((len(roi),), dtype=np.int32)
124 |             rois.append(roi)
125 |             search_regions.append(search_region)
126 |             roi_indices.append(batch_index)
127 | 
128 |         rois = np.concatenate(rois, axis=0)                      # shape (num_rois, 4)
129 |         search_regions = np.concatenate(search_regions, axis=0)  # shape (num_rois, 4)
130 |         roi_indices = np.concatenate(roi_indices, axis=0)   # shape (num_rois,)
131 |         return rpn_locs, rpn_scores, rois, search_regions, roi_indices, anchor
132 | 
133 | 
134 | 
135 | def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):
136 |     # Enumerate all shifted anchors:
137 |     #
138 |     # add A anchors (1, A, 4) to
139 |     # cell K shifts (K, 1, 4) to get
140 |     # shift anchors (K, A, 4)
141 |     # reshape to (K*A, 4) shifted anchors
142 |     # return (K*A, 4)
143 | 
144 |     # !TODO: add support for torch.CudaTensor
145 |     # xp = cuda.get_array_module(anchor_base)
146 |     # it seems that it can't be boosed using GPU
147 |     import numpy as xp
148 |     shift_y = xp.arange(0, height * feat_stride, feat_stride)
149 |     shift_x = xp.arange(0, width * feat_stride, feat_stride)
150 |     shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
151 |     shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
152 |                       shift_y.ravel(), shift_x.ravel()), axis=1)
153 | 
154 |     A = anchor_base.shape[0]
155 |     K = shift.shape[0]
156 |     anchor = anchor_base.reshape((1, A, 4)) + \
157 |              shift.reshape((1, K, 4)).transpose((1, 0, 2))
158 |     anchor = anchor.reshape((K * A, 4)).astype(np.float32)
159 |     return anchor
160 | 
161 | 
162 | def _enumerate_shifted_anchor_torch(anchor_base, feat_stride, height, width):
163 |     # Enumerate all shifted anchors:
164 |     #
165 |     # add A anchors (1, A, 4) to
166 |     # cell K shifts (K, 1, 4) to get
167 |     # shift anchors (K, A, 4)
168 |     # reshape to (K*A, 4) shifted anchors
169 |     # return (K*A, 4)
170 | 
171 |     # !TODO: add support for torch.CudaTensor
172 |     # xp = cuda.get_array_module(anchor_base)
173 |     import torch as t
174 |     shift_y = t.arange(0, height * feat_stride, feat_stride)
175 |     shift_x = t.arange(0, width * feat_stride, feat_stride)
176 |     shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
177 |     shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
178 |                       shift_y.ravel(), shift_x.ravel()), axis=1)
179 | 
180 |     A = anchor_base.shape[0]
181 |     K = shift.shape[0]
182 |     anchor = anchor_base.reshape((1, A, 4)) + \
183 |              shift.reshape((1, K, 4)).transpose((1, 0, 2))
184 |     anchor = anchor.reshape((K * A, 4)).astype(np.float32)
185 |     return anchor
186 | 
187 | 
188 | def normal_init(m, mean, stddev, truncated=False):
189 |     """
190 |     weight initalizer: truncated normal and random normal.
191 |     """
192 |     # x is a parameter
193 |     if truncated:
194 |         m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean)  # not a perfect approximation
195 |     else:
196 |         m.weight.data.normal_(mean, stddev)
197 |         m.bias.data.zero_()
198 | 


--------------------------------------------------------------------------------
/data/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from PIL import Image
  3 | import random
  4 | 
  5 | 
  6 | def read_image(path, dtype=np.float32, color=True):
  7 |     """Read an image from a file.
  8 | 
  9 |     This function reads an image from given file. The image is CHW format and
 10 |     the range of its value is :math:`[0, 255]`. If :obj:`color = True`, the
 11 |     order of the channels is RGB.
 12 | 
 13 |     Args:
 14 |         path (str): A path of image file.
 15 |         dtype: The type of array. The default value is :obj:`~numpy.float32`.
 16 |         color (bool): This option determines the number of channels.
 17 |             If :obj:`True`, the number of channels is three. In this case,
 18 |             the order of the channels is RGB. This is the default behaviour.
 19 |             If :obj:`False`, this function returns a grayscale image.
 20 | 
 21 |     Returns:
 22 |         ~numpy.ndarray: An image.
 23 |     """
 24 | 
 25 |     f = Image.open(path)
 26 |     try:
 27 |         if color:
 28 |             img = f.convert('RGB')
 29 |         else:
 30 |             img = f.convert('P')
 31 |         img = np.asarray(img, dtype=dtype)
 32 |     finally:
 33 |         if hasattr(f, 'close'):
 34 |             f.close()
 35 | 
 36 |     if img.ndim == 2:
 37 |         # reshape (H, W) -> (1, H, W)
 38 |         return img[np.newaxis]
 39 |     else:
 40 |         # transpose (H, W, C) -> (C, H, W)
 41 |         return img.transpose((2, 0, 1))
 42 | 
 43 | 
 44 | def resize_bbox(bbox, in_size, out_size):
 45 |     """Resize bounding boxes according to image resize.
 46 | 
 47 |     The bounding boxes are expected to be packed into a two dimensional
 48 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
 49 |     bounding boxes in the image. The second axis represents attributes of
 50 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
 51 |     where the four attributes are coordinates of the top left and the
 52 |     bottom right vertices.
 53 | 
 54 |     Args:
 55 |         bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
 56 |             :math:`R` is the number of bounding boxes.
 57 |         in_size (tuple): A tuple of length 2. The height and the width
 58 |             of the image before resized.
 59 |         out_size (tuple): A tuple of length 2. The height and the width
 60 |             of the image after resized.
 61 | 
 62 |     Returns:
 63 |         ~numpy.ndarray:
 64 |         Bounding boxes rescaled according to the given image shapes.
 65 | 
 66 |     """
 67 |     bbox = bbox.copy()
 68 |     y_scale = float(out_size[0]) / in_size[0]
 69 |     x_scale = float(out_size[1]) / in_size[1]
 70 |     bbox[:, 0] = y_scale * bbox[:, 0]
 71 |     bbox[:, 2] = y_scale * bbox[:, 2]
 72 |     bbox[:, 1] = x_scale * bbox[:, 1]
 73 |     bbox[:, 3] = x_scale * bbox[:, 3]
 74 |     return bbox
 75 | 
 76 | 
 77 | def flip_bbox(bbox, size, y_flip=False, x_flip=False):
 78 |     """Flip bounding boxes accordingly.
 79 | 
 80 |     The bounding boxes are expected to be packed into a two dimensional
 81 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
 82 |     bounding boxes in the image. The second axis represents attributes of
 83 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
 84 |     where the four attributes are coordinates of the top left and the
 85 |     bottom right vertices.
 86 | 
 87 |     Args:
 88 |         bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
 89 |             :math:`R` is the number of bounding boxes.
 90 |         size (tuple): A tuple of length 2. The height and the width
 91 |             of the image before resized.
 92 |         y_flip (bool): Flip bounding box according to a vertical flip of
 93 |             an image.
 94 |         x_flip (bool): Flip bounding box according to a horizontal flip of
 95 |             an image.
 96 | 
 97 |     Returns:
 98 |         ~numpy.ndarray:
 99 |         Bounding boxes flipped according to the given flips.
100 | 
101 |     """
102 |     H, W = size
103 |     bbox = bbox.copy()
104 |     if y_flip:
105 |         y_max = H - bbox[:, 0]
106 |         y_min = H - bbox[:, 2]
107 |         bbox[:, 0] = y_min
108 |         bbox[:, 2] = y_max
109 |     if x_flip:
110 |         x_max = W - bbox[:, 1]
111 |         x_min = W - bbox[:, 3]
112 |         bbox[:, 1] = x_min
113 |         bbox[:, 3] = x_max
114 |     return bbox
115 | 
116 | 
117 | def crop_bbox(
118 |         bbox, y_slice=None, x_slice=None,
119 |         allow_outside_center=True, return_param=False):
120 |     """Translate bounding boxes to fit within the cropped area of an image.
121 | 
122 |     This method is mainly used together with image cropping.
123 |     This method translates the coordinates of bounding boxes like
124 |     :func:`data.util.translate_bbox`. In addition,
125 |     this function truncates the bounding boxes to fit within the cropped area.
126 |     If a bounding box does not overlap with the cropped area,
127 |     this bounding box will be removed.
128 | 
129 |     The bounding boxes are expected to be packed into a two dimensional
130 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
131 |     bounding boxes in the image. The second axis represents attributes of
132 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
133 |     where the four attributes are coordinates of the top left and the
134 |     bottom right vertices.
135 | 
136 |     Args:
137 |         bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
138 |             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
139 |         y_slice (slice): The slice of y axis.
140 |         x_slice (slice): The slice of x axis.
141 |         allow_outside_center (bool): If this argument is :obj:`False`,
142 |             bounding boxes whose centers are outside of the cropped area
143 |             are removed. The default value is :obj:`True`.
144 |         return_param (bool): If :obj:`True`, this function returns
145 |             indices of kept bounding boxes.
146 | 
147 |     Returns:
148 |         ~numpy.ndarray or (~numpy.ndarray, dict):
149 | 
150 |         If :obj:`return_param = False`, returns an array :obj:`bbox`.
151 | 
152 |         If :obj:`return_param = True`,
153 |         returns a tuple whose elements are :obj:`bbox, param`.
154 |         :obj:`param` is a dictionary of intermediate parameters whose
155 |         contents are listed below with key, value-type and the description
156 |         of the value.
157 | 
158 |         * **index** (*numpy.ndarray*): An array holding indices of used \
159 |             bounding boxes.
160 | 
161 |     """
162 | 
163 |     t, b = _slice_to_bounds(y_slice)
164 |     l, r = _slice_to_bounds(x_slice)
165 |     crop_bb = np.array((t, l, b, r))
166 | 
167 |     if allow_outside_center:
168 |         mask = np.ones(bbox.shape[0], dtype=bool)
169 |     else:
170 |         center = (bbox[:, :2] + bbox[:, 2:]) / 2
171 |         mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) \
172 |             .all(axis=1)
173 | 
174 |     bbox = bbox.copy()
175 |     bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2])
176 |     bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:])
177 |     bbox[:, :2] -= crop_bb[:2]
178 |     bbox[:, 2:] -= crop_bb[:2]
179 | 
180 |     mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1))
181 |     bbox = bbox[mask]
182 | 
183 |     if return_param:
184 |         return bbox, {'index': np.flatnonzero(mask)}
185 |     else:
186 |         return bbox
187 | 
188 | 
189 | def _slice_to_bounds(slice_):
190 |     if slice_ is None:
191 |         return 0, np.inf
192 | 
193 |     if slice_.start is None:
194 |         l = 0
195 |     else:
196 |         l = slice_.start
197 | 
198 |     if slice_.stop is None:
199 |         u = np.inf
200 |     else:
201 |         u = slice_.stop
202 | 
203 |     return l, u
204 | 
205 | 
206 | def translate_bbox(bbox, y_offset=0, x_offset=0):
207 |     """Translate bounding boxes.
208 | 
209 |     This method is mainly used together with image transforms, such as padding
210 |     and cropping, which translates the left top point of the image from
211 |     coordinate :math:`(0, 0)` to coordinate
212 |     :math:`(y, x) = (y_{offset}, x_{offset})`.
213 | 
214 |     The bounding boxes are expected to be packed into a two dimensional
215 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
216 |     bounding boxes in the image. The second axis represents attributes of
217 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
218 |     where the four attributes are coordinates of the top left and the
219 |     bottom right vertices.
220 | 
221 |     Args:
222 |         bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
223 |             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
224 |         y_offset (int or float): The offset along y axis.
225 |         x_offset (int or float): The offset along x axis.
226 | 
227 |     Returns:
228 |         ~numpy.ndarray:
229 |         Bounding boxes translated according to the given offsets.
230 | 
231 |     """
232 | 
233 |     out_bbox = bbox.copy()
234 |     out_bbox[:, :2] += (y_offset, x_offset)
235 |     out_bbox[:, 2:] += (y_offset, x_offset)
236 | 
237 |     return out_bbox
238 | 
239 | 
240 | def random_flip(img, y_random=False, x_random=False,
241 |                 return_param=False, copy=False):
242 |     """Randomly flip an image in vertical or horizontal direction.
243 | 
244 |     Args:
245 |         img (~numpy.ndarray): An array that gets flipped. This is in
246 |             CHW format.
247 |         y_random (bool): Randomly flip in vertical direction.
248 |         x_random (bool): Randomly flip in horizontal direction.
249 |         return_param (bool): Returns information of flip.
250 |         copy (bool): If False, a view of :obj:`img` will be returned.
251 | 
252 |     Returns:
253 |         ~numpy.ndarray or (~numpy.ndarray, dict):
254 | 
255 |         If :obj:`return_param = False`,
256 |         returns an array :obj:`out_img` that is the result of flipping.
257 | 
258 |         If :obj:`return_param = True`,
259 |         returns a tuple whose elements are :obj:`out_img, param`.
260 |         :obj:`param` is a dictionary of intermediate parameters whose
261 |         contents are listed below with key, value-type and the description
262 |         of the value.
263 | 
264 |         * **y_flip** (*bool*): Whether the image was flipped in the\
265 |             vertical direction or not.
266 |         * **x_flip** (*bool*): Whether the image was flipped in the\
267 |             horizontal direction or not.
268 | 
269 |     """
270 |     y_flip, x_flip = False, False
271 |     if y_random:
272 |         y_flip = random.choice([True, False])
273 |     if x_random:
274 |         x_flip = random.choice([True, False])
275 | 
276 |     if y_flip:
277 |         img = img[:, ::-1, :]
278 |     if x_flip:
279 |         img = img[:, :, ::-1]
280 | 
281 |     if copy:
282 |         img = img.copy()
283 | 
284 |     if return_param:
285 |         return img, {'y_flip': y_flip, 'x_flip': x_flip}
286 |     else:
287 |         return img
288 | 


--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import time
  3 | from torch.nn import functional as F
  4 | from model.utils.creator_tool import AnchorTargetCreator, ProposalTargetCreator
  5 | 
  6 | from torch import nn
  7 | import torch as t
  8 | from torch.autograd import Variable
  9 | from utils import array_tool as at
 10 | from utils.vis_tool import Visualizer
 11 | 
 12 | from utils.config import opt
 13 | from torchnet.meter import ConfusionMeter, AverageValueMeter
 14 | 
 15 | 
 16 | import numpy as np
 17 | 
 18 | 
 19 | 
 20 | 
 21 | # create a namedtuple
 22 | LossTuple = namedtuple('LossTuple',
 23 |                        ['rpn_loc_loss',
 24 |                         'rpn_cls_loss',
 25 |                         'roi_loc_loss',
 26 |                         'roi_cls_loss',
 27 |                         'total_loss'
 28 |                         ])
 29 | 
 30 | 
 31 | class FasterRCNNTrainer(nn.Module):
 32 |     """wrapper for conveniently training. return losses
 33 | 
 34 |     The losses include:
 35 | 
 36 |     * :obj:`rpn_loc_loss`: The localization loss for Region Proposal Network (RPN).
 37 |     * :obj:`rpn_cls_loss`: The classification loss for RPN.
 38 |     * :obj:`roi_loc_loss`: The localization loss for the head module.
 39 |     * :obj:`roi_cls_loss`: The classification loss for the head module.
 40 |     * :obj:`total_loss`: The sum of 4 loss above.
 41 | 
 42 |     Args:
 43 |         faster_rcnn (model.FasterRCNN):
 44 |             A Faster R-CNN model that is going to be trained.
 45 |     """
 46 | 
 47 |     def __init__(self, faster_rcnn):
 48 |         super(FasterRCNNTrainer, self).__init__()
 49 | 
 50 |         self.faster_rcnn = faster_rcnn
 51 |         self.rpn_sigma = opt.rpn_sigma
 52 |         self.roi_sigma = opt.roi_sigma
 53 | 
 54 |         # target creator create gt_bbox gt_label etc as training targets. 
 55 |         self.anchor_target_creator = AnchorTargetCreator()
 56 |         self.proposal_target_creator = ProposalTargetCreator()
 57 | 
 58 |         self.optimizer = self.faster_rcnn.get_optimizer()
 59 |         # visdom wrapper
 60 |         self.vis = Visualizer(env=opt.env)
 61 | 
 62 |         # indicators for training status
 63 |         self.rpn_cm = ConfusionMeter(2)
 64 |         self.roi_cm = ConfusionMeter(21)
 65 |         self.meters = {k: AverageValueMeter() for k in LossTuple._fields}  # average loss
 66 | 
 67 |     def forward(self, imgs, bboxes, labels, scale):
 68 |         """Forward Faster R-CNN and calculate losses.
 69 | 
 70 |         Here are notations used.
 71 | 
 72 |         * :math:`N` is the batch size.
 73 |         * :math:`R` is the number of bounding boxes per image.
 74 | 
 75 |         Currently, only :math:`N=1` is supported.
 76 | 
 77 |         Args:
 78 |             imgs (~torch.autograd.Variable): A variable with a batch of images.
 79 |             bboxes (~torch.autograd.Variable): A batch of bounding boxes.
 80 |                 Its shape is :math:`(N, R, 4)`.
 81 |             labels (~torch.autograd..Variable): A batch of labels.
 82 |                 Its shape is :math:`(N, R)`. The background is excluded from
 83 |                 the definition, which means that the range of the value
 84 |                 is :math:`[0, L - 1]`. :math:`L` is the number of foreground
 85 |                 classes.
 86 |             scale (float): Amount of scaling applied to
 87 |                 the raw image during preprocessing.
 88 | 
 89 |         Returns:
 90 |             namedtuple of 5 losses
 91 |         """
 92 |         n = bboxes.shape[0] # number of input images one time
 93 |         if n != 1:
 94 |             raise ValueError('Currently only batch size 1 is supported.')
 95 | 
 96 |         _, _, H, W = imgs.shape # should be (1,3,H,W)
 97 |         img_size = (H, W)
 98 | 
 99 |         # need more feature maps here when you are trying to use features of different scale
100 |         features = self.faster_rcnn.extractor(imgs)
101 | 
102 |         rpn_locs, rpn_scores, rois, search_regions, roi_indices, anchor = self.faster_rcnn.rpn(features, img_size, scale)
103 | 
104 |         # Since batch size is one, convert variables to singular form
105 |         # different parameters here :
106 |         #     num_boxes : number of ground truth bounding boxes in a image.
107 |         #     num_anchors : number of anchors in images(or to say in a feature map).
108 |         #     num_rois : number of ROIs that are generated by RPN, which will be used in Fast RCNN.
109 |         bbox = bboxes[0]                # shape (num_boxes, 4)
110 |         label = labels[0]               # shape (num_boxes,)
111 |         rpn_score = rpn_scores[0]       # shape (num_anchors,)
112 |         rpn_loc = rpn_locs[0]           # shape (num_anchors, 4)
113 |         roi = rois                      # shape (num_rois, 4)
114 |         search_region = search_regions  # shape (num_rois, 4)
115 | 
116 |         # Sample RoIs and forward
117 |         # it's fine to break the computation graph of rois, 
118 |         # consider them as constant input
119 |         sample_roi, sample_search_region, (Tx,Ty), gt_roi_label = self.proposal_target_creator(roi,
120 |                                                                                               search_region,
121 |                                                                                               at.tonumpy(bbox),
122 |                                                                                               at.tonumpy(label))
123 | 
124 |         # NOTE it's all zero because now it only support for batch=1 now
125 |         sample_roi_index = t.zeros(len(sample_roi))
126 |         (px, py), roi_score = self.faster_rcnn.head(features,
127 |                                                     sample_roi,
128 |                                                     sample_search_region,
129 |                                                     sample_roi_index)
130 | 
131 | 
132 |         # ------------------ RPN losses -------------------#
133 |         gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(at.tonumpy(bbox),
134 |                                                               anchor,
135 |                                                               img_size)
136 |         gt_rpn_label = at.tovariable(gt_rpn_label).long()
137 |         gt_rpn_loc = at.tovariable(gt_rpn_loc)
138 |         rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc,
139 |                                            gt_rpn_loc,
140 |                                            gt_rpn_label.data,
141 |                                            self.rpn_sigma)
142 |         
143 |         # NOTE: default value of ignore_index is -100 ...
144 |         rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
145 |         _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
146 |         _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
147 |         self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())
148 | 
149 | 
150 |         # ------------------ ROI losses (fast rcnn loss) -------------------#
151 |         n_sample = px.shape[0]
152 |         # (px, py) and (Tx, Ty) are to be used to caculate loss ：roi_loc_loss
153 | 
154 |         Tx = at.tovariable(Tx).float()
155 |         Ty = at.tovariable(Ty).float()
156 | 
157 |         print("px is ", px)
158 |         # print("max of px is ", t.max(px))
159 |         # print("min of px is ", t.min(px))
160 |         # print(t.max(Tx))
161 |         # print(t.max(Ty))
162 |         # print(Tx.shape, Ty.shape, px.shape, py.shape)
163 | 
164 |         roi_loc_loss = _LocNet_loss(Tx, Ty, px, py, gt_roi_label.data, self.roi_sigma)
165 | 
166 |         
167 |         gt_roi_label = at.tovariable(gt_roi_label).long()
168 |         roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())
169 | 
170 |         self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())
171 | 
172 |         
173 |         losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
174 | 
175 |         print("losses", losses)
176 | 
177 |         losses = losses + [sum(losses)]
178 | 
179 |         return LossTuple(*losses)  # return a namedtuple
180 | 
181 |     def train_step(self, imgs, bboxes, labels, scale):
182 |         self.optimizer.zero_grad()
183 |         losses = self.forward(imgs, bboxes, labels, scale) # losses is a namedtuple
184 |         losses.total_loss.backward()                       # use total_loss to backprop
185 |         self.optimizer.step()
186 |         self.update_meters(losses)
187 |         return losses
188 | 
189 |     def save(self, save_optimizer=False, save_path=None, **kwargs):
190 |         """serialize models include optimizer and other info
191 |         return path where the model-file is stored.
192 | 
193 |         Args:
194 |             save_optimizer (bool): whether save optimizer.state_dict().
195 |             save_path (string): where to save model, if it's None, save_path
196 |                 is generate using time str and info from kwargs.
197 |         
198 |         Returns:
199 |             save_path(str): the path to save models.
200 |         """
201 |         save_dict = dict()
202 | 
203 |         save_dict['model'] = self.faster_rcnn.state_dict()
204 |         save_dict['config'] = opt._state_dict()
205 |         save_dict['other_info'] = kwargs
206 |         save_dict['vis_info'] = self.vis.state_dict()
207 | 
208 |         if save_optimizer:
209 |             save_dict['optimizer'] = self.optimizer.state_dict()
210 | 
211 |         if save_path is None:
212 |             timestr = time.strftime('%m%d%H%M')
213 |             save_path = 'checkpoints/fasterrcnn_%s' % timestr
214 |             for k_, v_ in kwargs.items():
215 |                 save_path += '_%s' % v_
216 | 
217 |         t.save(save_dict, save_path)
218 |         self.vis.save([self.vis.env])
219 |         return save_path
220 | 
221 |     def load(self, path, load_optimizer=True, parse_opt=False, ):
222 |         state_dict = t.load(path)
223 |         if 'model' in state_dict:
224 |             self.faster_rcnn.load_state_dict(state_dict['model'])
225 |         else:  # legacy way, for backward compatibility
226 |             self.faster_rcnn.load_state_dict(state_dict)
227 |             return self
228 |         if parse_opt:
229 |             opt._parse(state_dict['config'])
230 |         if 'optimizer' in state_dict and load_optimizer:
231 |             self.optimizer.load_state_dict(state_dict['optimizer'])
232 |         return self
233 | 
234 |     def update_meters(self, losses):
235 |         loss_d = {k: at.scalar(v) for k, v in losses._asdict().items()}
236 |         for key, meter in self.meters.items():
237 |             meter.add(loss_d[key])
238 | 
239 |     def reset_meters(self):
240 |         for key, meter in self.meters.items():
241 |             meter.reset()
242 |         self.roi_cm.reset()
243 |         self.rpn_cm.reset()
244 | 
245 |     def get_meter_data(self):
246 |         return {k: v.value()[0] for k, v in self.meters.items()}
247 | 
248 | 
249 | def _smooth_l1_loss(x, t, in_weight, sigma):
250 |     sigma2 = sigma ** 2
251 |     diff = in_weight * (x - t)
252 |     abs_diff = diff.abs()
253 |     flag = (abs_diff.data < (1. / sigma2)).float()
254 |     flag = Variable(flag)
255 |     y = (flag * (sigma2 / 2.) * (diff ** 2) +
256 |          (1 - flag) * (abs_diff - 0.5 / sigma2))
257 |     return y.sum()
258 | 
259 | 
260 | def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
261 |     in_weight = t.zeros(gt_loc.shape).cuda()
262 |     # Localization loss is calculated only for positive rois.
263 |     # NOTE:  unlike origin implementation, 
264 |     # we don't need inside_weight and outside_weight, they can calculate by gt_label
265 |     in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1
266 |     loc_loss = _smooth_l1_loss(pred_loc, gt_loc, Variable(in_weight), sigma)
267 |     # Normalize by total number of negtive and positive rois.
268 |     loc_loss /= (gt_label >= 0).sum().float()  # ignore gt_label==-1 for rpn_loss
269 |     return loc_loss
270 | 
271 | def _LocNet_loss(Tx, Ty, Px, Py, gt_label, sigma):
272 |     '''
273 |     Args: 
274 |         Tx, Ty : ground truth value for all points in all boxes. 
275 |                  shape of (R, M) of which R is the number of boxes used in \
276 |                  Head part and M is the number of parts along x and y axis of a box.
277 |         Px, Py : predicted value of Tx and Ty, shape of (R, M)
278 |         gt_label : class id of the box, 0 means background. shape of (R)
279 |     '''
280 |     # print("Tx.shape ", Tx.shape)
281 |     # print("Ty shape ", Ty.shape)
282 |     # print("Tx max ", t.max(Tx))
283 |     # print("Tx min", t.min(Tx))
284 | 
285 |     s = t.sum(Tx * t.log(Px), dim=1) + t.sum((1-Tx) * t.log(1-Px), dim=1) + t.sum(Ty * t.log(Py), dim=1) + t.sum((1-Ty) * t.log(1-Py), dim=1)   
286 |     s = (-1) * s
287 | 
288 |     # Localization loss is calculated only for positive rois.
289 |     in_weight = t.zeros(s.shape).cuda()
290 |     for i in range(len(gt_label)):
291 |         if gt_label[i]>0:
292 |             in_weight[i] = 1
293 |     in_weight = Variable(in_weight)
294 | 
295 |     result = sigma * (in_weight * s).sum()
296 | 
297 |     return result
298 | 
299 | 
300 | 
301 | 


--------------------------------------------------------------------------------
/model/faster_rcnn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch as t
  3 | import numpy as np
  4 | import cupy as cp
  5 | from utils import array_tool as at
  6 | from model.utils.bbox_tools import loc2bbox, p2bbox
  7 | from model.utils.nms import non_maximum_suppression
  8 | 
  9 | from torch import nn
 10 | from data.dataset import preprocess
 11 | from torch.nn import functional as F
 12 | from utils.config import opt
 13 | 
 14 | 
 15 | class FasterRCNN(nn.Module):
 16 |     """Base class for Faster R-CNN.
 17 | 
 18 |     This is a base class for Faster R-CNN links supporting object detection
 19 |     API [#]_. The following three stages constitute Faster R-CNN.
 20 | 
 21 |     1. **Feature extraction**: Images are taken and their \
 22 |         feature maps are calculated.
 23 |     2. **Region Proposal Networks**: Given the feature maps calculated in \
 24 |         the previous stage, produce set of RoIs around objects.
 25 |     3. **Localization and Classification Heads**: Using feature maps that \
 26 |         belong to the proposed RoIs, classify the categories of the objects \
 27 |         in the RoIs and improve localizations.
 28 | 
 29 |     Each stage is carried out by one of the callable
 30 |     :class:`torch.nn.Module` objects :obj:`feature`, :obj:`rpn` and :obj:`head`.
 31 | 
 32 |     There are two functions :meth:`predict` and :meth:`__call__` to conduct
 33 |     object detection.
 34 |     :meth:`predict` takes images and returns bounding boxes that are converted
 35 |     to image coordinates. This will be useful for a scenario when
 36 |     Faster R-CNN is treated as a black box function, for instance.
 37 |     :meth:`__call__` is provided for a scnerario when intermediate outputs
 38 |     are needed, for instance, for training and debugging.
 39 | 
 40 |     Links that support obejct detection API have method :meth:`predict` with
 41 |     the same interface. Please refer to :meth:`predict` for
 42 |     further details.
 43 | 
 44 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
 45 |     Faster R-CNN: Towards Real-Time Object Detection with \
 46 |     Region Proposal Networks. NIPS 2015.
 47 | 
 48 |     Args:
 49 |         extractor (nn.Module): A module that takes a BCHW image
 50 |             array and returns feature maps.
 51 |         rpn (nn.Module): A module that has the same interface as
 52 |             :class:`model.region_proposal_network.RegionProposalNetwork`.
 53 |             Please refer to the documentation found there.
 54 |         head (nn.Module): A module that takes
 55 |             a BCHW variable, RoIs and batch indices for RoIs. This returns class
 56 |             dependent localization paramters and class scores.
 57 |         loc_normalize_mean (tuple of four floats): Mean values of
 58 |             localization estimates.
 59 |         loc_normalize_std (tupler of four floats): Standard deviation
 60 |             of localization estimates.
 61 | 
 62 |     """
 63 | 
 64 |     def __init__(self, extractor, rpn, head,
 65 |                 loc_normalize_mean = (0., 0., 0., 0.),
 66 |                 loc_normalize_std = (0.1, 0.1, 0.2, 0.2)
 67 |     ):
 68 |         super(FasterRCNN, self).__init__()
 69 |         self.extractor = extractor
 70 |         self.rpn = rpn
 71 |         self.head = head
 72 | 
 73 |         # mean and std
 74 |         self.loc_normalize_mean = loc_normalize_mean
 75 |         self.loc_normalize_std = loc_normalize_std
 76 |         self.use_preset('evaluate')
 77 | 
 78 |     @property
 79 |     def n_class(self):
 80 |         # Total number of classes including the background.
 81 |         return self.head.n_class
 82 | 
 83 |     def forward(self, x, scale=1.):
 84 |         """Forward Faster R-CNN.
 85 | 
 86 |         Scaling paramter :obj:`scale` is used by RPN to determine the
 87 |         threshold to select small objects, which are going to be
 88 |         rejected irrespective of their confidence scores.
 89 | 
 90 |         Here are notations used.
 91 | 
 92 |         * :math:`N` is the number of batch size
 93 |         * :math:`R'` is the total number of RoIs produced across batches. \
 94 |             Given :math:`R_i` proposed RoIs from the :math:`i` th image, \
 95 |             :math:`R' = \\sum _{i=1} ^ N R_i`.
 96 |         * :math:`L` is the number of classes excluding the background.
 97 | 
 98 |         Classes are ordered by the background, the first class, ..., and
 99 |         the :math:`L` th class.
100 | 
101 |         Args:
102 |             x (autograd.Variable): 4D image variable.
103 |             scale (float): Amount of scaling applied to the raw image
104 |                 during preprocessing.
105 | 
106 |         Returns:
107 |             Variable, Variable, array, array:
108 |             Returns tuple of four values listed below.
109 | 
110 |             * **roi_cls_locs**: Offsets and scalings for the proposed RoIs. \
111 |                 Its shape is :math:`(R', (L + 1) \\times 4)`.
112 |             * **roi_scores**: Class predictions for the proposed RoIs. \
113 |                 Its shape is :math:`(R', L + 1)`.
114 |             * **rois**: RoIs proposed by RPN. Its shape is \
115 |                 :math:`(R', 4)`.
116 |             * **roi_indices**: Batch indices of RoIs. Its shape is \
117 |                 :math:`(R',)`.
118 | 
119 |         """
120 |         img_size = x.shape[2:]
121 | 
122 |         h = self.extractor(x)
123 |         rpn_locs, rpn_scores, rois, search_regions, roi_indices, anchor = self.rpn(h, img_size, scale)
124 |         (px, py), roi_scores = self.head(h, rois, search_regions, roi_indices)
125 |         return (px, py), roi_scores, rois, search_regions, roi_indices
126 | 
127 | 
128 | 
129 |     def use_preset(self, preset):
130 |         """Use the given preset during prediction.
131 | 
132 |         This method changes values of :obj:`self.nms_thresh` and
133 |         :obj:`self.score_thresh`. These values are a threshold value
134 |         used for non maximum suppression and a threshold value
135 |         to discard low confidence proposals in :meth:`predict`,
136 |         respectively.
137 | 
138 |         If the attributes need to be changed to something
139 |         other than the values provided in the presets, please modify
140 |         them by directly accessing the public attributes.
141 | 
142 |         Args:
143 |             preset ({'visualize', 'evaluate'): A string to determine the
144 |                 preset to use.
145 | 
146 |         """
147 |         if preset == 'visualize':
148 |             self.nms_thresh = 0.3
149 |             self.score_thresh = 0.7
150 |         elif preset == 'evaluate':
151 |             self.nms_thresh = 0.3
152 |             self.score_thresh = 0.05
153 |         else:
154 |             raise ValueError('preset must be visualize or evaluate')
155 | 
156 |     def _suppress(self, raw_cls_bbox, raw_prob):
157 |         bbox = list()
158 |         label = list()
159 |         score = list()
160 |         # skip cls_id = 0 because it is the background class
161 |         for l in range(1, self.n_class):
162 | 
163 |             cls_bbox_l = raw_cls_bbox
164 |             prob_l = raw_prob[:, l]
165 |             
166 |             mask = prob_l > self.score_thresh
167 |             cls_bbox_l = cls_bbox_l[mask]
168 |             prob_l = prob_l[mask]
169 |             
170 |             keep = non_maximum_suppression(
171 |                 cp.array(cls_bbox_l), self.nms_thresh, prob_l)
172 |             keep = cp.asnumpy(keep)
173 |             
174 |             bbox.append(cls_bbox_l[keep])
175 |             # The labels are in [0, self.n_class - 2].
176 |             label.append((l - 1) * np.ones((len(keep),)))
177 |             score.append(prob_l[keep])
178 | 
179 |         bbox = np.concatenate(bbox, axis=0).astype(np.float32)
180 |         label = np.concatenate(label, axis=0).astype(np.int32)
181 |         score = np.concatenate(score, axis=0).astype(np.float32)
182 |         
183 |         return bbox, label, score
184 | 
185 |     def predict(self, imgs, sizes=None, visualize=False, prob_thre=0.7):
186 |         """Detect objects from images.
187 | 
188 |         This method predicts objects for each image.
189 | 
190 |         Args:
191 |             imgs (iterable of numpy.ndarray): Arrays holding images.
192 |                 All images are in CHW and RGB format
193 |                 and the range of their value is :math:`[0, 255]`.
194 | 
195 |         Returns:
196 |            tuple of lists:
197 |            This method returns a tuple of three lists,
198 |            :obj:`(bboxes, labels, scores)`.
199 | 
200 |            * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
201 |                where :math:`R` is the number of bounding boxes in a image. \
202 |                Each bouding box is organized by \
203 |                :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
204 |                in the second axis.
205 |            * **labels** : A list of integer arrays of shape :math:`(R,)`. \
206 |                Each value indicates the class of the bounding box. \
207 |                Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
208 |                number of the foreground classes.
209 |            * **scores** : A list of float arrays of shape :math:`(R,)`. \
210 |                Each value indicates how confident the prediction is.
211 | 
212 |         """
213 |         self.eval()
214 | 
215 |         # sizes changes when visualize is set to different values
216 |         if visualize:
217 |             self.use_preset('visualize')
218 |             prepared_imgs = list()
219 |             sizes = list()
220 |             for img in imgs:
221 |                 size = img.shape[1:]  # reshaped image size
222 |                 img = preprocess(at.tonumpy(img))
223 |                 prepared_imgs.append(img)
224 |                 sizes.append(size)
225 |         else:
226 |              prepared_imgs = imgs 
227 | 
228 |         bboxes = list()
229 |         labels = list()
230 |         scores = list()
231 | 
232 |         for img, size in zip(prepared_imgs, sizes):
233 |             img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True)
234 |             
235 |             # judge and change type if necessary
236 |             if t.is_tensor(size[1]) : 
237 |                 size[1] = int(size[1])
238 | 
239 |             if t.is_tensor(img.shape[3]):
240 |                 img.shape[3] = int(img.shape[3])
241 | 
242 |             scale = img.shape[3] / size[1]
243 | 
244 |             (px, py), roi_scores, rois, search_regions, _ = self(img, scale=scale)
245 |             # We are assuming that batch size is 1.
246 |             roi_score = roi_scores.data
247 |             px = px.data
248 |             py = py.data
249 | 
250 |             roi = at.totensor(rois) / scale
251 |             search_regions = at.totensor(search_regions) / scale
252 |             
253 |             # Convert to numpy array
254 |             px = at.tonumpy(px)
255 |             py = at.tonumpy(py)
256 |             search_regions = at.tonumpy(search_regions)
257 | 
258 |             # Convert predictions to bounding boxes in image coordinates.
259 |             # Bounding boxes are scaled to the scale of the input images.
260 |             
261 |             # use px, py and search_regions to generate boxes
262 |             cls_bbox = p2bbox(px, py, search_regions, threshold=prob_thre)         
263 |             cls_bbox = at.totensor(cls_bbox)
264 |         
265 |             # clip bounding box
266 |             cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
267 |             cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])
268 | 
269 |             prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))
270 | 
271 |             raw_cls_bbox = at.tonumpy(cls_bbox)
272 |             raw_prob = at.tonumpy(prob)
273 | 
274 |             # print("raw_cls_bbox shape : ", raw_cls_bbox.shape)
275 |             # print("raw_prob : ", raw_prob)
276 | 
277 |             bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
278 | 
279 |             bboxes.append(bbox)
280 |             labels.append(label)
281 |             scores.append(score)
282 | 
283 |         self.use_preset('evaluate')
284 |         self.train()
285 | 
286 |         return bboxes, labels, scores
287 | 
288 |     def get_optimizer(self):
289 |         """
290 |         return optimizer, It could be overwriten if you want to specify 
291 |         special optimizer
292 |         """
293 |         lr = opt.lr
294 |         params = []
295 | 
296 |         # different learning rate for different parameters
297 |         for key, value in dict(self.named_parameters()).items():
298 |             if value.requires_grad:
299 |                 if 'bias' in key:
300 |                     params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}]
301 |                 else:
302 |                     params += [{'params': [value], 'lr': lr, 'weight_decay': opt.weight_decay}]
303 |         if opt.use_adam:
304 |             self.optimizer = t.optim.Adam(params)
305 |         else:
306 |             self.optimizer = t.optim.SGD(params, momentum=0.9)
307 |         return self.optimizer
308 | 
309 |     def scale_lr(self, decay=0.1):
310 |         for param_group in self.optimizer.param_groups:
311 |             param_group['lr'] *= decay
312 |         return self.optimizer
313 | 
314 | 
315 | 
316 | 
317 | 


--------------------------------------------------------------------------------
/utils/eval_tool.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | from collections import defaultdict
  4 | import itertools
  5 | import numpy as np
  6 | import six
  7 | 
  8 | from model.utils.bbox_tools import bbox_iou
  9 | 
 10 | 
 11 | def eval_detection_voc(
 12 |         pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
 13 |         gt_difficults=None,
 14 |         iou_thresh=0.5, use_07_metric=False):
 15 |     """Calculate average precisions based on evaluation code of PASCAL VOC.
 16 | 
 17 |     This function evaluates predicted bounding boxes obtained from a dataset
 18 |     which has :math:`N` images by using average precision for each class.
 19 |     The code is based on the evaluation code used in PASCAL VOC Challenge.
 20 | 
 21 |     Args:
 22 |         pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
 23 |             sets of bounding boxes.
 24 |             Its index corresponds to an index for the base dataset.
 25 |             Each element of :obj:`pred_bboxes` is a set of coordinates
 26 |             of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
 27 |             where :math:`R` corresponds
 28 |             to the number of bounding boxes, which may vary among boxes.
 29 |             The second axis corresponds to
 30 |             :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
 31 |         pred_labels (iterable of numpy.ndarray): An iterable of labels.
 32 |             Similar to :obj:`pred_bboxes`, its index corresponds to an
 33 |             index for the base dataset. Its length is :math:`N`.
 34 |         pred_scores (iterable of numpy.ndarray): An iterable of confidence
 35 |             scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
 36 |             its index corresponds to an index for the base dataset.
 37 |             Its length is :math:`N`.
 38 |         gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
 39 |             bounding boxes
 40 |             whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
 41 |             bounding box whose shape is :math:`(R, 4)`. Note that the number of
 42 |             bounding boxes in each image does not need to be same as the number
 43 |             of corresponding predicted boxes.
 44 |         gt_labels (iterable of numpy.ndarray): An iterable of ground truth
 45 |             labels which are organized similarly to :obj:`gt_bboxes`.
 46 |         gt_difficults (iterable of numpy.ndarray): An iterable of boolean
 47 |             arrays which is organized similarly to :obj:`gt_bboxes`.
 48 |             This tells whether the
 49 |             corresponding ground truth bounding box is difficult or not.
 50 |             By default, this is :obj:`None`. In that case, this function
 51 |             considers all bounding boxes to be not difficult.
 52 |         iou_thresh (float): A prediction is correct if its Intersection over
 53 |             Union with the ground truth is above this value.
 54 |         use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
 55 |             for calculating average precision. The default value is
 56 |             :obj:`False`.
 57 | 
 58 |     Returns:
 59 |         dict:
 60 | 
 61 |         The keys, value-types and the description of the values are listed
 62 |         below.
 63 | 
 64 |         * **ap** (*numpy.ndarray*): An array of average precisions. \
 65 |             The :math:`l`-th value corresponds to the average precision \
 66 |             for class :math:`l`. If class :math:`l` does not exist in \
 67 |             either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \
 68 |             value is set to :obj:`numpy.nan`.
 69 |         * **map** (*float*): The average of Average Precisions over classes.
 70 | 
 71 |     """
 72 | 
 73 |     prec, rec = calc_detection_voc_prec_rec(
 74 |         pred_bboxes, pred_labels, pred_scores,
 75 |         gt_bboxes, gt_labels, gt_difficults,
 76 |         iou_thresh=iou_thresh)
 77 | 
 78 |     ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric)
 79 | 
 80 |     return {'ap': ap, 'map': np.nanmean(ap)}
 81 | 
 82 | 
 83 | def calc_detection_voc_prec_rec(
 84 |         pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
 85 |         gt_difficults=None,
 86 |         iou_thresh=0.5):
 87 |     """Calculate precision and recall based on evaluation code of PASCAL VOC.
 88 | 
 89 |     This function calculates precision and recall of
 90 |     predicted bounding boxes obtained from a dataset which has :math:`N`
 91 |     images.
 92 |     The code is based on the evaluation code used in PASCAL VOC Challenge.
 93 | 
 94 |     Args:
 95 |         pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
 96 |             sets of bounding boxes.
 97 |             Its index corresponds to an index for the base dataset.
 98 |             Each element of :obj:`pred_bboxes` is a set of coordinates
 99 |             of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
100 |             where :math:`R` corresponds
101 |             to the number of bounding boxes, which may vary among boxes.
102 |             The second axis corresponds to
103 |             :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
104 |         pred_labels (iterable of numpy.ndarray): An iterable of labels.
105 |             Similar to :obj:`pred_bboxes`, its index corresponds to an
106 |             index for the base dataset. Its length is :math:`N`.
107 |         pred_scores (iterable of numpy.ndarray): An iterable of confidence
108 |             scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
109 |             its index corresponds to an index for the base dataset.
110 |             Its length is :math:`N`.
111 |         gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
112 |             bounding boxes
113 |             whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
114 |             bounding box whose shape is :math:`(R, 4)`. Note that the number of
115 |             bounding boxes in each image does not need to be same as the number
116 |             of corresponding predicted boxes.
117 |         gt_labels (iterable of numpy.ndarray): An iterable of ground truth
118 |             labels which are organized similarly to :obj:`gt_bboxes`.
119 |         gt_difficults (iterable of numpy.ndarray): An iterable of boolean
120 |             arrays which is organized similarly to :obj:`gt_bboxes`.
121 |             This tells whether the
122 |             corresponding ground truth bounding box is difficult or not.
123 |             By default, this is :obj:`None`. In that case, this function
124 |             considers all bounding boxes to be not difficult.
125 |         iou_thresh (float): A prediction is correct if its Intersection over
126 |             Union with the ground truth is above this value..
127 | 
128 |     Returns:
129 |         tuple of two lists:
130 |         This function returns two lists: :obj:`prec` and :obj:`rec`.
131 | 
132 |         * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \
133 |             for class :math:`l`. If class :math:`l` does not exist in \
134 |             either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \
135 |             set to :obj:`None`.
136 |         * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \
137 |             for class :math:`l`. If class :math:`l` that is not marked as \
138 |             difficult does not exist in \
139 |             :obj:`gt_labels`, :obj:`rec[l]` is \
140 |             set to :obj:`None`.
141 | 
142 |     """
143 | 
144 |     pred_bboxes = iter(pred_bboxes)
145 |     pred_labels = iter(pred_labels)
146 |     pred_scores = iter(pred_scores)
147 |     gt_bboxes = iter(gt_bboxes)
148 |     gt_labels = iter(gt_labels)
149 |     if gt_difficults is None:
150 |         gt_difficults = itertools.repeat(None)
151 |     else:
152 |         gt_difficults = iter(gt_difficults)
153 | 
154 |     n_pos = defaultdict(int)
155 |     score = defaultdict(list)
156 |     match = defaultdict(list)
157 | 
158 |     for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \
159 |             six.moves.zip(
160 |                 pred_bboxes, pred_labels, pred_scores,
161 |                 gt_bboxes, gt_labels, gt_difficults):
162 | 
163 |         if gt_difficult is None:
164 |             gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool)
165 | 
166 |         for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)):
167 |             pred_mask_l = pred_label == l
168 |             pred_bbox_l = pred_bbox[pred_mask_l]
169 |             pred_score_l = pred_score[pred_mask_l]
170 |             # sort by score
171 |             order = pred_score_l.argsort()[::-1]
172 |             pred_bbox_l = pred_bbox_l[order]
173 |             pred_score_l = pred_score_l[order]
174 | 
175 |             gt_mask_l = gt_label == l
176 |             gt_bbox_l = gt_bbox[gt_mask_l]
177 |             gt_difficult_l = gt_difficult[gt_mask_l]
178 | 
179 |             n_pos[l] += np.logical_not(gt_difficult_l).sum()
180 |             score[l].extend(pred_score_l)
181 | 
182 |             if len(pred_bbox_l) == 0:
183 |                 continue
184 |             if len(gt_bbox_l) == 0:
185 |                 match[l].extend((0,) * pred_bbox_l.shape[0])
186 |                 continue
187 | 
188 |             # VOC evaluation follows integer typed bounding boxes.
189 |             pred_bbox_l = pred_bbox_l.copy()
190 |             pred_bbox_l[:, 2:] += 1
191 |             gt_bbox_l = gt_bbox_l.copy()
192 |             gt_bbox_l[:, 2:] += 1
193 | 
194 |             iou = bbox_iou(pred_bbox_l, gt_bbox_l)
195 |             gt_index = iou.argmax(axis=1)
196 |             # set -1 if there is no matching ground truth
197 |             gt_index[iou.max(axis=1) < iou_thresh] = -1
198 |             del iou
199 | 
200 |             selec = np.zeros(gt_bbox_l.shape[0], dtype=bool)
201 |             for gt_idx in gt_index:
202 |                 if gt_idx >= 0:
203 |                     if gt_difficult_l[gt_idx]:
204 |                         match[l].append(-1)
205 |                     else:
206 |                         if not selec[gt_idx]:
207 |                             match[l].append(1)
208 |                         else:
209 |                             match[l].append(0)
210 |                     selec[gt_idx] = True
211 |                 else:
212 |                     match[l].append(0)
213 | 
214 |     for iter_ in (
215 |             pred_bboxes, pred_labels, pred_scores,
216 |             gt_bboxes, gt_labels, gt_difficults):
217 |         if next(iter_, None) is not None:
218 |             raise ValueError('Length of input iterables need to be same.')
219 | 
220 |     n_fg_class = max(n_pos.keys()) + 1
221 |     prec = [None] * n_fg_class
222 |     rec = [None] * n_fg_class
223 | 
224 |     for l in n_pos.keys():
225 |         score_l = np.array(score[l])
226 |         match_l = np.array(match[l], dtype=np.int8)
227 | 
228 |         order = score_l.argsort()[::-1]
229 |         match_l = match_l[order]
230 | 
231 |         tp = np.cumsum(match_l == 1)
232 |         fp = np.cumsum(match_l == 0)
233 | 
234 |         # If an element of fp + tp is 0,
235 |         # the corresponding element of prec[l] is nan.
236 |         prec[l] = tp / (fp + tp)
237 |         # If n_pos[l] is 0, rec[l] is None.
238 |         if n_pos[l] > 0:
239 |             rec[l] = tp / n_pos[l]
240 | 
241 |     return prec, rec
242 | 
243 | 
244 | def calc_detection_voc_ap(prec, rec, use_07_metric=False):
245 |     """Calculate average precisions based on evaluation code of PASCAL VOC.
246 | 
247 |     This function calculates average precisions
248 |     from given precisions and recalls.
249 |     The code is based on the evaluation code used in PASCAL VOC Challenge.
250 | 
251 |     Args:
252 |         prec (list of numpy.array): A list of arrays.
253 |             :obj:`prec[l]` indicates precision for class :math:`l`.
254 |             If :obj:`prec[l]` is :obj:`None`, this function returns
255 |             :obj:`numpy.nan` for class :math:`l`.
256 |         rec (list of numpy.array): A list of arrays.
257 |             :obj:`rec[l]` indicates recall for class :math:`l`.
258 |             If :obj:`rec[l]` is :obj:`None`, this function returns
259 |             :obj:`numpy.nan` for class :math:`l`.
260 |         use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
261 |             for calculating average precision. The default value is
262 |             :obj:`False`.
263 | 
264 |     Returns:
265 |         ~numpy.ndarray:
266 |         This function returns an array of average precisions.
267 |         The :math:`l`-th value corresponds to the average precision
268 |         for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is
269 |         :obj:`None`, the corresponding value is set to :obj:`numpy.nan`.
270 | 
271 |     """
272 | 
273 |     n_fg_class = len(prec)
274 |     ap = np.empty(n_fg_class)
275 |     for l in six.moves.range(n_fg_class):
276 |         if prec[l] is None or rec[l] is None:
277 |             ap[l] = np.nan
278 |             continue
279 | 
280 |         if use_07_metric:
281 |             # 11 point metric
282 |             ap[l] = 0
283 |             for t in np.arange(0., 1.1, 0.1):
284 |                 if np.sum(rec[l] >= t) == 0:
285 |                     p = 0
286 |                 else:
287 |                     p = np.max(np.nan_to_num(prec[l])[rec[l] >= t])
288 |                 ap[l] += p / 11
289 |         else:
290 |             # correct AP calculation
291 |             # first append sentinel values at the end
292 |             mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0]))
293 |             mrec = np.concatenate(([0], rec[l], [1]))
294 | 
295 |             mpre = np.maximum.accumulate(mpre[::-1])[::-1]
296 | 
297 |             # to calculate area under PR curve, look for points
298 |             # where X axis (recall) changes value
299 |             i = np.where(mrec[1:] != mrec[:-1])[0]
300 | 
301 |             # and sum (\Delta recall) * prec
302 |             ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
303 | 
304 |     return ap
305 | 


--------------------------------------------------------------------------------
/model/utils/bbox_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numpy as xp
  3 | 
  4 | import six
  5 | from six import __init__
  6 | 
  7 | 
  8 | def _area_of_box(box):
  9 |     ymin, xmin, ymax, xmax = box
 10 | 
 11 |     return (ymax-ymin) * (xmax-xmin)
 12 | 
 13 | def p2bbox(px, py, search_regions, threshold=0.5):
 14 |     '''
 15 |     use px, py to get bounding boxes from search_regions.
 16 |     Args :
 17 |         px, py : probability of content. shape of (S, M)
 18 |         search_regions : shape of (S, 4), (ymin, xmin, ymax, xmax)
 19 | 
 20 |     Return :
 21 |         bboxes shape of (S,4)
 22 |     '''
 23 |     boxes = np.zeros(search_regions.shape)
 24 |     # boxes = []
 25 | 
 26 |     M = px.shape[1]
 27 | 
 28 |     for i in range(px.shape[0]):
 29 |         
 30 |         x = px[i,:]
 31 |         y = py[i,:]
 32 | 
 33 |         x = np.where(x>threshold)
 34 |         y = np.where(y>threshold)
 35 | 
 36 |         # print("x.shape and y.sghape", x[0].shape, y[0].shape)
 37 |         # if x[0].any() and y[0].any() :
 38 |         if len(x[0])>=5 and len(y[0])>=5 :
 39 |             x_s, x_e = x[0][0], x[0][-1]
 40 |             y_s, y_e = y[0][0], y[0][-1]
 41 |         
 42 |             ymin, xmin, ymax, xmax = search_regions[i,:]
 43 |             height_block = (ymax - ymin) / M
 44 |             width_block = (xmax - xmin) / M
 45 | 
 46 |             y_start = ymin + y_s * height_block
 47 |             y_end  = ymin + y_e * height_block
 48 |             x_start = xmin + x_s * width_block
 49 |             x_end  = xmin + x_e * width_block
 50 |             
 51 |             boxes[i, :] = [y_start, x_start, y_end, x_end]
 52 |             # boxes.append([y_start, x_start, y_end, x_end])
 53 | 
 54 |     # boxes = np.array(boxes)
 55 | 
 56 |     return boxes
 57 | 
 58 | 
 59 | def bbox_intersection(box_a, box_b):
 60 |     '''
 61 |     Args:
 62 |         box_a (array): A array of coordinates of a box.
 63 |             Its shape is :math:`(4,)`. These coordinates are
 64 |             :math:`ymin, xmin, ymax, xmax`.
 65 |         box_b (array): A array of coordinates of a box.
 66 |             Its shape is :math:`(4,)`. These coordinates are
 67 |             :math:`ymin, xmin, ymax, xmax`.
 68 |     
 69 |     Return:
 70 |         intersection (array): A array of coordinates of the intersection box.
 71 |             Its shape is :math:`(4,)`. These coordinates are
 72 |             :math:`ymin, xmin, ymax, xmax`.
 73 |     '''
 74 |     y_min_a, x_min_a, y_max_a, x_max_a = box_a
 75 |     y_min_b, x_min_b, y_max_b, x_max_b = box_b
 76 | 
 77 |     left = max(x_min_a, x_min_b)
 78 |     right = min(x_max_a, x_max_b)
 79 |     bottom = max(y_min_a, y_min_b)
 80 |     top = min(y_max_a, y_max_b)
 81 | 
 82 |     # 两个 box 没有交集
 83 |     if right<left or top<bottom :
 84 |         return np.array([0, 0, 0, 0])
 85 |     # 两个 box 有交集，就返回交集的矩形 (ymin, xmin, ymax, xmax)
 86 |     else :
 87 |         return np.array([bottom, left, top, right])
 88 | 
 89 | 
 90 | def bbox2T(search_regions, bboxes, M=28):
 91 |     '''
 92 |     Encodes the bboxes to Tx and Ty.
 93 | 
 94 |     Args:
 95 |         search_regions (array): A array of coordinates of search_region.
 96 |             Its shape is :math:`(S, 4)`. These coordinates are
 97 |             :math:`ymin, xmin, ymax, xmax`.
 98 |         bboxes (array): An array of bounding boxes.
 99 |             Its shape is :math:`(S, 4)`. These coordinates are
100 |             :math:`ymin, xmin, ymax, xmax`.
101 |         M (int): Number of parts in x or y dorection.
102 | 
103 |     Returns:
104 |         array:
105 | 
106 |     '''
107 | 
108 |     intersections = np.zeros(search_regions.shape)
109 | 
110 |     for ii, (search_region, bbox) in enumerate(zip(search_regions, bboxes)):
111 |         intersection = bbox_intersection(search_region, bbox)
112 | 
113 |         if _area_of_box(intersection):
114 |             intersections[ii,:] = intersection
115 |     
116 | 
117 |     Tx = np.zeros((search_regions.shape[0], M))
118 |     Ty = np.zeros((search_regions.shape[0], M))
119 | 
120 | 
121 |     for jj, (search_region, intersection) in enumerate(zip(search_regions, intersections)):
122 |         
123 |         if _area_of_box(intersection):
124 | 
125 |             ymin, xmin, ymax, xmax = search_region
126 |             bottom, left, top, right = intersection
127 | 
128 |             # xmin~xmax段M等分，设等分尺寸为dx
129 |             # x在(left-dx, right+dx)的开区间内，对应的label就是1，否则是0   
130 |             dx = (xmax-xmin)/(M-1)
131 |             tx = []
132 |             for x in np.linspace(xmin, xmax, num=M):
133 |                 if left-dx < x < right+dx:
134 |                     tx.append(1)
135 |                 else:
136 |                     tx.append(0)
137 | 
138 |             Tx[jj,:] = np.array(tx)
139 |             
140 |             # ymin~ymax段M等分，设等分尺寸为dy
141 |             # y在(bottom-dy, top+dy)的开区间内，对应的label就是1，否则是0 
142 |             dy = (ymax-ymin)/(M-1)
143 |             ty = []
144 |             for y in np.linspace(ymin, ymax, num=M):
145 |                 if bottom-dy < y < top+dy:
146 |                     ty.append(1)
147 |                 else:
148 |                     ty.append(0)
149 | 
150 |             Ty[jj,:] = np.array(ty)
151 |     
152 |     # Tx can be all 0, because some of search_regions are negtive. 
153 |     # if (np.max(Tx))== 0:
154 |     #     print("Tx all zeros!!!!")
155 |  
156 |     return Tx, Ty
157 | 
158 | 
159 | def loc2bbox(src_bbox, loc):
160 |     """Decode bounding boxes from bounding box offsets and scales.
161 | 
162 |     Given bounding box offsets and scales computed by
163 |     :meth:`bbox2loc`, this function decodes the representation to
164 |     coordinates in 2D image coordinates.
165 | 
166 |     Given scales and offsets :math:`t_y, t_x, t_h, t_w` and a bounding
167 |     box whose center is :math:`(y, x) = p_y, p_x` and size :math:`p_h, p_w`,
168 |     the decoded bounding box's center :math:`\\hat{g}_y`, :math:`\\hat{g}_x`
169 |     and size :math:`\\hat{g}_h`, :math:`\\hat{g}_w` are calculated
170 |     by the following formulas.
171 | 
172 |     * :math:`\\hat{g}_y = p_h t_y + p_y`
173 |     * :math:`\\hat{g}_x = p_w t_x + p_x`
174 |     * :math:`\\hat{g}_h = p_h \\exp(t_h)`
175 |     * :math:`\\hat{g}_w = p_w \\exp(t_w)`
176 | 
177 |     The decoding formulas are used in works such as R-CNN [#]_.
178 | 
179 |     The output is same type as the type of the inputs.
180 | 
181 |     .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \
182 |     Rich feature hierarchies for accurate object detection and semantic \
183 |     segmentation. CVPR 2014.
184 | 
185 |     Args:
186 |         src_bbox (array): A coordinates of bounding boxes.
187 |             Its shape is :math:`(R, 4)`. These coordinates are
188 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
189 |         loc (array): An array with offsets and scales.
190 |             The shapes of :obj:`src_bbox` and :obj:`loc` should be same.
191 |             This contains values :math:`t_y, t_x, t_h, t_w`.
192 | 
193 |     Returns:
194 |         array:
195 |         Decoded bounding box coordinates. Its shape is :math:`(R, 4)`. \
196 |         The second axis contains four values \
197 |         :math:`\\hat{g}_{ymin}, \\hat{g}_{xmin},
198 |         \\hat{g}_{ymax}, \\hat{g}_{xmax}`.
199 | 
200 |     """
201 | 
202 |     if src_bbox.shape[0] == 0:
203 |         return xp.zeros((0, 4), dtype=loc.dtype)
204 | 
205 |     src_bbox = src_bbox.astype(src_bbox.dtype, copy=False)
206 | 
207 |     src_height = src_bbox[:, 2] - src_bbox[:, 0]
208 |     src_width = src_bbox[:, 3] - src_bbox[:, 1]
209 |     src_ctr_y = src_bbox[:, 0] + 0.5 * src_height
210 |     src_ctr_x = src_bbox[:, 1] + 0.5 * src_width
211 | 
212 |     dy = loc[:, 0::4]
213 |     dx = loc[:, 1::4]
214 |     dh = loc[:, 2::4]
215 |     dw = loc[:, 3::4]
216 | 
217 |     # print("dh = ", dh)
218 | 
219 |     ctr_y = dy * src_height[:, xp.newaxis] + src_ctr_y[:, xp.newaxis]
220 |     ctr_x = dx * src_width[:, xp.newaxis] + src_ctr_x[:, xp.newaxis]
221 |     h = xp.exp(dh) * src_height[:, xp.newaxis]
222 |     w = xp.exp(dw) * src_width[:, xp.newaxis]
223 | 
224 |     dst_bbox = xp.zeros(loc.shape, dtype=loc.dtype)
225 |     dst_bbox[:, 0::4] = ctr_y - 0.5 * h
226 |     dst_bbox[:, 1::4] = ctr_x - 0.5 * w
227 |     dst_bbox[:, 2::4] = ctr_y + 0.5 * h
228 |     dst_bbox[:, 3::4] = ctr_x + 0.5 * w
229 | 
230 |     return dst_bbox
231 | 
232 | 
233 | def bbox2loc(src_bbox, dst_bbox):
234 |     """Encodes the source and the destination bounding boxes to "loc".
235 | 
236 |     Given bounding boxes, this function computes offsets and scales
237 |     to match the source bounding boxes to the target bounding boxes.
238 |     Mathematcially, given a bounding box whose center is
239 |     :math:`(y, x) = p_y, p_x` and
240 |     size :math:`p_h, p_w` and the target bounding box whose center is
241 |     :math:`g_y, g_x` and size :math:`g_h, g_w`, the offsets and scales
242 |     :math:`t_y, t_x, t_h, t_w` can be computed by the following formulas.
243 | 
244 |     * :math:`t_y = \\frac{(g_y - p_y)} {p_h}`
245 |     * :math:`t_x = \\frac{(g_x - p_x)} {p_w}`
246 |     * :math:`t_h = \\log(\\frac{g_h} {p_h})`
247 |     * :math:`t_w = \\log(\\frac{g_w} {p_w})`
248 | 
249 |     The output is same type as the type of the inputs.
250 |     The encoding formulas are used in works such as R-CNN [#]_.
251 | 
252 |     .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \
253 |     Rich feature hierarchies for accurate object detection and semantic \
254 |     segmentation. CVPR 2014.
255 | 
256 |     Args:
257 |         src_bbox (array): An image coordinate array whose shape is
258 |             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
259 |             These coordinates are
260 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
261 |         dst_bbox (array): An image coordinate array whose shape is
262 |             :math:`(R, 4)`.
263 |             These coordinates are
264 |             :math:`g_{ymin}, g_{xmin}, g_{ymax}, g_{xmax}`.
265 | 
266 |     Returns:
267 |         array:
268 |         Bounding box offsets and scales from :obj:`src_bbox` \
269 |         to :obj:`dst_bbox`. \
270 |         This has shape :math:`(R, 4)`.
271 |         The second axis contains four values :math:`t_y, t_x, t_h, t_w`.
272 | 
273 |     """
274 | 
275 |     height = src_bbox[:, 2] - src_bbox[:, 0]
276 |     width = src_bbox[:, 3] - src_bbox[:, 1]
277 |     ctr_y = src_bbox[:, 0] + 0.5 * height
278 |     ctr_x = src_bbox[:, 1] + 0.5 * width
279 | 
280 |     base_height = dst_bbox[:, 2] - dst_bbox[:, 0]
281 |     base_width = dst_bbox[:, 3] - dst_bbox[:, 1]
282 |     base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height
283 |     base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width
284 | 
285 |     eps = xp.finfo(height.dtype).eps
286 |     height = xp.maximum(height, eps)
287 |     width = xp.maximum(width, eps)
288 | 
289 |     dy = (base_ctr_y - ctr_y) / height
290 |     dx = (base_ctr_x - ctr_x) / width
291 |     dh = xp.log(base_height / height)
292 |     dw = xp.log(base_width / width)
293 | 
294 |     loc = xp.vstack((dy, dx, dh, dw)).transpose()
295 |     return loc
296 | 
297 | 
298 | def bbox_iou(bbox_a, bbox_b):
299 |     """Calculate the Intersection of Unions (IoUs) between bounding boxes.
300 | 
301 |     IoU is calculated as a ratio of area of the intersection
302 |     and area of the union.
303 | 
304 |     This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as
305 |     inputs. Please note that both :obj:`bbox_a` and :obj:`bbox_b` need to be
306 |     same type.
307 |     The output is same type as the type of the inputs.
308 | 
309 |     Args:
310 |         bbox_a (array): An array whose shape is :math:`(N, 4)`.
311 |             :math:`N` is the number of bounding boxes.
312 |             The dtype should be :obj:`numpy.float32`.
313 |         bbox_b (array): An array similar to :obj:`bbox_a`,
314 |             whose shape is :math:`(K, 4)`.
315 |             The dtype should be :obj:`numpy.float32`.
316 | 
317 |     Returns:
318 |         array:
319 |         An array whose shape is :math:`(N, K)`. \
320 |         An element at index :math:`(n, k)` contains IoUs between \
321 |         :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \
322 |         box in :obj:`bbox_b`.
323 | 
324 |     """
325 |     if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4:
326 |         raise IndexError
327 | 
328 |     # top left
329 |     tl = xp.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
330 |     # bottom right
331 |     br = xp.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])
332 | 
333 |     area_i = xp.prod(br - tl, axis=2) * (tl < br).all(axis=2)
334 |     area_a = xp.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1)
335 |     area_b = xp.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1)
336 |     return area_i / (area_a[:, None] + area_b - area_i)
337 | 
338 | 
339 | def __test():
340 |     pass
341 | 
342 | 
343 | if __name__ == '__main__':
344 |     __test()
345 | 
346 | 
347 | def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2],
348 |                          anchor_scales=[8, 16, 32]):
349 |     """Generate anchor base windows by enumerating aspect ratio and scales.
350 | 
351 |     Generate anchors that are scaled and modified to the given aspect ratios.
352 |     Area of a scaled anchor is preserved when modifying to the given aspect
353 |     ratio.
354 | 
355 |     :obj:`R = len(ratios) * len(anchor_scales)` anchors are generated by this
356 |     function.
357 |     The :obj:`i * len(anchor_scales) + j` th anchor corresponds to an anchor
358 |     generated by :obj:`ratios[i]` and :obj:`anchor_scales[j]`.
359 | 
360 |     For example, if the scale is :math:`8` and the ratio is :math:`0.25`,
361 |     the width and the height of the base window will be stretched by :math:`8`.
362 |     For modifying the anchor to the given aspect ratio,
363 |     the height is halved and the width is doubled.
364 | 
365 |     Args:
366 |         base_size (number): The width and the height of the reference window.
367 |         ratios (list of floats): This is ratios of width to height of
368 |             the anchors.
369 |         anchor_scales (list of numbers): This is areas of anchors.
370 |             Those areas will be the product of the square of an element in
371 |             :obj:`anchor_scales` and the original area of the reference
372 |             window.
373 | 
374 |     Returns:
375 |         ~numpy.ndarray:
376 |         An array of shape :math:`(R, 4)`.
377 |         Each element is a set of coordinates of a bounding box.
378 |         The second axis corresponds to
379 |         :math:`(y_{min}, x_{min}, y_{max}, x_{max})` of a bounding box.
380 | 
381 |     """
382 |     py = base_size / 2.
383 |     px = base_size / 2.
384 | 
385 |     anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4),
386 |                            dtype=np.float32)
387 |     for i in six.moves.range(len(ratios)):
388 |         for j in six.moves.range(len(anchor_scales)):
389 |             h = base_size * anchor_scales[j] * np.sqrt(ratios[i])
390 |             w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])
391 | 
392 |             index = i * len(anchor_scales) + j
393 |             anchor_base[index, 0] = py - h / 2.
394 |             anchor_base[index, 1] = px - w / 2.
395 |             anchor_base[index, 2] = py + h / 2.
396 |             anchor_base[index, 3] = px + w / 2.
397 |     return anchor_base
398 | 


--------------------------------------------------------------------------------
/model/utils/creator_tool.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cupy as cp
  3 | 
  4 | from model.utils.bbox_tools import bbox2loc, bbox_iou, loc2bbox, bbox2T
  5 | from model.utils.nms import non_maximum_suppression
  6 | 
  7 | 
  8 | class ProposalTargetCreator(object):
  9 |     """
 10 |     When using LocNet to improve Fast RCNN, you need to change this class \
 11 |     to generate ground truth label in different forms.
 12 |     
 13 |     Assign ground truth bounding boxes to given RoIs.
 14 | 
 15 |     The :meth:`__call__` of this class generates training targets
 16 |     for each object proposal.
 17 |     This is used to train Faster RCNN [#]_.
 18 | 
 19 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
 20 |     Faster R-CNN: Towards Real-Time Object Detection with \
 21 |     Region Proposal Networks. NIPS 2015.
 22 | 
 23 |     Args:
 24 |         n_sample (int): The number of sampled regions.
 25 |         pos_ratio (float): Fraction of regions that is labeled as a foreground.
 26 |         pos_iou_thresh (float): IoU threshold for a RoI to be considered as a foreground.
 27 |         neg_iou_thresh_hi (float): RoI is considered to be the background if IoU is in
 28 |             [:obj:`neg_iou_thresh_lo`, :obj:`neg_iou_thresh_hi`).
 29 |         neg_iou_thresh_lo (float): See above.
 30 | 
 31 |     """
 32 | 
 33 |     def __init__(self,
 34 |                  n_sample=128,
 35 |                  pos_ratio=0.25, pos_iou_thresh=0.5,
 36 |                  neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0
 37 |                  ):
 38 |         self.n_sample = n_sample
 39 |         self.pos_ratio = pos_ratio
 40 |         self.pos_iou_thresh = pos_iou_thresh
 41 |         self.neg_iou_thresh_hi = neg_iou_thresh_hi
 42 |         self.neg_iou_thresh_lo = neg_iou_thresh_lo  # NOTE: py-faster-rcnn默认的值是0.1
 43 | 
 44 |     def __call__(self, roi, search_region, bbox, label):
 45 |         """Assigns ground truth to sampled proposals.
 46 | 
 47 |         This function samples total of :obj:`self.n_sample` RoIs
 48 |         from the combination of :obj:`roi` and :obj:`bbox`.
 49 |         The RoIs are assigned with the ground truth class labels as well as
 50 |         bounding box offsets and scales to match the ground truth bounding
 51 |         boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are
 52 |         sampled as foregrounds.
 53 | 
 54 |         Offsets and scales of bounding boxes are calculated using
 55 |         :func:`model.utils.bbox_tools.bbox2loc`.
 56 |         Also, types of input arrays and output arrays are same.
 57 | 
 58 |         Here are notations.
 59 | 
 60 |         * :math:`S` is the total number of sampled RoIs, which equals \
 61 |             :obj:`self.n_sample`.
 62 |         * :math:`L` is number of object classes possibly including the \
 63 |             background.
 64 | 
 65 |         Args:
 66 |             roi (array): Region of Interests (RoIs) from which we sample.
 67 |                 Its shape is :math:`(R, 4)`
 68 |             bbox (array): The coordinates of ground truth bounding boxes.
 69 |                 Its shape is :math:`(R', 4)`.
 70 |             label (array): Ground truth bounding box labels. Its shape
 71 |                 is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where
 72 |                 :math:`L` is the number of foreground classes.
 73 |             loc_normalize_mean (tuple of four floats): Mean values to normalize
 74 |                 coordinates of bouding boxes.
 75 |             loc_normalize_std (tupler of four floats): Standard deviation of
 76 |                 the coordinates of bounding boxes.
 77 | 
 78 |         Returns:
 79 |             (array, array, array):
 80 | 
 81 |             * **sample_roi**: Regions of interests that are sampled. \
 82 |                 Its shape is :math:`(S, 4)`.
 83 |             * **sample_search_region**: Search_region that are sampled. \
 84 |                 Its shape is :math:`(S, 4)`.
 85 |             * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \
 86 |                 :math:`(S,)`. Its range is :math:`[0, L]`. The label with \
 87 |                 value 0 is the background.
 88 |             
 89 |             # * **gt_roi_loc**: Offsets and scales to match \
 90 |             #     the sampled RoIs to the ground truth bounding boxes. \
 91 |             #     Its shape is :math:`(S, 4)`.
 92 | 
 93 | 
 94 |         """
 95 |         n_bbox, _ = bbox.shape
 96 | 
 97 |         # print("n_bbox : ", n_bbox)
 98 | 
 99 |         # roi shape (N,4) , bbox shape (M,4), so the new iou shape is (N+M,4)
100 |         # roi = np.concatenate((roi, bbox), axis=0) # 为什么把 roi 和 bbox 连起来了
101 |         # search_boxes = _generate_search_region(bbox, Sh=1.2, Sw=1.2) # enlarge bbox to get some search_regions
102 |         # search_region = np.concatenate((search_region, search_boxes), axis=0)
103 | 
104 |         pos_roi_per_image = np.round(self.n_sample * self.pos_ratio) # 正样本的数量
105 |         iou = bbox_iou(roi, bbox) #  iou shape is (N, M) or (N+M, M)    
106 |         gt_assignment = iou.argmax(axis=1)  # 含义是把某一个 roi 分配给了哪一个 bbox
107 |         max_iou = iou.max(axis=1)
108 | 
109 |         # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class].
110 |         # The label with value 0 is the background.
111 |         gt_roi_label = label[gt_assignment] + 1
112 | 
113 |         # Select foreground RoIs as those with >= pos_iou_thresh IoU.
114 |         pos_index = np.where(max_iou >= self.pos_iou_thresh)[0]
115 | 
116 |         pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
117 |         if pos_index.size > 0:
118 |             pos_index = np.random.choice(
119 |                 pos_index, size=pos_roi_per_this_image, replace=False)
120 | 
121 |         # Select background RoIs as those within
122 |         # [neg_iou_thresh_lo, neg_iou_thresh_hi).
123 |         neg_index = np.where((max_iou < self.neg_iou_thresh_hi) &
124 |                              (max_iou >= self.neg_iou_thresh_lo))[0]
125 |         neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image
126 |         neg_roi_per_this_image = int(min(neg_roi_per_this_image,
127 |                                          neg_index.size))
128 |         if neg_index.size > 0:
129 |             neg_index = np.random.choice(
130 |                 neg_index, size=neg_roi_per_this_image, replace=False)
131 | 
132 |         # The indices that we're selecting (both positive and negative).
133 |         keep_index = np.append(pos_index, neg_index)
134 | 
135 |         gt_roi_label = gt_roi_label[keep_index]
136 |         gt_roi_label[pos_roi_per_this_image:] = 0  # negative labels --> 0
137 |         
138 |         # print("gt_roi_label ", gt_roi_label)
139 |         # print(keep_index.shape)
140 |         # print("search_region.shape", search_region.shape)
141 |         # print("max of search_region", np.max(search_region))
142 |         # for i in range(search_region.shape[0]):
143 |         # print(search_region[i,:])
144 |         # print(roi.shape)
145 | 
146 |         sample_roi = roi[keep_index]
147 |         sample_search_region = search_region[keep_index]
148 | 
149 |         # use search region and bbox to generate Tx and Ty
150 |         # sample_roi 和 sample_bboxes 是一一对应的
151 |         # 当然这里可能出现一个 bbox 对应了多个 sample_roi 的情况
152 |         # sample_bboxes shape (S,4)
153 |         sample_bbox = bbox[gt_assignment[keep_index]]
154 |         
155 |         # print("", gt_assignment[keep_index])
156 |         # print("sample_bbox ", sample_bbox)
157 |         
158 |         Tx, Ty = bbox2T(sample_search_region, sample_bbox)
159 | 
160 |         return sample_roi, sample_search_region, (Tx,Ty), gt_roi_label
161 | 
162 | 
163 | class AnchorTargetCreator(object):
164 |     """Assign the ground truth bounding boxes to anchors.
165 | 
166 |     Assigns the ground truth bounding boxes to anchors for training Region
167 |     Proposal Networks introduced in Faster R-CNN [#]_.
168 | 
169 |     Offsets and scales to match anchors to the ground truth are
170 |     calculated using the encoding scheme of
171 |     :func:`model.utils.bbox_tools.bbox2loc`.
172 | 
173 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
174 |     Faster R-CNN: Towards Real-Time Object Detection with \
175 |     Region Proposal Networks. NIPS 2015.
176 | 
177 |     Args:
178 |         n_sample (int): The number of regions to produce.
179 |         pos_iou_thresh (float): Anchors with IoU above this
180 |             threshold will be assigned as positive.
181 |         neg_iou_thresh (float): Anchors with IoU below this
182 |             threshold will be assigned as negative.
183 |         pos_ratio (float): Ratio of positive regions in the
184 |             sampled regions.
185 | 
186 |     """
187 | 
188 |     def __init__(self,
189 |                  n_sample=256,
190 |                  pos_iou_thresh=0.7, 
191 |                  neg_iou_thresh=0.3,
192 |                  pos_ratio=0.5):
193 |         self.n_sample = n_sample
194 |         self.pos_iou_thresh = pos_iou_thresh
195 |         self.neg_iou_thresh = neg_iou_thresh
196 |         self.pos_ratio = pos_ratio
197 | 
198 |     def __call__(self, bbox, anchor, img_size):
199 |         """Assign ground truth supervision to sampled subset of anchors.
200 | 
201 |         Types of input arrays and output arrays are same.
202 | 
203 |         Here are notations.
204 | 
205 |         * :math:`S` is the number of anchors.
206 |         * :math:`R` is the number of bounding boxes.
207 | 
208 |         Args:
209 |             bbox (array): Coordinates of bounding boxes. Its shape is
210 |                 :math:`(R, 4)`.
211 |             anchor (array): Coordinates of anchors. Its shape is
212 |                 :math:`(S, 4)`.
213 |             img_size (tuple of ints): A tuple :obj:`H, W`, which
214 |                 is a tuple of height and width of an image.
215 | 
216 |         Returns:
217 |             (array, array):
218 | 
219 |             #NOTE: it's scale not only  offset
220 |             * **loc**: Offsets and scales to match the anchors to \
221 |                 the ground truth bounding boxes. Its shape is :math:`(S, 4)`.
222 |             * **label**: Labels of anchors with values \
223 |                 :obj:`(1=positive, 0=negative, -1=ignore)`. Its shape \
224 |                 is :math:`(S,)`.
225 | 
226 |         """
227 | 
228 |         img_H, img_W = img_size
229 | 
230 |         n_anchor = len(anchor)
231 |         inside_index = _get_inside_index(anchor, img_H, img_W)
232 |         anchor = anchor[inside_index]
233 |         argmax_ious, label = self._create_label(
234 |             inside_index, anchor, bbox)
235 | 
236 |         # compute bounding box regression targets
237 |         loc = bbox2loc(anchor, bbox[argmax_ious])
238 | 
239 |         # map up to original set of anchors
240 |         label = _unmap(label, n_anchor, inside_index, fill=-1)
241 |         loc = _unmap(loc, n_anchor, inside_index, fill=0)
242 | 
243 |         return loc, label
244 | 
245 |     def _create_label(self, inside_index, anchor, bbox):
246 |         # label: 1 is positive, 0 is negative, -1 is dont care
247 |         label = np.empty((len(inside_index),), dtype=np.int32)
248 |         label.fill(-1)
249 | 
250 |         argmax_ious, max_ious, gt_argmax_ious = \
251 |             self._calc_ious(anchor, bbox, inside_index)
252 | 
253 |         # assign negative labels first so that positive labels can clobber them
254 |         label[max_ious < self.neg_iou_thresh] = 0
255 | 
256 |         # positive label: for each gt, anchor with highest iou
257 |         label[gt_argmax_ious] = 1
258 | 
259 |         # positive label: above threshold IOU
260 |         label[max_ious >= self.pos_iou_thresh] = 1
261 | 
262 |         # subsample positive labels if we have too many
263 |         n_pos = int(self.pos_ratio * self.n_sample)
264 |         pos_index = np.where(label == 1)[0]
265 |         if len(pos_index) > n_pos:
266 |             disable_index = np.random.choice(
267 |                 pos_index, size=(len(pos_index) - n_pos), replace=False)
268 |             label[disable_index] = -1
269 | 
270 |         # subsample negative labels if we have too many
271 |         n_neg = self.n_sample - np.sum(label == 1)
272 |         neg_index = np.where(label == 0)[0]
273 |         if len(neg_index) > n_neg:
274 |             disable_index = np.random.choice(
275 |                 neg_index, size=(len(neg_index) - n_neg), replace=False)
276 |             label[disable_index] = -1
277 | 
278 |         return argmax_ious, label
279 | 
280 |     def _calc_ious(self, anchor, bbox, inside_index):
281 |         # ious between the anchors and the gt boxes
282 |         ious = bbox_iou(anchor, bbox)
283 |         argmax_ious = ious.argmax(axis=1)
284 |         max_ious = ious[np.arange(len(inside_index)), argmax_ious]
285 |         gt_argmax_ious = ious.argmax(axis=0)
286 |         gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
287 |         gt_argmax_ious = np.where(ious == gt_max_ious)[0]
288 | 
289 |         return argmax_ious, max_ious, gt_argmax_ious
290 | 
291 | 
292 | def _unmap(data, count, index, fill=0):
293 |     # Unmap a subset of item (data) back to the original set of items (of
294 |     # size count)
295 | 
296 |     if len(data.shape) == 1:
297 |         ret = np.empty((count,), dtype=data.dtype)
298 |         ret.fill(fill)
299 |         ret[index] = data
300 |     else:
301 |         ret = np.empty((count,) + data.shape[1:], dtype=data.dtype)
302 |         ret.fill(fill)
303 |         ret[index, :] = data
304 |     return ret
305 | 
306 | 
307 | def _get_inside_index(anchor, H, W):
308 |     # Calc indicies of anchors which are located completely inside of the image
309 |     # whose size is speficied.
310 |     index_inside = np.where(
311 |         (anchor[:, 0] >= 0) &
312 |         (anchor[:, 1] >= 0) &
313 |         (anchor[:, 2] <= H) &
314 |         (anchor[:, 3] <= W)
315 |     )[0]
316 |     return index_inside
317 | 
318 | 
319 | class ProposalCreator:
320 |     """
321 |     Changed : search_regions are also generated by call this object.
322 | 
323 |     Proposal regions are generated by calling this object.
324 | 
325 |     The :meth:`__call__` of this object outputs object detection proposals by
326 |     applying estimated bounding box offsets to a set of anchors.
327 | 
328 |     This class takes parameters to control number of bounding boxes to
329 |     pass to NMS and keep after NMS.
330 |     If the paramters are negative, it uses all the bounding boxes supplied
331 |     or keep all the bounding boxes returned by NMS.
332 | 
333 |     This class is used for Region Proposal Networks introduced in
334 |     Faster R-CNN [#]_.
335 | 
336 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
337 |     Faster R-CNN: Towards Real-Time Object Detection with \
338 |     Region Proposal Networks. NIPS 2015.
339 | 
340 |     Args:
341 |         nms_thresh (float): Threshold value used when calling NMS.
342 |         n_train_pre_nms (int): Number of top scored bounding boxes
343 |             to keep before passing to NMS in train mode.
344 |         n_train_post_nms (int): Number of top scored bounding boxes
345 |             to keep after passing to NMS in train mode.
346 |         n_test_pre_nms (int): Number of top scored bounding boxes
347 |             to keep before passing to NMS in test mode.
348 |         n_test_post_nms (int): Number of top scored bounding boxes
349 |             to keep after passing to NMS in test mode.
350 |         force_cpu_nms (bool): If this is :obj:`True`,
351 |             always use NMS in CPU mode. If :obj:`False`,
352 |             the NMS mode is selected based on the type of inputs.
353 |         min_size (int): A paramter to determine the threshold on
354 |             discarding bounding boxes based on their sizes.
355 | 
356 |     """
357 | 
358 |     def __init__(self,
359 |                  parent_model,
360 |                  nms_thresh=0.7,
361 |                  n_train_pre_nms=12000,
362 |                  n_train_post_nms=2000,
363 |                  n_test_pre_nms=6000,
364 |                  n_test_post_nms=300,
365 |                  min_size=16
366 |                  ):
367 |         self.parent_model = parent_model
368 |         self.nms_thresh = nms_thresh
369 |         self.n_train_pre_nms = n_train_pre_nms
370 |         self.n_train_post_nms = n_train_post_nms
371 |         self.n_test_pre_nms = n_test_pre_nms
372 |         self.n_test_post_nms = n_test_post_nms
373 |         self.min_size = min_size
374 | 
375 |     def __call__(self, loc, score,
376 |                  anchor, img_size, scale=1.):
377 |         """input should  be ndarray
378 |         Propose RoIs.
379 | 
380 |         Inputs :obj:`loc, score, anchor` refer to the same anchor when indexed
381 |         by the same index.
382 | 
383 |         On notations, :math:`R` is the total number of anchors. This is equal
384 |         to product of the height and the width of an image and the number of
385 |         anchor bases per pixel.
386 | 
387 |         Type of the output is same as the inputs.
388 | 
389 |         Args:
390 |             loc (array): Predicted offsets and scaling to anchors.
391 |                 Its shape is :math:`(R, 4)`.
392 |             score (array): Predicted foreground probability for anchors.
393 |                 Its shape is :math:`(R,)`.
394 |             anchor (array): Coordinates of anchors. Its shape is
395 |                 :math:`(R, 4)`.
396 |             img_size (tuple of ints): A tuple :obj:`height, width`,
397 |                 which contains image size after scaling.
398 |             scale (float): The scaling factor used to scale an image after
399 |                 reading it from a file.
400 | 
401 |         Returns:
402 |             array:
403 |             An array of coordinates of proposal boxes.
404 |             Its shape is :math:`(S, 4)`. :math:`S` is less than
405 |             :obj:`self.n_test_post_nms` in test time and less than
406 |             :obj:`self.n_train_post_nms` in train time. :math:`S` depends on
407 |             the size of the predicted bounding boxes and the number of
408 |             bounding boxes discarded by NMS.
409 |         
410 |         Steps:
411 |             1. set different parameters when training or predicting.
412 |             2. use anchor and loc to caculate boxes coordinates.
413 |             3. clip.
414 |             4. remove too small boxes.
415 |             5. select top score boxes.
416 | 
417 |         """
418 |         # NOTE: when test, remember
419 |         # faster_rcnn.eval()
420 |         # to set self.traing = False
421 |         if self.parent_model.training:
422 |             n_pre_nms = self.n_train_pre_nms
423 |             n_post_nms = self.n_train_post_nms
424 |         else:
425 |             n_pre_nms = self.n_test_pre_nms
426 |             n_post_nms = self.n_test_post_nms
427 | 
428 |         # Convert anchors into proposal via bbox transformations.
429 |         # roi = loc2bbox(anchor, loc)
430 |         # 使用 anchor 和 loc 来计算rpn预测的box的具体坐标
431 |         # roi shape (R, 4)
432 |         # roi : (ymin, xmin, ymax, xmax)
433 |         roi = loc2bbox(anchor, loc) 
434 | 
435 |         # generate search_region using roi
436 |         search_region = _generate_search_region(roi)
437 | 
438 |         # Clip predicted boxes to image. 
439 |         # 使用reshaped image的上下左右边来裁剪 roi，使得roi都在reshaped image里边
440 |         roi[:, slice(0, 4, 2)] = np.clip(roi[:, slice(0, 4, 2)], 0, img_size[0])
441 |         roi[:, slice(1, 4, 2)] = np.clip(roi[:, slice(1, 4, 2)], 0, img_size[1])
442 |         # 使用reshaped image的上下左右边来裁剪 search_region，使得search_region都在reshaped image里边
443 |         search_region[:, slice(0, 4, 2)] = np.clip(search_region[:, slice(0, 4, 2)], 0, img_size[0])
444 |         search_region[:, slice(1, 4, 2)] = np.clip(search_region[:, slice(1, 4, 2)], 0, img_size[1])
445 | 
446 |         # Remove predicted boxes with either height or width < threshold.
447 |         min_size = self.min_size * scale
448 |         hs = roi[:, 2] - roi[:, 0]
449 |         ws = roi[:, 3] - roi[:, 1]
450 |         keep = np.where((hs >= min_size) & (ws >= min_size))[0]
451 |         roi = roi[keep, :]
452 |         search_region = search_region[keep, :]
453 |         score = score[keep]
454 | 
455 |         # Sort all (proposal, score) pairs by score from highest to lowest.
456 |         # Take top pre_nms_topN (e.g. 6000).
457 |         order = score.ravel().argsort()[::-1]
458 |         if n_pre_nms > 0:
459 |             order = order[:n_pre_nms]
460 |         roi = roi[order, :]
461 |         search_region = search_region[order, :]
462 | 
463 |         # Apply nms (e.g. threshold = 0.7).
464 |         # Take after_nms_topN (e.g. 300).
465 | 
466 |         # unNOTE: somthing is wrong here!
467 |         # TODO: remove cuda.to_gpu
468 |         keep = non_maximum_suppression(cp.ascontiguousarray(cp.asarray(roi)),
469 |                                        thresh=self.nms_thresh)
470 |         if n_post_nms > 0:
471 |             keep = keep[:n_post_nms]
472 | 
473 |         roi = roi[keep]
474 |         search_region = search_region[keep]
475 |       
476 |         return roi, search_region
477 | 
478 | 
479 | def _generate_search_region(roi, Sh=1.2, Sw=1.2):
480 | 
481 |     search_region = np.zeros(roi.shape)
482 | 
483 |     for i in range(roi.shape[0]):
484 |         ymin_roi, xmin_roi, ymax_roi, xmax_roi = roi[i,:]
485 | 
486 |         y_center = (ymin_roi + ymax_roi)/2
487 |         x_center = (xmin_roi + xmax_roi)/2
488 |         
489 |         height_roi = ymax_roi - ymin_roi
490 |         width_roi = xmax_roi - xmin_roi
491 |         
492 |         # search region parameters
493 |         height_s = height_roi * Sh
494 |         width_s = width_roi * Sw
495 | 
496 |         ymin_s = y_center - height_s/2
497 |         ymax_s = y_center + height_s/2
498 |         xmin_s = x_center - width_s/2
499 |         xmax_s = x_center + width_s/2
500 | 
501 |         search_region[i,:] = ymin_s, xmin_s, ymax_s, xmax_s
502 |     
503 |     return search_region
504 | 


--------------------------------------------------------------------------------