├── data
    ├── __init__.py
    ├── dataset.py
    ├── voc_dataset.py
    └── util.py
├── model
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── nms
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   └── non_maximum_suppression.cpython-36.pyc
    │   │   ├── build
    │   │   │   └── temp.linux-x86_64-3.6
    │   │   │   │   └── _nms_gpu_post.o
    │   │   ├── _nms_gpu_post.cpython-36m-x86_64-linux-gnu.so
    │   │   ├── build.py
    │   │   ├── _nms_gpu_post_py.py
    │   │   ├── _nms_gpu_post.pyx
    │   │   ├── nohup.out
    │   │   └── non_maximum_suppression.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── bbox_tools.cpython-36.pyc
    │   │   ├── roi_sample.cpython-36.pyc
    │   │   └── rpn_gt_loc_label.cpython-36.pyc
    │   ├── roi_sample.py
    │   ├── rpn_gt_loc_label.py
    │   └── bbox_tools.py
    ├── vgg16.py
    ├── rpn.py
    ├── faster_rcnn.py
    └── roi_module.py
├── utils
    ├── __init__.py
    ├── __pycache__
    │   ├── config.cpython-36.pyc
    │   ├── py_nms.cpython-36.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── anchors.cpython-36.pyc
    │   ├── data_load.cpython-36.pyc
    │   ├── eval_tool.cpython-36.pyc
    │   └── array_tool.cpython-36.pyc
    ├── py_nms.py
    ├── array_tool.py
    ├── config.py
    ├── anchors.py
    ├── data_load.py
    └── eval_tool.py
├── trainer
    ├── __init__.py
    ├── __pycache__
    │   ├── trainer.cpython-36.pyc
    │   └── __init__.cpython-36.pyc
    └── trainer.py
├── demo.jpg
└── Train.ipynb


/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/demo.jpg


--------------------------------------------------------------------------------
/model/utils/nms/__init__.py:
--------------------------------------------------------------------------------
1 | from model.utils.nms.non_maximum_suppression import non_maximum_suppression


--------------------------------------------------------------------------------
/utils/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/py_nms.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/py_nms.cpython-36.pyc


--------------------------------------------------------------------------------
/trainer/__pycache__/trainer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/trainer/__pycache__/trainer.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/anchors.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/anchors.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/data_load.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/data_load.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/eval_tool.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/eval_tool.cpython-36.pyc


--------------------------------------------------------------------------------
/trainer/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/trainer/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/array_tool.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/array_tool.cpython-36.pyc


--------------------------------------------------------------------------------
/model/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/model/utils/__pycache__/bbox_tools.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/__pycache__/bbox_tools.cpython-36.pyc


--------------------------------------------------------------------------------
/model/utils/__pycache__/roi_sample.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/__pycache__/roi_sample.cpython-36.pyc


--------------------------------------------------------------------------------
/model/utils/nms/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/nms/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/model/utils/__pycache__/rpn_gt_loc_label.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/__pycache__/rpn_gt_loc_label.cpython-36.pyc


--------------------------------------------------------------------------------
/model/utils/nms/build/temp.linux-x86_64-3.6/_nms_gpu_post.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/nms/build/temp.linux-x86_64-3.6/_nms_gpu_post.o


--------------------------------------------------------------------------------
/model/utils/nms/_nms_gpu_post.cpython-36m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/nms/_nms_gpu_post.cpython-36m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/model/utils/nms/__pycache__/non_maximum_suppression.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/nms/__pycache__/non_maximum_suppression.cpython-36.pyc


--------------------------------------------------------------------------------
/model/utils/nms/build.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from distutils.extension import Extension
 3 | from Cython.Distutils import build_ext
 4 | 
 5 | ext_modules = [Extension("_nms_gpu_post", ["_nms_gpu_post.pyx"])]
 6 | setup(
 7 |     name="Hello pyx",
 8 |     cmdclass={'build_ext': build_ext},
 9 |     ext_modules=ext_modules
10 | )
11 | 


--------------------------------------------------------------------------------
/model/vgg16.py:
--------------------------------------------------------------------------------
 1 | from torchvision.models import vgg16
 2 | from torch import nn
 3 | 
 4 | def decom_vgg16():
 5 |     model = vgg16(pretrained=True)
 6 |     features = list(model.features)[:30]
 7 |     classifier = list(model.classifier)
 8 |     # remove last layer and dropout layer
 9 |     del classifier[6]
10 |     del classifier[5]
11 |     del classifier[2]
12 |     # free top layer params
13 |     for layer in features[:10]:
14 |         for p in layer.parameters():
15 |             p.requires_grad = False
16 |     return nn.Sequential(*features), nn.Sequential(*classifier)


--------------------------------------------------------------------------------
/model/utils/nms/_nms_gpu_post_py.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | 
 4 | def _nms_gpu_post( mask,
 5 |                   n_bbox,
 6 |                    threads_per_block,
 7 |                    col_blocks
 8 |                   ):
 9 |     n_selection = 0
10 |     one_ull = np.array([1],dtype=np.uint64)
11 |     selection = np.zeros((n_bbox,), dtype=np.int32)
12 |     remv = np.zeros((col_blocks,), dtype=np.uint64)
13 | 
14 |     for i in range(n_bbox):
15 |         nblock = i // threads_per_block
16 |         inblock = i % threads_per_block
17 | 
18 |         if not (remv[nblock] & one_ull << inblock):
19 |             selection[n_selection] = i
20 |             n_selection += 1
21 | 
22 |             index = i * col_blocks
23 |             for j in range(nblock, col_blocks):
24 |                 remv[j] |= mask[index + j]
25 |     return selection, n_selection
26 | 


--------------------------------------------------------------------------------
/utils/py_nms.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def py_cpu_nms(rois, thresh):
 4 |     """
 5 |     Pure Python NMS baseline.
 6 |     Already Sorted
 7 | 
 8 |     return:
 9 |     keep: roi keep indice
10 |     """
11 |     y1 = rois[:, 0]
12 |     x1 = rois[:, 1]
13 |     y2 = rois[:, 2]
14 |     x2 = rois[:, 3]
15 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
16 | 
17 |     N = len(rois)
18 |     order = np.array(range(N))
19 | 
20 |     keep = []
21 |     while order.size > 0:
22 |         i = order[0]
23 |         keep.append(i)
24 |         xx1 = np.maximum(x1[i], x1[order[1:]])
25 |         yy1 = np.maximum(y1[i], y1[order[1:]])
26 |         xx2 = np.minimum(x2[i], x2[order[1:]])
27 |         yy2 = np.minimum(y2[i], y2[order[1:]])
28 | 
29 |         w = np.maximum(0.0, xx2 - xx1 + 1)
30 |         h = np.maximum(0.0, yy2 - yy1 + 1)
31 |         inter = w * h
32 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
33 | 
34 |         inds = np.where(ovr <= thresh)[0]
35 |         order = order[inds + 1]
36 | 
37 |     return keep


--------------------------------------------------------------------------------
/model/utils/nms/_nms_gpu_post.pyx:
--------------------------------------------------------------------------------
 1 | cimport numpy as np
 2 | from libc.stdint cimport uint64_t
 3 | 
 4 | import numpy as np
 5 | 
 6 | def _nms_gpu_post(np.ndarray[np.uint64_t, ndim=1] mask,
 7 |                   int n_bbox,
 8 |                   int threads_per_block,
 9 |                   int col_blocks
10 |                   ):
11 |     cdef:
12 |         int i, j, nblock, index
13 |         uint64_t inblock
14 |         int n_selection = 0
15 |         uint64_t one_ull = 1
16 |         np.ndarray[np.int32_t, ndim=1] selection
17 |         np.ndarray[np.uint64_t, ndim=1] remv
18 | 
19 |     selection = np.zeros((n_bbox,), dtype=np.int32)
20 |     remv = np.zeros((col_blocks,), dtype=np.uint64)
21 | 
22 |     for i in range(n_bbox):
23 |         nblock = i // threads_per_block
24 |         inblock = i % threads_per_block
25 | 
26 |         if not (remv[nblock] & one_ull << inblock):
27 |             selection[n_selection] = i
28 |             n_selection += 1
29 | 
30 |             index = i * col_blocks
31 |             for j in range(nblock, col_blocks):
32 |                 remv[j] |= mask[index + j]
33 |     return selection, n_selection
34 | 


--------------------------------------------------------------------------------
/utils/array_tool.py:
--------------------------------------------------------------------------------
 1 | """
 2 | tools to convert specified type
 3 | """
 4 | import torch as t
 5 | import numpy as np
 6 | 
 7 | 
 8 | def tonumpy(data):
 9 |     if isinstance(data, np.ndarray):
10 |         return data
11 |     if isinstance(data, t._TensorBase):
12 |         return data.cpu().numpy()
13 |     if isinstance(data, t.autograd.Variable):
14 |         return tonumpy(data.data)
15 | 
16 | 
17 | def totensor(data, cuda=True):
18 |     if isinstance(data, np.ndarray):
19 |         tensor = t.from_numpy(data)
20 |     if isinstance(data, t._TensorBase):
21 |         tensor = data
22 |     if isinstance(data, t.autograd.Variable):
23 |         tensor = data.data
24 |     if cuda:
25 |         tensor = tensor.cuda()
26 |     return tensor
27 | 
28 | 
29 | def tovariable(data):
30 |     if isinstance(data, np.ndarray):
31 |         return tovariable(totensor(data))
32 |     if isinstance(data, t._TensorBase):
33 |         return t.autograd.Variable(data)
34 |     if isinstance(data, t.autograd.Variable):
35 |         return data
36 |     else:
37 |         raise ValueError("UnKnow data type: %s, input should be {np.ndarray,Tensor,Variable}" %type(data))
38 | 
39 | 
40 | def scalar(data):
41 |     if isinstance(data, np.ndarray):
42 |         return data.reshape(1)[0]
43 |     if isinstance(data, t._TensorBase):
44 |         return data.view(1)[0]
45 |     if isinstance(data, t.autograd.Variable):
46 |         return data.data.view(1)[0]
47 | 


--------------------------------------------------------------------------------
/utils/config.py:
--------------------------------------------------------------------------------
 1 | from pprint import pprint
 2 | 
 3 | 
 4 | # Default Configs for training
 5 | # NOTE that, config items could be overwriten by passing argument through command line.
 6 | # e.g. --voc-data-dir='./data/'
 7 | 
 8 | class Config:
 9 |     # data
10 |     voc_data_dir = '/home/guangyaoyang/data/VOCdevkit/VOC2007'
11 |     min_size = 600  # image resize
12 |     max_size = 1000 # image resize
13 |     num_workers = 8
14 |     test_num_workers = 8
15 | 
16 |     # # sigma for l1_smooth_loss
17 |     rpn_sigma = 3.
18 |     roi_sigma = 1.
19 | 
20 |     # # param for optimizer
21 |     # # 0.0005 in origin paper but 0.0001 in tf-faster-rcnn
22 |     weight_decay = 0.0005
23 |     # lr_decay = 0.1  # 1e-3 -> 1e-4
24 |     lr = 1e-3
25 | 
26 | 
27 |     # # visualization
28 |     # env = 'faster-rcnn'  # visdom env
29 |     # port = 8097
30 |     # plot_every = 40  # vis every N iter
31 | 
32 |     # # preset
33 |     # data = 'voc'
34 |     # pretrained_model = 'vgg16'
35 | 
36 |     # # training
37 |     epoch = 14
38 | 
39 | 
40 |     use_adam = False # Use Adam optimizer
41 |     # use_chainer = False # try match everything as chainer
42 |     # use_drop = False # use dropout in RoIHead
43 |     # # debug
44 |     # debug_file = '/tmp/debugf'
45 | 
46 |     test_num = 10000
47 |     # # model
48 |     # load_path = None
49 | 
50 |     caffe_pretrain = False # use caffe pretrained model instead of torchvision
51 |     caffe_pretrain_path = 'checkpoints/vgg16-caffe.pth'
52 | 
53 |     # def _parse(self, kwargs):
54 |     #     state_dict = self._state_dict()
55 |     #     for k, v in kwargs.items():
56 |     #         if k not in state_dict:
57 |     #             raise ValueError('UnKnown Option: "--%s"' % k)
58 |     #         setattr(self, k, v)
59 | 
60 |     #     print('======user config========')
61 |     #     pprint(self._state_dict())
62 |     #     print('==========end============')
63 | 
64 |     # def _state_dict(self):
65 |     #     return {k: getattr(self, k) for k, _ in Config.__dict__.items() \
66 |     #             if not k.startswith('_')}
67 | 
68 | 
69 | opt = Config()
70 | 


--------------------------------------------------------------------------------
/utils/anchors.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | def generate_anchor_base(side_length=16, ratios=[0.5, 1, 2],
  4 |                          scales=[0.5, 1, 2], strides=16):
  5 |     """
  6 | 	Generate anchors for a single 16*16 block. Then transform the anchors
  7 | 	to the original image space.
  8 | 	
  9 | 	Input:
 10 | 	side_length: block side length
 11 | 	ratios
 12 | 	scales
 13 | 	strides: network strides
 14 | 
 15 | 	Return
 16 | 	anchor_base: base anchor of the original image
 17 |     """
 18 |     py = side_length / 2.
 19 |     px = side_length / 2.
 20 | 
 21 |     anchor_base = np.zeros((len(ratios) * len(scales), 4),
 22 |                            dtype=np.float32)
 23 |     for i in range(len(ratios)):
 24 |         for j in range(len(scales)):
 25 |             h = side_length * strides * scales[j] * np.sqrt(ratios[i])
 26 |             w = side_length * strides * scales[j] * np.sqrt(1. / ratios[i])
 27 | 
 28 |             index = i * len(scales) + j
 29 |             anchor_base[index, 0] = py - h / 2.
 30 |             anchor_base[index, 1] = px - w / 2.
 31 |             anchor_base[index, 2] = py + h / 2.
 32 |             anchor_base[index, 3] = px + w / 2.
 33 |     return anchor_base
 34 | 
 35 | 
 36 | def get_anchors(anchor_base, feat_stride, height, width):
 37 |     anchors_y = np.arange(height) * feat_stride
 38 |     anchors_x = np.arange(width) * feat_stride
 39 |     anchors_x, anchors_y = np.meshgrid(anchors_x, anchors_y)
 40 |     shift = np.stack((anchors_y.ravel(), anchors_x.ravel(),
 41 |                       anchors_y.ravel(), anchors_x.ravel()), axis=1)
 42 |     anchors = np.repeat(shift, repeats=len(anchor_base), axis=0) + \
 43 |         np.tile(anchor_base, [len(shift),1])
 44 |     return anchors
 45 | 
 46 | def get_rois_from_loc_anchors(anchors, rpn_locs):
 47 |     """Decode bounding boxes from bounding box offsets and scales.
 48 | 
 49 |     Given bounding box offsets and scales computed by
 50 |     :meth:`bbox2loc`, this function decodes the representation to
 51 |     coordinates in 2D image coordinates.
 52 | 
 53 |     Given scales and offsets :math:`t_y, t_x, t_h, t_w` and a bounding
 54 |     box whose center is :math:`(y, x) = p_y, p_x` and size :math:`p_h, p_w`,
 55 |     the decoded bounding box's center :math:`\\hat{g}_y`, :math:`\\hat{g}_x`
 56 |     and size :math:`\\hat{g}_h`, :math:`\\hat{g}_w` are calculated
 57 |     by the following formulas.
 58 | 
 59 |     * :math:`\\hat{g}_y = p_h t_y + p_y`
 60 |     * :math:`\\hat{g}_x = p_w t_x + p_x`
 61 |     * :math:`\\hat{g}_h = p_h \\exp(t_h)`
 62 |     * :math:`\\hat{g}_w = p_w \\exp(t_w)`
 63 | 
 64 |     Args:
 65 |         anchors (array): A coordinates of bounding boxes.
 66 |             Its shape is :math:`(R, 4)`. These coordinates are
 67 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
 68 |         rpn_locs (array): An array with offsets and scales.
 69 |             The shapes of :obj:`anchors` and :obj:`rpn_locs` should be same.
 70 |             This contains values :math:`t_y, t_x, t_h, t_w`.
 71 | 
 72 |     Returns:
 73 |         array:
 74 |         Decoded bounding box coordinates. Its shape is :math:`(R, 4)`. \
 75 |         The second axis contains four values \
 76 |         :math:`\\hat{g}_{ymin}, \\hat{g}_{xmin},
 77 |         \\hat{g}_{ymax}, \\hat{g}_{xmax}`.
 78 | 
 79 |     """
 80 |     src_bbox = anchors
 81 |     src_bbox = src_bbox.astype(src_bbox.dtype, copy=False)
 82 | 
 83 |     src_height = src_bbox[:, 2] - src_bbox[:, 0]
 84 |     src_width = src_bbox[:, 3] - src_bbox[:, 1]
 85 |     src_ctr_y = src_bbox[:, 0] + 0.5 * src_height
 86 |     src_ctr_x = src_bbox[:, 1] + 0.5 * src_width
 87 | 
 88 |     dy = rpn_locs[:, 0]
 89 |     dx = rpn_locs[:, 1]
 90 |     dh = rpn_locs[:, 2]
 91 |     dw = rpn_locs[:, 3]
 92 | 
 93 |     dst_y = dy * src_height + src_ctr_y
 94 |     dst_x = dx * src_width + src_ctr_x
 95 |     dst_h = np.exp(dh) * src_height
 96 |     dst_w = np.exp(dw) * src_width
 97 | 
 98 |     dst_bbox = np.zeros(rpn_locs.shape, dtype=rpn_locs.dtype)
 99 |     dst_bbox[:, 0] = dst_y - 0.5 * dst_h
100 |     dst_bbox[:, 1] = dst_x - 0.5 * dst_w
101 |     dst_bbox[:, 2] = dst_y + 0.5 * dst_h
102 |     dst_bbox[:, 3] = dst_x + 0.5 * dst_w
103 | 
104 |     return dst_bbox


--------------------------------------------------------------------------------
/data/dataset.py:
--------------------------------------------------------------------------------
  1 | import torch as t
  2 | from .voc_dataset import VOCBboxDataset
  3 | from skimage import transform as sktsf
  4 | from torchvision import transforms as tvtsf
  5 | from . import util
  6 | import numpy as np
  7 | from utils.config import opt
  8 | 
  9 | 
 10 | def inverse_normalize(img):
 11 |     if opt.caffe_pretrain:
 12 |         img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1))
 13 |         return img[::-1, :, :]
 14 |     # approximate un-normalize for visualize
 15 |     return (img * 0.225 + 0.45).clip(min=0, max=1) * 255
 16 | 
 17 | 
 18 | def pytorch_normalze(img):
 19 |     """
 20 |     https://github.com/pytorch/vision/issues/223
 21 |     return appr -1~1 RGB
 22 |     """
 23 |     normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406],
 24 |                                 std=[0.229, 0.224, 0.225])
 25 |     img = normalize(t.from_numpy(img))
 26 |     return img.numpy()
 27 | 
 28 | 
 29 | def caffe_normalize(img):
 30 |     """
 31 |     return appr -125-125 BGR
 32 |     """
 33 |     img = img[[2, 1, 0], :, :]  # RGB-BGR
 34 |     img = img * 255
 35 |     mean = np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1)
 36 |     img = (img - mean).astype(np.float32, copy=True)
 37 |     return img
 38 | 
 39 | 
 40 | def preprocess(img, min_size=600, max_size=1000):
 41 |     """Preprocess an image for feature extraction.
 42 | 
 43 |     The length of the shorter edge is scaled to :obj:`self.min_size`.
 44 |     After the scaling, if the length of the longer edge is longer than
 45 |     :param min_size:
 46 |     :obj:`self.max_size`, the image is scaled to fit the longer edge
 47 |     to :obj:`self.max_size`.
 48 | 
 49 |     After resizing the image, the image is subtracted by a mean image value
 50 |     :obj:`self.mean`.
 51 | 
 52 |     Args:
 53 |         img (~numpy.ndarray): An image. This is in CHW and RGB format.
 54 |             The range of its value is :math:`[0, 255]`.
 55 |          (~numpy.ndarray): An image. This is in CHW and RGB format.
 56 |             The range of its value is :math:`[0, 255]`.
 57 | 
 58 |     Returns:
 59 |         ~numpy.ndarray:
 60 |         A preprocessed image.
 61 | 
 62 |     """
 63 |     C, H, W = img.shape
 64 |     scale1 = min_size / min(H, W)
 65 |     scale2 = max_size / max(H, W)
 66 |     scale = min(scale1, scale2)
 67 |     img = img / 255.
 68 |     img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect')
 69 |     # both the longer and shorter should be less than
 70 |     # max_size and min_size
 71 |     if opt.caffe_pretrain:
 72 |         normalize = caffe_normalize
 73 |     else:
 74 |         normalize = pytorch_normalze
 75 |     return normalize(img)
 76 | 
 77 | 
 78 | class Transform(object):
 79 | 
 80 |     def __init__(self, min_size=600, max_size=1000):
 81 |         self.min_size = min_size
 82 |         self.max_size = max_size
 83 | 
 84 |     def __call__(self, in_data):
 85 |         img, bbox, label = in_data
 86 |         _, H, W = img.shape
 87 |         img = preprocess(img, self.min_size, self.max_size)
 88 |         _, o_H, o_W = img.shape
 89 |         scale = o_H / H
 90 |         bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))
 91 | 
 92 |         # horizontally flip
 93 |         img, params = util.random_flip(
 94 |             img, x_random=True, return_param=True)
 95 |         bbox = util.flip_bbox(
 96 |             bbox, (o_H, o_W), x_flip=params['x_flip'])
 97 | 
 98 |         return img, bbox, label, scale
 99 | 
100 | 
101 | class Dataset:
102 |     def __init__(self, opt):
103 |         self.opt = opt
104 |         self.db = VOCBboxDataset(opt.voc_data_dir)
105 |         self.tsf = Transform(opt.min_size, opt.max_size)
106 | 
107 |     def __getitem__(self, idx):
108 |         ori_img, bbox, label, difficult = self.db.get_example(idx)
109 | 
110 |         img, bbox, label, scale = self.tsf((ori_img, bbox, label))
111 |         # TODO: check whose stride is negative to fix this instead copy all
112 |         # some of the strides of a given numpy array are negative.
113 |         return img.copy(), bbox.copy(), label.copy(), scale
114 | 
115 |     def __len__(self):
116 |         return len(self.db)
117 | 
118 | 
119 | class TestDataset:
120 |     def __init__(self, opt, split='test', use_difficult=True):
121 |         self.opt = opt
122 |         self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult)
123 | 
124 |     def __getitem__(self, idx):
125 |         ori_img, bbox, label, difficult = self.db.get_example(idx)
126 |         img = preprocess(ori_img)
127 |         return img, ori_img.shape[1:], bbox, label, difficult
128 | 
129 |     def __len__(self):
130 |         return len(self.db)
131 | 


--------------------------------------------------------------------------------
/model/utils/roi_sample.py:
--------------------------------------------------------------------------------
  1 | from model.utils.bbox_tools import bbox2loc, bbox_iou
  2 | import numpy as np
  3 | 
  4 | class ProposalTargetCreator(object):
  5 |     """Assign ground truth bounding boxes to given RoIs.
  6 | 
  7 |     The :meth:`__call__` of this class generates training targets
  8 |     for each object proposal.
  9 |     This is used to train Faster RCNN [#]_.
 10 | 
 11 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
 12 |     Faster R-CNN: Towards Real-Time Object Detection with \
 13 |     Region Proposal Networks. NIPS 2015.
 14 | 
 15 |     Args:
 16 |         n_sample (int): The number of sampled regions.
 17 |         pos_ratio (float): Fraction of regions that is labeled as a
 18 |             foreground.
 19 |         pos_iou_thresh (float): IoU threshold for a RoI to be considered as a
 20 |             foreground.
 21 |         neg_iou_thresh_hi (float): RoI is considered to be the background
 22 |             if IoU is in
 23 |             [:obj:`neg_iou_thresh_hi`, :obj:`neg_iou_thresh_hi`).
 24 |         neg_iou_thresh_lo (float): See above.
 25 | 
 26 |     """
 27 | 
 28 |     def __init__(self,
 29 |                  n_sample=128,
 30 |                  pos_ratio=0.25, pos_iou_thresh=0.5,
 31 |                  neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0
 32 |                  ):
 33 |         self.n_sample = n_sample
 34 |         self.pos_ratio = pos_ratio
 35 |         self.pos_iou_thresh = pos_iou_thresh
 36 |         self.neg_iou_thresh_hi = neg_iou_thresh_hi
 37 |         self.neg_iou_thresh_lo = neg_iou_thresh_lo  # NOTE: py-faster-rcnn默认的值是0.1
 38 | 
 39 |     def __call__(self, roi, bbox, label,
 40 |                  loc_normalize_mean=(0., 0., 0., 0.),
 41 |                  loc_normalize_std=(0.1, 0.1, 0.2, 0.2)):
 42 |         """Assigns ground truth to sampled proposals.
 43 | 
 44 |         This function samples total of :obj:`self.n_sample` RoIs
 45 |         from the combination of :obj:`roi` and :obj:`bbox`.
 46 |         The RoIs are assigned with the ground truth class labels as well as
 47 |         bounding box offsets and scales to match the ground truth bounding
 48 |         boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are
 49 |         sampled as foregrounds.
 50 | 
 51 |         Offsets and scales of bounding boxes are calculated using
 52 |         :func:`model.utils.bbox_tools.bbox2loc`.
 53 |         Also, types of input arrays and output arrays are same.
 54 | 
 55 |         Here are notations.
 56 | 
 57 |         * :math:`S` is the total number of sampled RoIs, which equals \
 58 |             :obj:`self.n_sample`.
 59 |         * :math:`L` is number of object classes possibly including the \
 60 |             background.
 61 | 
 62 |         Args:
 63 |             roi (array): Region of Interests (RoIs) from which we sample.
 64 |                 Its shape is :math:`(R, 4)`
 65 |             bbox (array): The coordinates of ground truth bounding boxes.
 66 |                 Its shape is :math:`(R', 4)`.
 67 |             label (array): Ground truth bounding box labels. Its shape
 68 |                 is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where
 69 |                 :math:`L` is the number of foreground classes.
 70 |             loc_normalize_mean (tuple of four floats): Mean values to normalize
 71 |                 coordinates of bouding boxes.
 72 |             loc_normalize_std (tupler of four floats): Standard deviation of
 73 |                 the coordinates of bounding boxes.
 74 | 
 75 |         Returns:
 76 |             (array, array, array):
 77 | 
 78 |             * **sample_roi**: Regions of interests that are sampled. \
 79 |                 Its shape is :math:`(S, 4)`.
 80 |             * **gt_roi_loc**: Offsets and scales to match \
 81 |                 the sampled RoIs to the ground truth bounding boxes. \
 82 |                 Its shape is :math:`(S, 4)`.
 83 |             * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \
 84 |                 :math:`(S,)`. Its range is :math:`[0, L]`. The label with \
 85 |                 value 0 is the background.
 86 | 
 87 |         """
 88 |         n_bbox, _ = bbox.shape
 89 | 
 90 |         roi = np.concatenate((roi, bbox), axis=0)
 91 | 
 92 |         pos_roi_per_image = np.round(self.n_sample * self.pos_ratio)
 93 |         iou = bbox_iou(roi, bbox)
 94 |         gt_assignment = iou.argmax(axis=1)
 95 |         max_iou = iou.max(axis=1)
 96 |         # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class].
 97 |         # The label with value 0 is the background.
 98 |         gt_roi_label = label[gt_assignment] + 1
 99 | 
100 |         # Select foreground RoIs as those with >= pos_iou_thresh IoU.
101 |         pos_index = np.where(max_iou >= self.pos_iou_thresh)[0]
102 |         pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
103 |         if pos_index.size > 0:
104 |             pos_index = np.random.choice(
105 |                 pos_index, size=pos_roi_per_this_image, replace=False)
106 | 
107 |         # Select background RoIs as those within
108 |         # [neg_iou_thresh_lo, neg_iou_thresh_hi).
109 |         neg_index = np.where((max_iou < self.neg_iou_thresh_hi) &
110 |                              (max_iou >= self.neg_iou_thresh_lo))[0]
111 |         neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image
112 |         neg_roi_per_this_image = int(min(neg_roi_per_this_image,
113 |                                          neg_index.size))
114 |         if neg_index.size > 0:
115 |             neg_index = np.random.choice(
116 |                 neg_index, size=neg_roi_per_this_image, replace=False)
117 | 
118 |         # The indices that we're selecting (both positive and negative).
119 |         keep_index = np.append(pos_index, neg_index)
120 |         gt_roi_label = gt_roi_label[keep_index]
121 |         gt_roi_label[pos_roi_per_this_image:] = 0  # negative labels --> 0
122 |         sample_roi = roi[keep_index]
123 | 
124 |         # Compute offsets and scales to match sampled RoIs to the GTs.
125 |         gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]])
126 |         gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)
127 |                        ) / np.array(loc_normalize_std, np.float32))
128 | 
129 |         return sample_roi, gt_roi_loc, gt_roi_label


--------------------------------------------------------------------------------
/model/utils/rpn_gt_loc_label.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from model.utils.bbox_tools import bbox2loc, bbox_iou
  4 | 
  5 | class AnchorTargetCreator(object):
  6 |     """Assign the ground truth bounding boxes to anchors.
  7 | 
  8 |     Assigns the ground truth bounding boxes to anchors for training Region
  9 |     Proposal Networks introduced in Faster R-CNN [#]_.
 10 | 
 11 |     Offsets and scales to match anchors to the ground truth are
 12 |     calculated using the encoding scheme of
 13 |     :func:`model.utils.bbox_tools.bbox2loc`.
 14 | 
 15 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
 16 |     Faster R-CNN: Towards Real-Time Object Detection with \
 17 |     Region Proposal Networks. NIPS 2015.
 18 | 
 19 |     Args:
 20 |         n_sample (int): The number of regions to produce.
 21 |         pos_iou_thresh (float): Anchors with IoU above this
 22 |             threshold will be assigned as positive.
 23 |         neg_iou_thresh (float): Anchors with IoU below this
 24 |             threshold will be assigned as negative.
 25 |         pos_ratio (float): Ratio of positive regions in the
 26 |             sampled regions.
 27 | 
 28 |     """
 29 | 
 30 |     def __init__(self,
 31 |                  n_sample=256,
 32 |                  pos_iou_thresh=0.7, neg_iou_thresh=0.3,
 33 |                  pos_ratio=0.5):
 34 |         self.n_sample = n_sample
 35 |         self.pos_iou_thresh = pos_iou_thresh
 36 |         self.neg_iou_thresh = neg_iou_thresh
 37 |         self.pos_ratio = pos_ratio
 38 | 
 39 |     def __call__(self, bbox, anchor, img_size):
 40 |         """Assign ground truth supervision to sampled subset of anchors.
 41 | 
 42 |         Types of input arrays and output arrays are same.
 43 | 
 44 |         Here are notations.
 45 | 
 46 |         * :math:`S` is the number of anchors.
 47 |         * :math:`R` is the number of bounding boxes.
 48 | 
 49 |         Args:
 50 |             bbox (array): Coordinates of bounding boxes. Its shape is
 51 |                 :math:`(R, 4)`.
 52 |             anchor (array): Coordinates of anchors. Its shape is
 53 |                 :math:`(S, 4)`.
 54 |             img_size (tuple of ints): A tuple :obj:`H, W`, which
 55 |                 is a tuple of height and width of an image.
 56 | 
 57 |         Returns:
 58 |             (array, array):
 59 | 
 60 |             #NOTE: it's scale not only  offset
 61 |             * **loc**: Offsets and scales to match the anchors to \
 62 |                 the ground truth bounding boxes. Its shape is :math:`(S, 4)`.
 63 |             * **label**: Labels of anchors with values \
 64 |                 :obj:`(1=positive, 0=negative, -1=ignore)`. Its shape \
 65 |                 is :math:`(S,)`.
 66 | 
 67 |         """
 68 | 
 69 |         img_H, img_W = img_size
 70 | 
 71 |         n_anchor = len(anchor)
 72 |         inside_index = _get_inside_index(anchor, img_H, img_W)
 73 |         anchor = anchor[inside_index]
 74 |         argmax_ious, label = self._create_label(
 75 |             inside_index, anchor, bbox)
 76 | 
 77 |         # compute bounding box regression targets
 78 |         loc = bbox2loc(anchor, bbox[argmax_ious])
 79 | 
 80 |         # map up to original set of anchors
 81 |         label = _unmap(label, n_anchor, inside_index, fill=-1)
 82 |         loc = _unmap(loc, n_anchor, inside_index, fill=0)
 83 | 
 84 |         return loc, label
 85 | 
 86 |     def _create_label(self, inside_index, anchor, bbox):
 87 |         # label: 1 is positive, 0 is negative, -1 is dont care
 88 |         label = np.empty((len(inside_index),), dtype=np.int32)
 89 |         label.fill(-1)
 90 | 
 91 |         argmax_ious, max_ious, gt_argmax_ious = \
 92 |             self._calc_ious(anchor, bbox, inside_index)
 93 | 
 94 |         # assign negative labels first so that positive labels can clobber them
 95 |         label[max_ious < self.neg_iou_thresh] = 0
 96 | 
 97 |         # positive label: for each gt, anchor with highest iou
 98 |         label[gt_argmax_ious] = 1
 99 | 
100 |         # positive label: above threshold IOU
101 |         label[max_ious >= self.pos_iou_thresh] = 1
102 | 
103 |         # subsample positive labels if we have too many
104 |         n_pos = int(self.pos_ratio * self.n_sample)
105 |         pos_index = np.where(label == 1)[0]
106 |         if len(pos_index) > n_pos:
107 |             disable_index = np.random.choice(
108 |                 pos_index, size=(len(pos_index) - n_pos), replace=False)
109 |             label[disable_index] = -1
110 | 
111 |         # subsample negative labels if we have too many
112 |         n_neg = self.n_sample - np.sum(label == 1)
113 |         neg_index = np.where(label == 0)[0]
114 |         if len(neg_index) > n_neg:
115 |             disable_index = np.random.choice(
116 |                 neg_index, size=(len(neg_index) - n_neg), replace=False)
117 |             label[disable_index] = -1
118 | 
119 |         return argmax_ious, label
120 | 
121 |     def _calc_ious(self, anchor, bbox, inside_index):
122 |         # ious between the anchors and the gt boxes
123 |         ious = bbox_iou(anchor, bbox)
124 |         argmax_ious = ious.argmax(axis=1)
125 |         max_ious = ious[np.arange(len(inside_index)), argmax_ious]
126 |         gt_argmax_ious = ious.argmax(axis=0)
127 |         gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
128 |         gt_argmax_ious = np.where(ious == gt_max_ious)[0]
129 | 
130 |         return argmax_ious, max_ious, gt_argmax_ious
131 | 
132 | 
133 | def _unmap(data, count, index, fill=0):
134 |     # Unmap a subset of item (data) back to the original set of items (of
135 |     # size count)
136 | 
137 |     if len(data.shape) == 1:
138 |         ret = np.empty((count,), dtype=data.dtype)
139 |         ret.fill(fill)
140 |         ret[index] = data
141 |     else:
142 |         ret = np.empty((count,) + data.shape[1:], dtype=data.dtype)
143 |         ret.fill(fill)
144 |         ret[index, :] = data
145 |     return ret
146 | 
147 | 
148 | def _get_inside_index(anchor, H, W):
149 |     # Calc indicies of anchors which are located completely inside of the image
150 |     # whose size is speficied.
151 |     index_inside = np.where(
152 |         (anchor[:, 0] >= 0) &
153 |         (anchor[:, 1] >= 0) &
154 |         (anchor[:, 2] <= H) &
155 |         (anchor[:, 3] <= W)
156 |     )[0]
157 |     return index_inside


--------------------------------------------------------------------------------
/data/voc_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import xml.etree.ElementTree as ET
  3 | 
  4 | import numpy as np
  5 | 
  6 | from .util import read_image
  7 | 
  8 | 
  9 | class VOCBboxDataset:
 10 |     """Bounding box dataset for PASCAL `VOC`_.
 11 | 
 12 |     .. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/
 13 | 
 14 |     The index corresponds to each image.
 15 | 
 16 |     When queried by an index, if :obj:`return_difficult == False`,
 17 |     this dataset returns a corresponding
 18 |     :obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels.
 19 |     This is the default behaviour.
 20 |     If :obj:`return_difficult == True`, this dataset returns corresponding
 21 |     :obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array
 22 |     that indicates whether bounding boxes are labeled as difficult or not.
 23 | 
 24 |     The bounding boxes are packed into a two dimensional tensor of shape
 25 |     :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in
 26 |     the image. The second axis represents attributes of the bounding box.
 27 |     They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the
 28 |     four attributes are coordinates of the top left and the bottom right
 29 |     vertices.
 30 | 
 31 |     The labels are packed into a one dimensional tensor of shape :math:`(R,)`.
 32 |     :math:`R` is the number of bounding boxes in the image.
 33 |     The class name of the label :math:`l` is :math:`l` th element of
 34 |     :obj:`VOC_BBOX_LABEL_NAMES`.
 35 | 
 36 |     The array :obj:`difficult` is a one dimensional boolean array of shape
 37 |     :math:`(R,)`. :math:`R` is the number of bounding boxes in the image.
 38 |     If :obj:`use_difficult` is :obj:`False`, this array is
 39 |     a boolean array with all :obj:`False`.
 40 | 
 41 |     The type of the image, the bounding boxes and the labels are as follows.
 42 | 
 43 |     * :obj:`img.dtype == numpy.float32`
 44 |     * :obj:`bbox.dtype == numpy.float32`
 45 |     * :obj:`label.dtype == numpy.int32`
 46 |     * :obj:`difficult.dtype == numpy.bool`
 47 | 
 48 |     Args:
 49 |         data_dir (string): Path to the root of the training data. 
 50 |             i.e. "/data/image/voc/VOCdevkit/VOC2007/"
 51 |         split ({'train', 'val', 'trainval', 'test'}): Select a split of the
 52 |             dataset. :obj:`test` split is only available for
 53 |             2007 dataset.
 54 |         year ({'2007', '2012'}): Use a dataset prepared for a challenge
 55 |             held in :obj:`year`.
 56 |         use_difficult (bool): If :obj:`True`, use images that are labeled as
 57 |             difficult in the original annotation.
 58 |         return_difficult (bool): If :obj:`True`, this dataset returns
 59 |             a boolean array
 60 |             that indicates whether bounding boxes are labeled as difficult
 61 |             or not. The default value is :obj:`False`.
 62 | 
 63 |     """
 64 | 
 65 |     def __init__(self, data_dir, split='trainval',
 66 |                  use_difficult=False, return_difficult=False,
 67 |                  ):
 68 | 
 69 |         # if split not in ['train', 'trainval', 'val']:
 70 |         #     if not (split == 'test' and year == '2007'):
 71 |         #         warnings.warn(
 72 |         #             'please pick split from \'train\', \'trainval\', \'val\''
 73 |         #             'for 2012 dataset. For 2007 dataset, you can pick \'test\''
 74 |         #             ' in addition to the above mentioned splits.'
 75 |         #         )
 76 |         id_list_file = os.path.join(
 77 |             data_dir, 'ImageSets/Main/{0}.txt'.format(split))
 78 | 
 79 |         self.ids = [id_.strip() for id_ in open(id_list_file)]
 80 |         self.data_dir = data_dir
 81 |         self.use_difficult = use_difficult
 82 |         self.return_difficult = return_difficult
 83 |         self.label_names = VOC_BBOX_LABEL_NAMES
 84 | 
 85 |     def __len__(self):
 86 |         return len(self.ids)
 87 | 
 88 |     def get_example(self, i):
 89 |         """Returns the i-th example.
 90 | 
 91 |         Returns a color image and bounding boxes. The image is in CHW format.
 92 |         The returned image is RGB.
 93 | 
 94 |         Args:
 95 |             i (int): The index of the example.
 96 | 
 97 |         Returns:
 98 |             tuple of an image and bounding boxes
 99 | 
100 |         """
101 |         id_ = self.ids[i]
102 |         anno = ET.parse(
103 |             os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
104 |         bbox = list()
105 |         label = list()
106 |         difficult = list()
107 |         for obj in anno.findall('object'):
108 |             # when in not using difficult split, and the object is
109 |             # difficult, skipt it.
110 |             if not self.use_difficult and int(obj.find('difficult').text) == 1:
111 |                 continue
112 | 
113 |             difficult.append(int(obj.find('difficult').text))
114 |             bndbox_anno = obj.find('bndbox')
115 |             # subtract 1 to make pixel indexes 0-based
116 |             bbox.append([
117 |                 int(bndbox_anno.find(tag).text) - 1
118 |                 for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
119 |             name = obj.find('name').text.lower().strip()
120 |             label.append(VOC_BBOX_LABEL_NAMES.index(name))
121 |         bbox = np.stack(bbox).astype(np.float32)
122 |         label = np.stack(label).astype(np.int32)
123 |         # When `use_difficult==False`, all elements in `difficult` are False.
124 |         difficult = np.array(difficult, dtype=np.bool).astype(np.uint8)  # PyTorch don't support np.bool
125 | 
126 |         # Load a image
127 |         img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
128 |         img = read_image(img_file, color=True)
129 | 
130 |         # if self.return_difficult:
131 |         #     return img, bbox, label, difficult
132 |         return img, bbox, label, difficult
133 | 
134 |     __getitem__ = get_example
135 | 
136 | 
137 | VOC_BBOX_LABEL_NAMES = (
138 |     'aeroplane',
139 |     'bicycle',
140 |     'bird',
141 |     'boat',
142 |     'bottle',
143 |     'bus',
144 |     'car',
145 |     'cat',
146 |     'chair',
147 |     'cow',
148 |     'diningtable',
149 |     'dog',
150 |     'horse',
151 |     'motorbike',
152 |     'person',
153 |     'pottedplant',
154 |     'sheep',
155 |     'sofa',
156 |     'train',
157 |     'tvmonitor')
158 | 


--------------------------------------------------------------------------------
/model/rpn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from utils.anchors import generate_anchor_base, get_anchors, get_rois_from_loc_anchors
  3 | from utils.py_nms import py_cpu_nms
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | class RegionProposalNetwork(nn.Module):
  8 |     """Region Proposal Network introduced in Faster R-CNN.
  9 | 
 10 |     Args:
 11 |         in_channels (int): The channel size of input.
 12 |         mid_channels (int): The channel size of the intermediate tensor.
 13 |         ratios (list of floats): This is ratios of width to height of
 14 |             the anchors.
 15 |         anchor_scales (list of numbers): This is areas of anchors.
 16 |             Those areas will be the product of the square of an element in
 17 |             :obj:`anchor_scales` and the original area of the reference
 18 |             window.
 19 |         feat_stride (int): Stride size after extracting features from an
 20 |             image.
 21 |         initialW (callable): Initial weight value. If :obj:`None` then this
 22 |             function uses Gaussian distribution scaled by 0.1 to
 23 |             initialize weight.
 24 |             May also be a callable that takes an array and edits its values.
 25 |         proposal_creator_params (dict): Key valued paramters for
 26 |             :class:`model.utils.creator_tools.ProposalCreator`.
 27 | 
 28 |     .. seealso::
 29 |         :class:`~model.utils.creator_tools.ProposalCreator`
 30 | 
 31 |     """
 32 | 
 33 |     def __init__(
 34 |             self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2],
 35 |             scales=[0.5, 1, 2], feat_stride=16
 36 |     ):
 37 |         super(RegionProposalNetwork, self).__init__()
 38 |         # prepare anchor base
 39 |         self.anchor_base = generate_anchor_base(side_length=16, 
 40 |             ratios=ratios, scales=scales, strides=feat_stride)
 41 |         self.feat_stride = feat_stride
 42 |         # network params
 43 |         n_anchor = self.anchor_base.shape[0]
 44 |         self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
 45 |         self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)
 46 |         self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
 47 |         normal_init(self.conv1, 0, 0.01)
 48 |         normal_init(self.score, 0, 0.01)
 49 |         normal_init(self.loc, 0, 0.01)
 50 | 
 51 |     def forward(self, h, img_size, scale=1.):
 52 |         """Forward Region Proposal Network.
 53 | 
 54 |         Here are notations.
 55 | 
 56 |         * :math:`N` is batch size.
 57 |         * :math:`C` channel size of the input.
 58 |         * :math:`H` and :math:`W` are height and witdh of the input feature.
 59 |         * :math:`A` is number of anchors assigned to each pixel.
 60 | 
 61 |         Args:
 62 |             x (~torch.autograd.Variable): The Features extracted from images.
 63 |                 Its shape is :math:`(N, C, H, W)`.
 64 |             img_size (tuple of ints): A tuple :obj:`height, width`,
 65 |                 which contains image size after scaling.
 66 |             scale (float): The amount of scaling done to the input images after
 67 |                 reading them from files.
 68 | 
 69 |         Returns:
 70 |             (~torch.autograd.Variable, ~torch.autograd.Variable, array, array, array):
 71 | 
 72 |             This is a tuple of five following values.
 73 | 
 74 |             * **rpn_locs**: Predicted bounding box offsets and scales for \
 75 |                 anchors. Its shape is :math:`(N, H W A, 4)`.
 76 |             * **rpn_scores**:  Predicted foreground scores for \
 77 |                 anchors. Its shape is :math:`(N, H W A, 2)`.
 78 |             * **rois**: A bounding box array containing coordinates of \
 79 |                 proposal boxes.  This is a concatenation of bounding box \
 80 |                 arrays from multiple images in the batch. \
 81 |                 Its shape is :math:`(R', 4)`. Given :math:`R_i` predicted \
 82 |                 bounding boxes from the :math:`i` th image, \
 83 |                 :math:`R' = \\sum _{i=1} ^ N R_i`.
 84 |             * **roi_indices**: An array containing indices of images to \
 85 |                 which RoIs correspond to. Its shape is :math:`(R',)`.
 86 |             * **anchor**: Coordinates of enumerated shifted anchors. \
 87 |                 Its shape is :math:`(H W A, 4)`.
 88 | 
 89 |         """
 90 |         n_pre_nms = 12000
 91 |         n_post_nms = 2000
 92 |         nms_thresh = 0.7
 93 | 
 94 |         # get anchors predifined
 95 |         n, _, hh, ww = h.shape
 96 |         anchors = get_anchors(self.anchor_base, self.feat_stride, hh, ww)
 97 | 
 98 |         # main forward
 99 |         hidd = F.relu(self.conv1(h))
100 |         rpn_locs = self.loc(hidd)
101 |         rpn_scores = self.score(hidd)
102 | 
103 |         # view data
104 |         # rpn_locs, rpn_scores
105 |         rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)
106 |         rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous().view(n, -1, 2)
107 |         scores = rpn_scores[:,:,1].data.cpu().numpy()[0]
108 |         
109 |         # get rois, roi_indices
110 |         rois = get_rois_from_loc_anchors(anchors, rpn_locs[0].data.cpu().numpy())
111 |         ## clip
112 |         rois[:, ::2] = np.clip(rois[:, ::2], 0, img_size[0])
113 |         rois[:, 1::2] = np.clip(rois[:, 1::2], 0, img_size[1])
114 |         ## remove < min_size
115 |         min_size = 16
116 |         min_size = min_size * scale
117 |         hs = rois[:, 2] - rois[:, 0]
118 |         ws = rois[:, 3] - rois[:, 1]
119 |         keep = np.where((hs >= min_size) & (ws >= min_size))[0]
120 |         rois = rois[keep, :]
121 |         scores = scores[keep]
122 |         # Sort all (proposal, score) pairs by score from highest to lowest.
123 |         # Take top pre_nms_topN (e.g. 6000).
124 |         order = scores.ravel().argsort()[::-1]
125 |         if n_pre_nms > 0:
126 |             order = order[:n_pre_nms]
127 |         rois = rois[order, :]
128 | 
129 |         # NMS
130 |         keep = py_cpu_nms(rois, nms_thresh)
131 |         keep = keep[:n_post_nms]
132 |         rois = rois[keep]
133 |         return rpn_locs, rpn_scores, rois, [0]*len(rois), anchors
134 | 
135 | 
136 | def normal_init(m, mean, stddev, truncated=False):
137 |     """
138 |     weight initalizer: truncated normal and random normal.
139 |     """
140 |     # x is a parameter
141 |     if truncated:
142 |         m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean)  # not a perfect approximation
143 |     else:
144 |         m.weight.data.normal_(mean, stddev)
145 |         m.bias.data.zero_()


--------------------------------------------------------------------------------
/model/utils/nms/nohup.out:
--------------------------------------------------------------------------------
  1 | Traceback (most recent call last):
  2 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1318, in do_open
  3 |     encode_chunked=req.has_header('Transfer-encoding'))
  4 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1239, in request
  5 |     self._send_request(method, url, body, headers, encode_chunked)
  6 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1285, in _send_request
  7 |     self.endheaders(body, encode_chunked=encode_chunked)
  8 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1234, in endheaders
  9 |     self._send_output(message_body, encode_chunked=encode_chunked)
 10 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1026, in _send_output
 11 |     self.send(msg)
 12 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 964, in send
 13 |     self.connect()
 14 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1392, in connect
 15 |     super().connect()
 16 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 936, in connect
 17 |     (self.host,self.port), self.timeout, self.source_address)
 18 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/socket.py", line 724, in create_connection
 19 |     raise err
 20 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/socket.py", line 713, in create_connection
 21 |     sock.connect(sa)
 22 | TimeoutError: [Errno 110] Connection timed out
 23 | 
 24 | During handling of the above exception, another exception occurred:
 25 | 
 26 | Traceback (most recent call last):
 27 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 693, in download_scripts
 28 |     data = opener.open(req).read()
 29 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 526, in open
 30 |     response = self._open(req, data)
 31 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 544, in _open
 32 |     '_open', req)
 33 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 504, in _call_chain
 34 |     result = func(*args)
 35 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1361, in https_open
 36 |     context=self._context, check_hostname=self._check_hostname)
 37 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1320, in do_open
 38 |     raise URLError(err)
 39 | urllib.error.URLError: <urlopen error [Errno 110] Connection timed out>
 40 | 
 41 | During handling of the above exception, another exception occurred:
 42 | 
 43 | Traceback (most recent call last):
 44 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/runpy.py", line 193, in _run_module_as_main
 45 |     "__main__", mod_spec)
 46 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/runpy.py", line 85, in _run_code
 47 |     exec(code, run_globals)
 48 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 717, in <module>
 49 |     download_scripts()
 50 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 697, in download_scripts
 51 |     logging.error('Error {} while downloading {}'.format(exc.code, key))
 52 | AttributeError: 'URLError' object has no attribute 'code'
 53 | Downloading scripts. It might take a while.
 54 | Traceback (most recent call last):
 55 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1318, in do_open
 56 |     encode_chunked=req.has_header('Transfer-encoding'))
 57 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1239, in request
 58 |     self._send_request(method, url, body, headers, encode_chunked)
 59 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1285, in _send_request
 60 |     self.endheaders(body, encode_chunked=encode_chunked)
 61 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1234, in endheaders
 62 |     self._send_output(message_body, encode_chunked=encode_chunked)
 63 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1026, in _send_output
 64 |     self.send(msg)
 65 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 964, in send
 66 |     self.connect()
 67 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1400, in connect
 68 |     server_hostname=server_hostname)
 69 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/ssl.py", line 407, in wrap_socket
 70 |     _context=self, _session=session)
 71 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/ssl.py", line 814, in __init__
 72 |     self.do_handshake()
 73 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/ssl.py", line 1068, in do_handshake
 74 |     self._sslobj.do_handshake()
 75 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/ssl.py", line 689, in do_handshake
 76 |     self._sslobj.do_handshake()
 77 | ConnectionResetError: [Errno 104] Connection reset by peer
 78 | 
 79 | During handling of the above exception, another exception occurred:
 80 | 
 81 | Traceback (most recent call last):
 82 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 693, in download_scripts
 83 |     data = opener.open(req).read()
 84 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 526, in open
 85 |     response = self._open(req, data)
 86 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 544, in _open
 87 |     '_open', req)
 88 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 504, in _call_chain
 89 |     result = func(*args)
 90 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1361, in https_open
 91 |     context=self._context, check_hostname=self._check_hostname)
 92 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1320, in do_open
 93 |     raise URLError(err)
 94 | urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>
 95 | 
 96 | During handling of the above exception, another exception occurred:
 97 | 
 98 | Traceback (most recent call last):
 99 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/runpy.py", line 193, in _run_module_as_main
100 |     "__main__", mod_spec)
101 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/runpy.py", line 85, in _run_code
102 |     exec(code, run_globals)
103 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 717, in <module>
104 |     download_scripts()
105 |   File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 697, in download_scripts
106 |     logging.error('Error {} while downloading {}'.format(exc.code, key))
107 | AttributeError: 'URLError' object has no attribute 'code'
108 | Downloading scripts. It might take a while.
109 | 


--------------------------------------------------------------------------------
/model/utils/bbox_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numpy as xp
  3 | 
  4 | def bbox2loc(src_bbox, dst_bbox):
  5 |     """Encodes the source and the destination bounding boxes to "loc".
  6 | 
  7 |     Given bounding boxes, this function computes offsets and scales
  8 |     to match the source bounding boxes to the target bounding boxes.
  9 |     Mathematcially, given a bounding box whose center is
 10 |     :math:`(y, x) = p_y, p_x` and
 11 |     size :math:`p_h, p_w` and the target bounding box whose center is
 12 |     :math:`g_y, g_x` and size :math:`g_h, g_w`, the offsets and scales
 13 |     :math:`t_y, t_x, t_h, t_w` can be computed by the following formulas.
 14 | 
 15 |     * :math:`t_y = \\frac{(g_y - p_y)} {p_h}`
 16 |     * :math:`t_x = \\frac{(g_x - p_x)} {p_w}`
 17 |     * :math:`t_h = \\log(\\frac{g_h} {p_h})`
 18 |     * :math:`t_w = \\log(\\frac{g_w} {p_w})`
 19 | 
 20 |     The output is same type as the type of the inputs.
 21 |     The encoding formulas are used in works such as R-CNN [#]_.
 22 | 
 23 |     .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \
 24 |     Rich feature hierarchies for accurate object detection and semantic \
 25 |     segmentation. CVPR 2014.
 26 | 
 27 |     Args:
 28 |         src_bbox (array): An image coordinate array whose shape is
 29 |             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
 30 |             These coordinates are
 31 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
 32 |         dst_bbox (array): An image coordinate array whose shape is
 33 |             :math:`(R, 4)`.
 34 |             These coordinates are
 35 |             :math:`g_{ymin}, g_{xmin}, g_{ymax}, g_{xmax}`.
 36 | 
 37 |     Returns:
 38 |         array:
 39 |         Bounding box offsets and scales from :obj:`src_bbox` \
 40 |         to :obj:`dst_bbox`. \
 41 |         This has shape :math:`(R, 4)`.
 42 |         The second axis contains four values :math:`t_y, t_x, t_h, t_w`.
 43 | 
 44 |     """
 45 | 
 46 |     height = src_bbox[:, 2] - src_bbox[:, 0]
 47 |     width = src_bbox[:, 3] - src_bbox[:, 1]
 48 |     ctr_y = src_bbox[:, 0] + 0.5 * height
 49 |     ctr_x = src_bbox[:, 1] + 0.5 * width
 50 | 
 51 |     base_height = dst_bbox[:, 2] - dst_bbox[:, 0]
 52 |     base_width = dst_bbox[:, 3] - dst_bbox[:, 1]
 53 |     base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height
 54 |     base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width
 55 | 
 56 |     eps = xp.finfo(height.dtype).eps
 57 |     height = xp.maximum(height, eps)
 58 |     width = xp.maximum(width, eps)
 59 | 
 60 |     dy = (base_ctr_y - ctr_y) / height
 61 |     dx = (base_ctr_x - ctr_x) / width
 62 |     dh = xp.log(base_height / height)
 63 |     dw = xp.log(base_width / width)
 64 | 
 65 |     loc = xp.vstack((dy, dx, dh, dw)).transpose()
 66 |     return loc
 67 | 
 68 | 
 69 | def bbox_iou(bbox_a, bbox_b):
 70 |     """Calculate the Intersection of Unions (IoUs) between bounding boxes.
 71 | 
 72 |     IoU is calculated as a ratio of area of the intersection
 73 |     and area of the union.
 74 | 
 75 |     This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as
 76 |     inputs. Please note that both :obj:`bbox_a` and :obj:`bbox_b` need to be
 77 |     same type.
 78 |     The output is same type as the type of the inputs.
 79 | 
 80 |     Args:
 81 |         bbox_a (array): An array whose shape is :math:`(N, 4)`.
 82 |             :math:`N` is the number of bounding boxes.
 83 |             The dtype should be :obj:`numpy.float32`.
 84 |         bbox_b (array): An array similar to :obj:`bbox_a`,
 85 |             whose shape is :math:`(K, 4)`.
 86 |             The dtype should be :obj:`numpy.float32`.
 87 | 
 88 |     Returns:
 89 |         array:
 90 |         An array whose shape is :math:`(N, K)`. \
 91 |         An element at index :math:`(n, k)` contains IoUs between \
 92 |         :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \
 93 |         box in :obj:`bbox_b`.
 94 | 
 95 |     """
 96 |     if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4:
 97 |         raise IndexError
 98 | 
 99 |     # top left
100 |     tl = xp.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
101 |     # bottom right
102 |     br = xp.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])
103 | 
104 |     area_i = xp.prod(br - tl, axis=2) * (tl < br).all(axis=2)
105 |     area_a = xp.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1)
106 |     area_b = xp.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1)
107 |     return area_i / (area_a[:, None] + area_b - area_i)
108 | 
109 | 
110 | 
111 | def loc2bbox(src_bbox, loc):
112 |     """Decode bounding boxes from bounding box offsets and scales.
113 | 
114 |     Given bounding box offsets and scales computed by
115 |     :meth:`bbox2loc`, this function decodes the representation to
116 |     coordinates in 2D image coordinates.
117 | 
118 |     Given scales and offsets :math:`t_y, t_x, t_h, t_w` and a bounding
119 |     box whose center is :math:`(y, x) = p_y, p_x` and size :math:`p_h, p_w`,
120 |     the decoded bounding box's center :math:`\\hat{g}_y`, :math:`\\hat{g}_x`
121 |     and size :math:`\\hat{g}_h`, :math:`\\hat{g}_w` are calculated
122 |     by the following formulas.
123 | 
124 |     * :math:`\\hat{g}_y = p_h t_y + p_y`
125 |     * :math:`\\hat{g}_x = p_w t_x + p_x`
126 |     * :math:`\\hat{g}_h = p_h \\exp(t_h)`
127 |     * :math:`\\hat{g}_w = p_w \\exp(t_w)`
128 | 
129 |     The decoding formulas are used in works such as R-CNN [#]_.
130 | 
131 |     The output is same type as the type of the inputs.
132 | 
133 |     .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \
134 |     Rich feature hierarchies for accurate object detection and semantic \
135 |     segmentation. CVPR 2014.
136 | 
137 |     Args:
138 |         src_bbox (array): A coordinates of bounding boxes.
139 |             Its shape is :math:`(R, 4)`. These coordinates are
140 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
141 |         loc (array): An array with offsets and scales.
142 |             The shapes of :obj:`src_bbox` and :obj:`loc` should be same.
143 |             This contains values :math:`t_y, t_x, t_h, t_w`.
144 | 
145 |     Returns:
146 |         array:
147 |         Decoded bounding box coordinates. Its shape is :math:`(R, 4)`. \
148 |         The second axis contains four values \
149 |         :math:`\\hat{g}_{ymin}, \\hat{g}_{xmin},
150 |         \\hat{g}_{ymax}, \\hat{g}_{xmax}`.
151 | 
152 |     """
153 | 
154 |     if src_bbox.shape[0] == 0:
155 |         return xp.zeros((0, 4), dtype=loc.dtype)
156 | 
157 |     src_bbox = src_bbox.astype(src_bbox.dtype, copy=False)
158 | 
159 |     src_height = src_bbox[:, 2] - src_bbox[:, 0]
160 |     src_width = src_bbox[:, 3] - src_bbox[:, 1]
161 |     src_ctr_y = src_bbox[:, 0] + 0.5 * src_height
162 |     src_ctr_x = src_bbox[:, 1] + 0.5 * src_width
163 | 
164 |     dy = loc[:, 0::4]
165 |     dx = loc[:, 1::4]
166 |     dh = loc[:, 2::4]
167 |     dw = loc[:, 3::4]
168 | 
169 |     ctr_y = dy * src_height[:, xp.newaxis] + src_ctr_y[:, xp.newaxis]
170 |     ctr_x = dx * src_width[:, xp.newaxis] + src_ctr_x[:, xp.newaxis]
171 |     h = xp.exp(dh) * src_height[:, xp.newaxis]
172 |     w = xp.exp(dw) * src_width[:, xp.newaxis]
173 | 
174 |     dst_bbox = xp.zeros(loc.shape, dtype=loc.dtype)
175 |     dst_bbox[:, 0::4] = ctr_y - 0.5 * h
176 |     dst_bbox[:, 1::4] = ctr_x - 0.5 * w
177 |     dst_bbox[:, 2::4] = ctr_y + 0.5 * h
178 |     dst_bbox[:, 3::4] = ctr_x + 0.5 * w
179 | 
180 |     return dst_bbox
181 | 


--------------------------------------------------------------------------------
/model/utils/nms/non_maximum_suppression.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import numpy as np
  3 | import cupy as cp
  4 | import torch as t
  5 | try:
  6 |     from ._nms_gpu_post import _nms_gpu_post
  7 | except:
  8 |     import warnings
  9 |     warnings.warn('''
 10 |     the python code for non_maximum_suppression is about 2x slow
 11 |     It is strongly recommended to build cython code: 
 12 |     `cd model/utils/nms/; python3 build.py build_ext --inplace''')
 13 |     from ._nms_gpu_post_py import _nms_gpu_post
 14 | 
 15 | 
 16 | @cp.util.memoize(for_each_device=True)
 17 | def _load_kernel(kernel_name, code, options=()):
 18 |     cp.cuda.runtime.free(0)
 19 |     assert isinstance(options, tuple)
 20 |     kernel_code = cp.cuda.compile_with_cache(code, options=options)
 21 |     return kernel_code.get_function(kernel_name)
 22 | 
 23 | 
 24 | def non_maximum_suppression(bbox, thresh, score=None,
 25 |                             limit=None):
 26 |     """Suppress bounding boxes according to their IoUs.
 27 | 
 28 |     This method checks each bounding box sequentially and selects the bounding
 29 |     box if the Intersection over Unions (IoUs) between the bounding box and the
 30 |     previously selected bounding boxes is less than :obj:`thresh`. This method
 31 |     is mainly used as postprocessing of object detection.
 32 |     The bounding boxes are selected from ones with higher scores.
 33 |     If :obj:`score` is not provided as an argument, the bounding box
 34 |     is ordered by its index in ascending order.
 35 | 
 36 |     The bounding boxes are expected to be packed into a two dimensional
 37 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
 38 |     bounding boxes in the image. The second axis represents attributes of
 39 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
 40 |     where the four attributes are coordinates of the top left and the
 41 |     bottom right vertices.
 42 | 
 43 |     :obj:`score` is a float array of shape :math:`(R,)`. Each score indicates
 44 |     confidence of prediction.
 45 | 
 46 |     This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as
 47 |     an input. Please note that both :obj:`bbox` and :obj:`score` need to be
 48 |     the same type.
 49 |     The type of the output is the same as the input.
 50 | 
 51 |     Args:
 52 |         bbox (array): Bounding boxes to be transformed. The shape is
 53 |             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
 54 |         thresh (float): Threshold of IoUs.
 55 |         score (array): An array of confidences whose shape is :math:`(R,)`.
 56 |         limit (int): The upper bound of the number of the output bounding
 57 |             boxes. If it is not specified, this method selects as many
 58 |             bounding boxes as possible.
 59 | 
 60 |     Returns:
 61 |         array:
 62 |         An array with indices of bounding boxes that are selected. \
 63 |         They are sorted by the scores of bounding boxes in descending \
 64 |         order. \
 65 |         The shape of this array is :math:`(K,)` and its dtype is\
 66 |         :obj:`numpy.int32`. Note that :math:`K \\leq R`.
 67 | 
 68 |     """
 69 | 
 70 |     return _non_maximum_suppression_gpu(bbox, thresh, score, limit)
 71 | 
 72 | 
 73 | def _non_maximum_suppression_gpu(bbox, thresh, score=None, limit=None):
 74 |     if len(bbox) == 0:
 75 |         return cp.zeros((0,), dtype=np.int32)
 76 | 
 77 |     n_bbox = bbox.shape[0]
 78 | 
 79 |     if score is not None:
 80 |         order = score.argsort()[::-1].astype(np.int32)
 81 |     else:
 82 |         order = cp.arange(n_bbox, dtype=np.int32)
 83 | 
 84 |     sorted_bbox = bbox[order, :]
 85 |     selec, n_selec = _call_nms_kernel(
 86 |         sorted_bbox, thresh)
 87 |     selec = selec[:n_selec]
 88 |     selec = order[selec]
 89 |     if limit is not None:
 90 |         selec = selec[:limit]
 91 |     return cp.asnumpy(selec)
 92 | 
 93 | 
 94 | _nms_gpu_code = '''
 95 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
 96 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
 97 | 
 98 | __device__
 99 | inline float devIoU(float const *const bbox_a, float const *const bbox_b) {
100 |   float top = max(bbox_a[0], bbox_b[0]);
101 |   float bottom = min(bbox_a[2], bbox_b[2]);
102 |   float left = max(bbox_a[1], bbox_b[1]);
103 |   float right = min(bbox_a[3], bbox_b[3]);
104 |   float height = max(bottom - top, 0.f);
105 |   float width = max(right - left, 0.f);
106 |   float area_i = height * width;
107 |   float area_a = (bbox_a[2] - bbox_a[0]) * (bbox_a[3] - bbox_a[1]);
108 |   float area_b = (bbox_b[2] - bbox_b[0]) * (bbox_b[3] - bbox_b[1]);
109 |   return area_i / (area_a + area_b - area_i);
110 | }
111 | 
112 | extern "C"
113 | __global__
114 | void nms_kernel(const int n_bbox, const float thresh,
115 |                 const float *dev_bbox,
116 |                 unsigned long long *dev_mask) {
117 |   const int row_start = blockIdx.y;
118 |   const int col_start = blockIdx.x;
119 | 
120 |   const int row_size =
121 |         min(n_bbox - row_start * threadsPerBlock, threadsPerBlock);
122 |   const int col_size =
123 |         min(n_bbox - col_start * threadsPerBlock, threadsPerBlock);
124 | 
125 |   __shared__ float block_bbox[threadsPerBlock * 4];
126 |   if (threadIdx.x < col_size) {
127 |     block_bbox[threadIdx.x * 4 + 0] =
128 |         dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0];
129 |     block_bbox[threadIdx.x * 4 + 1] =
130 |         dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1];
131 |     block_bbox[threadIdx.x * 4 + 2] =
132 |         dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2];
133 |     block_bbox[threadIdx.x * 4 + 3] =
134 |         dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3];
135 |   }
136 |   __syncthreads();
137 | 
138 |   if (threadIdx.x < row_size) {
139 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
140 |     const float *cur_box = dev_bbox + cur_box_idx * 4;
141 |     int i = 0;
142 |     unsigned long long t = 0;
143 |     int start = 0;
144 |     if (row_start == col_start) {
145 |       start = threadIdx.x + 1;
146 |     }
147 |     for (i = start; i < col_size; i++) {
148 |       if (devIoU(cur_box, block_bbox + i * 4) >= thresh) {
149 |         t |= 1ULL << i;
150 |       }
151 |     }
152 |     const int col_blocks = DIVUP(n_bbox, threadsPerBlock);
153 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
154 |   }
155 | }
156 | '''
157 | 
158 | 
159 | def _call_nms_kernel(bbox, thresh):
160 |     # PyTorch does not support unsigned long Tensor.
161 |     # Doesn't matter,since it returns ndarray finally.
162 |     # So I'll keep it unmodified.
163 |     n_bbox = bbox.shape[0]
164 |     threads_per_block = 64
165 |     col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
166 |     blocks = (col_blocks, col_blocks, 1)
167 |     threads = (threads_per_block, 1, 1)
168 | 
169 |     mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
170 |     bbox = cp.ascontiguousarray(bbox, dtype=np.float32)  # NOTE: 变成连续的
171 |     kern = _load_kernel('nms_kernel', _nms_gpu_code)
172 |     kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
173 |                                 bbox, mask_dev))
174 | 
175 |     mask_host = mask_dev.get()
176 |     selection, n_selec = _nms_gpu_post(
177 |         mask_host, n_bbox, threads_per_block, col_blocks)
178 |     return selection, n_selec
179 | 


--------------------------------------------------------------------------------
/trainer/trainer.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | 
  3 | from torch.nn import functional as F
  4 | from model.utils.roi_sample import ProposalTargetCreator
  5 | from model.utils.rpn_gt_loc_label import AnchorTargetCreator
  6 | 
  7 | from torch import nn
  8 | import torch as t
  9 | from torch.autograd import Variable
 10 | from utils import array_tool as at
 11 | 
 12 | from utils.config import opt
 13 | 
 14 | LossTuple = namedtuple('LossTuple',
 15 |                        ['rpn_loc_loss',
 16 |                         'rpn_cls_loss',
 17 |                         'roi_loc_loss',
 18 |                         'roi_cls_loss',
 19 |                         'total_loss'
 20 |                         ])
 21 | 
 22 | 
 23 | class FasterRCNNTrainer(nn.Module):
 24 |     """wrapper for conveniently training. return losses
 25 | 
 26 |     The losses include:
 27 | 
 28 |     * :obj:`rpn_loc_loss`: The localization loss for \
 29 |         Region Proposal Network (RPN).
 30 |     * :obj:`rpn_cls_loss`: The classification loss for RPN.
 31 |     * :obj:`roi_loc_loss`: The localization loss for the head module.
 32 |     * :obj:`roi_cls_loss`: The classification loss for the head module.
 33 |     * :obj:`total_loss`: The sum of 4 loss above.
 34 | 
 35 |     Args:
 36 |         faster_rcnn (model.FasterRCNN):
 37 |             A Faster R-CNN model that is going to be trained.
 38 |     """
 39 | 
 40 |     def __init__(self, faster_rcnn):
 41 |         super(FasterRCNNTrainer, self).__init__()
 42 | 
 43 |         self.faster_rcnn = faster_rcnn
 44 |         self.rpn_sigma = opt.rpn_sigma
 45 |         self.roi_sigma = opt.roi_sigma
 46 | 
 47 |         # target creator create gt_bbox gt_label etc as training targets. 
 48 |         self.anchor_target_creator = AnchorTargetCreator()
 49 |         self.proposal_target_creator = ProposalTargetCreator()
 50 | 
 51 |         self.loc_normalize_mean = faster_rcnn.loc_normalize_mean
 52 |         self.loc_normalize_std = faster_rcnn.loc_normalize_std
 53 | 
 54 |         self.optimizer = self.faster_rcnn.get_optimizer()
 55 | 
 56 |     def forward(self, imgs, bboxes, labels, scale):
 57 |         """Forward Faster R-CNN and calculate losses.
 58 | 
 59 |         Here are notations used.
 60 | 
 61 |         * :math:`N` is the batch size.
 62 |         * :math:`R` is the number of bounding boxes per image.
 63 | 
 64 |         Currently, only :math:`N=1` is supported.
 65 | 
 66 |         Args:
 67 |             imgs (~torch.autograd.Variable): A variable with a batch of images.
 68 |             bboxes (~torch.autograd.Variable): A batch of bounding boxes.
 69 |                 Its shape is :math:`(N, R, 4)`.
 70 |             labels (~torch.autograd..Variable): A batch of labels.
 71 |                 Its shape is :math:`(N, R)`. The background is excluded from
 72 |                 the definition, which means that the range of the value
 73 |                 is :math:`[0, L - 1]`. :math:`L` is the number of foreground
 74 |                 classes.
 75 |             scale (float): Amount of scaling applied to
 76 |                 the raw image during preprocessing.
 77 | 
 78 |         Returns:
 79 |             namedtuple of 5 losses
 80 |         """
 81 |         n = bboxes.shape[0]
 82 |         if n != 1:
 83 |             raise ValueError('Currently only batch size 1 is supported.')
 84 | 
 85 |         _, _, H, W = imgs.shape
 86 |         img_size = (H, W)
 87 | 
 88 |         features = self.faster_rcnn.extractor(imgs)
 89 | 
 90 |         rpn_locs, rpn_scores, rois, roi_indices, anchor = \
 91 |             self.faster_rcnn.rpn(features, img_size, scale)
 92 | 
 93 |         # Since batch size is one, convert variables to singular form
 94 |         bbox = bboxes[0]
 95 |         label = labels[0]
 96 |         rpn_score = rpn_scores[0]
 97 |         rpn_loc = rpn_locs[0]
 98 |         roi = rois
 99 | 
100 |         # Sample RoIs and forward
101 |         # it's fine to break the computation graph of rois, 
102 |         # consider them as constant input
103 |         sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
104 |             roi,
105 |             at.tonumpy(bbox),
106 |             at.tonumpy(label),
107 |             self.loc_normalize_mean,
108 |             self.loc_normalize_std)
109 |         # NOTE it's all zero because now it only support for batch=1 now
110 |         sample_roi_index = t.zeros(len(sample_roi))
111 |         roi_cls_loc, roi_score = self.faster_rcnn.head(
112 |             features,
113 |             sample_roi,
114 |             sample_roi_index)
115 | 
116 |         # ------------------ RPN losses -------------------#
117 |         gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
118 |             at.tonumpy(bbox),
119 |             anchor,
120 |             img_size)
121 |         gt_rpn_label = at.tovariable(gt_rpn_label).long()
122 |         gt_rpn_loc = at.tovariable(gt_rpn_loc)
123 |         rpn_loc_loss = _fast_rcnn_loc_loss(
124 |             rpn_loc,
125 |             gt_rpn_loc,
126 |             gt_rpn_label.data,
127 |             self.rpn_sigma)
128 | 
129 |         # NOTE: default value of ignore_index is -100 ...
130 |         rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
131 | 
132 |         # ------------------ ROI losses (fast rcnn loss) -------------------#
133 |         n_sample = roi_cls_loc.shape[0]
134 |         roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
135 |         roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
136 |                               at.totensor(gt_roi_label).long()]
137 |         gt_roi_label = at.tovariable(gt_roi_label).long()
138 |         gt_roi_loc = at.tovariable(gt_roi_loc)
139 | 
140 |         roi_loc_loss = _fast_rcnn_loc_loss(
141 |             roi_loc.contiguous(),
142 |             gt_roi_loc.float(),
143 |             gt_roi_label.data,
144 |             self.roi_sigma)
145 | 
146 | 
147 |         roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())
148 | 
149 |         losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
150 |         losses = losses + [sum(losses)]
151 | 
152 |         return LossTuple(*losses)
153 | 
154 |     def train_step(self, imgs, bboxes, labels, scale):
155 |         self.optimizer.zero_grad()
156 |         losses = self.forward(imgs, bboxes, labels, scale)
157 |         losses.total_loss.backward()
158 |         self.optimizer.step()
159 |         return losses
160 | 
161 | def _smooth_l1_loss(x, t, in_weight, sigma):
162 |     sigma2 = sigma ** 2
163 |     # print ("------------")
164 |     # print ("in_weight: ", in_weight)
165 |     # print ("------------")
166 |     # print ("x: ", x)
167 |     # print ("------------")
168 |     # print ("t: ", t)
169 |     # print ("------------")
170 |     t = t.float()
171 |     diff = in_weight * (x - t)
172 |     abs_diff = diff.abs()
173 |     flag = (abs_diff.data < (1. / sigma2)).float()
174 |     flag = Variable(flag)
175 |     y = (flag * (sigma2 / 2.) * (diff ** 2) +
176 |          (1 - flag) * (abs_diff - 0.5 / sigma2))
177 |     return y.sum()
178 | 
179 | 
180 | def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
181 |     in_weight = t.zeros(gt_loc.shape).cuda()
182 |     # Localization loss is calculated only for positive rois.
183 |     # NOTE:  unlike origin implementation, 
184 |     # we don't need inside_weight and outside_weight, they can calculate by gt_label
185 |     in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1
186 |     loc_loss = _smooth_l1_loss(pred_loc, gt_loc, Variable(in_weight), sigma)
187 |     # Normalize by total number of negtive and positive rois.
188 |     loc_loss /= (gt_label >= 0).sum()  # ignore gt_label==-1 for rpn_loss
189 |     return loc_loss
190 | 


--------------------------------------------------------------------------------
/utils/data_load.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import xml.etree.ElementTree as ET
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | from torch.utils.data import Dataset, DataLoader
  6 | import random
  7 | import cv2
  8 | import pickle
  9 | 
 10 | # ----------------------------------------------------------------
 11 | # LOAD AND SAVE USING PICKLE
 12 | def save_pkl(filename, f):
 13 |     with open(filename, 'wb') as handle:
 14 |         pickle.dump(f, handle, protocol=pickle.HIGHEST_PROTOCOL)
 15 | 
 16 | def load_pkl(filename):
 17 |     with open(filename, 'rb') as handle:
 18 |         b = pickle.load(handle)
 19 |         return b
 20 | 
 21 | 
 22 | # ----------------------------------------------------------------
 23 | # VOC Objection Datasets
 24 | class VOCBboxDataset:
 25 |     """Bounding box dataset for PASCAL `VOC`_.
 26 | 
 27 |     .. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/
 28 | 
 29 |     The index corresponds to each image.
 30 | 
 31 |     When queried by an index, if :obj:`return_difficult == False`,
 32 |     this dataset returns a corresponding
 33 |     :obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels.
 34 |     This is the default behaviour.
 35 |     If :obj:`return_difficult == True`, this dataset returns corresponding
 36 |     :obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array
 37 |     that indicates whether bounding boxes are labeled as difficult or not.
 38 | 
 39 |     The bounding boxes are packed into a two dimensional tensor of shape
 40 |     :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in
 41 |     the image. The second axis represents attributes of the bounding box.
 42 |     They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the
 43 |     four attributes are coordinates of the top left and the bottom right
 44 |     vertices.
 45 | 
 46 |     The labels are packed into a one dimensional tensor of shape :math:`(R,)`.
 47 |     :math:`R` is the number of bounding boxes in the image.
 48 |     The class name of the label :math:`l` is :math:`l` th element of
 49 |     :obj:`VOC_BBOX_LABEL_NAMES`.
 50 | 
 51 |     The array :obj:`difficult` is a one dimensional boolean array of shape
 52 |     :math:`(R,)`. :math:`R` is the number of bounding boxes in the image.
 53 |     If :obj:`use_difficult` is :obj:`False`, this array is
 54 |     a boolean array with all :obj:`False`.
 55 | 
 56 |     The type of the image, the bounding boxes and the labels are as follows.
 57 | 
 58 |     * :obj:`img.dtype == numpy.float32`
 59 |     * :obj:`bbox.dtype == numpy.float32`
 60 |     * :obj:`label.dtype == numpy.int32`
 61 |     * :obj:`difficult.dtype == numpy.bool`
 62 | 
 63 |     Args:
 64 |         data_dir (string): Path to the root of the training data. 
 65 |             i.e. "/data/image/voc/VOCdevkit/VOC2007/"
 66 |         split ({'train', 'val', 'trainval', 'test'}): Select a split of the
 67 |             dataset. :obj:`test` split is only available for
 68 |             2007 dataset.
 69 |         year ({'2007', '2012'}): Use a dataset prepared for a challenge
 70 |             held in :obj:`year`.
 71 |         use_difficult (bool): If :obj:`True`, use images that are labeled as
 72 |             difficult in the original annotation.
 73 |         return_difficult (bool): If :obj:`True`, this dataset returns
 74 |             a boolean array
 75 |             that indicates whether bounding boxes are labeled as difficult
 76 |             or not. The default value is :obj:`False`.
 77 | 
 78 |     """
 79 | 
 80 |     def __init__(self, data_dir, split='trainval',
 81 |                  use_difficult=False, return_difficult=False,
 82 |                  ):
 83 | 
 84 |         id_list_file = os.path.join(
 85 |             data_dir, 'ImageSets/Main/{0}.txt'.format(split))
 86 | 
 87 |         self.ids = [id_.strip() for id_ in open(id_list_file)]
 88 |         self.data_dir = data_dir
 89 |         self.use_difficult = use_difficult
 90 |         self.return_difficult = return_difficult
 91 |         self.label_names = VOC_BBOX_LABEL_NAMES
 92 | 
 93 |     def __len__(self):
 94 |         return len(self.ids)
 95 | 
 96 |     def get_example(self, i):
 97 |         """Returns the i-th example.
 98 | 
 99 |         Returns a color image and bounding boxes. The image is in CHW format.
100 |         The returned image is RGB.
101 | 
102 |         Args:
103 |             i (int): The index of the example.
104 | 
105 |         Returns:
106 |             tuple of an image and bounding boxes
107 |             img: RGB image with shape [H, W, C], type float32
108 | 
109 |         """
110 |         id_ = self.ids[i]
111 |         anno = ET.parse(
112 |             os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
113 |         bbox = list()
114 |         label = list()
115 |         difficult = list()
116 |         for obj in anno.findall('object'):
117 |             # when in not using difficult split, and the object is
118 |             # difficult, skipt it.
119 |             if not self.use_difficult and int(obj.find('difficult').text) == 1:
120 |                 continue
121 | 
122 |             difficult.append(int(obj.find('difficult').text))
123 |             bndbox_anno = obj.find('bndbox')
124 |             # subtract 1 to make pixel indexes 0-based
125 |             bbox.append([
126 |                 int(bndbox_anno.find(tag).text) - 1
127 |                 for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
128 |             name = obj.find('name').text.lower().strip()
129 |             label.append(VOC_BBOX_LABEL_NAMES.index(name))
130 |         bbox = np.stack(bbox).astype(np.float32)
131 |         label = np.stack(label).astype(np.int32)
132 |         # When `use_difficult==False`, all elements in `difficult` are False.
133 |         difficult = np.array(difficult, dtype=np.bool).astype(np.uint8)  # PyTorch don't support np.bool
134 | 
135 |         # Load a image
136 |         img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
137 |         img = plt.imread(img_file)
138 |         img = img.astype(np.float32)
139 | 
140 |         # if self.return_difficult:
141 |         #     return img, bbox, label, difficult
142 |         return img, bbox, label, difficult
143 | 
144 |     __getitem__ = get_example
145 | 
146 | 
147 | VOC_BBOX_LABEL_NAMES = (
148 |     'aeroplane',
149 |     'bicycle',
150 |     'bird',
151 |     'boat',
152 |     'bottle',
153 |     'bus',
154 |     'car',
155 |     'cat',
156 |     'chair',
157 |     'cow',
158 |     'diningtable',
159 |     'dog',
160 |     'horse',
161 |     'motorbike',
162 |     'person',
163 |     'pottedplant',
164 |     'sheep',
165 |     'sofa',
166 |     'train',
167 |     'tvmonitor')
168 | 
169 | # ----------------------------------------------------------------
170 | # Datasets Using in Training and Testing
171 | # 
172 | # link: http://pytorch.org/docs/0.3.0/data.html?highlight=dataset
173 | class VOCDataset(Dataset):
174 |     """
175 |     returned image: 
176 |     scaled image (mean, std, /255), float32, HWC, RGB
177 |     mean=[0.485, 0.456, 0.406]
178 |     std=[0.229, 0.224, 0.225]
179 |     """
180 |     def __init__(self, opt, train=True):
181 |         self.opt = opt
182 |         self.train = train
183 |         if train:
184 |             self.db = VOCBboxDataset(opt.voc_data_dir)
185 |         else:
186 |             self.db = VOCBboxDataset(opt.voc_data_dir, split='test', use_difficult=True)
187 | 
188 |     def __getitem__(self, idx):
189 |         ori_img, bbox, label, difficult = self.db.get_example(idx)
190 |         
191 |         # RESCALE ----------------------------
192 |         # image rescale to [opt.min_size, opt.max_size]
193 |         H0, W0, C = ori_img.shape
194 |         scale = min(self.opt.min_size/min(H0, W0), self.opt.max_size/max(H0, W0))
195 |         scaled_img = cv2.resize(ori_img, (0,0), fx=scale, fy=scale)
196 |         H1, W1, _ = scaled_img.shape
197 |         # bbox rescale
198 |         bbox = scale * bbox
199 |         
200 |         # NORMALIZE ----------------------------
201 |         normalized_img = scaled_img / 255.0
202 |         normalized_img = (normalized_img - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225])
203 |         
204 |         # HORIZON FLIP ----------------------------
205 |         if self.train and random.random() < 0.5:
206 |             normalized_img = normalized_img[:,::-1,:]
207 |             bbox[:,1], bbox[:,3] = W1 - bbox[:, 3], W1 - bbox[:, 1]
208 |         return normalized_img.astype(np.float32), bbox, label, scale
209 | 
210 |     def __len__(self):
211 |         return len(self.db)


--------------------------------------------------------------------------------
/model/faster_rcnn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from torch import nn
  3 | from model.vgg16 import decom_vgg16
  4 | from model.rpn import RegionProposalNetwork
  5 | from model.roi_module import VGG16RoIHead
  6 | from utils.config import opt
  7 | import torch as t
  8 | from utils import array_tool as at
  9 | 
 10 | 
 11 | import cupy as cp
 12 | from model.utils.nms import non_maximum_suppression
 13 | from model.utils.bbox_tools import loc2bbox
 14 | 
 15 | from torch.nn import functional as F
 16 | 
 17 | class FasterRCNN(nn.Module):
 18 |     def __init__(self, ratios=[0.5, 1, 2], anchor_scales=[0.5, 1, 2], \
 19 |                     loc_normalize_mean = (0., 0., 0., 0.), \
 20 |                     loc_normalize_std = (0.1, 0.1, 0.2, 0.2)):
 21 |         super(FasterRCNN, self).__init__()
 22 |         
 23 |         # prepare
 24 |         extractor, classifier = decom_vgg16()
 25 |         rpn = RegionProposalNetwork(
 26 |             512, 512,
 27 |             ratios=ratios,
 28 |             scales=anchor_scales,
 29 |             feat_stride=16
 30 |         )
 31 |         
 32 |         head = VGG16RoIHead(
 33 |             n_class=20 + 1,
 34 |             roi_size=7,
 35 |             spatial_scale=(1. / 16),
 36 |             classifier=classifier.cuda()
 37 |         )
 38 |         self.extractor = extractor.cuda()
 39 |         self.rpn = rpn.cuda()
 40 |         self.head = head
 41 | 
 42 |         # mean and std
 43 |         self.loc_normalize_mean = loc_normalize_mean
 44 |         self.loc_normalize_std = loc_normalize_std
 45 | 
 46 | 
 47 |     @property
 48 |     def n_class(self):
 49 |         # Total number of classes including the background.
 50 |         return self.head.n_class
 51 | 
 52 |     def forward(self, x, scale=1.):
 53 | 
 54 |         img_size = x.shape[2:]
 55 | 
 56 |         h = self.extractor(x)
 57 |         rpn_locs, rpn_scores, rois, roi_indices, anchor = \
 58 |             self.rpn(h, img_size, scale)
 59 |         roi_cls_locs, roi_scores = self.head(
 60 |             h.cuda(), rois, np.array(roi_indices))
 61 |         return roi_cls_locs, roi_scores, rois, roi_indices
 62 | 
 63 | 
 64 |     def get_optimizer(self):
 65 |         """
 66 |         return optimizer, It could be overwriten if you want to specify 
 67 |         special optimizer
 68 |         """
 69 |         lr = opt.lr
 70 |         params = []
 71 |         for key, value in dict(self.named_parameters()).items():
 72 |             if value.requires_grad:
 73 |                 if 'bias' in key:
 74 |                     params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}]
 75 |                 else:
 76 |                     params += [{'params': [value], 'lr': lr, 'weight_decay': opt.weight_decay}]
 77 |         # if opt.use_adam:
 78 |         #     self.optimizer = t.optim.Adam(params)
 79 |         # else:
 80 |         self.optimizer = t.optim.SGD(params, momentum=0.9)
 81 |         return self.optimizer
 82 | 
 83 | 
 84 |     def use_preset(self, preset):
 85 |         """Use the given preset during prediction.
 86 | 
 87 |         This method changes values of :obj:`self.nms_thresh` and
 88 |         :obj:`self.score_thresh`. These values are a threshold value
 89 |         used for non maximum suppression and a threshold value
 90 |         to discard low confidence proposals in :meth:`predict`,
 91 |         respectively.
 92 | 
 93 |         If the attributes need to be changed to something
 94 |         other than the values provided in the presets, please modify
 95 |         them by directly accessing the public attributes.
 96 | 
 97 |         Args:
 98 |             preset ({'visualize', 'evaluate'): A string to determine the
 99 |                 preset to use.
100 | 
101 |         """
102 |         if preset == 'visualize':
103 |             self.nms_thresh = 0.3
104 |             self.score_thresh = 0.7
105 |         elif preset == 'evaluate':
106 |             self.nms_thresh = 0.3
107 |             self.score_thresh = 0.05
108 |         else:
109 |             raise ValueError('preset must be visualize or evaluate')
110 | 
111 |     def _suppress(self, raw_cls_bbox, raw_prob):
112 |         bbox = list()
113 |         label = list()
114 |         score = list()
115 |         # skip cls_id = 0 because it is the background class
116 |         for l in range(1, self.n_class):
117 |             cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :]
118 |             prob_l = raw_prob[:, l]
119 |             mask = prob_l > self.score_thresh
120 |             cls_bbox_l = cls_bbox_l[mask]
121 |             prob_l = prob_l[mask]
122 |             keep = non_maximum_suppression(
123 |                 cp.array(cls_bbox_l), self.nms_thresh, prob_l)
124 |             keep = cp.asnumpy(keep)
125 |             bbox.append(cls_bbox_l[keep])
126 |             # The labels are in [0, self.n_class - 2].
127 |             label.append((l - 1) * np.ones((len(keep),)))
128 |             score.append(prob_l[keep])
129 |         bbox = np.concatenate(bbox, axis=0).astype(np.float32)
130 |         label = np.concatenate(label, axis=0).astype(np.int32)
131 |         score = np.concatenate(score, axis=0).astype(np.float32)
132 |         return bbox, label, score
133 | 
134 |     def predict(self, imgs,sizes=None,visualize=False):
135 |         """Detect objects from images.
136 | 
137 |         This method predicts objects for each image.
138 | 
139 |         Args:
140 |             imgs (iterable of numpy.ndarray): Arrays holding images.
141 |                 All images are in CHW and RGB format
142 |                 and the range of their value is :math:`[0, 255]`.
143 | 
144 |         Returns:
145 |            tuple of lists:
146 |            This method returns a tuple of three lists,
147 |            :obj:`(bboxes, labels, scores)`.
148 | 
149 |            * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
150 |                where :math:`R` is the number of bounding boxes in a image. \
151 |                Each bouding box is organized by \
152 |                :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
153 |                in the second axis.
154 |            * **labels** : A list of integer arrays of shape :math:`(R,)`. \
155 |                Each value indicates the class of the bounding box. \
156 |                Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
157 |                number of the foreground classes.
158 |            * **scores** : A list of float arrays of shape :math:`(R,)`. \
159 |                Each value indicates how confident the prediction is.
160 | 
161 |         """
162 |         self.eval()
163 |         self.use_preset('evaluate')
164 |         if visualize:
165 |             self.use_preset('visualize')
166 |             prepared_imgs = list()
167 |             sizes = list()
168 |             for img in imgs:
169 |                 size = img.shape[1:]
170 |                 img = preprocess(at.tonumpy(img))
171 |                 prepared_imgs.append(img)
172 |                 sizes.append(size)
173 |         else:
174 |              prepared_imgs = imgs 
175 |         bboxes = list()
176 |         labels = list()
177 |         scores = list()
178 |         for img, size in zip(prepared_imgs, sizes):
179 |             img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True)
180 |             scale = img.shape[3] / size[1]
181 |             roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
182 |             # We are assuming that batch size is 1.
183 |             roi_score = roi_scores.data
184 |             roi_cls_loc = roi_cls_loc.data
185 |             roi = at.totensor(rois) / scale
186 | 
187 |             # Convert predictions to bounding boxes in image coordinates.
188 |             # Bounding boxes are scaled to the scale of the input images.
189 |             mean = t.Tensor(self.loc_normalize_mean).cuda(). \
190 |                 repeat(self.n_class)[None]
191 |             std = t.Tensor(self.loc_normalize_std).cuda(). \
192 |                 repeat(self.n_class)[None]
193 | 
194 |             roi_cls_loc = (roi_cls_loc * std + mean)
195 |             roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
196 |             roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
197 |             cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
198 |                                 at.tonumpy(roi_cls_loc).reshape((-1, 4)))
199 |             cls_bbox = at.totensor(cls_bbox)
200 |             cls_bbox = cls_bbox.view(-1, self.n_class * 4)
201 |             # clip bounding box
202 |             cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
203 |             cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])
204 | 
205 |             prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))
206 | 
207 |             raw_cls_bbox = at.tonumpy(cls_bbox)
208 |             raw_prob = at.tonumpy(prob)
209 | 
210 |             bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
211 |             bboxes.append(bbox)
212 |             labels.append(label)
213 |             scores.append(score)
214 | 
215 |         self.use_preset('evaluate')
216 |         self.train()
217 |         return bboxes, labels, scores


--------------------------------------------------------------------------------
/data/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from PIL import Image
  3 | import random
  4 | 
  5 | 
  6 | def read_image(path, dtype=np.float32, color=True):
  7 |     """Read an image from a file.
  8 | 
  9 |     This function reads an image from given file. The image is CHW format and
 10 |     the range of its value is :math:`[0, 255]`. If :obj:`color = True`, the
 11 |     order of the channels is RGB.
 12 | 
 13 |     Args:
 14 |         path (str): A path of image file.
 15 |         dtype: The type of array. The default value is :obj:`~numpy.float32`.
 16 |         color (bool): This option determines the number of channels.
 17 |             If :obj:`True`, the number of channels is three. In this case,
 18 |             the order of the channels is RGB. This is the default behaviour.
 19 |             If :obj:`False`, this function returns a grayscale image.
 20 | 
 21 |     Returns:
 22 |         ~numpy.ndarray: An image.
 23 |     """
 24 | 
 25 |     f = Image.open(path)
 26 |     try:
 27 |         if color:
 28 |             img = f.convert('RGB')
 29 |         else:
 30 |             img = f.convert('P')
 31 |         img = np.asarray(img, dtype=dtype)
 32 |     finally:
 33 |         if hasattr(f, 'close'):
 34 |             f.close()
 35 | 
 36 |     if img.ndim == 2:
 37 |         # reshape (H, W) -> (1, H, W)
 38 |         return img[np.newaxis]
 39 |     else:
 40 |         # transpose (H, W, C) -> (C, H, W)
 41 |         return img.transpose((2, 0, 1))
 42 | 
 43 | 
 44 | def resize_bbox(bbox, in_size, out_size):
 45 |     """Resize bounding boxes according to image resize.
 46 | 
 47 |     The bounding boxes are expected to be packed into a two dimensional
 48 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
 49 |     bounding boxes in the image. The second axis represents attributes of
 50 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
 51 |     where the four attributes are coordinates of the top left and the
 52 |     bottom right vertices.
 53 | 
 54 |     Args:
 55 |         bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
 56 |             :math:`R` is the number of bounding boxes.
 57 |         in_size (tuple): A tuple of length 2. The height and the width
 58 |             of the image before resized.
 59 |         out_size (tuple): A tuple of length 2. The height and the width
 60 |             of the image after resized.
 61 | 
 62 |     Returns:
 63 |         ~numpy.ndarray:
 64 |         Bounding boxes rescaled according to the given image shapes.
 65 | 
 66 |     """
 67 |     bbox = bbox.copy()
 68 |     y_scale = float(out_size[0]) / in_size[0]
 69 |     x_scale = float(out_size[1]) / in_size[1]
 70 |     bbox[:, 0] = y_scale * bbox[:, 0]
 71 |     bbox[:, 2] = y_scale * bbox[:, 2]
 72 |     bbox[:, 1] = x_scale * bbox[:, 1]
 73 |     bbox[:, 3] = x_scale * bbox[:, 3]
 74 |     return bbox
 75 | 
 76 | 
 77 | def flip_bbox(bbox, size, y_flip=False, x_flip=False):
 78 |     """Flip bounding boxes accordingly.
 79 | 
 80 |     The bounding boxes are expected to be packed into a two dimensional
 81 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
 82 |     bounding boxes in the image. The second axis represents attributes of
 83 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
 84 |     where the four attributes are coordinates of the top left and the
 85 |     bottom right vertices.
 86 | 
 87 |     Args:
 88 |         bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
 89 |             :math:`R` is the number of bounding boxes.
 90 |         size (tuple): A tuple of length 2. The height and the width
 91 |             of the image before resized.
 92 |         y_flip (bool): Flip bounding box according to a vertical flip of
 93 |             an image.
 94 |         x_flip (bool): Flip bounding box according to a horizontal flip of
 95 |             an image.
 96 | 
 97 |     Returns:
 98 |         ~numpy.ndarray:
 99 |         Bounding boxes flipped according to the given flips.
100 | 
101 |     """
102 |     H, W = size
103 |     bbox = bbox.copy()
104 |     if y_flip:
105 |         y_max = H - bbox[:, 0]
106 |         y_min = H - bbox[:, 2]
107 |         bbox[:, 0] = y_min
108 |         bbox[:, 2] = y_max
109 |     if x_flip:
110 |         x_max = W - bbox[:, 1]
111 |         x_min = W - bbox[:, 3]
112 |         bbox[:, 1] = x_min
113 |         bbox[:, 3] = x_max
114 |     return bbox
115 | 
116 | 
117 | def crop_bbox(
118 |         bbox, y_slice=None, x_slice=None,
119 |         allow_outside_center=True, return_param=False):
120 |     """Translate bounding boxes to fit within the cropped area of an image.
121 | 
122 |     This method is mainly used together with image cropping.
123 |     This method translates the coordinates of bounding boxes like
124 |     :func:`data.util.translate_bbox`. In addition,
125 |     this function truncates the bounding boxes to fit within the cropped area.
126 |     If a bounding box does not overlap with the cropped area,
127 |     this bounding box will be removed.
128 | 
129 |     The bounding boxes are expected to be packed into a two dimensional
130 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
131 |     bounding boxes in the image. The second axis represents attributes of
132 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
133 |     where the four attributes are coordinates of the top left and the
134 |     bottom right vertices.
135 | 
136 |     Args:
137 |         bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
138 |             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
139 |         y_slice (slice): The slice of y axis.
140 |         x_slice (slice): The slice of x axis.
141 |         allow_outside_center (bool): If this argument is :obj:`False`,
142 |             bounding boxes whose centers are outside of the cropped area
143 |             are removed. The default value is :obj:`True`.
144 |         return_param (bool): If :obj:`True`, this function returns
145 |             indices of kept bounding boxes.
146 | 
147 |     Returns:
148 |         ~numpy.ndarray or (~numpy.ndarray, dict):
149 | 
150 |         If :obj:`return_param = False`, returns an array :obj:`bbox`.
151 | 
152 |         If :obj:`return_param = True`,
153 |         returns a tuple whose elements are :obj:`bbox, param`.
154 |         :obj:`param` is a dictionary of intermediate parameters whose
155 |         contents are listed below with key, value-type and the description
156 |         of the value.
157 | 
158 |         * **index** (*numpy.ndarray*): An array holding indices of used \
159 |             bounding boxes.
160 | 
161 |     """
162 | 
163 |     t, b = _slice_to_bounds(y_slice)
164 |     l, r = _slice_to_bounds(x_slice)
165 |     crop_bb = np.array((t, l, b, r))
166 | 
167 |     if allow_outside_center:
168 |         mask = np.ones(bbox.shape[0], dtype=bool)
169 |     else:
170 |         center = (bbox[:, :2] + bbox[:, 2:]) / 2
171 |         mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) \
172 |             .all(axis=1)
173 | 
174 |     bbox = bbox.copy()
175 |     bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2])
176 |     bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:])
177 |     bbox[:, :2] -= crop_bb[:2]
178 |     bbox[:, 2:] -= crop_bb[:2]
179 | 
180 |     mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1))
181 |     bbox = bbox[mask]
182 | 
183 |     if return_param:
184 |         return bbox, {'index': np.flatnonzero(mask)}
185 |     else:
186 |         return bbox
187 | 
188 | 
189 | def _slice_to_bounds(slice_):
190 |     if slice_ is None:
191 |         return 0, np.inf
192 | 
193 |     if slice_.start is None:
194 |         l = 0
195 |     else:
196 |         l = slice_.start
197 | 
198 |     if slice_.stop is None:
199 |         u = np.inf
200 |     else:
201 |         u = slice_.stop
202 | 
203 |     return l, u
204 | 
205 | 
206 | def translate_bbox(bbox, y_offset=0, x_offset=0):
207 |     """Translate bounding boxes.
208 | 
209 |     This method is mainly used together with image transforms, such as padding
210 |     and cropping, which translates the left top point of the image from
211 |     coordinate :math:`(0, 0)` to coordinate
212 |     :math:`(y, x) = (y_{offset}, x_{offset})`.
213 | 
214 |     The bounding boxes are expected to be packed into a two dimensional
215 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
216 |     bounding boxes in the image. The second axis represents attributes of
217 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
218 |     where the four attributes are coordinates of the top left and the
219 |     bottom right vertices.
220 | 
221 |     Args:
222 |         bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
223 |             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
224 |         y_offset (int or float): The offset along y axis.
225 |         x_offset (int or float): The offset along x axis.
226 | 
227 |     Returns:
228 |         ~numpy.ndarray:
229 |         Bounding boxes translated according to the given offsets.
230 | 
231 |     """
232 | 
233 |     out_bbox = bbox.copy()
234 |     out_bbox[:, :2] += (y_offset, x_offset)
235 |     out_bbox[:, 2:] += (y_offset, x_offset)
236 | 
237 |     return out_bbox
238 | 
239 | 
240 | def random_flip(img, y_random=False, x_random=False,
241 |                 return_param=False, copy=False):
242 |     """Randomly flip an image in vertical or horizontal direction.
243 | 
244 |     Args:
245 |         img (~numpy.ndarray): An array that gets flipped. This is in
246 |             CHW format.
247 |         y_random (bool): Randomly flip in vertical direction.
248 |         x_random (bool): Randomly flip in horizontal direction.
249 |         return_param (bool): Returns information of flip.
250 |         copy (bool): If False, a view of :obj:`img` will be returned.
251 | 
252 |     Returns:
253 |         ~numpy.ndarray or (~numpy.ndarray, dict):
254 | 
255 |         If :obj:`return_param = False`,
256 |         returns an array :obj:`out_img` that is the result of flipping.
257 | 
258 |         If :obj:`return_param = True`,
259 |         returns a tuple whose elements are :obj:`out_img, param`.
260 |         :obj:`param` is a dictionary of intermediate parameters whose
261 |         contents are listed below with key, value-type and the description
262 |         of the value.
263 | 
264 |         * **y_flip** (*bool*): Whether the image was flipped in the\
265 |             vertical direction or not.
266 |         * **x_flip** (*bool*): Whether the image was flipped in the\
267 |             horizontal direction or not.
268 | 
269 |     """
270 |     y_flip, x_flip = False, False
271 |     if y_random:
272 |         y_flip = random.choice([True, False])
273 |     if x_random:
274 |         x_flip = random.choice([True, False])
275 | 
276 |     if y_flip:
277 |         img = img[:, ::-1, :]
278 |     if x_flip:
279 |         img = img[:, :, ::-1]
280 | 
281 |     if copy:
282 |         img = img.copy()
283 | 
284 |     if return_param:
285 |         return img, {'y_flip': y_flip, 'x_flip': x_flip}
286 |     else:
287 |         return img
288 | 


--------------------------------------------------------------------------------
/utils/eval_tool.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | from collections import defaultdict
  4 | import itertools
  5 | import numpy as np
  6 | import six
  7 | 
  8 | from model.utils.bbox_tools import bbox_iou
  9 | 
 10 | 
 11 | def eval_detection_voc(
 12 |         pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
 13 |         gt_difficults=None,
 14 |         iou_thresh=0.5, use_07_metric=False):
 15 |     """Calculate average precisions based on evaluation code of PASCAL VOC.
 16 | 
 17 |     This function evaluates predicted bounding boxes obtained from a dataset
 18 |     which has :math:`N` images by using average precision for each class.
 19 |     The code is based on the evaluation code used in PASCAL VOC Challenge.
 20 | 
 21 |     Args:
 22 |         pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
 23 |             sets of bounding boxes.
 24 |             Its index corresponds to an index for the base dataset.
 25 |             Each element of :obj:`pred_bboxes` is a set of coordinates
 26 |             of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
 27 |             where :math:`R` corresponds
 28 |             to the number of bounding boxes, which may vary among boxes.
 29 |             The second axis corresponds to
 30 |             :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
 31 |         pred_labels (iterable of numpy.ndarray): An iterable of labels.
 32 |             Similar to :obj:`pred_bboxes`, its index corresponds to an
 33 |             index for the base dataset. Its length is :math:`N`.
 34 |         pred_scores (iterable of numpy.ndarray): An iterable of confidence
 35 |             scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
 36 |             its index corresponds to an index for the base dataset.
 37 |             Its length is :math:`N`.
 38 |         gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
 39 |             bounding boxes
 40 |             whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
 41 |             bounding box whose shape is :math:`(R, 4)`. Note that the number of
 42 |             bounding boxes in each image does not need to be same as the number
 43 |             of corresponding predicted boxes.
 44 |         gt_labels (iterable of numpy.ndarray): An iterable of ground truth
 45 |             labels which are organized similarly to :obj:`gt_bboxes`.
 46 |         gt_difficults (iterable of numpy.ndarray): An iterable of boolean
 47 |             arrays which is organized similarly to :obj:`gt_bboxes`.
 48 |             This tells whether the
 49 |             corresponding ground truth bounding box is difficult or not.
 50 |             By default, this is :obj:`None`. In that case, this function
 51 |             considers all bounding boxes to be not difficult.
 52 |         iou_thresh (float): A prediction is correct if its Intersection over
 53 |             Union with the ground truth is above this value.
 54 |         use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
 55 |             for calculating average precision. The default value is
 56 |             :obj:`False`.
 57 | 
 58 |     Returns:
 59 |         dict:
 60 | 
 61 |         The keys, value-types and the description of the values are listed
 62 |         below.
 63 | 
 64 |         * **ap** (*numpy.ndarray*): An array of average precisions. \
 65 |             The :math:`l`-th value corresponds to the average precision \
 66 |             for class :math:`l`. If class :math:`l` does not exist in \
 67 |             either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \
 68 |             value is set to :obj:`numpy.nan`.
 69 |         * **map** (*float*): The average of Average Precisions over classes.
 70 | 
 71 |     """
 72 | 
 73 |     prec, rec = calc_detection_voc_prec_rec(
 74 |         pred_bboxes, pred_labels, pred_scores,
 75 |         gt_bboxes, gt_labels, gt_difficults,
 76 |         iou_thresh=iou_thresh)
 77 | 
 78 |     ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric)
 79 | 
 80 |     return {'ap': ap, 'map': np.nanmean(ap)}
 81 | 
 82 | 
 83 | def calc_detection_voc_prec_rec(
 84 |         pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
 85 |         gt_difficults=None,
 86 |         iou_thresh=0.5):
 87 |     """Calculate precision and recall based on evaluation code of PASCAL VOC.
 88 | 
 89 |     This function calculates precision and recall of
 90 |     predicted bounding boxes obtained from a dataset which has :math:`N`
 91 |     images.
 92 |     The code is based on the evaluation code used in PASCAL VOC Challenge.
 93 | 
 94 |     Args:
 95 |         pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
 96 |             sets of bounding boxes.
 97 |             Its index corresponds to an index for the base dataset.
 98 |             Each element of :obj:`pred_bboxes` is a set of coordinates
 99 |             of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
100 |             where :math:`R` corresponds
101 |             to the number of bounding boxes, which may vary among boxes.
102 |             The second axis corresponds to
103 |             :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
104 |         pred_labels (iterable of numpy.ndarray): An iterable of labels.
105 |             Similar to :obj:`pred_bboxes`, its index corresponds to an
106 |             index for the base dataset. Its length is :math:`N`.
107 |         pred_scores (iterable of numpy.ndarray): An iterable of confidence
108 |             scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
109 |             its index corresponds to an index for the base dataset.
110 |             Its length is :math:`N`.
111 |         gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
112 |             bounding boxes
113 |             whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
114 |             bounding box whose shape is :math:`(R, 4)`. Note that the number of
115 |             bounding boxes in each image does not need to be same as the number
116 |             of corresponding predicted boxes.
117 |         gt_labels (iterable of numpy.ndarray): An iterable of ground truth
118 |             labels which are organized similarly to :obj:`gt_bboxes`.
119 |         gt_difficults (iterable of numpy.ndarray): An iterable of boolean
120 |             arrays which is organized similarly to :obj:`gt_bboxes`.
121 |             This tells whether the
122 |             corresponding ground truth bounding box is difficult or not.
123 |             By default, this is :obj:`None`. In that case, this function
124 |             considers all bounding boxes to be not difficult.
125 |         iou_thresh (float): A prediction is correct if its Intersection over
126 |             Union with the ground truth is above this value..
127 | 
128 |     Returns:
129 |         tuple of two lists:
130 |         This function returns two lists: :obj:`prec` and :obj:`rec`.
131 | 
132 |         * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \
133 |             for class :math:`l`. If class :math:`l` does not exist in \
134 |             either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \
135 |             set to :obj:`None`.
136 |         * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \
137 |             for class :math:`l`. If class :math:`l` that is not marked as \
138 |             difficult does not exist in \
139 |             :obj:`gt_labels`, :obj:`rec[l]` is \
140 |             set to :obj:`None`.
141 | 
142 |     """
143 | 
144 |     pred_bboxes = iter(pred_bboxes)
145 |     pred_labels = iter(pred_labels)
146 |     pred_scores = iter(pred_scores)
147 |     gt_bboxes = iter(gt_bboxes)
148 |     gt_labels = iter(gt_labels)
149 |     if gt_difficults is None:
150 |         gt_difficults = itertools.repeat(None)
151 |     else:
152 |         gt_difficults = iter(gt_difficults)
153 | 
154 |     n_pos = defaultdict(int)
155 |     score = defaultdict(list)
156 |     match = defaultdict(list)
157 | 
158 |     for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \
159 |             six.moves.zip(
160 |                 pred_bboxes, pred_labels, pred_scores,
161 |                 gt_bboxes, gt_labels, gt_difficults):
162 | 
163 |         if gt_difficult is None:
164 |             gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool)
165 | 
166 |         for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)):
167 |             pred_mask_l = pred_label == l
168 |             pred_bbox_l = pred_bbox[pred_mask_l]
169 |             pred_score_l = pred_score[pred_mask_l]
170 |             # sort by score
171 |             order = pred_score_l.argsort()[::-1]
172 |             pred_bbox_l = pred_bbox_l[order]
173 |             pred_score_l = pred_score_l[order]
174 | 
175 |             gt_mask_l = gt_label == l
176 |             gt_bbox_l = gt_bbox[gt_mask_l]
177 |             gt_difficult_l = gt_difficult[gt_mask_l]
178 | 
179 |             n_pos[l] += np.logical_not(gt_difficult_l).sum()
180 |             score[l].extend(pred_score_l)
181 | 
182 |             if len(pred_bbox_l) == 0:
183 |                 continue
184 |             if len(gt_bbox_l) == 0:
185 |                 match[l].extend((0,) * pred_bbox_l.shape[0])
186 |                 continue
187 | 
188 |             # VOC evaluation follows integer typed bounding boxes.
189 |             pred_bbox_l = pred_bbox_l.copy()
190 |             pred_bbox_l[:, 2:] += 1
191 |             gt_bbox_l = gt_bbox_l.copy()
192 |             gt_bbox_l[:, 2:] += 1
193 | 
194 |             iou = bbox_iou(pred_bbox_l, gt_bbox_l)
195 |             gt_index = iou.argmax(axis=1)
196 |             # set -1 if there is no matching ground truth
197 |             gt_index[iou.max(axis=1) < iou_thresh] = -1
198 |             del iou
199 | 
200 |             selec = np.zeros(gt_bbox_l.shape[0], dtype=bool)
201 |             for gt_idx in gt_index:
202 |                 if gt_idx >= 0:
203 |                     if gt_difficult_l[gt_idx]:
204 |                         match[l].append(-1)
205 |                     else:
206 |                         if not selec[gt_idx]:
207 |                             match[l].append(1)
208 |                         else:
209 |                             match[l].append(0)
210 |                     selec[gt_idx] = True
211 |                 else:
212 |                     match[l].append(0)
213 | 
214 |     for iter_ in (
215 |             pred_bboxes, pred_labels, pred_scores,
216 |             gt_bboxes, gt_labels, gt_difficults):
217 |         if next(iter_, None) is not None:
218 |             raise ValueError('Length of input iterables need to be same.')
219 | 
220 |     n_fg_class = max(n_pos.keys()) + 1
221 |     prec = [None] * n_fg_class
222 |     rec = [None] * n_fg_class
223 | 
224 |     for l in n_pos.keys():
225 |         score_l = np.array(score[l])
226 |         match_l = np.array(match[l], dtype=np.int8)
227 | 
228 |         order = score_l.argsort()[::-1]
229 |         match_l = match_l[order]
230 | 
231 |         tp = np.cumsum(match_l == 1)
232 |         fp = np.cumsum(match_l == 0)
233 | 
234 |         # If an element of fp + tp is 0,
235 |         # the corresponding element of prec[l] is nan.
236 |         prec[l] = tp / (fp + tp)
237 |         # If n_pos[l] is 0, rec[l] is None.
238 |         if n_pos[l] > 0:
239 |             rec[l] = tp / n_pos[l]
240 | 
241 |     return prec, rec
242 | 
243 | 
244 | def calc_detection_voc_ap(prec, rec, use_07_metric=False):
245 |     """Calculate average precisions based on evaluation code of PASCAL VOC.
246 | 
247 |     This function calculates average precisions
248 |     from given precisions and recalls.
249 |     The code is based on the evaluation code used in PASCAL VOC Challenge.
250 | 
251 |     Args:
252 |         prec (list of numpy.array): A list of arrays.
253 |             :obj:`prec[l]` indicates precision for class :math:`l`.
254 |             If :obj:`prec[l]` is :obj:`None`, this function returns
255 |             :obj:`numpy.nan` for class :math:`l`.
256 |         rec (list of numpy.array): A list of arrays.
257 |             :obj:`rec[l]` indicates recall for class :math:`l`.
258 |             If :obj:`rec[l]` is :obj:`None`, this function returns
259 |             :obj:`numpy.nan` for class :math:`l`.
260 |         use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
261 |             for calculating average precision. The default value is
262 |             :obj:`False`.
263 | 
264 |     Returns:
265 |         ~numpy.ndarray:
266 |         This function returns an array of average precisions.
267 |         The :math:`l`-th value corresponds to the average precision
268 |         for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is
269 |         :obj:`None`, the corresponding value is set to :obj:`numpy.nan`.
270 | 
271 |     """
272 | 
273 |     n_fg_class = len(prec)
274 |     ap = np.empty(n_fg_class)
275 |     for l in six.moves.range(n_fg_class):
276 |         if prec[l] is None or rec[l] is None:
277 |             ap[l] = np.nan
278 |             continue
279 | 
280 |         if use_07_metric:
281 |             # 11 point metric
282 |             ap[l] = 0
283 |             for t in np.arange(0., 1.1, 0.1):
284 |                 if np.sum(rec[l] >= t) == 0:
285 |                     p = 0
286 |                 else:
287 |                     p = np.max(np.nan_to_num(prec[l])[rec[l] >= t])
288 |                 ap[l] += p / 11
289 |         else:
290 |             # correct AP calculation
291 |             # first append sentinel values at the end
292 |             mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0]))
293 |             mrec = np.concatenate(([0], rec[l], [1]))
294 | 
295 |             mpre = np.maximum.accumulate(mpre[::-1])[::-1]
296 | 
297 |             # to calculate area under PR curve, look for points
298 |             # where X axis (recall) changes value
299 |             i = np.where(mrec[1:] != mrec[:-1])[0]
300 | 
301 |             # and sum (\Delta recall) * prec
302 |             ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
303 | 
304 |     return ap
305 | 


--------------------------------------------------------------------------------
/model/roi_module.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | from string import Template
  3 | 
  4 | import cupy, torch
  5 | import cupy as cp
  6 | import torch as t
  7 | from torch import nn
  8 | from torch.autograd import Function
  9 | 
 10 | from utils import array_tool as at
 11 | 
 12 | 
 13 | def normal_init(m, mean, stddev, truncated=False):
 14 |     """
 15 |     weight initalizer: truncated normal and random normal.
 16 |     """
 17 |     # x is a parameter
 18 |     if truncated:
 19 |         m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean)  # not a perfect approximation
 20 |     else:
 21 |         m.weight.data.normal_(mean, stddev)
 22 |         m.bias.data.zero_()
 23 | 
 24 | class VGG16RoIHead(nn.Module):
 25 |     """Faster R-CNN Head for VGG-16 based implementation.
 26 |     This class is used as a head for Faster R-CNN.
 27 |     This outputs class-wise localizations and classification based on feature
 28 |     maps in the given RoIs.
 29 |     
 30 |     Args:
 31 |         n_class (int): The number of classes possibly including the background.
 32 |         roi_size (int): Height and width of the feature maps after RoI-pooling.
 33 |         spatial_scale (float): Scale of the roi is resized.
 34 |         classifier (nn.Module): Two layer Linear ported from vgg16
 35 | 
 36 |     """
 37 | 
 38 |     def __init__(self, n_class, roi_size, spatial_scale,
 39 |                  classifier):
 40 |         # n_class includes the background
 41 |         super(VGG16RoIHead, self).__init__()
 42 | 
 43 |         self.classifier = classifier.cuda()
 44 |         self.cls_loc = nn.Linear(4096, n_class * 4).cuda()
 45 |         self.score = nn.Linear(4096, n_class).cuda()
 46 | 
 47 |         normal_init(self.cls_loc, 0, 0.001)
 48 |         normal_init(self.score, 0, 0.01)
 49 | 
 50 |         self.n_class = n_class
 51 |         self.roi_size = roi_size
 52 |         self.spatial_scale = spatial_scale
 53 |         self.roi = RoIPooling2D(self.roi_size, self.roi_size, self.spatial_scale)
 54 | 
 55 |     def forward(self, x, rois, roi_indices):
 56 |         """Forward the chain.
 57 | 
 58 |         We assume that there are :math:`N` batches.
 59 | 
 60 |         Args:
 61 |             x (Variable): 4D image variable.
 62 |             rois (Tensor): A bounding box array containing coordinates of
 63 |                 proposal boxes.  This is a concatenation of bounding box
 64 |                 arrays from multiple images in the batch.
 65 |                 Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed
 66 |                 RoIs from the :math:`i` th image,
 67 |                 :math:`R' = \\sum _{i=1} ^ N R_i`.
 68 |             roi_indices (Tensor): An array containing indices of images to
 69 |                 which bounding boxes correspond to. Its shape is :math:`(R',)`.
 70 | 
 71 |         """
 72 |         # in case roi_indices is  ndarray
 73 |         roi_indices = at.totensor(roi_indices).float()
 74 |         rois = at.totensor(rois).float()
 75 |         indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1)
 76 |         # NOTE: important: yx->xy
 77 |         xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
 78 |         indices_and_rois = t.autograd.Variable(xy_indices_and_rois.contiguous())
 79 | 
 80 |         pool = self.roi(x, indices_and_rois)
 81 |         pool = pool.view(pool.size(0), -1)
 82 |         fc7 = self.classifier(pool)
 83 |         roi_cls_locs = self.cls_loc(fc7)
 84 |         roi_scores = self.score(fc7)
 85 |         return roi_cls_locs, roi_scores
 86 | 
 87 | Stream = namedtuple('Stream', ['ptr'])
 88 | 
 89 | 
 90 | @cupy.util.memoize(for_each_device=True)
 91 | def load_kernel(kernel_name, code, **kwargs):
 92 |     cp.cuda.runtime.free(0)
 93 |     code = Template(code).substitute(**kwargs)
 94 |     kernel_code = cupy.cuda.compile_with_cache(code)
 95 |     return kernel_code.get_function(kernel_name)
 96 | 
 97 | 
 98 | CUDA_NUM_THREADS = 1024
 99 | 
100 | 
101 | def GET_BLOCKS(N, K=CUDA_NUM_THREADS):
102 |     return (N + K - 1) // K
103 | 
104 | 
105 | class RoI(Function):
106 |     """
107 |     NOTE：only CUDA-compatible
108 |     """
109 | 
110 |     def __init__(self, outh, outw, spatial_scale):
111 |         self.forward_fn = load_kernel('roi_forward', kernel_forward)
112 |         self.backward_fn = load_kernel('roi_backward', kernel_backward)
113 |         self.outh, self.outw, self.spatial_scale = outh, outw, spatial_scale
114 | 
115 |     def forward(self, x, rois):
116 |         # NOTE: MAKE SURE input is contiguous too
117 |         x = x.contiguous()
118 |         rois = rois.contiguous()
119 |         self.in_size = B, C, H, W = x.size()
120 |         self.N = N = rois.size(0)
121 |         output = t.zeros(N, C, self.outh, self.outw).cuda()
122 |         self.argmax_data = t.zeros(N, C, self.outh, self.outw).int().cuda()
123 |         self.rois = rois
124 |         args = [x.data_ptr(), rois.data_ptr(),
125 |                 output.data_ptr(),
126 |                 self.argmax_data.data_ptr(),
127 |                 self.spatial_scale, C, H, W,
128 |                 self.outh, self.outw,
129 |                 output.numel()]
130 |         stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
131 |         self.forward_fn(args=args,
132 |                         block=(CUDA_NUM_THREADS, 1, 1),
133 |                         grid=(GET_BLOCKS(output.numel()), 1, 1),
134 |                         stream=stream)
135 |         return output
136 | 
137 |     def backward(self, grad_output):
138 |         ##NOTE: IMPORTANT CONTIGUOUS
139 |         # TODO: input
140 |         grad_output = grad_output.contiguous()
141 |         B, C, H, W = self.in_size
142 |         grad_input = t.zeros(self.in_size).cuda()
143 |         stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
144 |         args = [grad_output.data_ptr(),
145 |                 self.argmax_data.data_ptr(),
146 |                 self.rois.data_ptr(),
147 |                 grad_input.data_ptr(),
148 |                 self.N, self.spatial_scale, C, H, W, self.outh, self.outw,
149 |                 grad_input.numel()]
150 |         self.backward_fn(args=args,
151 |                          block=(CUDA_NUM_THREADS, 1, 1),
152 |                          grid=(GET_BLOCKS(grad_input.numel()), 1, 1),
153 |                          stream=stream
154 |                          )
155 |         return grad_input, None
156 | 
157 | 
158 | class RoIPooling2D(nn.Module):
159 | 
160 |     def __init__(self, outh, outw, spatial_scale):
161 |         super(RoIPooling2D, self).__init__()
162 |         self.RoI = RoI(outh, outw, spatial_scale)
163 | 
164 |     def forward(self, x, rois):
165 |         return self.RoI(x, rois)
166 | 
167 | 
168 | def test_roi_module():
169 |     ## fake data###
170 |     B, N, C, H, W, PH, PW = 2, 8, 4, 32, 32, 7, 7
171 | 
172 |     bottom_data = t.randn(B, C, H, W).cuda()
173 |     bottom_rois = t.randn(N, 5)
174 |     bottom_rois[:int(N / 2), 0] = 0
175 |     bottom_rois[int(N / 2):, 0] = 1
176 |     bottom_rois[:, 1:] = (t.rand(N, 4) * 100).float()
177 |     bottom_rois = bottom_rois.cuda()
178 |     spatial_scale = 1. / 16
179 |     outh, outw = PH, PW
180 | 
181 |     # pytorch version
182 |     module = RoIPooling2D(outh, outw, spatial_scale)
183 |     x = t.autograd.Variable(bottom_data, requires_grad=True)
184 |     rois = t.autograd.Variable(bottom_rois)
185 |     output = module(x, rois)
186 |     output.sum().backward()
187 | 
188 |     def t2c(variable):
189 |         npa = variable.data.cpu().numpy()
190 |         return cp.array(npa)
191 | 
192 |     def test_eq(variable, array, info):
193 |         cc = cp.asnumpy(array)
194 |         neq = (cc != variable.data.cpu().numpy())
195 |         assert neq.sum() == 0, 'test failed: %s' % info
196 | 
197 |     # chainer version,if you're going to run this
198 |     # pip install chainer 
199 |     import chainer.functions as F
200 |     from chainer import Variable
201 |     x_cn = Variable(t2c(x))
202 | 
203 |     o_cn = F.roi_pooling_2d(x_cn, t2c(rois), outh, outw, spatial_scale)
204 |     test_eq(output, o_cn.array, 'forward')
205 |     F.sum(o_cn).backward()
206 |     test_eq(x.grad, x_cn.grad, 'backward')
207 |     print('test pass')
208 | 
209 | 
210 | def normal_init(m, mean, stddev, truncated=False):
211 |     """
212 |     weight initalizer: truncated normal and random normal.
213 |     """
214 |     # x is a parameter
215 |     if truncated:
216 |         m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean)  # not a perfect approximation
217 |     else:
218 |         m.weight.data.normal_(mean, stddev)
219 |         m.bias.data.zero_()
220 | 
221 | 
222 | kernel_forward = '''
223 |     extern "C"
224 |     __global__ void roi_forward(const float* const bottom_data,const float* const bottom_rois,
225 |                 float* top_data, int* argmax_data,
226 |                 const double spatial_scale,const int channels,const int height, 
227 |                 const int width, const int pooled_height, 
228 |                 const int pooled_width,const int NN
229 |     ){
230 |         
231 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
232 |     if(idx>=NN)
233 |         return;
234 |     const int pw = idx % pooled_width;
235 |     const int ph = (idx / pooled_width) % pooled_height;
236 |     const int c = (idx / pooled_width / pooled_height) % channels;
237 |     int num = idx / pooled_width / pooled_height / channels;
238 |     const int roi_batch_ind = bottom_rois[num * 5 + 0];
239 |     const int roi_start_w = round(bottom_rois[num * 5 + 1] * spatial_scale);
240 |     const int roi_start_h = round(bottom_rois[num * 5 + 2] * spatial_scale);
241 |     const int roi_end_w = round(bottom_rois[num * 5 + 3] * spatial_scale);
242 |     const int roi_end_h = round(bottom_rois[num * 5 + 4] * spatial_scale);
243 |     // Force malformed ROIs to be 1x1
244 |     const int roi_width = max(roi_end_w - roi_start_w + 1, 1);
245 |     const int roi_height = max(roi_end_h - roi_start_h + 1, 1);
246 |     const float bin_size_h = static_cast<float>(roi_height)
247 |                     / static_cast<float>(pooled_height);
248 |     const float bin_size_w = static_cast<float>(roi_width)
249 |                     / static_cast<float>(pooled_width);
250 | 
251 |     int hstart = static_cast<int>(floor(static_cast<float>(ph)
252 |                                     * bin_size_h));
253 |         int wstart = static_cast<int>(floor(static_cast<float>(pw)
254 |                                     * bin_size_w));
255 |     int hend = static_cast<int>(ceil(static_cast<float>(ph + 1)
256 |                                 * bin_size_h));
257 |         int wend = static_cast<int>(ceil(static_cast<float>(pw + 1)
258 |                                 * bin_size_w));
259 | 
260 |     // Add roi offsets and clip to input boundaries
261 |     hstart = min(max(hstart + roi_start_h, 0), height);
262 |     hend = min(max(hend + roi_start_h, 0), height);
263 |     wstart = min(max(wstart + roi_start_w, 0), width);
264 |     wend = min(max(wend + roi_start_w, 0), width);
265 |     bool is_empty = (hend <= hstart) || (wend <= wstart);
266 | 
267 |     // Define an empty pooling region to be zero
268 |     float maxval = is_empty ? 0 : -1E+37;
269 |     // If nothing is pooled, argmax=-1 causes nothing to be backprop'd
270 |     int maxidx = -1;
271 |     const int data_offset = (roi_batch_ind * channels + c) * height * width;
272 |     for (int h = hstart; h < hend; ++h) {
273 |         for (int w = wstart; w < wend; ++w) {
274 |             int bottom_index = h * width + w;
275 |             if (bottom_data[data_offset + bottom_index] > maxval) {
276 |                 maxval = bottom_data[data_offset + bottom_index];
277 |                 maxidx = bottom_index;
278 |             }
279 |         }
280 |     }
281 |     top_data[idx]=maxval;
282 |     argmax_data[idx]=maxidx;
283 |     }
284 | '''
285 | kernel_backward = '''
286 |     extern "C"
287 |     __global__ void roi_backward(const float* const top_diff,
288 |          const int* const argmax_data,const float* const bottom_rois,
289 |          float* bottom_diff, const int num_rois,
290 |          const double spatial_scale, int channels,
291 |          int height, int width, int pooled_height,
292 |           int pooled_width,const int NN)
293 |     {
294 | 
295 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
296 |     ////Importtan >= instead of >
297 |     if(idx>=NN)
298 |         return;
299 |     int w = idx % width;
300 |     int h = (idx / width) % height;
301 |     int c = (idx/ (width * height)) % channels;
302 |     int num = idx / (width * height * channels);
303 | 
304 |     float gradient = 0;
305 |     // Accumulate gradient over all ROIs that pooled this element
306 |     for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
307 |         // Skip if ROI's batch index doesn't match num
308 |         if (num != static_cast<int>(bottom_rois[roi_n * 5])) {
309 |             continue;
310 |         }
311 | 
312 |         int roi_start_w = round(bottom_rois[roi_n * 5 + 1]
313 |                                 * spatial_scale);
314 |         int roi_start_h = round(bottom_rois[roi_n * 5 + 2]
315 |                                 * spatial_scale);
316 |         int roi_end_w = round(bottom_rois[roi_n * 5 + 3]
317 |                                 * spatial_scale);
318 |         int roi_end_h = round(bottom_rois[roi_n * 5 + 4]
319 |                                 * spatial_scale);
320 | 
321 |         // Skip if ROI doesn't include (h, w)
322 |         const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
323 |                                 h >= roi_start_h && h <= roi_end_h);
324 |         if (!in_roi) {
325 |             continue;
326 |         }
327 | 
328 |         int offset = (roi_n * channels + c) * pooled_height
329 |                         * pooled_width;
330 | 
331 |         // Compute feasible set of pooled units that could have pooled
332 |         // this bottom unit
333 | 
334 |         // Force malformed ROIs to be 1x1
335 |         int roi_width = max(roi_end_w - roi_start_w + 1, 1);
336 |         int roi_height = max(roi_end_h - roi_start_h + 1, 1);
337 | 
338 |         float bin_size_h = static_cast<float>(roi_height)
339 |                         / static_cast<float>(pooled_height);
340 |         float bin_size_w = static_cast<float>(roi_width)
341 |                         / static_cast<float>(pooled_width);
342 | 
343 |         int phstart = floor(static_cast<float>(h - roi_start_h)
344 |                             / bin_size_h);
345 |         int phend = ceil(static_cast<float>(h - roi_start_h + 1)
346 |                             / bin_size_h);
347 |         int pwstart = floor(static_cast<float>(w - roi_start_w)
348 |                             / bin_size_w);
349 |         int pwend = ceil(static_cast<float>(w - roi_start_w + 1)
350 |                             / bin_size_w);
351 | 
352 |         phstart = min(max(phstart, 0), pooled_height);
353 |         phend = min(max(phend, 0), pooled_height);
354 |         pwstart = min(max(pwstart, 0), pooled_width);
355 |         pwend = min(max(pwend, 0), pooled_width);
356 |         for (int ph = phstart; ph < phend; ++ph) {
357 |             for (int pw = pwstart; pw < pwend; ++pw) {
358 |                 int index_ = ph * pooled_width + pw + offset;
359 |                 if (argmax_data[index_] == (h * width + w)) {
360 |                     gradient += top_diff[index_];
361 |                 }
362 |             }
363 |         }
364 |     }
365 |     bottom_diff[idx] = gradient;
366 |     }
367 | '''
368 | 


--------------------------------------------------------------------------------
/Train.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2\n",
 11 |     "%matplotlib inline\n",
 12 |     "\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "from utils.config import opt\n",
 15 |     "from utils.data_load import save_pkl, load_pkl\n",
 16 |     "import numpy as np\n",
 17 |     "from torch import nn\n",
 18 |     "from torch.utils import data as data_\n",
 19 |     "from tqdm import tqdm\n",
 20 |     "import torch as t\n",
 21 |     "from utils import array_tool as at\n",
 22 |     "from torch.autograd import Variable"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "# Load Data"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "from data.dataset import Dataset, TestDataset, inverse_normalize"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {
 47 |     "collapsed": true
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "dataset = Dataset(opt)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 4,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "dataloader = data_.DataLoader(dataset, \\\n",
 61 |     "                                  batch_size=1, \\\n",
 62 |     "                                  shuffle=True, \\\n",
 63 |     "                                  # pin_memory=True,\n",
 64 |     "                                  num_workers=opt.num_workers)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "# Load Net and Trainer"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 5,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "from model.faster_rcnn import FasterRCNN\n",
 83 |     "from trainer.trainer import FasterRCNNTrainer"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 6,
 89 |    "metadata": {
 90 |     "collapsed": true
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "faster_rcnn = FasterRCNN()\n",
 95 |     "trainer = FasterRCNNTrainer(faster_rcnn).cuda()"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "# Training"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 7,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "name": "stderr",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "5011it [20:59,  4.58it/s]\n"
115 |      ]
116 |     },
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "--------------------------\n",
122 |       "curr epoch:  0\n",
123 |       "roi_cls loss:  Variable containing:\n",
124 |       " 0.3674\n",
125 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
126 |       "\n",
127 |       "roi_loc loss:  Variable containing:\n",
128 |       " 0.3669\n",
129 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
130 |       "\n",
131 |       "rpn_cls loss:  Variable containing:\n",
132 |       " 0.1877\n",
133 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
134 |       "\n",
135 |       "rpn_loc loss:  Variable containing:\n",
136 |       "1.00000e-02 *\n",
137 |       "  7.2901\n",
138 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
139 |       "\n",
140 |       "--------------------------\n"
141 |      ]
142 |     },
143 |     {
144 |      "name": "stderr",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "5011it [21:20,  3.73it/s]\n"
148 |      ]
149 |     },
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "--------------------------\n",
155 |       "curr epoch:  1\n",
156 |       "roi_cls loss:  Variable containing:\n",
157 |       " 0.2599\n",
158 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
159 |       "\n",
160 |       "roi_loc loss:  Variable containing:\n",
161 |       " 0.3091\n",
162 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
163 |       "\n",
164 |       "rpn_cls loss:  Variable containing:\n",
165 |       " 0.1426\n",
166 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
167 |       "\n",
168 |       "rpn_loc loss:  Variable containing:\n",
169 |       "1.00000e-02 *\n",
170 |       "  6.3122\n",
171 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
172 |       "\n",
173 |       "--------------------------\n"
174 |      ]
175 |     },
176 |     {
177 |      "name": "stderr",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "5011it [21:15,  4.04it/s]\n"
181 |      ]
182 |     },
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "--------------------------\n",
188 |       "curr epoch:  2\n",
189 |       "roi_cls loss:  Variable containing:\n",
190 |       " 0.2248\n",
191 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
192 |       "\n",
193 |       "roi_loc loss:  Variable containing:\n",
194 |       " 0.2781\n",
195 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
196 |       "\n",
197 |       "rpn_cls loss:  Variable containing:\n",
198 |       " 0.1257\n",
199 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
200 |       "\n",
201 |       "rpn_loc loss:  Variable containing:\n",
202 |       "1.00000e-02 *\n",
203 |       "  5.9798\n",
204 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
205 |       "\n",
206 |       "--------------------------\n"
207 |      ]
208 |     },
209 |     {
210 |      "name": "stderr",
211 |      "output_type": "stream",
212 |      "text": [
213 |       "5011it [21:37,  3.77it/s]\n"
214 |      ]
215 |     },
216 |     {
217 |      "name": "stdout",
218 |      "output_type": "stream",
219 |      "text": [
220 |       "--------------------------\n",
221 |       "curr epoch:  3\n",
222 |       "roi_cls loss:  Variable containing:\n",
223 |       " 0.1998\n",
224 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
225 |       "\n",
226 |       "roi_loc loss:  Variable containing:\n",
227 |       " 0.2560\n",
228 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
229 |       "\n",
230 |       "rpn_cls loss:  Variable containing:\n",
231 |       " 0.1126\n",
232 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
233 |       "\n",
234 |       "rpn_loc loss:  Variable containing:\n",
235 |       "1.00000e-02 *\n",
236 |       "  5.7650\n",
237 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
238 |       "\n",
239 |       "--------------------------\n"
240 |      ]
241 |     },
242 |     {
243 |      "name": "stderr",
244 |      "output_type": "stream",
245 |      "text": [
246 |       "5011it [22:38,  3.31it/s]\n"
247 |      ]
248 |     },
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "--------------------------\n",
254 |       "curr epoch:  4\n",
255 |       "roi_cls loss:  Variable containing:\n",
256 |       " 0.1831\n",
257 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
258 |       "\n",
259 |       "roi_loc loss:  Variable containing:\n",
260 |       " 0.2406\n",
261 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
262 |       "\n",
263 |       "rpn_cls loss:  Variable containing:\n",
264 |       " 0.1040\n",
265 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
266 |       "\n",
267 |       "rpn_loc loss:  Variable containing:\n",
268 |       "1.00000e-02 *\n",
269 |       "  5.5827\n",
270 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
271 |       "\n",
272 |       "--------------------------\n"
273 |      ]
274 |     },
275 |     {
276 |      "name": "stderr",
277 |      "output_type": "stream",
278 |      "text": [
279 |       "5011it [22:29,  3.93it/s]\n"
280 |      ]
281 |     },
282 |     {
283 |      "name": "stdout",
284 |      "output_type": "stream",
285 |      "text": [
286 |       "--------------------------\n",
287 |       "curr epoch:  5\n",
288 |       "roi_cls loss:  Variable containing:\n",
289 |       " 0.1717\n",
290 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
291 |       "\n",
292 |       "roi_loc loss:  Variable containing:\n",
293 |       " 0.2253\n",
294 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
295 |       "\n",
296 |       "rpn_cls loss:  Variable containing:\n",
297 |       "1.00000e-02 *\n",
298 |       "  9.6195\n",
299 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
300 |       "\n",
301 |       "rpn_loc loss:  Variable containing:\n",
302 |       "1.00000e-02 *\n",
303 |       "  5.3951\n",
304 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
305 |       "\n",
306 |       "--------------------------\n"
307 |      ]
308 |     },
309 |     {
310 |      "name": "stderr",
311 |      "output_type": "stream",
312 |      "text": [
313 |       "5011it [22:42,  4.02it/s]\n"
314 |      ]
315 |     },
316 |     {
317 |      "name": "stdout",
318 |      "output_type": "stream",
319 |      "text": [
320 |       "--------------------------\n",
321 |       "curr epoch:  6\n",
322 |       "roi_cls loss:  Variable containing:\n",
323 |       " 0.1593\n",
324 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
325 |       "\n",
326 |       "roi_loc loss:  Variable containing:\n",
327 |       " 0.2130\n",
328 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
329 |       "\n",
330 |       "rpn_cls loss:  Variable containing:\n",
331 |       "1.00000e-02 *\n",
332 |       "  8.8023\n",
333 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
334 |       "\n",
335 |       "rpn_loc loss:  Variable containing:\n",
336 |       "1.00000e-02 *\n",
337 |       "  5.3082\n",
338 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
339 |       "\n",
340 |       "--------------------------\n"
341 |      ]
342 |     },
343 |     {
344 |      "name": "stderr",
345 |      "output_type": "stream",
346 |      "text": [
347 |       "5011it [22:38,  3.69it/s]\n"
348 |      ]
349 |     },
350 |     {
351 |      "name": "stdout",
352 |      "output_type": "stream",
353 |      "text": [
354 |       "--------------------------\n",
355 |       "curr epoch:  7\n",
356 |       "roi_cls loss:  Variable containing:\n",
357 |       " 0.1457\n",
358 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
359 |       "\n",
360 |       "roi_loc loss:  Variable containing:\n",
361 |       " 0.2030\n",
362 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
363 |       "\n",
364 |       "rpn_cls loss:  Variable containing:\n",
365 |       "1.00000e-02 *\n",
366 |       "  8.0985\n",
367 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
368 |       "\n",
369 |       "rpn_loc loss:  Variable containing:\n",
370 |       "1.00000e-02 *\n",
371 |       "  5.1582\n",
372 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
373 |       "\n",
374 |       "--------------------------\n"
375 |      ]
376 |     },
377 |     {
378 |      "name": "stderr",
379 |      "output_type": "stream",
380 |      "text": [
381 |       "5011it [22:42,  4.22it/s]\n"
382 |      ]
383 |     },
384 |     {
385 |      "name": "stdout",
386 |      "output_type": "stream",
387 |      "text": [
388 |       "--------------------------\n",
389 |       "curr epoch:  8\n",
390 |       "roi_cls loss:  Variable containing:\n",
391 |       " 0.1402\n",
392 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
393 |       "\n",
394 |       "roi_loc loss:  Variable containing:\n",
395 |       " 0.1936\n",
396 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
397 |       "\n",
398 |       "rpn_cls loss:  Variable containing:\n",
399 |       "1.00000e-02 *\n",
400 |       "  7.5001\n",
401 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
402 |       "\n",
403 |       "rpn_loc loss:  Variable containing:\n",
404 |       "1.00000e-02 *\n",
405 |       "  5.0869\n",
406 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
407 |       "\n",
408 |       "--------------------------\n"
409 |      ]
410 |     },
411 |     {
412 |      "name": "stderr",
413 |      "output_type": "stream",
414 |      "text": [
415 |       "5011it [22:24,  3.66it/s]\n"
416 |      ]
417 |     },
418 |     {
419 |      "name": "stdout",
420 |      "output_type": "stream",
421 |      "text": [
422 |       "--------------------------\n",
423 |       "curr epoch:  9\n",
424 |       "roi_cls loss:  Variable containing:\n",
425 |       " 0.1342\n",
426 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
427 |       "\n",
428 |       "roi_loc loss:  Variable containing:\n",
429 |       " 0.1853\n",
430 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
431 |       "\n",
432 |       "rpn_cls loss:  Variable containing:\n",
433 |       "1.00000e-02 *\n",
434 |       "  7.0720\n",
435 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
436 |       "\n",
437 |       "rpn_loc loss:  Variable containing:\n",
438 |       "1.00000e-02 *\n",
439 |       "  5.0439\n",
440 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
441 |       "\n",
442 |       "--------------------------\n"
443 |      ]
444 |     },
445 |     {
446 |      "name": "stderr",
447 |      "output_type": "stream",
448 |      "text": [
449 |       "5011it [22:48,  3.66it/s]\n"
450 |      ]
451 |     },
452 |     {
453 |      "name": "stdout",
454 |      "output_type": "stream",
455 |      "text": [
456 |       "--------------------------\n",
457 |       "curr epoch:  10\n",
458 |       "roi_cls loss:  Variable containing:\n",
459 |       " 0.1281\n",
460 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
461 |       "\n",
462 |       "roi_loc loss:  Variable containing:\n",
463 |       " 0.1791\n",
464 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
465 |       "\n",
466 |       "rpn_cls loss:  Variable containing:\n",
467 |       "1.00000e-02 *\n",
468 |       "  6.5513\n",
469 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
470 |       "\n",
471 |       "rpn_loc loss:  Variable containing:\n",
472 |       "1.00000e-02 *\n",
473 |       "  4.9259\n",
474 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
475 |       "\n",
476 |       "--------------------------\n"
477 |      ]
478 |     },
479 |     {
480 |      "name": "stderr",
481 |      "output_type": "stream",
482 |      "text": [
483 |       "5011it [23:02,  4.23it/s]\n"
484 |      ]
485 |     },
486 |     {
487 |      "name": "stdout",
488 |      "output_type": "stream",
489 |      "text": [
490 |       "--------------------------\n",
491 |       "curr epoch:  11\n",
492 |       "roi_cls loss:  Variable containing:\n",
493 |       " 0.1220\n",
494 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
495 |       "\n",
496 |       "roi_loc loss:  Variable containing:\n",
497 |       " 0.1720\n",
498 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
499 |       "\n",
500 |       "rpn_cls loss:  Variable containing:\n",
501 |       "1.00000e-02 *\n",
502 |       "  6.1539\n",
503 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
504 |       "\n",
505 |       "rpn_loc loss:  Variable containing:\n",
506 |       "1.00000e-02 *\n",
507 |       "  4.8372\n",
508 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
509 |       "\n",
510 |       "--------------------------\n"
511 |      ]
512 |     },
513 |     {
514 |      "name": "stderr",
515 |      "output_type": "stream",
516 |      "text": [
517 |       "5011it [22:35,  3.51it/s]\n"
518 |      ]
519 |     },
520 |     {
521 |      "name": "stdout",
522 |      "output_type": "stream",
523 |      "text": [
524 |       "--------------------------\n",
525 |       "curr epoch:  12\n",
526 |       "roi_cls loss:  Variable containing:\n",
527 |       " 0.1174\n",
528 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
529 |       "\n",
530 |       "roi_loc loss:  Variable containing:\n",
531 |       " 0.1660\n",
532 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
533 |       "\n",
534 |       "rpn_cls loss:  Variable containing:\n",
535 |       "1.00000e-02 *\n",
536 |       "  5.8359\n",
537 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
538 |       "\n",
539 |       "rpn_loc loss:  Variable containing:\n",
540 |       "1.00000e-02 *\n",
541 |       "  4.7706\n",
542 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
543 |       "\n",
544 |       "--------------------------\n"
545 |      ]
546 |     },
547 |     {
548 |      "name": "stderr",
549 |      "output_type": "stream",
550 |      "text": [
551 |       "5011it [22:54,  3.90it/s]\n"
552 |      ]
553 |     },
554 |     {
555 |      "name": "stdout",
556 |      "output_type": "stream",
557 |      "text": [
558 |       "--------------------------\n",
559 |       "curr epoch:  13\n",
560 |       "roi_cls loss:  Variable containing:\n",
561 |       " 0.1138\n",
562 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
563 |       "\n",
564 |       "roi_loc loss:  Variable containing:\n",
565 |       " 0.1598\n",
566 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
567 |       "\n",
568 |       "rpn_cls loss:  Variable containing:\n",
569 |       "1.00000e-02 *\n",
570 |       "  5.4888\n",
571 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
572 |       "\n",
573 |       "rpn_loc loss:  Variable containing:\n",
574 |       "1.00000e-02 *\n",
575 |       "  4.7324\n",
576 |       "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n",
577 |       "\n",
578 |       "--------------------------\n"
579 |      ]
580 |     }
581 |    ],
582 |    "source": [
583 |     "for epoch in range(14):\n",
584 |     "    \n",
585 |     "    loss_list_roi_cls = []\n",
586 |     "    loss_list_roi_loc = []\n",
587 |     "    loss_list_rpn_cls = []\n",
588 |     "    loss_list_rpn_loc = []\n",
589 |     "    for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):\n",
590 |     "        \n",
591 |     "        scale = at.scalar(scale)\n",
592 |     "        img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()\n",
593 |     "        img, bbox, label = Variable(img), Variable(bbox), Variable(label)\n",
594 |     "        loss_list = trainer.train_step(img, bbox, label, scale)\n",
595 |     "\n",
596 |     "        loss_list_roi_cls.append(loss_list.roi_cls_loss)\n",
597 |     "        loss_list_roi_loc.append(loss_list.roi_loc_loss)\n",
598 |     "        loss_list_rpn_cls.append(loss_list.rpn_cls_loss)\n",
599 |     "        loss_list_rpn_loc.append(loss_list.rpn_loc_loss)\n",
600 |     "    print (\"--------------------------\")\n",
601 |     "    print (\"curr epoch: \", epoch)\n",
602 |     "    print (\"roi_cls loss: \", np.array(loss_list_roi_cls).mean())\n",
603 |     "    print (\"roi_loc loss: \", np.array(loss_list_roi_loc).mean())\n",
604 |     "    print (\"rpn_cls loss: \", np.array(loss_list_rpn_cls).mean())\n",
605 |     "    print (\"rpn_loc loss: \", np.array(loss_list_rpn_loc).mean())\n",
606 |     "    print (\"--------------------------\")"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "markdown",
611 |    "metadata": {},
612 |    "source": [
613 |     "# Evaluation"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": 8,
619 |    "metadata": {
620 |     "collapsed": true
621 |    },
622 |    "outputs": [],
623 |    "source": [
624 |     "from utils.eval_tool import eval_detection_voc"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": 9,
630 |    "metadata": {
631 |     "collapsed": true
632 |    },
633 |    "outputs": [],
634 |    "source": [
635 |     "def eval(dataloader, faster_rcnn, test_num=10000):\n",
636 |     "    pred_bboxes, pred_labels, pred_scores = list(), list(), list()\n",
637 |     "    gt_bboxes, gt_labels, gt_difficults = list(), list(), list()\n",
638 |     "    for ii, (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) in tqdm(enumerate(dataloader)):\n",
639 |     "        sizes = [sizes[0][0], sizes[1][0]]\n",
640 |     "        pred_bboxes_, pred_labels_, pred_scores_ = faster_rcnn.predict(imgs, [sizes])\n",
641 |     "        gt_bboxes += list(gt_bboxes_.numpy())\n",
642 |     "        gt_labels += list(gt_labels_.numpy())\n",
643 |     "        gt_difficults += list(gt_difficults_.numpy())\n",
644 |     "        pred_bboxes += pred_bboxes_\n",
645 |     "        pred_labels += pred_labels_\n",
646 |     "        pred_scores += pred_scores_\n",
647 |     "        if ii == test_num: break\n",
648 |     "\n",
649 |     "    result = eval_detection_voc(\n",
650 |     "        pred_bboxes, pred_labels, pred_scores,\n",
651 |     "        gt_bboxes, gt_labels, gt_difficults,\n",
652 |     "        use_07_metric=True)\n",
653 |     "    return result"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": 10,
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": [
662 |     "testset = TestDataset(opt)\n",
663 |     "test_dataloader = data_.DataLoader(testset,\n",
664 |     "                                   batch_size=1,\n",
665 |     "                                   num_workers=8,\n",
666 |     "                                   shuffle=False, \\\n",
667 |     "                                   pin_memory=True\n",
668 |     "                                   )"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": 11,
674 |    "metadata": {},
675 |    "outputs": [
676 |     {
677 |      "name": "stderr",
678 |      "output_type": "stream",
679 |      "text": [
680 |       "1000it [07:31,  1.98it/s]"
681 |      ]
682 |     }
683 |    ],
684 |    "source": [
685 |     "eval_result = eval(test_dataloader, faster_rcnn, test_num=1000)"
686 |    ]
687 |   },
688 |   {
689 |    "cell_type": "code",
690 |    "execution_count": 12,
691 |    "metadata": {},
692 |    "outputs": [
693 |     {
694 |      "data": {
695 |       "text/plain": [
696 |        "{'ap': array([ 0.78703012,  0.80306497,  0.64505543,  0.60716555,  0.49361586,\n",
697 |        "         0.8341593 ,  0.84566703,  0.78718817,  0.45073836,  0.79517284,\n",
698 |        "         0.62107752,  0.6427704 ,  0.74794425,  0.79348821,  0.74581093,\n",
699 |        "         0.38075758,  0.67097107,  0.49560663,  0.74021585,  0.66774319]),\n",
700 |        " 'map': 0.67776216301200209}"
701 |       ]
702 |      },
703 |      "execution_count": 12,
704 |      "metadata": {},
705 |      "output_type": "execute_result"
706 |     }
707 |    ],
708 |    "source": [
709 |     "eval_result"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "code",
714 |    "execution_count": 26,
715 |    "metadata": {},
716 |    "outputs": [
717 |     {
718 |      "data": {
719 |       "text/plain": [
720 |        "{'ap': array([ 0.67399267,  0.58309746,  0.47840597,  0.38835252,  0.32832103,\n",
721 |        "         0.65140195,  0.72522413,  0.67210257,  0.261774  ,  0.60450732,\n",
722 |        "         0.43700676,  0.38783834,  0.67360036,  0.58708554,  0.61982477,\n",
723 |        "         0.32746522,  0.40122355,  0.38349585,  0.61877112,  0.59206734]),\n",
724 |        " 'map': 0.51977792314647886}"
725 |       ]
726 |      },
727 |      "execution_count": 26,
728 |      "metadata": {},
729 |      "output_type": "execute_result"
730 |     }
731 |    ],
732 |    "source": [
733 |     "eval_result"
734 |    ]
735 |   }
736 |  ],
737 |  "metadata": {
738 |   "kernelspec": {
739 |    "display_name": "Python 3",
740 |    "language": "python",
741 |    "name": "python3"
742 |   },
743 |   "language_info": {
744 |    "codemirror_mode": {
745 |     "name": "ipython",
746 |     "version": 3
747 |    },
748 |    "file_extension": ".py",
749 |    "mimetype": "text/x-python",
750 |    "name": "python",
751 |    "nbconvert_exporter": "python",
752 |    "pygments_lexer": "ipython3",
753 |    "version": "3.6.3"
754 |   }
755 |  },
756 |  "nbformat": 4,
757 |  "nbformat_minor": 2
758 | }
759 | 


--------------------------------------------------------------------------------