├── data ├── __init__.py ├── dataset.py ├── voc_dataset.py └── util.py ├── model ├── __init__.py ├── utils │ ├── __init__.py │ ├── nms │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── non_maximum_suppression.cpython-36.pyc │ │ ├── build │ │ │ └── temp.linux-x86_64-3.6 │ │ │ │ └── _nms_gpu_post.o │ │ ├── _nms_gpu_post.cpython-36m-x86_64-linux-gnu.so │ │ ├── build.py │ │ ├── _nms_gpu_post_py.py │ │ ├── _nms_gpu_post.pyx │ │ ├── nohup.out │ │ └── non_maximum_suppression.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── bbox_tools.cpython-36.pyc │ │ ├── roi_sample.cpython-36.pyc │ │ └── rpn_gt_loc_label.cpython-36.pyc │ ├── roi_sample.py │ ├── rpn_gt_loc_label.py │ └── bbox_tools.py ├── vgg16.py ├── rpn.py ├── faster_rcnn.py └── roi_module.py ├── utils ├── __init__.py ├── __pycache__ │ ├── config.cpython-36.pyc │ ├── py_nms.cpython-36.pyc │ ├── __init__.cpython-36.pyc │ ├── anchors.cpython-36.pyc │ ├── data_load.cpython-36.pyc │ ├── eval_tool.cpython-36.pyc │ └── array_tool.cpython-36.pyc ├── py_nms.py ├── array_tool.py ├── config.py ├── anchors.py ├── data_load.py └── eval_tool.py ├── trainer ├── __init__.py ├── __pycache__ │ ├── trainer.cpython-36.pyc │ └── __init__.cpython-36.pyc └── trainer.py ├── demo.jpg └── Train.ipynb /data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trainer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/demo.jpg -------------------------------------------------------------------------------- /model/utils/nms/__init__.py: -------------------------------------------------------------------------------- 1 | from model.utils.nms.non_maximum_suppression import non_maximum_suppression -------------------------------------------------------------------------------- /utils/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/py_nms.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/py_nms.cpython-36.pyc -------------------------------------------------------------------------------- /trainer/__pycache__/trainer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/trainer/__pycache__/trainer.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/anchors.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/anchors.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/data_load.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/data_load.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/eval_tool.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/eval_tool.cpython-36.pyc -------------------------------------------------------------------------------- /trainer/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/trainer/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/array_tool.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/utils/__pycache__/array_tool.cpython-36.pyc -------------------------------------------------------------------------------- /model/utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /model/utils/__pycache__/bbox_tools.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/__pycache__/bbox_tools.cpython-36.pyc -------------------------------------------------------------------------------- /model/utils/__pycache__/roi_sample.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/__pycache__/roi_sample.cpython-36.pyc -------------------------------------------------------------------------------- /model/utils/nms/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/nms/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /model/utils/__pycache__/rpn_gt_loc_label.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/__pycache__/rpn_gt_loc_label.cpython-36.pyc -------------------------------------------------------------------------------- /model/utils/nms/build/temp.linux-x86_64-3.6/_nms_gpu_post.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/nms/build/temp.linux-x86_64-3.6/_nms_gpu_post.o -------------------------------------------------------------------------------- /model/utils/nms/_nms_gpu_post.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/nms/_nms_gpu_post.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /model/utils/nms/__pycache__/non_maximum_suppression.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YoungGer/Faster-RCNN-Pytorch/HEAD/model/utils/nms/__pycache__/non_maximum_suppression.cpython-36.pyc -------------------------------------------------------------------------------- /model/utils/nms/build.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from distutils.extension import Extension 3 | from Cython.Distutils import build_ext 4 | 5 | ext_modules = [Extension("_nms_gpu_post", ["_nms_gpu_post.pyx"])] 6 | setup( 7 | name="Hello pyx", 8 | cmdclass={'build_ext': build_ext}, 9 | ext_modules=ext_modules 10 | ) 11 | -------------------------------------------------------------------------------- /model/vgg16.py: -------------------------------------------------------------------------------- 1 | from torchvision.models import vgg16 2 | from torch import nn 3 | 4 | def decom_vgg16(): 5 | model = vgg16(pretrained=True) 6 | features = list(model.features)[:30] 7 | classifier = list(model.classifier) 8 | # remove last layer and dropout layer 9 | del classifier[6] 10 | del classifier[5] 11 | del classifier[2] 12 | # free top layer params 13 | for layer in features[:10]: 14 | for p in layer.parameters(): 15 | p.requires_grad = False 16 | return nn.Sequential(*features), nn.Sequential(*classifier) -------------------------------------------------------------------------------- /model/utils/nms/_nms_gpu_post_py.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | def _nms_gpu_post( mask, 5 | n_bbox, 6 | threads_per_block, 7 | col_blocks 8 | ): 9 | n_selection = 0 10 | one_ull = np.array([1],dtype=np.uint64) 11 | selection = np.zeros((n_bbox,), dtype=np.int32) 12 | remv = np.zeros((col_blocks,), dtype=np.uint64) 13 | 14 | for i in range(n_bbox): 15 | nblock = i // threads_per_block 16 | inblock = i % threads_per_block 17 | 18 | if not (remv[nblock] & one_ull << inblock): 19 | selection[n_selection] = i 20 | n_selection += 1 21 | 22 | index = i * col_blocks 23 | for j in range(nblock, col_blocks): 24 | remv[j] |= mask[index + j] 25 | return selection, n_selection 26 | -------------------------------------------------------------------------------- /utils/py_nms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def py_cpu_nms(rois, thresh): 4 | """ 5 | Pure Python NMS baseline. 6 | Already Sorted 7 | 8 | return: 9 | keep: roi keep indice 10 | """ 11 | y1 = rois[:, 0] 12 | x1 = rois[:, 1] 13 | y2 = rois[:, 2] 14 | x2 = rois[:, 3] 15 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 16 | 17 | N = len(rois) 18 | order = np.array(range(N)) 19 | 20 | keep = [] 21 | while order.size > 0: 22 | i = order[0] 23 | keep.append(i) 24 | xx1 = np.maximum(x1[i], x1[order[1:]]) 25 | yy1 = np.maximum(y1[i], y1[order[1:]]) 26 | xx2 = np.minimum(x2[i], x2[order[1:]]) 27 | yy2 = np.minimum(y2[i], y2[order[1:]]) 28 | 29 | w = np.maximum(0.0, xx2 - xx1 + 1) 30 | h = np.maximum(0.0, yy2 - yy1 + 1) 31 | inter = w * h 32 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 33 | 34 | inds = np.where(ovr <= thresh)[0] 35 | order = order[inds + 1] 36 | 37 | return keep -------------------------------------------------------------------------------- /model/utils/nms/_nms_gpu_post.pyx: -------------------------------------------------------------------------------- 1 | cimport numpy as np 2 | from libc.stdint cimport uint64_t 3 | 4 | import numpy as np 5 | 6 | def _nms_gpu_post(np.ndarray[np.uint64_t, ndim=1] mask, 7 | int n_bbox, 8 | int threads_per_block, 9 | int col_blocks 10 | ): 11 | cdef: 12 | int i, j, nblock, index 13 | uint64_t inblock 14 | int n_selection = 0 15 | uint64_t one_ull = 1 16 | np.ndarray[np.int32_t, ndim=1] selection 17 | np.ndarray[np.uint64_t, ndim=1] remv 18 | 19 | selection = np.zeros((n_bbox,), dtype=np.int32) 20 | remv = np.zeros((col_blocks,), dtype=np.uint64) 21 | 22 | for i in range(n_bbox): 23 | nblock = i // threads_per_block 24 | inblock = i % threads_per_block 25 | 26 | if not (remv[nblock] & one_ull << inblock): 27 | selection[n_selection] = i 28 | n_selection += 1 29 | 30 | index = i * col_blocks 31 | for j in range(nblock, col_blocks): 32 | remv[j] |= mask[index + j] 33 | return selection, n_selection 34 | -------------------------------------------------------------------------------- /utils/array_tool.py: -------------------------------------------------------------------------------- 1 | """ 2 | tools to convert specified type 3 | """ 4 | import torch as t 5 | import numpy as np 6 | 7 | 8 | def tonumpy(data): 9 | if isinstance(data, np.ndarray): 10 | return data 11 | if isinstance(data, t._TensorBase): 12 | return data.cpu().numpy() 13 | if isinstance(data, t.autograd.Variable): 14 | return tonumpy(data.data) 15 | 16 | 17 | def totensor(data, cuda=True): 18 | if isinstance(data, np.ndarray): 19 | tensor = t.from_numpy(data) 20 | if isinstance(data, t._TensorBase): 21 | tensor = data 22 | if isinstance(data, t.autograd.Variable): 23 | tensor = data.data 24 | if cuda: 25 | tensor = tensor.cuda() 26 | return tensor 27 | 28 | 29 | def tovariable(data): 30 | if isinstance(data, np.ndarray): 31 | return tovariable(totensor(data)) 32 | if isinstance(data, t._TensorBase): 33 | return t.autograd.Variable(data) 34 | if isinstance(data, t.autograd.Variable): 35 | return data 36 | else: 37 | raise ValueError("UnKnow data type: %s, input should be {np.ndarray,Tensor,Variable}" %type(data)) 38 | 39 | 40 | def scalar(data): 41 | if isinstance(data, np.ndarray): 42 | return data.reshape(1)[0] 43 | if isinstance(data, t._TensorBase): 44 | return data.view(1)[0] 45 | if isinstance(data, t.autograd.Variable): 46 | return data.data.view(1)[0] 47 | -------------------------------------------------------------------------------- /utils/config.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | 3 | 4 | # Default Configs for training 5 | # NOTE that, config items could be overwriten by passing argument through command line. 6 | # e.g. --voc-data-dir='./data/' 7 | 8 | class Config: 9 | # data 10 | voc_data_dir = '/home/guangyaoyang/data/VOCdevkit/VOC2007' 11 | min_size = 600 # image resize 12 | max_size = 1000 # image resize 13 | num_workers = 8 14 | test_num_workers = 8 15 | 16 | # # sigma for l1_smooth_loss 17 | rpn_sigma = 3. 18 | roi_sigma = 1. 19 | 20 | # # param for optimizer 21 | # # 0.0005 in origin paper but 0.0001 in tf-faster-rcnn 22 | weight_decay = 0.0005 23 | # lr_decay = 0.1 # 1e-3 -> 1e-4 24 | lr = 1e-3 25 | 26 | 27 | # # visualization 28 | # env = 'faster-rcnn' # visdom env 29 | # port = 8097 30 | # plot_every = 40 # vis every N iter 31 | 32 | # # preset 33 | # data = 'voc' 34 | # pretrained_model = 'vgg16' 35 | 36 | # # training 37 | epoch = 14 38 | 39 | 40 | use_adam = False # Use Adam optimizer 41 | # use_chainer = False # try match everything as chainer 42 | # use_drop = False # use dropout in RoIHead 43 | # # debug 44 | # debug_file = '/tmp/debugf' 45 | 46 | test_num = 10000 47 | # # model 48 | # load_path = None 49 | 50 | caffe_pretrain = False # use caffe pretrained model instead of torchvision 51 | caffe_pretrain_path = 'checkpoints/vgg16-caffe.pth' 52 | 53 | # def _parse(self, kwargs): 54 | # state_dict = self._state_dict() 55 | # for k, v in kwargs.items(): 56 | # if k not in state_dict: 57 | # raise ValueError('UnKnown Option: "--%s"' % k) 58 | # setattr(self, k, v) 59 | 60 | # print('======user config========') 61 | # pprint(self._state_dict()) 62 | # print('==========end============') 63 | 64 | # def _state_dict(self): 65 | # return {k: getattr(self, k) for k, _ in Config.__dict__.items() \ 66 | # if not k.startswith('_')} 67 | 68 | 69 | opt = Config() 70 | -------------------------------------------------------------------------------- /utils/anchors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def generate_anchor_base(side_length=16, ratios=[0.5, 1, 2], 4 | scales=[0.5, 1, 2], strides=16): 5 | """ 6 | Generate anchors for a single 16*16 block. Then transform the anchors 7 | to the original image space. 8 | 9 | Input: 10 | side_length: block side length 11 | ratios 12 | scales 13 | strides: network strides 14 | 15 | Return 16 | anchor_base: base anchor of the original image 17 | """ 18 | py = side_length / 2. 19 | px = side_length / 2. 20 | 21 | anchor_base = np.zeros((len(ratios) * len(scales), 4), 22 | dtype=np.float32) 23 | for i in range(len(ratios)): 24 | for j in range(len(scales)): 25 | h = side_length * strides * scales[j] * np.sqrt(ratios[i]) 26 | w = side_length * strides * scales[j] * np.sqrt(1. / ratios[i]) 27 | 28 | index = i * len(scales) + j 29 | anchor_base[index, 0] = py - h / 2. 30 | anchor_base[index, 1] = px - w / 2. 31 | anchor_base[index, 2] = py + h / 2. 32 | anchor_base[index, 3] = px + w / 2. 33 | return anchor_base 34 | 35 | 36 | def get_anchors(anchor_base, feat_stride, height, width): 37 | anchors_y = np.arange(height) * feat_stride 38 | anchors_x = np.arange(width) * feat_stride 39 | anchors_x, anchors_y = np.meshgrid(anchors_x, anchors_y) 40 | shift = np.stack((anchors_y.ravel(), anchors_x.ravel(), 41 | anchors_y.ravel(), anchors_x.ravel()), axis=1) 42 | anchors = np.repeat(shift, repeats=len(anchor_base), axis=0) + \ 43 | np.tile(anchor_base, [len(shift),1]) 44 | return anchors 45 | 46 | def get_rois_from_loc_anchors(anchors, rpn_locs): 47 | """Decode bounding boxes from bounding box offsets and scales. 48 | 49 | Given bounding box offsets and scales computed by 50 | :meth:`bbox2loc`, this function decodes the representation to 51 | coordinates in 2D image coordinates. 52 | 53 | Given scales and offsets :math:`t_y, t_x, t_h, t_w` and a bounding 54 | box whose center is :math:`(y, x) = p_y, p_x` and size :math:`p_h, p_w`, 55 | the decoded bounding box's center :math:`\\hat{g}_y`, :math:`\\hat{g}_x` 56 | and size :math:`\\hat{g}_h`, :math:`\\hat{g}_w` are calculated 57 | by the following formulas. 58 | 59 | * :math:`\\hat{g}_y = p_h t_y + p_y` 60 | * :math:`\\hat{g}_x = p_w t_x + p_x` 61 | * :math:`\\hat{g}_h = p_h \\exp(t_h)` 62 | * :math:`\\hat{g}_w = p_w \\exp(t_w)` 63 | 64 | Args: 65 | anchors (array): A coordinates of bounding boxes. 66 | Its shape is :math:`(R, 4)`. These coordinates are 67 | :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`. 68 | rpn_locs (array): An array with offsets and scales. 69 | The shapes of :obj:`anchors` and :obj:`rpn_locs` should be same. 70 | This contains values :math:`t_y, t_x, t_h, t_w`. 71 | 72 | Returns: 73 | array: 74 | Decoded bounding box coordinates. Its shape is :math:`(R, 4)`. \ 75 | The second axis contains four values \ 76 | :math:`\\hat{g}_{ymin}, \\hat{g}_{xmin}, 77 | \\hat{g}_{ymax}, \\hat{g}_{xmax}`. 78 | 79 | """ 80 | src_bbox = anchors 81 | src_bbox = src_bbox.astype(src_bbox.dtype, copy=False) 82 | 83 | src_height = src_bbox[:, 2] - src_bbox[:, 0] 84 | src_width = src_bbox[:, 3] - src_bbox[:, 1] 85 | src_ctr_y = src_bbox[:, 0] + 0.5 * src_height 86 | src_ctr_x = src_bbox[:, 1] + 0.5 * src_width 87 | 88 | dy = rpn_locs[:, 0] 89 | dx = rpn_locs[:, 1] 90 | dh = rpn_locs[:, 2] 91 | dw = rpn_locs[:, 3] 92 | 93 | dst_y = dy * src_height + src_ctr_y 94 | dst_x = dx * src_width + src_ctr_x 95 | dst_h = np.exp(dh) * src_height 96 | dst_w = np.exp(dw) * src_width 97 | 98 | dst_bbox = np.zeros(rpn_locs.shape, dtype=rpn_locs.dtype) 99 | dst_bbox[:, 0] = dst_y - 0.5 * dst_h 100 | dst_bbox[:, 1] = dst_x - 0.5 * dst_w 101 | dst_bbox[:, 2] = dst_y + 0.5 * dst_h 102 | dst_bbox[:, 3] = dst_x + 0.5 * dst_w 103 | 104 | return dst_bbox -------------------------------------------------------------------------------- /data/dataset.py: -------------------------------------------------------------------------------- 1 | import torch as t 2 | from .voc_dataset import VOCBboxDataset 3 | from skimage import transform as sktsf 4 | from torchvision import transforms as tvtsf 5 | from . import util 6 | import numpy as np 7 | from utils.config import opt 8 | 9 | 10 | def inverse_normalize(img): 11 | if opt.caffe_pretrain: 12 | img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1)) 13 | return img[::-1, :, :] 14 | # approximate un-normalize for visualize 15 | return (img * 0.225 + 0.45).clip(min=0, max=1) * 255 16 | 17 | 18 | def pytorch_normalze(img): 19 | """ 20 | https://github.com/pytorch/vision/issues/223 21 | return appr -1~1 RGB 22 | """ 23 | normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406], 24 | std=[0.229, 0.224, 0.225]) 25 | img = normalize(t.from_numpy(img)) 26 | return img.numpy() 27 | 28 | 29 | def caffe_normalize(img): 30 | """ 31 | return appr -125-125 BGR 32 | """ 33 | img = img[[2, 1, 0], :, :] # RGB-BGR 34 | img = img * 255 35 | mean = np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1) 36 | img = (img - mean).astype(np.float32, copy=True) 37 | return img 38 | 39 | 40 | def preprocess(img, min_size=600, max_size=1000): 41 | """Preprocess an image for feature extraction. 42 | 43 | The length of the shorter edge is scaled to :obj:`self.min_size`. 44 | After the scaling, if the length of the longer edge is longer than 45 | :param min_size: 46 | :obj:`self.max_size`, the image is scaled to fit the longer edge 47 | to :obj:`self.max_size`. 48 | 49 | After resizing the image, the image is subtracted by a mean image value 50 | :obj:`self.mean`. 51 | 52 | Args: 53 | img (~numpy.ndarray): An image. This is in CHW and RGB format. 54 | The range of its value is :math:`[0, 255]`. 55 | (~numpy.ndarray): An image. This is in CHW and RGB format. 56 | The range of its value is :math:`[0, 255]`. 57 | 58 | Returns: 59 | ~numpy.ndarray: 60 | A preprocessed image. 61 | 62 | """ 63 | C, H, W = img.shape 64 | scale1 = min_size / min(H, W) 65 | scale2 = max_size / max(H, W) 66 | scale = min(scale1, scale2) 67 | img = img / 255. 68 | img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect') 69 | # both the longer and shorter should be less than 70 | # max_size and min_size 71 | if opt.caffe_pretrain: 72 | normalize = caffe_normalize 73 | else: 74 | normalize = pytorch_normalze 75 | return normalize(img) 76 | 77 | 78 | class Transform(object): 79 | 80 | def __init__(self, min_size=600, max_size=1000): 81 | self.min_size = min_size 82 | self.max_size = max_size 83 | 84 | def __call__(self, in_data): 85 | img, bbox, label = in_data 86 | _, H, W = img.shape 87 | img = preprocess(img, self.min_size, self.max_size) 88 | _, o_H, o_W = img.shape 89 | scale = o_H / H 90 | bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W)) 91 | 92 | # horizontally flip 93 | img, params = util.random_flip( 94 | img, x_random=True, return_param=True) 95 | bbox = util.flip_bbox( 96 | bbox, (o_H, o_W), x_flip=params['x_flip']) 97 | 98 | return img, bbox, label, scale 99 | 100 | 101 | class Dataset: 102 | def __init__(self, opt): 103 | self.opt = opt 104 | self.db = VOCBboxDataset(opt.voc_data_dir) 105 | self.tsf = Transform(opt.min_size, opt.max_size) 106 | 107 | def __getitem__(self, idx): 108 | ori_img, bbox, label, difficult = self.db.get_example(idx) 109 | 110 | img, bbox, label, scale = self.tsf((ori_img, bbox, label)) 111 | # TODO: check whose stride is negative to fix this instead copy all 112 | # some of the strides of a given numpy array are negative. 113 | return img.copy(), bbox.copy(), label.copy(), scale 114 | 115 | def __len__(self): 116 | return len(self.db) 117 | 118 | 119 | class TestDataset: 120 | def __init__(self, opt, split='test', use_difficult=True): 121 | self.opt = opt 122 | self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult) 123 | 124 | def __getitem__(self, idx): 125 | ori_img, bbox, label, difficult = self.db.get_example(idx) 126 | img = preprocess(ori_img) 127 | return img, ori_img.shape[1:], bbox, label, difficult 128 | 129 | def __len__(self): 130 | return len(self.db) 131 | -------------------------------------------------------------------------------- /model/utils/roi_sample.py: -------------------------------------------------------------------------------- 1 | from model.utils.bbox_tools import bbox2loc, bbox_iou 2 | import numpy as np 3 | 4 | class ProposalTargetCreator(object): 5 | """Assign ground truth bounding boxes to given RoIs. 6 | 7 | The :meth:`__call__` of this class generates training targets 8 | for each object proposal. 9 | This is used to train Faster RCNN [#]_. 10 | 11 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ 12 | Faster R-CNN: Towards Real-Time Object Detection with \ 13 | Region Proposal Networks. NIPS 2015. 14 | 15 | Args: 16 | n_sample (int): The number of sampled regions. 17 | pos_ratio (float): Fraction of regions that is labeled as a 18 | foreground. 19 | pos_iou_thresh (float): IoU threshold for a RoI to be considered as a 20 | foreground. 21 | neg_iou_thresh_hi (float): RoI is considered to be the background 22 | if IoU is in 23 | [:obj:`neg_iou_thresh_hi`, :obj:`neg_iou_thresh_hi`). 24 | neg_iou_thresh_lo (float): See above. 25 | 26 | """ 27 | 28 | def __init__(self, 29 | n_sample=128, 30 | pos_ratio=0.25, pos_iou_thresh=0.5, 31 | neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0 32 | ): 33 | self.n_sample = n_sample 34 | self.pos_ratio = pos_ratio 35 | self.pos_iou_thresh = pos_iou_thresh 36 | self.neg_iou_thresh_hi = neg_iou_thresh_hi 37 | self.neg_iou_thresh_lo = neg_iou_thresh_lo # NOTE: py-faster-rcnn默认的值是0.1 38 | 39 | def __call__(self, roi, bbox, label, 40 | loc_normalize_mean=(0., 0., 0., 0.), 41 | loc_normalize_std=(0.1, 0.1, 0.2, 0.2)): 42 | """Assigns ground truth to sampled proposals. 43 | 44 | This function samples total of :obj:`self.n_sample` RoIs 45 | from the combination of :obj:`roi` and :obj:`bbox`. 46 | The RoIs are assigned with the ground truth class labels as well as 47 | bounding box offsets and scales to match the ground truth bounding 48 | boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are 49 | sampled as foregrounds. 50 | 51 | Offsets and scales of bounding boxes are calculated using 52 | :func:`model.utils.bbox_tools.bbox2loc`. 53 | Also, types of input arrays and output arrays are same. 54 | 55 | Here are notations. 56 | 57 | * :math:`S` is the total number of sampled RoIs, which equals \ 58 | :obj:`self.n_sample`. 59 | * :math:`L` is number of object classes possibly including the \ 60 | background. 61 | 62 | Args: 63 | roi (array): Region of Interests (RoIs) from which we sample. 64 | Its shape is :math:`(R, 4)` 65 | bbox (array): The coordinates of ground truth bounding boxes. 66 | Its shape is :math:`(R', 4)`. 67 | label (array): Ground truth bounding box labels. Its shape 68 | is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where 69 | :math:`L` is the number of foreground classes. 70 | loc_normalize_mean (tuple of four floats): Mean values to normalize 71 | coordinates of bouding boxes. 72 | loc_normalize_std (tupler of four floats): Standard deviation of 73 | the coordinates of bounding boxes. 74 | 75 | Returns: 76 | (array, array, array): 77 | 78 | * **sample_roi**: Regions of interests that are sampled. \ 79 | Its shape is :math:`(S, 4)`. 80 | * **gt_roi_loc**: Offsets and scales to match \ 81 | the sampled RoIs to the ground truth bounding boxes. \ 82 | Its shape is :math:`(S, 4)`. 83 | * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \ 84 | :math:`(S,)`. Its range is :math:`[0, L]`. The label with \ 85 | value 0 is the background. 86 | 87 | """ 88 | n_bbox, _ = bbox.shape 89 | 90 | roi = np.concatenate((roi, bbox), axis=0) 91 | 92 | pos_roi_per_image = np.round(self.n_sample * self.pos_ratio) 93 | iou = bbox_iou(roi, bbox) 94 | gt_assignment = iou.argmax(axis=1) 95 | max_iou = iou.max(axis=1) 96 | # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class]. 97 | # The label with value 0 is the background. 98 | gt_roi_label = label[gt_assignment] + 1 99 | 100 | # Select foreground RoIs as those with >= pos_iou_thresh IoU. 101 | pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] 102 | pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) 103 | if pos_index.size > 0: 104 | pos_index = np.random.choice( 105 | pos_index, size=pos_roi_per_this_image, replace=False) 106 | 107 | # Select background RoIs as those within 108 | # [neg_iou_thresh_lo, neg_iou_thresh_hi). 109 | neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & 110 | (max_iou >= self.neg_iou_thresh_lo))[0] 111 | neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image 112 | neg_roi_per_this_image = int(min(neg_roi_per_this_image, 113 | neg_index.size)) 114 | if neg_index.size > 0: 115 | neg_index = np.random.choice( 116 | neg_index, size=neg_roi_per_this_image, replace=False) 117 | 118 | # The indices that we're selecting (both positive and negative). 119 | keep_index = np.append(pos_index, neg_index) 120 | gt_roi_label = gt_roi_label[keep_index] 121 | gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0 122 | sample_roi = roi[keep_index] 123 | 124 | # Compute offsets and scales to match sampled RoIs to the GTs. 125 | gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]]) 126 | gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32) 127 | ) / np.array(loc_normalize_std, np.float32)) 128 | 129 | return sample_roi, gt_roi_loc, gt_roi_label -------------------------------------------------------------------------------- /model/utils/rpn_gt_loc_label.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from model.utils.bbox_tools import bbox2loc, bbox_iou 4 | 5 | class AnchorTargetCreator(object): 6 | """Assign the ground truth bounding boxes to anchors. 7 | 8 | Assigns the ground truth bounding boxes to anchors for training Region 9 | Proposal Networks introduced in Faster R-CNN [#]_. 10 | 11 | Offsets and scales to match anchors to the ground truth are 12 | calculated using the encoding scheme of 13 | :func:`model.utils.bbox_tools.bbox2loc`. 14 | 15 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ 16 | Faster R-CNN: Towards Real-Time Object Detection with \ 17 | Region Proposal Networks. NIPS 2015. 18 | 19 | Args: 20 | n_sample (int): The number of regions to produce. 21 | pos_iou_thresh (float): Anchors with IoU above this 22 | threshold will be assigned as positive. 23 | neg_iou_thresh (float): Anchors with IoU below this 24 | threshold will be assigned as negative. 25 | pos_ratio (float): Ratio of positive regions in the 26 | sampled regions. 27 | 28 | """ 29 | 30 | def __init__(self, 31 | n_sample=256, 32 | pos_iou_thresh=0.7, neg_iou_thresh=0.3, 33 | pos_ratio=0.5): 34 | self.n_sample = n_sample 35 | self.pos_iou_thresh = pos_iou_thresh 36 | self.neg_iou_thresh = neg_iou_thresh 37 | self.pos_ratio = pos_ratio 38 | 39 | def __call__(self, bbox, anchor, img_size): 40 | """Assign ground truth supervision to sampled subset of anchors. 41 | 42 | Types of input arrays and output arrays are same. 43 | 44 | Here are notations. 45 | 46 | * :math:`S` is the number of anchors. 47 | * :math:`R` is the number of bounding boxes. 48 | 49 | Args: 50 | bbox (array): Coordinates of bounding boxes. Its shape is 51 | :math:`(R, 4)`. 52 | anchor (array): Coordinates of anchors. Its shape is 53 | :math:`(S, 4)`. 54 | img_size (tuple of ints): A tuple :obj:`H, W`, which 55 | is a tuple of height and width of an image. 56 | 57 | Returns: 58 | (array, array): 59 | 60 | #NOTE: it's scale not only offset 61 | * **loc**: Offsets and scales to match the anchors to \ 62 | the ground truth bounding boxes. Its shape is :math:`(S, 4)`. 63 | * **label**: Labels of anchors with values \ 64 | :obj:`(1=positive, 0=negative, -1=ignore)`. Its shape \ 65 | is :math:`(S,)`. 66 | 67 | """ 68 | 69 | img_H, img_W = img_size 70 | 71 | n_anchor = len(anchor) 72 | inside_index = _get_inside_index(anchor, img_H, img_W) 73 | anchor = anchor[inside_index] 74 | argmax_ious, label = self._create_label( 75 | inside_index, anchor, bbox) 76 | 77 | # compute bounding box regression targets 78 | loc = bbox2loc(anchor, bbox[argmax_ious]) 79 | 80 | # map up to original set of anchors 81 | label = _unmap(label, n_anchor, inside_index, fill=-1) 82 | loc = _unmap(loc, n_anchor, inside_index, fill=0) 83 | 84 | return loc, label 85 | 86 | def _create_label(self, inside_index, anchor, bbox): 87 | # label: 1 is positive, 0 is negative, -1 is dont care 88 | label = np.empty((len(inside_index),), dtype=np.int32) 89 | label.fill(-1) 90 | 91 | argmax_ious, max_ious, gt_argmax_ious = \ 92 | self._calc_ious(anchor, bbox, inside_index) 93 | 94 | # assign negative labels first so that positive labels can clobber them 95 | label[max_ious < self.neg_iou_thresh] = 0 96 | 97 | # positive label: for each gt, anchor with highest iou 98 | label[gt_argmax_ious] = 1 99 | 100 | # positive label: above threshold IOU 101 | label[max_ious >= self.pos_iou_thresh] = 1 102 | 103 | # subsample positive labels if we have too many 104 | n_pos = int(self.pos_ratio * self.n_sample) 105 | pos_index = np.where(label == 1)[0] 106 | if len(pos_index) > n_pos: 107 | disable_index = np.random.choice( 108 | pos_index, size=(len(pos_index) - n_pos), replace=False) 109 | label[disable_index] = -1 110 | 111 | # subsample negative labels if we have too many 112 | n_neg = self.n_sample - np.sum(label == 1) 113 | neg_index = np.where(label == 0)[0] 114 | if len(neg_index) > n_neg: 115 | disable_index = np.random.choice( 116 | neg_index, size=(len(neg_index) - n_neg), replace=False) 117 | label[disable_index] = -1 118 | 119 | return argmax_ious, label 120 | 121 | def _calc_ious(self, anchor, bbox, inside_index): 122 | # ious between the anchors and the gt boxes 123 | ious = bbox_iou(anchor, bbox) 124 | argmax_ious = ious.argmax(axis=1) 125 | max_ious = ious[np.arange(len(inside_index)), argmax_ious] 126 | gt_argmax_ious = ious.argmax(axis=0) 127 | gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])] 128 | gt_argmax_ious = np.where(ious == gt_max_ious)[0] 129 | 130 | return argmax_ious, max_ious, gt_argmax_ious 131 | 132 | 133 | def _unmap(data, count, index, fill=0): 134 | # Unmap a subset of item (data) back to the original set of items (of 135 | # size count) 136 | 137 | if len(data.shape) == 1: 138 | ret = np.empty((count,), dtype=data.dtype) 139 | ret.fill(fill) 140 | ret[index] = data 141 | else: 142 | ret = np.empty((count,) + data.shape[1:], dtype=data.dtype) 143 | ret.fill(fill) 144 | ret[index, :] = data 145 | return ret 146 | 147 | 148 | def _get_inside_index(anchor, H, W): 149 | # Calc indicies of anchors which are located completely inside of the image 150 | # whose size is speficied. 151 | index_inside = np.where( 152 | (anchor[:, 0] >= 0) & 153 | (anchor[:, 1] >= 0) & 154 | (anchor[:, 2] <= H) & 155 | (anchor[:, 3] <= W) 156 | )[0] 157 | return index_inside -------------------------------------------------------------------------------- /data/voc_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import xml.etree.ElementTree as ET 3 | 4 | import numpy as np 5 | 6 | from .util import read_image 7 | 8 | 9 | class VOCBboxDataset: 10 | """Bounding box dataset for PASCAL `VOC`_. 11 | 12 | .. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/ 13 | 14 | The index corresponds to each image. 15 | 16 | When queried by an index, if :obj:`return_difficult == False`, 17 | this dataset returns a corresponding 18 | :obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels. 19 | This is the default behaviour. 20 | If :obj:`return_difficult == True`, this dataset returns corresponding 21 | :obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array 22 | that indicates whether bounding boxes are labeled as difficult or not. 23 | 24 | The bounding boxes are packed into a two dimensional tensor of shape 25 | :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in 26 | the image. The second axis represents attributes of the bounding box. 27 | They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the 28 | four attributes are coordinates of the top left and the bottom right 29 | vertices. 30 | 31 | The labels are packed into a one dimensional tensor of shape :math:`(R,)`. 32 | :math:`R` is the number of bounding boxes in the image. 33 | The class name of the label :math:`l` is :math:`l` th element of 34 | :obj:`VOC_BBOX_LABEL_NAMES`. 35 | 36 | The array :obj:`difficult` is a one dimensional boolean array of shape 37 | :math:`(R,)`. :math:`R` is the number of bounding boxes in the image. 38 | If :obj:`use_difficult` is :obj:`False`, this array is 39 | a boolean array with all :obj:`False`. 40 | 41 | The type of the image, the bounding boxes and the labels are as follows. 42 | 43 | * :obj:`img.dtype == numpy.float32` 44 | * :obj:`bbox.dtype == numpy.float32` 45 | * :obj:`label.dtype == numpy.int32` 46 | * :obj:`difficult.dtype == numpy.bool` 47 | 48 | Args: 49 | data_dir (string): Path to the root of the training data. 50 | i.e. "/data/image/voc/VOCdevkit/VOC2007/" 51 | split ({'train', 'val', 'trainval', 'test'}): Select a split of the 52 | dataset. :obj:`test` split is only available for 53 | 2007 dataset. 54 | year ({'2007', '2012'}): Use a dataset prepared for a challenge 55 | held in :obj:`year`. 56 | use_difficult (bool): If :obj:`True`, use images that are labeled as 57 | difficult in the original annotation. 58 | return_difficult (bool): If :obj:`True`, this dataset returns 59 | a boolean array 60 | that indicates whether bounding boxes are labeled as difficult 61 | or not. The default value is :obj:`False`. 62 | 63 | """ 64 | 65 | def __init__(self, data_dir, split='trainval', 66 | use_difficult=False, return_difficult=False, 67 | ): 68 | 69 | # if split not in ['train', 'trainval', 'val']: 70 | # if not (split == 'test' and year == '2007'): 71 | # warnings.warn( 72 | # 'please pick split from \'train\', \'trainval\', \'val\'' 73 | # 'for 2012 dataset. For 2007 dataset, you can pick \'test\'' 74 | # ' in addition to the above mentioned splits.' 75 | # ) 76 | id_list_file = os.path.join( 77 | data_dir, 'ImageSets/Main/{0}.txt'.format(split)) 78 | 79 | self.ids = [id_.strip() for id_ in open(id_list_file)] 80 | self.data_dir = data_dir 81 | self.use_difficult = use_difficult 82 | self.return_difficult = return_difficult 83 | self.label_names = VOC_BBOX_LABEL_NAMES 84 | 85 | def __len__(self): 86 | return len(self.ids) 87 | 88 | def get_example(self, i): 89 | """Returns the i-th example. 90 | 91 | Returns a color image and bounding boxes. The image is in CHW format. 92 | The returned image is RGB. 93 | 94 | Args: 95 | i (int): The index of the example. 96 | 97 | Returns: 98 | tuple of an image and bounding boxes 99 | 100 | """ 101 | id_ = self.ids[i] 102 | anno = ET.parse( 103 | os.path.join(self.data_dir, 'Annotations', id_ + '.xml')) 104 | bbox = list() 105 | label = list() 106 | difficult = list() 107 | for obj in anno.findall('object'): 108 | # when in not using difficult split, and the object is 109 | # difficult, skipt it. 110 | if not self.use_difficult and int(obj.find('difficult').text) == 1: 111 | continue 112 | 113 | difficult.append(int(obj.find('difficult').text)) 114 | bndbox_anno = obj.find('bndbox') 115 | # subtract 1 to make pixel indexes 0-based 116 | bbox.append([ 117 | int(bndbox_anno.find(tag).text) - 1 118 | for tag in ('ymin', 'xmin', 'ymax', 'xmax')]) 119 | name = obj.find('name').text.lower().strip() 120 | label.append(VOC_BBOX_LABEL_NAMES.index(name)) 121 | bbox = np.stack(bbox).astype(np.float32) 122 | label = np.stack(label).astype(np.int32) 123 | # When `use_difficult==False`, all elements in `difficult` are False. 124 | difficult = np.array(difficult, dtype=np.bool).astype(np.uint8) # PyTorch don't support np.bool 125 | 126 | # Load a image 127 | img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg') 128 | img = read_image(img_file, color=True) 129 | 130 | # if self.return_difficult: 131 | # return img, bbox, label, difficult 132 | return img, bbox, label, difficult 133 | 134 | __getitem__ = get_example 135 | 136 | 137 | VOC_BBOX_LABEL_NAMES = ( 138 | 'aeroplane', 139 | 'bicycle', 140 | 'bird', 141 | 'boat', 142 | 'bottle', 143 | 'bus', 144 | 'car', 145 | 'cat', 146 | 'chair', 147 | 'cow', 148 | 'diningtable', 149 | 'dog', 150 | 'horse', 151 | 'motorbike', 152 | 'person', 153 | 'pottedplant', 154 | 'sheep', 155 | 'sofa', 156 | 'train', 157 | 'tvmonitor') 158 | -------------------------------------------------------------------------------- /model/rpn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils.anchors import generate_anchor_base, get_anchors, get_rois_from_loc_anchors 3 | from utils.py_nms import py_cpu_nms 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | class RegionProposalNetwork(nn.Module): 8 | """Region Proposal Network introduced in Faster R-CNN. 9 | 10 | Args: 11 | in_channels (int): The channel size of input. 12 | mid_channels (int): The channel size of the intermediate tensor. 13 | ratios (list of floats): This is ratios of width to height of 14 | the anchors. 15 | anchor_scales (list of numbers): This is areas of anchors. 16 | Those areas will be the product of the square of an element in 17 | :obj:`anchor_scales` and the original area of the reference 18 | window. 19 | feat_stride (int): Stride size after extracting features from an 20 | image. 21 | initialW (callable): Initial weight value. If :obj:`None` then this 22 | function uses Gaussian distribution scaled by 0.1 to 23 | initialize weight. 24 | May also be a callable that takes an array and edits its values. 25 | proposal_creator_params (dict): Key valued paramters for 26 | :class:`model.utils.creator_tools.ProposalCreator`. 27 | 28 | .. seealso:: 29 | :class:`~model.utils.creator_tools.ProposalCreator` 30 | 31 | """ 32 | 33 | def __init__( 34 | self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2], 35 | scales=[0.5, 1, 2], feat_stride=16 36 | ): 37 | super(RegionProposalNetwork, self).__init__() 38 | # prepare anchor base 39 | self.anchor_base = generate_anchor_base(side_length=16, 40 | ratios=ratios, scales=scales, strides=feat_stride) 41 | self.feat_stride = feat_stride 42 | # network params 43 | n_anchor = self.anchor_base.shape[0] 44 | self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1) 45 | self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0) 46 | self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0) 47 | normal_init(self.conv1, 0, 0.01) 48 | normal_init(self.score, 0, 0.01) 49 | normal_init(self.loc, 0, 0.01) 50 | 51 | def forward(self, h, img_size, scale=1.): 52 | """Forward Region Proposal Network. 53 | 54 | Here are notations. 55 | 56 | * :math:`N` is batch size. 57 | * :math:`C` channel size of the input. 58 | * :math:`H` and :math:`W` are height and witdh of the input feature. 59 | * :math:`A` is number of anchors assigned to each pixel. 60 | 61 | Args: 62 | x (~torch.autograd.Variable): The Features extracted from images. 63 | Its shape is :math:`(N, C, H, W)`. 64 | img_size (tuple of ints): A tuple :obj:`height, width`, 65 | which contains image size after scaling. 66 | scale (float): The amount of scaling done to the input images after 67 | reading them from files. 68 | 69 | Returns: 70 | (~torch.autograd.Variable, ~torch.autograd.Variable, array, array, array): 71 | 72 | This is a tuple of five following values. 73 | 74 | * **rpn_locs**: Predicted bounding box offsets and scales for \ 75 | anchors. Its shape is :math:`(N, H W A, 4)`. 76 | * **rpn_scores**: Predicted foreground scores for \ 77 | anchors. Its shape is :math:`(N, H W A, 2)`. 78 | * **rois**: A bounding box array containing coordinates of \ 79 | proposal boxes. This is a concatenation of bounding box \ 80 | arrays from multiple images in the batch. \ 81 | Its shape is :math:`(R', 4)`. Given :math:`R_i` predicted \ 82 | bounding boxes from the :math:`i` th image, \ 83 | :math:`R' = \\sum _{i=1} ^ N R_i`. 84 | * **roi_indices**: An array containing indices of images to \ 85 | which RoIs correspond to. Its shape is :math:`(R',)`. 86 | * **anchor**: Coordinates of enumerated shifted anchors. \ 87 | Its shape is :math:`(H W A, 4)`. 88 | 89 | """ 90 | n_pre_nms = 12000 91 | n_post_nms = 2000 92 | nms_thresh = 0.7 93 | 94 | # get anchors predifined 95 | n, _, hh, ww = h.shape 96 | anchors = get_anchors(self.anchor_base, self.feat_stride, hh, ww) 97 | 98 | # main forward 99 | hidd = F.relu(self.conv1(h)) 100 | rpn_locs = self.loc(hidd) 101 | rpn_scores = self.score(hidd) 102 | 103 | # view data 104 | # rpn_locs, rpn_scores 105 | rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4) 106 | rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous().view(n, -1, 2) 107 | scores = rpn_scores[:,:,1].data.cpu().numpy()[0] 108 | 109 | # get rois, roi_indices 110 | rois = get_rois_from_loc_anchors(anchors, rpn_locs[0].data.cpu().numpy()) 111 | ## clip 112 | rois[:, ::2] = np.clip(rois[:, ::2], 0, img_size[0]) 113 | rois[:, 1::2] = np.clip(rois[:, 1::2], 0, img_size[1]) 114 | ## remove < min_size 115 | min_size = 16 116 | min_size = min_size * scale 117 | hs = rois[:, 2] - rois[:, 0] 118 | ws = rois[:, 3] - rois[:, 1] 119 | keep = np.where((hs >= min_size) & (ws >= min_size))[0] 120 | rois = rois[keep, :] 121 | scores = scores[keep] 122 | # Sort all (proposal, score) pairs by score from highest to lowest. 123 | # Take top pre_nms_topN (e.g. 6000). 124 | order = scores.ravel().argsort()[::-1] 125 | if n_pre_nms > 0: 126 | order = order[:n_pre_nms] 127 | rois = rois[order, :] 128 | 129 | # NMS 130 | keep = py_cpu_nms(rois, nms_thresh) 131 | keep = keep[:n_post_nms] 132 | rois = rois[keep] 133 | return rpn_locs, rpn_scores, rois, [0]*len(rois), anchors 134 | 135 | 136 | def normal_init(m, mean, stddev, truncated=False): 137 | """ 138 | weight initalizer: truncated normal and random normal. 139 | """ 140 | # x is a parameter 141 | if truncated: 142 | m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation 143 | else: 144 | m.weight.data.normal_(mean, stddev) 145 | m.bias.data.zero_() -------------------------------------------------------------------------------- /model/utils/nms/nohup.out: -------------------------------------------------------------------------------- 1 | Traceback (most recent call last): 2 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1318, in do_open 3 | encode_chunked=req.has_header('Transfer-encoding')) 4 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1239, in request 5 | self._send_request(method, url, body, headers, encode_chunked) 6 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1285, in _send_request 7 | self.endheaders(body, encode_chunked=encode_chunked) 8 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1234, in endheaders 9 | self._send_output(message_body, encode_chunked=encode_chunked) 10 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1026, in _send_output 11 | self.send(msg) 12 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 964, in send 13 | self.connect() 14 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1392, in connect 15 | super().connect() 16 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 936, in connect 17 | (self.host,self.port), self.timeout, self.source_address) 18 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/socket.py", line 724, in create_connection 19 | raise err 20 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/socket.py", line 713, in create_connection 21 | sock.connect(sa) 22 | TimeoutError: [Errno 110] Connection timed out 23 | 24 | During handling of the above exception, another exception occurred: 25 | 26 | Traceback (most recent call last): 27 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 693, in download_scripts 28 | data = opener.open(req).read() 29 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 526, in open 30 | response = self._open(req, data) 31 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 544, in _open 32 | '_open', req) 33 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 504, in _call_chain 34 | result = func(*args) 35 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1361, in https_open 36 | context=self._context, check_hostname=self._check_hostname) 37 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1320, in do_open 38 | raise URLError(err) 39 | urllib.error.URLError: 40 | 41 | During handling of the above exception, another exception occurred: 42 | 43 | Traceback (most recent call last): 44 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/runpy.py", line 193, in _run_module_as_main 45 | "__main__", mod_spec) 46 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/runpy.py", line 85, in _run_code 47 | exec(code, run_globals) 48 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 717, in 49 | download_scripts() 50 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 697, in download_scripts 51 | logging.error('Error {} while downloading {}'.format(exc.code, key)) 52 | AttributeError: 'URLError' object has no attribute 'code' 53 | Downloading scripts. It might take a while. 54 | Traceback (most recent call last): 55 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1318, in do_open 56 | encode_chunked=req.has_header('Transfer-encoding')) 57 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1239, in request 58 | self._send_request(method, url, body, headers, encode_chunked) 59 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1285, in _send_request 60 | self.endheaders(body, encode_chunked=encode_chunked) 61 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1234, in endheaders 62 | self._send_output(message_body, encode_chunked=encode_chunked) 63 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1026, in _send_output 64 | self.send(msg) 65 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 964, in send 66 | self.connect() 67 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/http/client.py", line 1400, in connect 68 | server_hostname=server_hostname) 69 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/ssl.py", line 407, in wrap_socket 70 | _context=self, _session=session) 71 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/ssl.py", line 814, in __init__ 72 | self.do_handshake() 73 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/ssl.py", line 1068, in do_handshake 74 | self._sslobj.do_handshake() 75 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/ssl.py", line 689, in do_handshake 76 | self._sslobj.do_handshake() 77 | ConnectionResetError: [Errno 104] Connection reset by peer 78 | 79 | During handling of the above exception, another exception occurred: 80 | 81 | Traceback (most recent call last): 82 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 693, in download_scripts 83 | data = opener.open(req).read() 84 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 526, in open 85 | response = self._open(req, data) 86 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 544, in _open 87 | '_open', req) 88 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 504, in _call_chain 89 | result = func(*args) 90 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1361, in https_open 91 | context=self._context, check_hostname=self._check_hostname) 92 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/urllib/request.py", line 1320, in do_open 93 | raise URLError(err) 94 | urllib.error.URLError: 95 | 96 | During handling of the above exception, another exception occurred: 97 | 98 | Traceback (most recent call last): 99 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/runpy.py", line 193, in _run_module_as_main 100 | "__main__", mod_spec) 101 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/runpy.py", line 85, in _run_code 102 | exec(code, run_globals) 103 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 717, in 104 | download_scripts() 105 | File "/home/guangyaoyang/anaconda3/envs/gluon/lib/python3.6/site-packages/visdom/server.py", line 697, in download_scripts 106 | logging.error('Error {} while downloading {}'.format(exc.code, key)) 107 | AttributeError: 'URLError' object has no attribute 'code' 108 | Downloading scripts. It might take a while. 109 | -------------------------------------------------------------------------------- /model/utils/bbox_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy as xp 3 | 4 | def bbox2loc(src_bbox, dst_bbox): 5 | """Encodes the source and the destination bounding boxes to "loc". 6 | 7 | Given bounding boxes, this function computes offsets and scales 8 | to match the source bounding boxes to the target bounding boxes. 9 | Mathematcially, given a bounding box whose center is 10 | :math:`(y, x) = p_y, p_x` and 11 | size :math:`p_h, p_w` and the target bounding box whose center is 12 | :math:`g_y, g_x` and size :math:`g_h, g_w`, the offsets and scales 13 | :math:`t_y, t_x, t_h, t_w` can be computed by the following formulas. 14 | 15 | * :math:`t_y = \\frac{(g_y - p_y)} {p_h}` 16 | * :math:`t_x = \\frac{(g_x - p_x)} {p_w}` 17 | * :math:`t_h = \\log(\\frac{g_h} {p_h})` 18 | * :math:`t_w = \\log(\\frac{g_w} {p_w})` 19 | 20 | The output is same type as the type of the inputs. 21 | The encoding formulas are used in works such as R-CNN [#]_. 22 | 23 | .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \ 24 | Rich feature hierarchies for accurate object detection and semantic \ 25 | segmentation. CVPR 2014. 26 | 27 | Args: 28 | src_bbox (array): An image coordinate array whose shape is 29 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes. 30 | These coordinates are 31 | :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`. 32 | dst_bbox (array): An image coordinate array whose shape is 33 | :math:`(R, 4)`. 34 | These coordinates are 35 | :math:`g_{ymin}, g_{xmin}, g_{ymax}, g_{xmax}`. 36 | 37 | Returns: 38 | array: 39 | Bounding box offsets and scales from :obj:`src_bbox` \ 40 | to :obj:`dst_bbox`. \ 41 | This has shape :math:`(R, 4)`. 42 | The second axis contains four values :math:`t_y, t_x, t_h, t_w`. 43 | 44 | """ 45 | 46 | height = src_bbox[:, 2] - src_bbox[:, 0] 47 | width = src_bbox[:, 3] - src_bbox[:, 1] 48 | ctr_y = src_bbox[:, 0] + 0.5 * height 49 | ctr_x = src_bbox[:, 1] + 0.5 * width 50 | 51 | base_height = dst_bbox[:, 2] - dst_bbox[:, 0] 52 | base_width = dst_bbox[:, 3] - dst_bbox[:, 1] 53 | base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height 54 | base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width 55 | 56 | eps = xp.finfo(height.dtype).eps 57 | height = xp.maximum(height, eps) 58 | width = xp.maximum(width, eps) 59 | 60 | dy = (base_ctr_y - ctr_y) / height 61 | dx = (base_ctr_x - ctr_x) / width 62 | dh = xp.log(base_height / height) 63 | dw = xp.log(base_width / width) 64 | 65 | loc = xp.vstack((dy, dx, dh, dw)).transpose() 66 | return loc 67 | 68 | 69 | def bbox_iou(bbox_a, bbox_b): 70 | """Calculate the Intersection of Unions (IoUs) between bounding boxes. 71 | 72 | IoU is calculated as a ratio of area of the intersection 73 | and area of the union. 74 | 75 | This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as 76 | inputs. Please note that both :obj:`bbox_a` and :obj:`bbox_b` need to be 77 | same type. 78 | The output is same type as the type of the inputs. 79 | 80 | Args: 81 | bbox_a (array): An array whose shape is :math:`(N, 4)`. 82 | :math:`N` is the number of bounding boxes. 83 | The dtype should be :obj:`numpy.float32`. 84 | bbox_b (array): An array similar to :obj:`bbox_a`, 85 | whose shape is :math:`(K, 4)`. 86 | The dtype should be :obj:`numpy.float32`. 87 | 88 | Returns: 89 | array: 90 | An array whose shape is :math:`(N, K)`. \ 91 | An element at index :math:`(n, k)` contains IoUs between \ 92 | :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \ 93 | box in :obj:`bbox_b`. 94 | 95 | """ 96 | if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4: 97 | raise IndexError 98 | 99 | # top left 100 | tl = xp.maximum(bbox_a[:, None, :2], bbox_b[:, :2]) 101 | # bottom right 102 | br = xp.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:]) 103 | 104 | area_i = xp.prod(br - tl, axis=2) * (tl < br).all(axis=2) 105 | area_a = xp.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1) 106 | area_b = xp.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1) 107 | return area_i / (area_a[:, None] + area_b - area_i) 108 | 109 | 110 | 111 | def loc2bbox(src_bbox, loc): 112 | """Decode bounding boxes from bounding box offsets and scales. 113 | 114 | Given bounding box offsets and scales computed by 115 | :meth:`bbox2loc`, this function decodes the representation to 116 | coordinates in 2D image coordinates. 117 | 118 | Given scales and offsets :math:`t_y, t_x, t_h, t_w` and a bounding 119 | box whose center is :math:`(y, x) = p_y, p_x` and size :math:`p_h, p_w`, 120 | the decoded bounding box's center :math:`\\hat{g}_y`, :math:`\\hat{g}_x` 121 | and size :math:`\\hat{g}_h`, :math:`\\hat{g}_w` are calculated 122 | by the following formulas. 123 | 124 | * :math:`\\hat{g}_y = p_h t_y + p_y` 125 | * :math:`\\hat{g}_x = p_w t_x + p_x` 126 | * :math:`\\hat{g}_h = p_h \\exp(t_h)` 127 | * :math:`\\hat{g}_w = p_w \\exp(t_w)` 128 | 129 | The decoding formulas are used in works such as R-CNN [#]_. 130 | 131 | The output is same type as the type of the inputs. 132 | 133 | .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \ 134 | Rich feature hierarchies for accurate object detection and semantic \ 135 | segmentation. CVPR 2014. 136 | 137 | Args: 138 | src_bbox (array): A coordinates of bounding boxes. 139 | Its shape is :math:`(R, 4)`. These coordinates are 140 | :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`. 141 | loc (array): An array with offsets and scales. 142 | The shapes of :obj:`src_bbox` and :obj:`loc` should be same. 143 | This contains values :math:`t_y, t_x, t_h, t_w`. 144 | 145 | Returns: 146 | array: 147 | Decoded bounding box coordinates. Its shape is :math:`(R, 4)`. \ 148 | The second axis contains four values \ 149 | :math:`\\hat{g}_{ymin}, \\hat{g}_{xmin}, 150 | \\hat{g}_{ymax}, \\hat{g}_{xmax}`. 151 | 152 | """ 153 | 154 | if src_bbox.shape[0] == 0: 155 | return xp.zeros((0, 4), dtype=loc.dtype) 156 | 157 | src_bbox = src_bbox.astype(src_bbox.dtype, copy=False) 158 | 159 | src_height = src_bbox[:, 2] - src_bbox[:, 0] 160 | src_width = src_bbox[:, 3] - src_bbox[:, 1] 161 | src_ctr_y = src_bbox[:, 0] + 0.5 * src_height 162 | src_ctr_x = src_bbox[:, 1] + 0.5 * src_width 163 | 164 | dy = loc[:, 0::4] 165 | dx = loc[:, 1::4] 166 | dh = loc[:, 2::4] 167 | dw = loc[:, 3::4] 168 | 169 | ctr_y = dy * src_height[:, xp.newaxis] + src_ctr_y[:, xp.newaxis] 170 | ctr_x = dx * src_width[:, xp.newaxis] + src_ctr_x[:, xp.newaxis] 171 | h = xp.exp(dh) * src_height[:, xp.newaxis] 172 | w = xp.exp(dw) * src_width[:, xp.newaxis] 173 | 174 | dst_bbox = xp.zeros(loc.shape, dtype=loc.dtype) 175 | dst_bbox[:, 0::4] = ctr_y - 0.5 * h 176 | dst_bbox[:, 1::4] = ctr_x - 0.5 * w 177 | dst_bbox[:, 2::4] = ctr_y + 0.5 * h 178 | dst_bbox[:, 3::4] = ctr_x + 0.5 * w 179 | 180 | return dst_bbox 181 | -------------------------------------------------------------------------------- /model/utils/nms/non_maximum_suppression.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | import cupy as cp 4 | import torch as t 5 | try: 6 | from ._nms_gpu_post import _nms_gpu_post 7 | except: 8 | import warnings 9 | warnings.warn(''' 10 | the python code for non_maximum_suppression is about 2x slow 11 | It is strongly recommended to build cython code: 12 | `cd model/utils/nms/; python3 build.py build_ext --inplace''') 13 | from ._nms_gpu_post_py import _nms_gpu_post 14 | 15 | 16 | @cp.util.memoize(for_each_device=True) 17 | def _load_kernel(kernel_name, code, options=()): 18 | cp.cuda.runtime.free(0) 19 | assert isinstance(options, tuple) 20 | kernel_code = cp.cuda.compile_with_cache(code, options=options) 21 | return kernel_code.get_function(kernel_name) 22 | 23 | 24 | def non_maximum_suppression(bbox, thresh, score=None, 25 | limit=None): 26 | """Suppress bounding boxes according to their IoUs. 27 | 28 | This method checks each bounding box sequentially and selects the bounding 29 | box if the Intersection over Unions (IoUs) between the bounding box and the 30 | previously selected bounding boxes is less than :obj:`thresh`. This method 31 | is mainly used as postprocessing of object detection. 32 | The bounding boxes are selected from ones with higher scores. 33 | If :obj:`score` is not provided as an argument, the bounding box 34 | is ordered by its index in ascending order. 35 | 36 | The bounding boxes are expected to be packed into a two dimensional 37 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 38 | bounding boxes in the image. The second axis represents attributes of 39 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 40 | where the four attributes are coordinates of the top left and the 41 | bottom right vertices. 42 | 43 | :obj:`score` is a float array of shape :math:`(R,)`. Each score indicates 44 | confidence of prediction. 45 | 46 | This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as 47 | an input. Please note that both :obj:`bbox` and :obj:`score` need to be 48 | the same type. 49 | The type of the output is the same as the input. 50 | 51 | Args: 52 | bbox (array): Bounding boxes to be transformed. The shape is 53 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes. 54 | thresh (float): Threshold of IoUs. 55 | score (array): An array of confidences whose shape is :math:`(R,)`. 56 | limit (int): The upper bound of the number of the output bounding 57 | boxes. If it is not specified, this method selects as many 58 | bounding boxes as possible. 59 | 60 | Returns: 61 | array: 62 | An array with indices of bounding boxes that are selected. \ 63 | They are sorted by the scores of bounding boxes in descending \ 64 | order. \ 65 | The shape of this array is :math:`(K,)` and its dtype is\ 66 | :obj:`numpy.int32`. Note that :math:`K \\leq R`. 67 | 68 | """ 69 | 70 | return _non_maximum_suppression_gpu(bbox, thresh, score, limit) 71 | 72 | 73 | def _non_maximum_suppression_gpu(bbox, thresh, score=None, limit=None): 74 | if len(bbox) == 0: 75 | return cp.zeros((0,), dtype=np.int32) 76 | 77 | n_bbox = bbox.shape[0] 78 | 79 | if score is not None: 80 | order = score.argsort()[::-1].astype(np.int32) 81 | else: 82 | order = cp.arange(n_bbox, dtype=np.int32) 83 | 84 | sorted_bbox = bbox[order, :] 85 | selec, n_selec = _call_nms_kernel( 86 | sorted_bbox, thresh) 87 | selec = selec[:n_selec] 88 | selec = order[selec] 89 | if limit is not None: 90 | selec = selec[:limit] 91 | return cp.asnumpy(selec) 92 | 93 | 94 | _nms_gpu_code = ''' 95 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 96 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 97 | 98 | __device__ 99 | inline float devIoU(float const *const bbox_a, float const *const bbox_b) { 100 | float top = max(bbox_a[0], bbox_b[0]); 101 | float bottom = min(bbox_a[2], bbox_b[2]); 102 | float left = max(bbox_a[1], bbox_b[1]); 103 | float right = min(bbox_a[3], bbox_b[3]); 104 | float height = max(bottom - top, 0.f); 105 | float width = max(right - left, 0.f); 106 | float area_i = height * width; 107 | float area_a = (bbox_a[2] - bbox_a[0]) * (bbox_a[3] - bbox_a[1]); 108 | float area_b = (bbox_b[2] - bbox_b[0]) * (bbox_b[3] - bbox_b[1]); 109 | return area_i / (area_a + area_b - area_i); 110 | } 111 | 112 | extern "C" 113 | __global__ 114 | void nms_kernel(const int n_bbox, const float thresh, 115 | const float *dev_bbox, 116 | unsigned long long *dev_mask) { 117 | const int row_start = blockIdx.y; 118 | const int col_start = blockIdx.x; 119 | 120 | const int row_size = 121 | min(n_bbox - row_start * threadsPerBlock, threadsPerBlock); 122 | const int col_size = 123 | min(n_bbox - col_start * threadsPerBlock, threadsPerBlock); 124 | 125 | __shared__ float block_bbox[threadsPerBlock * 4]; 126 | if (threadIdx.x < col_size) { 127 | block_bbox[threadIdx.x * 4 + 0] = 128 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0]; 129 | block_bbox[threadIdx.x * 4 + 1] = 130 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1]; 131 | block_bbox[threadIdx.x * 4 + 2] = 132 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2]; 133 | block_bbox[threadIdx.x * 4 + 3] = 134 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3]; 135 | } 136 | __syncthreads(); 137 | 138 | if (threadIdx.x < row_size) { 139 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 140 | const float *cur_box = dev_bbox + cur_box_idx * 4; 141 | int i = 0; 142 | unsigned long long t = 0; 143 | int start = 0; 144 | if (row_start == col_start) { 145 | start = threadIdx.x + 1; 146 | } 147 | for (i = start; i < col_size; i++) { 148 | if (devIoU(cur_box, block_bbox + i * 4) >= thresh) { 149 | t |= 1ULL << i; 150 | } 151 | } 152 | const int col_blocks = DIVUP(n_bbox, threadsPerBlock); 153 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 154 | } 155 | } 156 | ''' 157 | 158 | 159 | def _call_nms_kernel(bbox, thresh): 160 | # PyTorch does not support unsigned long Tensor. 161 | # Doesn't matter,since it returns ndarray finally. 162 | # So I'll keep it unmodified. 163 | n_bbox = bbox.shape[0] 164 | threads_per_block = 64 165 | col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32) 166 | blocks = (col_blocks, col_blocks, 1) 167 | threads = (threads_per_block, 1, 1) 168 | 169 | mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64) 170 | bbox = cp.ascontiguousarray(bbox, dtype=np.float32) # NOTE: 变成连续的 171 | kern = _load_kernel('nms_kernel', _nms_gpu_code) 172 | kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), 173 | bbox, mask_dev)) 174 | 175 | mask_host = mask_dev.get() 176 | selection, n_selec = _nms_gpu_post( 177 | mask_host, n_bbox, threads_per_block, col_blocks) 178 | return selection, n_selec 179 | -------------------------------------------------------------------------------- /trainer/trainer.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | from torch.nn import functional as F 4 | from model.utils.roi_sample import ProposalTargetCreator 5 | from model.utils.rpn_gt_loc_label import AnchorTargetCreator 6 | 7 | from torch import nn 8 | import torch as t 9 | from torch.autograd import Variable 10 | from utils import array_tool as at 11 | 12 | from utils.config import opt 13 | 14 | LossTuple = namedtuple('LossTuple', 15 | ['rpn_loc_loss', 16 | 'rpn_cls_loss', 17 | 'roi_loc_loss', 18 | 'roi_cls_loss', 19 | 'total_loss' 20 | ]) 21 | 22 | 23 | class FasterRCNNTrainer(nn.Module): 24 | """wrapper for conveniently training. return losses 25 | 26 | The losses include: 27 | 28 | * :obj:`rpn_loc_loss`: The localization loss for \ 29 | Region Proposal Network (RPN). 30 | * :obj:`rpn_cls_loss`: The classification loss for RPN. 31 | * :obj:`roi_loc_loss`: The localization loss for the head module. 32 | * :obj:`roi_cls_loss`: The classification loss for the head module. 33 | * :obj:`total_loss`: The sum of 4 loss above. 34 | 35 | Args: 36 | faster_rcnn (model.FasterRCNN): 37 | A Faster R-CNN model that is going to be trained. 38 | """ 39 | 40 | def __init__(self, faster_rcnn): 41 | super(FasterRCNNTrainer, self).__init__() 42 | 43 | self.faster_rcnn = faster_rcnn 44 | self.rpn_sigma = opt.rpn_sigma 45 | self.roi_sigma = opt.roi_sigma 46 | 47 | # target creator create gt_bbox gt_label etc as training targets. 48 | self.anchor_target_creator = AnchorTargetCreator() 49 | self.proposal_target_creator = ProposalTargetCreator() 50 | 51 | self.loc_normalize_mean = faster_rcnn.loc_normalize_mean 52 | self.loc_normalize_std = faster_rcnn.loc_normalize_std 53 | 54 | self.optimizer = self.faster_rcnn.get_optimizer() 55 | 56 | def forward(self, imgs, bboxes, labels, scale): 57 | """Forward Faster R-CNN and calculate losses. 58 | 59 | Here are notations used. 60 | 61 | * :math:`N` is the batch size. 62 | * :math:`R` is the number of bounding boxes per image. 63 | 64 | Currently, only :math:`N=1` is supported. 65 | 66 | Args: 67 | imgs (~torch.autograd.Variable): A variable with a batch of images. 68 | bboxes (~torch.autograd.Variable): A batch of bounding boxes. 69 | Its shape is :math:`(N, R, 4)`. 70 | labels (~torch.autograd..Variable): A batch of labels. 71 | Its shape is :math:`(N, R)`. The background is excluded from 72 | the definition, which means that the range of the value 73 | is :math:`[0, L - 1]`. :math:`L` is the number of foreground 74 | classes. 75 | scale (float): Amount of scaling applied to 76 | the raw image during preprocessing. 77 | 78 | Returns: 79 | namedtuple of 5 losses 80 | """ 81 | n = bboxes.shape[0] 82 | if n != 1: 83 | raise ValueError('Currently only batch size 1 is supported.') 84 | 85 | _, _, H, W = imgs.shape 86 | img_size = (H, W) 87 | 88 | features = self.faster_rcnn.extractor(imgs) 89 | 90 | rpn_locs, rpn_scores, rois, roi_indices, anchor = \ 91 | self.faster_rcnn.rpn(features, img_size, scale) 92 | 93 | # Since batch size is one, convert variables to singular form 94 | bbox = bboxes[0] 95 | label = labels[0] 96 | rpn_score = rpn_scores[0] 97 | rpn_loc = rpn_locs[0] 98 | roi = rois 99 | 100 | # Sample RoIs and forward 101 | # it's fine to break the computation graph of rois, 102 | # consider them as constant input 103 | sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( 104 | roi, 105 | at.tonumpy(bbox), 106 | at.tonumpy(label), 107 | self.loc_normalize_mean, 108 | self.loc_normalize_std) 109 | # NOTE it's all zero because now it only support for batch=1 now 110 | sample_roi_index = t.zeros(len(sample_roi)) 111 | roi_cls_loc, roi_score = self.faster_rcnn.head( 112 | features, 113 | sample_roi, 114 | sample_roi_index) 115 | 116 | # ------------------ RPN losses -------------------# 117 | gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( 118 | at.tonumpy(bbox), 119 | anchor, 120 | img_size) 121 | gt_rpn_label = at.tovariable(gt_rpn_label).long() 122 | gt_rpn_loc = at.tovariable(gt_rpn_loc) 123 | rpn_loc_loss = _fast_rcnn_loc_loss( 124 | rpn_loc, 125 | gt_rpn_loc, 126 | gt_rpn_label.data, 127 | self.rpn_sigma) 128 | 129 | # NOTE: default value of ignore_index is -100 ... 130 | rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) 131 | 132 | # ------------------ ROI losses (fast rcnn loss) -------------------# 133 | n_sample = roi_cls_loc.shape[0] 134 | roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) 135 | roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \ 136 | at.totensor(gt_roi_label).long()] 137 | gt_roi_label = at.tovariable(gt_roi_label).long() 138 | gt_roi_loc = at.tovariable(gt_roi_loc) 139 | 140 | roi_loc_loss = _fast_rcnn_loc_loss( 141 | roi_loc.contiguous(), 142 | gt_roi_loc.float(), 143 | gt_roi_label.data, 144 | self.roi_sigma) 145 | 146 | 147 | roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) 148 | 149 | losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] 150 | losses = losses + [sum(losses)] 151 | 152 | return LossTuple(*losses) 153 | 154 | def train_step(self, imgs, bboxes, labels, scale): 155 | self.optimizer.zero_grad() 156 | losses = self.forward(imgs, bboxes, labels, scale) 157 | losses.total_loss.backward() 158 | self.optimizer.step() 159 | return losses 160 | 161 | def _smooth_l1_loss(x, t, in_weight, sigma): 162 | sigma2 = sigma ** 2 163 | # print ("------------") 164 | # print ("in_weight: ", in_weight) 165 | # print ("------------") 166 | # print ("x: ", x) 167 | # print ("------------") 168 | # print ("t: ", t) 169 | # print ("------------") 170 | t = t.float() 171 | diff = in_weight * (x - t) 172 | abs_diff = diff.abs() 173 | flag = (abs_diff.data < (1. / sigma2)).float() 174 | flag = Variable(flag) 175 | y = (flag * (sigma2 / 2.) * (diff ** 2) + 176 | (1 - flag) * (abs_diff - 0.5 / sigma2)) 177 | return y.sum() 178 | 179 | 180 | def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma): 181 | in_weight = t.zeros(gt_loc.shape).cuda() 182 | # Localization loss is calculated only for positive rois. 183 | # NOTE: unlike origin implementation, 184 | # we don't need inside_weight and outside_weight, they can calculate by gt_label 185 | in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1 186 | loc_loss = _smooth_l1_loss(pred_loc, gt_loc, Variable(in_weight), sigma) 187 | # Normalize by total number of negtive and positive rois. 188 | loc_loss /= (gt_label >= 0).sum() # ignore gt_label==-1 for rpn_loss 189 | return loc_loss 190 | -------------------------------------------------------------------------------- /utils/data_load.py: -------------------------------------------------------------------------------- 1 | import os 2 | import xml.etree.ElementTree as ET 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | from torch.utils.data import Dataset, DataLoader 6 | import random 7 | import cv2 8 | import pickle 9 | 10 | # ---------------------------------------------------------------- 11 | # LOAD AND SAVE USING PICKLE 12 | def save_pkl(filename, f): 13 | with open(filename, 'wb') as handle: 14 | pickle.dump(f, handle, protocol=pickle.HIGHEST_PROTOCOL) 15 | 16 | def load_pkl(filename): 17 | with open(filename, 'rb') as handle: 18 | b = pickle.load(handle) 19 | return b 20 | 21 | 22 | # ---------------------------------------------------------------- 23 | # VOC Objection Datasets 24 | class VOCBboxDataset: 25 | """Bounding box dataset for PASCAL `VOC`_. 26 | 27 | .. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/ 28 | 29 | The index corresponds to each image. 30 | 31 | When queried by an index, if :obj:`return_difficult == False`, 32 | this dataset returns a corresponding 33 | :obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels. 34 | This is the default behaviour. 35 | If :obj:`return_difficult == True`, this dataset returns corresponding 36 | :obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array 37 | that indicates whether bounding boxes are labeled as difficult or not. 38 | 39 | The bounding boxes are packed into a two dimensional tensor of shape 40 | :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in 41 | the image. The second axis represents attributes of the bounding box. 42 | They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the 43 | four attributes are coordinates of the top left and the bottom right 44 | vertices. 45 | 46 | The labels are packed into a one dimensional tensor of shape :math:`(R,)`. 47 | :math:`R` is the number of bounding boxes in the image. 48 | The class name of the label :math:`l` is :math:`l` th element of 49 | :obj:`VOC_BBOX_LABEL_NAMES`. 50 | 51 | The array :obj:`difficult` is a one dimensional boolean array of shape 52 | :math:`(R,)`. :math:`R` is the number of bounding boxes in the image. 53 | If :obj:`use_difficult` is :obj:`False`, this array is 54 | a boolean array with all :obj:`False`. 55 | 56 | The type of the image, the bounding boxes and the labels are as follows. 57 | 58 | * :obj:`img.dtype == numpy.float32` 59 | * :obj:`bbox.dtype == numpy.float32` 60 | * :obj:`label.dtype == numpy.int32` 61 | * :obj:`difficult.dtype == numpy.bool` 62 | 63 | Args: 64 | data_dir (string): Path to the root of the training data. 65 | i.e. "/data/image/voc/VOCdevkit/VOC2007/" 66 | split ({'train', 'val', 'trainval', 'test'}): Select a split of the 67 | dataset. :obj:`test` split is only available for 68 | 2007 dataset. 69 | year ({'2007', '2012'}): Use a dataset prepared for a challenge 70 | held in :obj:`year`. 71 | use_difficult (bool): If :obj:`True`, use images that are labeled as 72 | difficult in the original annotation. 73 | return_difficult (bool): If :obj:`True`, this dataset returns 74 | a boolean array 75 | that indicates whether bounding boxes are labeled as difficult 76 | or not. The default value is :obj:`False`. 77 | 78 | """ 79 | 80 | def __init__(self, data_dir, split='trainval', 81 | use_difficult=False, return_difficult=False, 82 | ): 83 | 84 | id_list_file = os.path.join( 85 | data_dir, 'ImageSets/Main/{0}.txt'.format(split)) 86 | 87 | self.ids = [id_.strip() for id_ in open(id_list_file)] 88 | self.data_dir = data_dir 89 | self.use_difficult = use_difficult 90 | self.return_difficult = return_difficult 91 | self.label_names = VOC_BBOX_LABEL_NAMES 92 | 93 | def __len__(self): 94 | return len(self.ids) 95 | 96 | def get_example(self, i): 97 | """Returns the i-th example. 98 | 99 | Returns a color image and bounding boxes. The image is in CHW format. 100 | The returned image is RGB. 101 | 102 | Args: 103 | i (int): The index of the example. 104 | 105 | Returns: 106 | tuple of an image and bounding boxes 107 | img: RGB image with shape [H, W, C], type float32 108 | 109 | """ 110 | id_ = self.ids[i] 111 | anno = ET.parse( 112 | os.path.join(self.data_dir, 'Annotations', id_ + '.xml')) 113 | bbox = list() 114 | label = list() 115 | difficult = list() 116 | for obj in anno.findall('object'): 117 | # when in not using difficult split, and the object is 118 | # difficult, skipt it. 119 | if not self.use_difficult and int(obj.find('difficult').text) == 1: 120 | continue 121 | 122 | difficult.append(int(obj.find('difficult').text)) 123 | bndbox_anno = obj.find('bndbox') 124 | # subtract 1 to make pixel indexes 0-based 125 | bbox.append([ 126 | int(bndbox_anno.find(tag).text) - 1 127 | for tag in ('ymin', 'xmin', 'ymax', 'xmax')]) 128 | name = obj.find('name').text.lower().strip() 129 | label.append(VOC_BBOX_LABEL_NAMES.index(name)) 130 | bbox = np.stack(bbox).astype(np.float32) 131 | label = np.stack(label).astype(np.int32) 132 | # When `use_difficult==False`, all elements in `difficult` are False. 133 | difficult = np.array(difficult, dtype=np.bool).astype(np.uint8) # PyTorch don't support np.bool 134 | 135 | # Load a image 136 | img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg') 137 | img = plt.imread(img_file) 138 | img = img.astype(np.float32) 139 | 140 | # if self.return_difficult: 141 | # return img, bbox, label, difficult 142 | return img, bbox, label, difficult 143 | 144 | __getitem__ = get_example 145 | 146 | 147 | VOC_BBOX_LABEL_NAMES = ( 148 | 'aeroplane', 149 | 'bicycle', 150 | 'bird', 151 | 'boat', 152 | 'bottle', 153 | 'bus', 154 | 'car', 155 | 'cat', 156 | 'chair', 157 | 'cow', 158 | 'diningtable', 159 | 'dog', 160 | 'horse', 161 | 'motorbike', 162 | 'person', 163 | 'pottedplant', 164 | 'sheep', 165 | 'sofa', 166 | 'train', 167 | 'tvmonitor') 168 | 169 | # ---------------------------------------------------------------- 170 | # Datasets Using in Training and Testing 171 | # 172 | # link: http://pytorch.org/docs/0.3.0/data.html?highlight=dataset 173 | class VOCDataset(Dataset): 174 | """ 175 | returned image: 176 | scaled image (mean, std, /255), float32, HWC, RGB 177 | mean=[0.485, 0.456, 0.406] 178 | std=[0.229, 0.224, 0.225] 179 | """ 180 | def __init__(self, opt, train=True): 181 | self.opt = opt 182 | self.train = train 183 | if train: 184 | self.db = VOCBboxDataset(opt.voc_data_dir) 185 | else: 186 | self.db = VOCBboxDataset(opt.voc_data_dir, split='test', use_difficult=True) 187 | 188 | def __getitem__(self, idx): 189 | ori_img, bbox, label, difficult = self.db.get_example(idx) 190 | 191 | # RESCALE ---------------------------- 192 | # image rescale to [opt.min_size, opt.max_size] 193 | H0, W0, C = ori_img.shape 194 | scale = min(self.opt.min_size/min(H0, W0), self.opt.max_size/max(H0, W0)) 195 | scaled_img = cv2.resize(ori_img, (0,0), fx=scale, fy=scale) 196 | H1, W1, _ = scaled_img.shape 197 | # bbox rescale 198 | bbox = scale * bbox 199 | 200 | # NORMALIZE ---------------------------- 201 | normalized_img = scaled_img / 255.0 202 | normalized_img = (normalized_img - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225]) 203 | 204 | # HORIZON FLIP ---------------------------- 205 | if self.train and random.random() < 0.5: 206 | normalized_img = normalized_img[:,::-1,:] 207 | bbox[:,1], bbox[:,3] = W1 - bbox[:, 3], W1 - bbox[:, 1] 208 | return normalized_img.astype(np.float32), bbox, label, scale 209 | 210 | def __len__(self): 211 | return len(self.db) -------------------------------------------------------------------------------- /model/faster_rcnn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch import nn 3 | from model.vgg16 import decom_vgg16 4 | from model.rpn import RegionProposalNetwork 5 | from model.roi_module import VGG16RoIHead 6 | from utils.config import opt 7 | import torch as t 8 | from utils import array_tool as at 9 | 10 | 11 | import cupy as cp 12 | from model.utils.nms import non_maximum_suppression 13 | from model.utils.bbox_tools import loc2bbox 14 | 15 | from torch.nn import functional as F 16 | 17 | class FasterRCNN(nn.Module): 18 | def __init__(self, ratios=[0.5, 1, 2], anchor_scales=[0.5, 1, 2], \ 19 | loc_normalize_mean = (0., 0., 0., 0.), \ 20 | loc_normalize_std = (0.1, 0.1, 0.2, 0.2)): 21 | super(FasterRCNN, self).__init__() 22 | 23 | # prepare 24 | extractor, classifier = decom_vgg16() 25 | rpn = RegionProposalNetwork( 26 | 512, 512, 27 | ratios=ratios, 28 | scales=anchor_scales, 29 | feat_stride=16 30 | ) 31 | 32 | head = VGG16RoIHead( 33 | n_class=20 + 1, 34 | roi_size=7, 35 | spatial_scale=(1. / 16), 36 | classifier=classifier.cuda() 37 | ) 38 | self.extractor = extractor.cuda() 39 | self.rpn = rpn.cuda() 40 | self.head = head 41 | 42 | # mean and std 43 | self.loc_normalize_mean = loc_normalize_mean 44 | self.loc_normalize_std = loc_normalize_std 45 | 46 | 47 | @property 48 | def n_class(self): 49 | # Total number of classes including the background. 50 | return self.head.n_class 51 | 52 | def forward(self, x, scale=1.): 53 | 54 | img_size = x.shape[2:] 55 | 56 | h = self.extractor(x) 57 | rpn_locs, rpn_scores, rois, roi_indices, anchor = \ 58 | self.rpn(h, img_size, scale) 59 | roi_cls_locs, roi_scores = self.head( 60 | h.cuda(), rois, np.array(roi_indices)) 61 | return roi_cls_locs, roi_scores, rois, roi_indices 62 | 63 | 64 | def get_optimizer(self): 65 | """ 66 | return optimizer, It could be overwriten if you want to specify 67 | special optimizer 68 | """ 69 | lr = opt.lr 70 | params = [] 71 | for key, value in dict(self.named_parameters()).items(): 72 | if value.requires_grad: 73 | if 'bias' in key: 74 | params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}] 75 | else: 76 | params += [{'params': [value], 'lr': lr, 'weight_decay': opt.weight_decay}] 77 | # if opt.use_adam: 78 | # self.optimizer = t.optim.Adam(params) 79 | # else: 80 | self.optimizer = t.optim.SGD(params, momentum=0.9) 81 | return self.optimizer 82 | 83 | 84 | def use_preset(self, preset): 85 | """Use the given preset during prediction. 86 | 87 | This method changes values of :obj:`self.nms_thresh` and 88 | :obj:`self.score_thresh`. These values are a threshold value 89 | used for non maximum suppression and a threshold value 90 | to discard low confidence proposals in :meth:`predict`, 91 | respectively. 92 | 93 | If the attributes need to be changed to something 94 | other than the values provided in the presets, please modify 95 | them by directly accessing the public attributes. 96 | 97 | Args: 98 | preset ({'visualize', 'evaluate'): A string to determine the 99 | preset to use. 100 | 101 | """ 102 | if preset == 'visualize': 103 | self.nms_thresh = 0.3 104 | self.score_thresh = 0.7 105 | elif preset == 'evaluate': 106 | self.nms_thresh = 0.3 107 | self.score_thresh = 0.05 108 | else: 109 | raise ValueError('preset must be visualize or evaluate') 110 | 111 | def _suppress(self, raw_cls_bbox, raw_prob): 112 | bbox = list() 113 | label = list() 114 | score = list() 115 | # skip cls_id = 0 because it is the background class 116 | for l in range(1, self.n_class): 117 | cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :] 118 | prob_l = raw_prob[:, l] 119 | mask = prob_l > self.score_thresh 120 | cls_bbox_l = cls_bbox_l[mask] 121 | prob_l = prob_l[mask] 122 | keep = non_maximum_suppression( 123 | cp.array(cls_bbox_l), self.nms_thresh, prob_l) 124 | keep = cp.asnumpy(keep) 125 | bbox.append(cls_bbox_l[keep]) 126 | # The labels are in [0, self.n_class - 2]. 127 | label.append((l - 1) * np.ones((len(keep),))) 128 | score.append(prob_l[keep]) 129 | bbox = np.concatenate(bbox, axis=0).astype(np.float32) 130 | label = np.concatenate(label, axis=0).astype(np.int32) 131 | score = np.concatenate(score, axis=0).astype(np.float32) 132 | return bbox, label, score 133 | 134 | def predict(self, imgs,sizes=None,visualize=False): 135 | """Detect objects from images. 136 | 137 | This method predicts objects for each image. 138 | 139 | Args: 140 | imgs (iterable of numpy.ndarray): Arrays holding images. 141 | All images are in CHW and RGB format 142 | and the range of their value is :math:`[0, 255]`. 143 | 144 | Returns: 145 | tuple of lists: 146 | This method returns a tuple of three lists, 147 | :obj:`(bboxes, labels, scores)`. 148 | 149 | * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ 150 | where :math:`R` is the number of bounding boxes in a image. \ 151 | Each bouding box is organized by \ 152 | :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ 153 | in the second axis. 154 | * **labels** : A list of integer arrays of shape :math:`(R,)`. \ 155 | Each value indicates the class of the bounding box. \ 156 | Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ 157 | number of the foreground classes. 158 | * **scores** : A list of float arrays of shape :math:`(R,)`. \ 159 | Each value indicates how confident the prediction is. 160 | 161 | """ 162 | self.eval() 163 | self.use_preset('evaluate') 164 | if visualize: 165 | self.use_preset('visualize') 166 | prepared_imgs = list() 167 | sizes = list() 168 | for img in imgs: 169 | size = img.shape[1:] 170 | img = preprocess(at.tonumpy(img)) 171 | prepared_imgs.append(img) 172 | sizes.append(size) 173 | else: 174 | prepared_imgs = imgs 175 | bboxes = list() 176 | labels = list() 177 | scores = list() 178 | for img, size in zip(prepared_imgs, sizes): 179 | img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) 180 | scale = img.shape[3] / size[1] 181 | roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) 182 | # We are assuming that batch size is 1. 183 | roi_score = roi_scores.data 184 | roi_cls_loc = roi_cls_loc.data 185 | roi = at.totensor(rois) / scale 186 | 187 | # Convert predictions to bounding boxes in image coordinates. 188 | # Bounding boxes are scaled to the scale of the input images. 189 | mean = t.Tensor(self.loc_normalize_mean).cuda(). \ 190 | repeat(self.n_class)[None] 191 | std = t.Tensor(self.loc_normalize_std).cuda(). \ 192 | repeat(self.n_class)[None] 193 | 194 | roi_cls_loc = (roi_cls_loc * std + mean) 195 | roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) 196 | roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) 197 | cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), 198 | at.tonumpy(roi_cls_loc).reshape((-1, 4))) 199 | cls_bbox = at.totensor(cls_bbox) 200 | cls_bbox = cls_bbox.view(-1, self.n_class * 4) 201 | # clip bounding box 202 | cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) 203 | cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) 204 | 205 | prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) 206 | 207 | raw_cls_bbox = at.tonumpy(cls_bbox) 208 | raw_prob = at.tonumpy(prob) 209 | 210 | bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) 211 | bboxes.append(bbox) 212 | labels.append(label) 213 | scores.append(score) 214 | 215 | self.use_preset('evaluate') 216 | self.train() 217 | return bboxes, labels, scores -------------------------------------------------------------------------------- /data/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | import random 4 | 5 | 6 | def read_image(path, dtype=np.float32, color=True): 7 | """Read an image from a file. 8 | 9 | This function reads an image from given file. The image is CHW format and 10 | the range of its value is :math:`[0, 255]`. If :obj:`color = True`, the 11 | order of the channels is RGB. 12 | 13 | Args: 14 | path (str): A path of image file. 15 | dtype: The type of array. The default value is :obj:`~numpy.float32`. 16 | color (bool): This option determines the number of channels. 17 | If :obj:`True`, the number of channels is three. In this case, 18 | the order of the channels is RGB. This is the default behaviour. 19 | If :obj:`False`, this function returns a grayscale image. 20 | 21 | Returns: 22 | ~numpy.ndarray: An image. 23 | """ 24 | 25 | f = Image.open(path) 26 | try: 27 | if color: 28 | img = f.convert('RGB') 29 | else: 30 | img = f.convert('P') 31 | img = np.asarray(img, dtype=dtype) 32 | finally: 33 | if hasattr(f, 'close'): 34 | f.close() 35 | 36 | if img.ndim == 2: 37 | # reshape (H, W) -> (1, H, W) 38 | return img[np.newaxis] 39 | else: 40 | # transpose (H, W, C) -> (C, H, W) 41 | return img.transpose((2, 0, 1)) 42 | 43 | 44 | def resize_bbox(bbox, in_size, out_size): 45 | """Resize bounding boxes according to image resize. 46 | 47 | The bounding boxes are expected to be packed into a two dimensional 48 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 49 | bounding boxes in the image. The second axis represents attributes of 50 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 51 | where the four attributes are coordinates of the top left and the 52 | bottom right vertices. 53 | 54 | Args: 55 | bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`. 56 | :math:`R` is the number of bounding boxes. 57 | in_size (tuple): A tuple of length 2. The height and the width 58 | of the image before resized. 59 | out_size (tuple): A tuple of length 2. The height and the width 60 | of the image after resized. 61 | 62 | Returns: 63 | ~numpy.ndarray: 64 | Bounding boxes rescaled according to the given image shapes. 65 | 66 | """ 67 | bbox = bbox.copy() 68 | y_scale = float(out_size[0]) / in_size[0] 69 | x_scale = float(out_size[1]) / in_size[1] 70 | bbox[:, 0] = y_scale * bbox[:, 0] 71 | bbox[:, 2] = y_scale * bbox[:, 2] 72 | bbox[:, 1] = x_scale * bbox[:, 1] 73 | bbox[:, 3] = x_scale * bbox[:, 3] 74 | return bbox 75 | 76 | 77 | def flip_bbox(bbox, size, y_flip=False, x_flip=False): 78 | """Flip bounding boxes accordingly. 79 | 80 | The bounding boxes are expected to be packed into a two dimensional 81 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 82 | bounding boxes in the image. The second axis represents attributes of 83 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 84 | where the four attributes are coordinates of the top left and the 85 | bottom right vertices. 86 | 87 | Args: 88 | bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`. 89 | :math:`R` is the number of bounding boxes. 90 | size (tuple): A tuple of length 2. The height and the width 91 | of the image before resized. 92 | y_flip (bool): Flip bounding box according to a vertical flip of 93 | an image. 94 | x_flip (bool): Flip bounding box according to a horizontal flip of 95 | an image. 96 | 97 | Returns: 98 | ~numpy.ndarray: 99 | Bounding boxes flipped according to the given flips. 100 | 101 | """ 102 | H, W = size 103 | bbox = bbox.copy() 104 | if y_flip: 105 | y_max = H - bbox[:, 0] 106 | y_min = H - bbox[:, 2] 107 | bbox[:, 0] = y_min 108 | bbox[:, 2] = y_max 109 | if x_flip: 110 | x_max = W - bbox[:, 1] 111 | x_min = W - bbox[:, 3] 112 | bbox[:, 1] = x_min 113 | bbox[:, 3] = x_max 114 | return bbox 115 | 116 | 117 | def crop_bbox( 118 | bbox, y_slice=None, x_slice=None, 119 | allow_outside_center=True, return_param=False): 120 | """Translate bounding boxes to fit within the cropped area of an image. 121 | 122 | This method is mainly used together with image cropping. 123 | This method translates the coordinates of bounding boxes like 124 | :func:`data.util.translate_bbox`. In addition, 125 | this function truncates the bounding boxes to fit within the cropped area. 126 | If a bounding box does not overlap with the cropped area, 127 | this bounding box will be removed. 128 | 129 | The bounding boxes are expected to be packed into a two dimensional 130 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 131 | bounding boxes in the image. The second axis represents attributes of 132 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 133 | where the four attributes are coordinates of the top left and the 134 | bottom right vertices. 135 | 136 | Args: 137 | bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is 138 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes. 139 | y_slice (slice): The slice of y axis. 140 | x_slice (slice): The slice of x axis. 141 | allow_outside_center (bool): If this argument is :obj:`False`, 142 | bounding boxes whose centers are outside of the cropped area 143 | are removed. The default value is :obj:`True`. 144 | return_param (bool): If :obj:`True`, this function returns 145 | indices of kept bounding boxes. 146 | 147 | Returns: 148 | ~numpy.ndarray or (~numpy.ndarray, dict): 149 | 150 | If :obj:`return_param = False`, returns an array :obj:`bbox`. 151 | 152 | If :obj:`return_param = True`, 153 | returns a tuple whose elements are :obj:`bbox, param`. 154 | :obj:`param` is a dictionary of intermediate parameters whose 155 | contents are listed below with key, value-type and the description 156 | of the value. 157 | 158 | * **index** (*numpy.ndarray*): An array holding indices of used \ 159 | bounding boxes. 160 | 161 | """ 162 | 163 | t, b = _slice_to_bounds(y_slice) 164 | l, r = _slice_to_bounds(x_slice) 165 | crop_bb = np.array((t, l, b, r)) 166 | 167 | if allow_outside_center: 168 | mask = np.ones(bbox.shape[0], dtype=bool) 169 | else: 170 | center = (bbox[:, :2] + bbox[:, 2:]) / 2 171 | mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) \ 172 | .all(axis=1) 173 | 174 | bbox = bbox.copy() 175 | bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2]) 176 | bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:]) 177 | bbox[:, :2] -= crop_bb[:2] 178 | bbox[:, 2:] -= crop_bb[:2] 179 | 180 | mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1)) 181 | bbox = bbox[mask] 182 | 183 | if return_param: 184 | return bbox, {'index': np.flatnonzero(mask)} 185 | else: 186 | return bbox 187 | 188 | 189 | def _slice_to_bounds(slice_): 190 | if slice_ is None: 191 | return 0, np.inf 192 | 193 | if slice_.start is None: 194 | l = 0 195 | else: 196 | l = slice_.start 197 | 198 | if slice_.stop is None: 199 | u = np.inf 200 | else: 201 | u = slice_.stop 202 | 203 | return l, u 204 | 205 | 206 | def translate_bbox(bbox, y_offset=0, x_offset=0): 207 | """Translate bounding boxes. 208 | 209 | This method is mainly used together with image transforms, such as padding 210 | and cropping, which translates the left top point of the image from 211 | coordinate :math:`(0, 0)` to coordinate 212 | :math:`(y, x) = (y_{offset}, x_{offset})`. 213 | 214 | The bounding boxes are expected to be packed into a two dimensional 215 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 216 | bounding boxes in the image. The second axis represents attributes of 217 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 218 | where the four attributes are coordinates of the top left and the 219 | bottom right vertices. 220 | 221 | Args: 222 | bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is 223 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes. 224 | y_offset (int or float): The offset along y axis. 225 | x_offset (int or float): The offset along x axis. 226 | 227 | Returns: 228 | ~numpy.ndarray: 229 | Bounding boxes translated according to the given offsets. 230 | 231 | """ 232 | 233 | out_bbox = bbox.copy() 234 | out_bbox[:, :2] += (y_offset, x_offset) 235 | out_bbox[:, 2:] += (y_offset, x_offset) 236 | 237 | return out_bbox 238 | 239 | 240 | def random_flip(img, y_random=False, x_random=False, 241 | return_param=False, copy=False): 242 | """Randomly flip an image in vertical or horizontal direction. 243 | 244 | Args: 245 | img (~numpy.ndarray): An array that gets flipped. This is in 246 | CHW format. 247 | y_random (bool): Randomly flip in vertical direction. 248 | x_random (bool): Randomly flip in horizontal direction. 249 | return_param (bool): Returns information of flip. 250 | copy (bool): If False, a view of :obj:`img` will be returned. 251 | 252 | Returns: 253 | ~numpy.ndarray or (~numpy.ndarray, dict): 254 | 255 | If :obj:`return_param = False`, 256 | returns an array :obj:`out_img` that is the result of flipping. 257 | 258 | If :obj:`return_param = True`, 259 | returns a tuple whose elements are :obj:`out_img, param`. 260 | :obj:`param` is a dictionary of intermediate parameters whose 261 | contents are listed below with key, value-type and the description 262 | of the value. 263 | 264 | * **y_flip** (*bool*): Whether the image was flipped in the\ 265 | vertical direction or not. 266 | * **x_flip** (*bool*): Whether the image was flipped in the\ 267 | horizontal direction or not. 268 | 269 | """ 270 | y_flip, x_flip = False, False 271 | if y_random: 272 | y_flip = random.choice([True, False]) 273 | if x_random: 274 | x_flip = random.choice([True, False]) 275 | 276 | if y_flip: 277 | img = img[:, ::-1, :] 278 | if x_flip: 279 | img = img[:, :, ::-1] 280 | 281 | if copy: 282 | img = img.copy() 283 | 284 | if return_param: 285 | return img, {'y_flip': y_flip, 'x_flip': x_flip} 286 | else: 287 | return img 288 | -------------------------------------------------------------------------------- /utils/eval_tool.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | from collections import defaultdict 4 | import itertools 5 | import numpy as np 6 | import six 7 | 8 | from model.utils.bbox_tools import bbox_iou 9 | 10 | 11 | def eval_detection_voc( 12 | pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, 13 | gt_difficults=None, 14 | iou_thresh=0.5, use_07_metric=False): 15 | """Calculate average precisions based on evaluation code of PASCAL VOC. 16 | 17 | This function evaluates predicted bounding boxes obtained from a dataset 18 | which has :math:`N` images by using average precision for each class. 19 | The code is based on the evaluation code used in PASCAL VOC Challenge. 20 | 21 | Args: 22 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` 23 | sets of bounding boxes. 24 | Its index corresponds to an index for the base dataset. 25 | Each element of :obj:`pred_bboxes` is a set of coordinates 26 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`, 27 | where :math:`R` corresponds 28 | to the number of bounding boxes, which may vary among boxes. 29 | The second axis corresponds to 30 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. 31 | pred_labels (iterable of numpy.ndarray): An iterable of labels. 32 | Similar to :obj:`pred_bboxes`, its index corresponds to an 33 | index for the base dataset. Its length is :math:`N`. 34 | pred_scores (iterable of numpy.ndarray): An iterable of confidence 35 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, 36 | its index corresponds to an index for the base dataset. 37 | Its length is :math:`N`. 38 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth 39 | bounding boxes 40 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a 41 | bounding box whose shape is :math:`(R, 4)`. Note that the number of 42 | bounding boxes in each image does not need to be same as the number 43 | of corresponding predicted boxes. 44 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth 45 | labels which are organized similarly to :obj:`gt_bboxes`. 46 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean 47 | arrays which is organized similarly to :obj:`gt_bboxes`. 48 | This tells whether the 49 | corresponding ground truth bounding box is difficult or not. 50 | By default, this is :obj:`None`. In that case, this function 51 | considers all bounding boxes to be not difficult. 52 | iou_thresh (float): A prediction is correct if its Intersection over 53 | Union with the ground truth is above this value. 54 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric 55 | for calculating average precision. The default value is 56 | :obj:`False`. 57 | 58 | Returns: 59 | dict: 60 | 61 | The keys, value-types and the description of the values are listed 62 | below. 63 | 64 | * **ap** (*numpy.ndarray*): An array of average precisions. \ 65 | The :math:`l`-th value corresponds to the average precision \ 66 | for class :math:`l`. If class :math:`l` does not exist in \ 67 | either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \ 68 | value is set to :obj:`numpy.nan`. 69 | * **map** (*float*): The average of Average Precisions over classes. 70 | 71 | """ 72 | 73 | prec, rec = calc_detection_voc_prec_rec( 74 | pred_bboxes, pred_labels, pred_scores, 75 | gt_bboxes, gt_labels, gt_difficults, 76 | iou_thresh=iou_thresh) 77 | 78 | ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric) 79 | 80 | return {'ap': ap, 'map': np.nanmean(ap)} 81 | 82 | 83 | def calc_detection_voc_prec_rec( 84 | pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, 85 | gt_difficults=None, 86 | iou_thresh=0.5): 87 | """Calculate precision and recall based on evaluation code of PASCAL VOC. 88 | 89 | This function calculates precision and recall of 90 | predicted bounding boxes obtained from a dataset which has :math:`N` 91 | images. 92 | The code is based on the evaluation code used in PASCAL VOC Challenge. 93 | 94 | Args: 95 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` 96 | sets of bounding boxes. 97 | Its index corresponds to an index for the base dataset. 98 | Each element of :obj:`pred_bboxes` is a set of coordinates 99 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`, 100 | where :math:`R` corresponds 101 | to the number of bounding boxes, which may vary among boxes. 102 | The second axis corresponds to 103 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. 104 | pred_labels (iterable of numpy.ndarray): An iterable of labels. 105 | Similar to :obj:`pred_bboxes`, its index corresponds to an 106 | index for the base dataset. Its length is :math:`N`. 107 | pred_scores (iterable of numpy.ndarray): An iterable of confidence 108 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, 109 | its index corresponds to an index for the base dataset. 110 | Its length is :math:`N`. 111 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth 112 | bounding boxes 113 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a 114 | bounding box whose shape is :math:`(R, 4)`. Note that the number of 115 | bounding boxes in each image does not need to be same as the number 116 | of corresponding predicted boxes. 117 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth 118 | labels which are organized similarly to :obj:`gt_bboxes`. 119 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean 120 | arrays which is organized similarly to :obj:`gt_bboxes`. 121 | This tells whether the 122 | corresponding ground truth bounding box is difficult or not. 123 | By default, this is :obj:`None`. In that case, this function 124 | considers all bounding boxes to be not difficult. 125 | iou_thresh (float): A prediction is correct if its Intersection over 126 | Union with the ground truth is above this value.. 127 | 128 | Returns: 129 | tuple of two lists: 130 | This function returns two lists: :obj:`prec` and :obj:`rec`. 131 | 132 | * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \ 133 | for class :math:`l`. If class :math:`l` does not exist in \ 134 | either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \ 135 | set to :obj:`None`. 136 | * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \ 137 | for class :math:`l`. If class :math:`l` that is not marked as \ 138 | difficult does not exist in \ 139 | :obj:`gt_labels`, :obj:`rec[l]` is \ 140 | set to :obj:`None`. 141 | 142 | """ 143 | 144 | pred_bboxes = iter(pred_bboxes) 145 | pred_labels = iter(pred_labels) 146 | pred_scores = iter(pred_scores) 147 | gt_bboxes = iter(gt_bboxes) 148 | gt_labels = iter(gt_labels) 149 | if gt_difficults is None: 150 | gt_difficults = itertools.repeat(None) 151 | else: 152 | gt_difficults = iter(gt_difficults) 153 | 154 | n_pos = defaultdict(int) 155 | score = defaultdict(list) 156 | match = defaultdict(list) 157 | 158 | for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \ 159 | six.moves.zip( 160 | pred_bboxes, pred_labels, pred_scores, 161 | gt_bboxes, gt_labels, gt_difficults): 162 | 163 | if gt_difficult is None: 164 | gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool) 165 | 166 | for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): 167 | pred_mask_l = pred_label == l 168 | pred_bbox_l = pred_bbox[pred_mask_l] 169 | pred_score_l = pred_score[pred_mask_l] 170 | # sort by score 171 | order = pred_score_l.argsort()[::-1] 172 | pred_bbox_l = pred_bbox_l[order] 173 | pred_score_l = pred_score_l[order] 174 | 175 | gt_mask_l = gt_label == l 176 | gt_bbox_l = gt_bbox[gt_mask_l] 177 | gt_difficult_l = gt_difficult[gt_mask_l] 178 | 179 | n_pos[l] += np.logical_not(gt_difficult_l).sum() 180 | score[l].extend(pred_score_l) 181 | 182 | if len(pred_bbox_l) == 0: 183 | continue 184 | if len(gt_bbox_l) == 0: 185 | match[l].extend((0,) * pred_bbox_l.shape[0]) 186 | continue 187 | 188 | # VOC evaluation follows integer typed bounding boxes. 189 | pred_bbox_l = pred_bbox_l.copy() 190 | pred_bbox_l[:, 2:] += 1 191 | gt_bbox_l = gt_bbox_l.copy() 192 | gt_bbox_l[:, 2:] += 1 193 | 194 | iou = bbox_iou(pred_bbox_l, gt_bbox_l) 195 | gt_index = iou.argmax(axis=1) 196 | # set -1 if there is no matching ground truth 197 | gt_index[iou.max(axis=1) < iou_thresh] = -1 198 | del iou 199 | 200 | selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) 201 | for gt_idx in gt_index: 202 | if gt_idx >= 0: 203 | if gt_difficult_l[gt_idx]: 204 | match[l].append(-1) 205 | else: 206 | if not selec[gt_idx]: 207 | match[l].append(1) 208 | else: 209 | match[l].append(0) 210 | selec[gt_idx] = True 211 | else: 212 | match[l].append(0) 213 | 214 | for iter_ in ( 215 | pred_bboxes, pred_labels, pred_scores, 216 | gt_bboxes, gt_labels, gt_difficults): 217 | if next(iter_, None) is not None: 218 | raise ValueError('Length of input iterables need to be same.') 219 | 220 | n_fg_class = max(n_pos.keys()) + 1 221 | prec = [None] * n_fg_class 222 | rec = [None] * n_fg_class 223 | 224 | for l in n_pos.keys(): 225 | score_l = np.array(score[l]) 226 | match_l = np.array(match[l], dtype=np.int8) 227 | 228 | order = score_l.argsort()[::-1] 229 | match_l = match_l[order] 230 | 231 | tp = np.cumsum(match_l == 1) 232 | fp = np.cumsum(match_l == 0) 233 | 234 | # If an element of fp + tp is 0, 235 | # the corresponding element of prec[l] is nan. 236 | prec[l] = tp / (fp + tp) 237 | # If n_pos[l] is 0, rec[l] is None. 238 | if n_pos[l] > 0: 239 | rec[l] = tp / n_pos[l] 240 | 241 | return prec, rec 242 | 243 | 244 | def calc_detection_voc_ap(prec, rec, use_07_metric=False): 245 | """Calculate average precisions based on evaluation code of PASCAL VOC. 246 | 247 | This function calculates average precisions 248 | from given precisions and recalls. 249 | The code is based on the evaluation code used in PASCAL VOC Challenge. 250 | 251 | Args: 252 | prec (list of numpy.array): A list of arrays. 253 | :obj:`prec[l]` indicates precision for class :math:`l`. 254 | If :obj:`prec[l]` is :obj:`None`, this function returns 255 | :obj:`numpy.nan` for class :math:`l`. 256 | rec (list of numpy.array): A list of arrays. 257 | :obj:`rec[l]` indicates recall for class :math:`l`. 258 | If :obj:`rec[l]` is :obj:`None`, this function returns 259 | :obj:`numpy.nan` for class :math:`l`. 260 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric 261 | for calculating average precision. The default value is 262 | :obj:`False`. 263 | 264 | Returns: 265 | ~numpy.ndarray: 266 | This function returns an array of average precisions. 267 | The :math:`l`-th value corresponds to the average precision 268 | for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is 269 | :obj:`None`, the corresponding value is set to :obj:`numpy.nan`. 270 | 271 | """ 272 | 273 | n_fg_class = len(prec) 274 | ap = np.empty(n_fg_class) 275 | for l in six.moves.range(n_fg_class): 276 | if prec[l] is None or rec[l] is None: 277 | ap[l] = np.nan 278 | continue 279 | 280 | if use_07_metric: 281 | # 11 point metric 282 | ap[l] = 0 283 | for t in np.arange(0., 1.1, 0.1): 284 | if np.sum(rec[l] >= t) == 0: 285 | p = 0 286 | else: 287 | p = np.max(np.nan_to_num(prec[l])[rec[l] >= t]) 288 | ap[l] += p / 11 289 | else: 290 | # correct AP calculation 291 | # first append sentinel values at the end 292 | mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0])) 293 | mrec = np.concatenate(([0], rec[l], [1])) 294 | 295 | mpre = np.maximum.accumulate(mpre[::-1])[::-1] 296 | 297 | # to calculate area under PR curve, look for points 298 | # where X axis (recall) changes value 299 | i = np.where(mrec[1:] != mrec[:-1])[0] 300 | 301 | # and sum (\Delta recall) * prec 302 | ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 303 | 304 | return ap 305 | -------------------------------------------------------------------------------- /model/roi_module.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from string import Template 3 | 4 | import cupy, torch 5 | import cupy as cp 6 | import torch as t 7 | from torch import nn 8 | from torch.autograd import Function 9 | 10 | from utils import array_tool as at 11 | 12 | 13 | def normal_init(m, mean, stddev, truncated=False): 14 | """ 15 | weight initalizer: truncated normal and random normal. 16 | """ 17 | # x is a parameter 18 | if truncated: 19 | m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation 20 | else: 21 | m.weight.data.normal_(mean, stddev) 22 | m.bias.data.zero_() 23 | 24 | class VGG16RoIHead(nn.Module): 25 | """Faster R-CNN Head for VGG-16 based implementation. 26 | This class is used as a head for Faster R-CNN. 27 | This outputs class-wise localizations and classification based on feature 28 | maps in the given RoIs. 29 | 30 | Args: 31 | n_class (int): The number of classes possibly including the background. 32 | roi_size (int): Height and width of the feature maps after RoI-pooling. 33 | spatial_scale (float): Scale of the roi is resized. 34 | classifier (nn.Module): Two layer Linear ported from vgg16 35 | 36 | """ 37 | 38 | def __init__(self, n_class, roi_size, spatial_scale, 39 | classifier): 40 | # n_class includes the background 41 | super(VGG16RoIHead, self).__init__() 42 | 43 | self.classifier = classifier.cuda() 44 | self.cls_loc = nn.Linear(4096, n_class * 4).cuda() 45 | self.score = nn.Linear(4096, n_class).cuda() 46 | 47 | normal_init(self.cls_loc, 0, 0.001) 48 | normal_init(self.score, 0, 0.01) 49 | 50 | self.n_class = n_class 51 | self.roi_size = roi_size 52 | self.spatial_scale = spatial_scale 53 | self.roi = RoIPooling2D(self.roi_size, self.roi_size, self.spatial_scale) 54 | 55 | def forward(self, x, rois, roi_indices): 56 | """Forward the chain. 57 | 58 | We assume that there are :math:`N` batches. 59 | 60 | Args: 61 | x (Variable): 4D image variable. 62 | rois (Tensor): A bounding box array containing coordinates of 63 | proposal boxes. This is a concatenation of bounding box 64 | arrays from multiple images in the batch. 65 | Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed 66 | RoIs from the :math:`i` th image, 67 | :math:`R' = \\sum _{i=1} ^ N R_i`. 68 | roi_indices (Tensor): An array containing indices of images to 69 | which bounding boxes correspond to. Its shape is :math:`(R',)`. 70 | 71 | """ 72 | # in case roi_indices is ndarray 73 | roi_indices = at.totensor(roi_indices).float() 74 | rois = at.totensor(rois).float() 75 | indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1) 76 | # NOTE: important: yx->xy 77 | xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] 78 | indices_and_rois = t.autograd.Variable(xy_indices_and_rois.contiguous()) 79 | 80 | pool = self.roi(x, indices_and_rois) 81 | pool = pool.view(pool.size(0), -1) 82 | fc7 = self.classifier(pool) 83 | roi_cls_locs = self.cls_loc(fc7) 84 | roi_scores = self.score(fc7) 85 | return roi_cls_locs, roi_scores 86 | 87 | Stream = namedtuple('Stream', ['ptr']) 88 | 89 | 90 | @cupy.util.memoize(for_each_device=True) 91 | def load_kernel(kernel_name, code, **kwargs): 92 | cp.cuda.runtime.free(0) 93 | code = Template(code).substitute(**kwargs) 94 | kernel_code = cupy.cuda.compile_with_cache(code) 95 | return kernel_code.get_function(kernel_name) 96 | 97 | 98 | CUDA_NUM_THREADS = 1024 99 | 100 | 101 | def GET_BLOCKS(N, K=CUDA_NUM_THREADS): 102 | return (N + K - 1) // K 103 | 104 | 105 | class RoI(Function): 106 | """ 107 | NOTE:only CUDA-compatible 108 | """ 109 | 110 | def __init__(self, outh, outw, spatial_scale): 111 | self.forward_fn = load_kernel('roi_forward', kernel_forward) 112 | self.backward_fn = load_kernel('roi_backward', kernel_backward) 113 | self.outh, self.outw, self.spatial_scale = outh, outw, spatial_scale 114 | 115 | def forward(self, x, rois): 116 | # NOTE: MAKE SURE input is contiguous too 117 | x = x.contiguous() 118 | rois = rois.contiguous() 119 | self.in_size = B, C, H, W = x.size() 120 | self.N = N = rois.size(0) 121 | output = t.zeros(N, C, self.outh, self.outw).cuda() 122 | self.argmax_data = t.zeros(N, C, self.outh, self.outw).int().cuda() 123 | self.rois = rois 124 | args = [x.data_ptr(), rois.data_ptr(), 125 | output.data_ptr(), 126 | self.argmax_data.data_ptr(), 127 | self.spatial_scale, C, H, W, 128 | self.outh, self.outw, 129 | output.numel()] 130 | stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) 131 | self.forward_fn(args=args, 132 | block=(CUDA_NUM_THREADS, 1, 1), 133 | grid=(GET_BLOCKS(output.numel()), 1, 1), 134 | stream=stream) 135 | return output 136 | 137 | def backward(self, grad_output): 138 | ##NOTE: IMPORTANT CONTIGUOUS 139 | # TODO: input 140 | grad_output = grad_output.contiguous() 141 | B, C, H, W = self.in_size 142 | grad_input = t.zeros(self.in_size).cuda() 143 | stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) 144 | args = [grad_output.data_ptr(), 145 | self.argmax_data.data_ptr(), 146 | self.rois.data_ptr(), 147 | grad_input.data_ptr(), 148 | self.N, self.spatial_scale, C, H, W, self.outh, self.outw, 149 | grad_input.numel()] 150 | self.backward_fn(args=args, 151 | block=(CUDA_NUM_THREADS, 1, 1), 152 | grid=(GET_BLOCKS(grad_input.numel()), 1, 1), 153 | stream=stream 154 | ) 155 | return grad_input, None 156 | 157 | 158 | class RoIPooling2D(nn.Module): 159 | 160 | def __init__(self, outh, outw, spatial_scale): 161 | super(RoIPooling2D, self).__init__() 162 | self.RoI = RoI(outh, outw, spatial_scale) 163 | 164 | def forward(self, x, rois): 165 | return self.RoI(x, rois) 166 | 167 | 168 | def test_roi_module(): 169 | ## fake data### 170 | B, N, C, H, W, PH, PW = 2, 8, 4, 32, 32, 7, 7 171 | 172 | bottom_data = t.randn(B, C, H, W).cuda() 173 | bottom_rois = t.randn(N, 5) 174 | bottom_rois[:int(N / 2), 0] = 0 175 | bottom_rois[int(N / 2):, 0] = 1 176 | bottom_rois[:, 1:] = (t.rand(N, 4) * 100).float() 177 | bottom_rois = bottom_rois.cuda() 178 | spatial_scale = 1. / 16 179 | outh, outw = PH, PW 180 | 181 | # pytorch version 182 | module = RoIPooling2D(outh, outw, spatial_scale) 183 | x = t.autograd.Variable(bottom_data, requires_grad=True) 184 | rois = t.autograd.Variable(bottom_rois) 185 | output = module(x, rois) 186 | output.sum().backward() 187 | 188 | def t2c(variable): 189 | npa = variable.data.cpu().numpy() 190 | return cp.array(npa) 191 | 192 | def test_eq(variable, array, info): 193 | cc = cp.asnumpy(array) 194 | neq = (cc != variable.data.cpu().numpy()) 195 | assert neq.sum() == 0, 'test failed: %s' % info 196 | 197 | # chainer version,if you're going to run this 198 | # pip install chainer 199 | import chainer.functions as F 200 | from chainer import Variable 201 | x_cn = Variable(t2c(x)) 202 | 203 | o_cn = F.roi_pooling_2d(x_cn, t2c(rois), outh, outw, spatial_scale) 204 | test_eq(output, o_cn.array, 'forward') 205 | F.sum(o_cn).backward() 206 | test_eq(x.grad, x_cn.grad, 'backward') 207 | print('test pass') 208 | 209 | 210 | def normal_init(m, mean, stddev, truncated=False): 211 | """ 212 | weight initalizer: truncated normal and random normal. 213 | """ 214 | # x is a parameter 215 | if truncated: 216 | m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation 217 | else: 218 | m.weight.data.normal_(mean, stddev) 219 | m.bias.data.zero_() 220 | 221 | 222 | kernel_forward = ''' 223 | extern "C" 224 | __global__ void roi_forward(const float* const bottom_data,const float* const bottom_rois, 225 | float* top_data, int* argmax_data, 226 | const double spatial_scale,const int channels,const int height, 227 | const int width, const int pooled_height, 228 | const int pooled_width,const int NN 229 | ){ 230 | 231 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 232 | if(idx>=NN) 233 | return; 234 | const int pw = idx % pooled_width; 235 | const int ph = (idx / pooled_width) % pooled_height; 236 | const int c = (idx / pooled_width / pooled_height) % channels; 237 | int num = idx / pooled_width / pooled_height / channels; 238 | const int roi_batch_ind = bottom_rois[num * 5 + 0]; 239 | const int roi_start_w = round(bottom_rois[num * 5 + 1] * spatial_scale); 240 | const int roi_start_h = round(bottom_rois[num * 5 + 2] * spatial_scale); 241 | const int roi_end_w = round(bottom_rois[num * 5 + 3] * spatial_scale); 242 | const int roi_end_h = round(bottom_rois[num * 5 + 4] * spatial_scale); 243 | // Force malformed ROIs to be 1x1 244 | const int roi_width = max(roi_end_w - roi_start_w + 1, 1); 245 | const int roi_height = max(roi_end_h - roi_start_h + 1, 1); 246 | const float bin_size_h = static_cast(roi_height) 247 | / static_cast(pooled_height); 248 | const float bin_size_w = static_cast(roi_width) 249 | / static_cast(pooled_width); 250 | 251 | int hstart = static_cast(floor(static_cast(ph) 252 | * bin_size_h)); 253 | int wstart = static_cast(floor(static_cast(pw) 254 | * bin_size_w)); 255 | int hend = static_cast(ceil(static_cast(ph + 1) 256 | * bin_size_h)); 257 | int wend = static_cast(ceil(static_cast(pw + 1) 258 | * bin_size_w)); 259 | 260 | // Add roi offsets and clip to input boundaries 261 | hstart = min(max(hstart + roi_start_h, 0), height); 262 | hend = min(max(hend + roi_start_h, 0), height); 263 | wstart = min(max(wstart + roi_start_w, 0), width); 264 | wend = min(max(wend + roi_start_w, 0), width); 265 | bool is_empty = (hend <= hstart) || (wend <= wstart); 266 | 267 | // Define an empty pooling region to be zero 268 | float maxval = is_empty ? 0 : -1E+37; 269 | // If nothing is pooled, argmax=-1 causes nothing to be backprop'd 270 | int maxidx = -1; 271 | const int data_offset = (roi_batch_ind * channels + c) * height * width; 272 | for (int h = hstart; h < hend; ++h) { 273 | for (int w = wstart; w < wend; ++w) { 274 | int bottom_index = h * width + w; 275 | if (bottom_data[data_offset + bottom_index] > maxval) { 276 | maxval = bottom_data[data_offset + bottom_index]; 277 | maxidx = bottom_index; 278 | } 279 | } 280 | } 281 | top_data[idx]=maxval; 282 | argmax_data[idx]=maxidx; 283 | } 284 | ''' 285 | kernel_backward = ''' 286 | extern "C" 287 | __global__ void roi_backward(const float* const top_diff, 288 | const int* const argmax_data,const float* const bottom_rois, 289 | float* bottom_diff, const int num_rois, 290 | const double spatial_scale, int channels, 291 | int height, int width, int pooled_height, 292 | int pooled_width,const int NN) 293 | { 294 | 295 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 296 | ////Importtan >= instead of > 297 | if(idx>=NN) 298 | return; 299 | int w = idx % width; 300 | int h = (idx / width) % height; 301 | int c = (idx/ (width * height)) % channels; 302 | int num = idx / (width * height * channels); 303 | 304 | float gradient = 0; 305 | // Accumulate gradient over all ROIs that pooled this element 306 | for (int roi_n = 0; roi_n < num_rois; ++roi_n) { 307 | // Skip if ROI's batch index doesn't match num 308 | if (num != static_cast(bottom_rois[roi_n * 5])) { 309 | continue; 310 | } 311 | 312 | int roi_start_w = round(bottom_rois[roi_n * 5 + 1] 313 | * spatial_scale); 314 | int roi_start_h = round(bottom_rois[roi_n * 5 + 2] 315 | * spatial_scale); 316 | int roi_end_w = round(bottom_rois[roi_n * 5 + 3] 317 | * spatial_scale); 318 | int roi_end_h = round(bottom_rois[roi_n * 5 + 4] 319 | * spatial_scale); 320 | 321 | // Skip if ROI doesn't include (h, w) 322 | const bool in_roi = (w >= roi_start_w && w <= roi_end_w && 323 | h >= roi_start_h && h <= roi_end_h); 324 | if (!in_roi) { 325 | continue; 326 | } 327 | 328 | int offset = (roi_n * channels + c) * pooled_height 329 | * pooled_width; 330 | 331 | // Compute feasible set of pooled units that could have pooled 332 | // this bottom unit 333 | 334 | // Force malformed ROIs to be 1x1 335 | int roi_width = max(roi_end_w - roi_start_w + 1, 1); 336 | int roi_height = max(roi_end_h - roi_start_h + 1, 1); 337 | 338 | float bin_size_h = static_cast(roi_height) 339 | / static_cast(pooled_height); 340 | float bin_size_w = static_cast(roi_width) 341 | / static_cast(pooled_width); 342 | 343 | int phstart = floor(static_cast(h - roi_start_h) 344 | / bin_size_h); 345 | int phend = ceil(static_cast(h - roi_start_h + 1) 346 | / bin_size_h); 347 | int pwstart = floor(static_cast(w - roi_start_w) 348 | / bin_size_w); 349 | int pwend = ceil(static_cast(w - roi_start_w + 1) 350 | / bin_size_w); 351 | 352 | phstart = min(max(phstart, 0), pooled_height); 353 | phend = min(max(phend, 0), pooled_height); 354 | pwstart = min(max(pwstart, 0), pooled_width); 355 | pwend = min(max(pwend, 0), pooled_width); 356 | for (int ph = phstart; ph < phend; ++ph) { 357 | for (int pw = pwstart; pw < pwend; ++pw) { 358 | int index_ = ph * pooled_width + pw + offset; 359 | if (argmax_data[index_] == (h * width + w)) { 360 | gradient += top_diff[index_]; 361 | } 362 | } 363 | } 364 | } 365 | bottom_diff[idx] = gradient; 366 | } 367 | ''' 368 | -------------------------------------------------------------------------------- /Train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "%matplotlib inline\n", 12 | "\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "from utils.config import opt\n", 15 | "from utils.data_load import save_pkl, load_pkl\n", 16 | "import numpy as np\n", 17 | "from torch import nn\n", 18 | "from torch.utils import data as data_\n", 19 | "from tqdm import tqdm\n", 20 | "import torch as t\n", 21 | "from utils import array_tool as at\n", 22 | "from torch.autograd import Variable" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "# Load Data" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "from data.dataset import Dataset, TestDataset, inverse_normalize" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "dataset = Dataset(opt)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "dataloader = data_.DataLoader(dataset, \\\n", 61 | " batch_size=1, \\\n", 62 | " shuffle=True, \\\n", 63 | " # pin_memory=True,\n", 64 | " num_workers=opt.num_workers)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "# Load Net and Trainer" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "from model.faster_rcnn import FasterRCNN\n", 83 | "from trainer.trainer import FasterRCNNTrainer" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 6, 89 | "metadata": { 90 | "collapsed": true 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "faster_rcnn = FasterRCNN()\n", 95 | "trainer = FasterRCNNTrainer(faster_rcnn).cuda()" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "# Training" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 7, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stderr", 112 | "output_type": "stream", 113 | "text": [ 114 | "5011it [20:59, 4.58it/s]\n" 115 | ] 116 | }, 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "--------------------------\n", 122 | "curr epoch: 0\n", 123 | "roi_cls loss: Variable containing:\n", 124 | " 0.3674\n", 125 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 126 | "\n", 127 | "roi_loc loss: Variable containing:\n", 128 | " 0.3669\n", 129 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 130 | "\n", 131 | "rpn_cls loss: Variable containing:\n", 132 | " 0.1877\n", 133 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 134 | "\n", 135 | "rpn_loc loss: Variable containing:\n", 136 | "1.00000e-02 *\n", 137 | " 7.2901\n", 138 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 139 | "\n", 140 | "--------------------------\n" 141 | ] 142 | }, 143 | { 144 | "name": "stderr", 145 | "output_type": "stream", 146 | "text": [ 147 | "5011it [21:20, 3.73it/s]\n" 148 | ] 149 | }, 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "--------------------------\n", 155 | "curr epoch: 1\n", 156 | "roi_cls loss: Variable containing:\n", 157 | " 0.2599\n", 158 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 159 | "\n", 160 | "roi_loc loss: Variable containing:\n", 161 | " 0.3091\n", 162 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 163 | "\n", 164 | "rpn_cls loss: Variable containing:\n", 165 | " 0.1426\n", 166 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 167 | "\n", 168 | "rpn_loc loss: Variable containing:\n", 169 | "1.00000e-02 *\n", 170 | " 6.3122\n", 171 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 172 | "\n", 173 | "--------------------------\n" 174 | ] 175 | }, 176 | { 177 | "name": "stderr", 178 | "output_type": "stream", 179 | "text": [ 180 | "5011it [21:15, 4.04it/s]\n" 181 | ] 182 | }, 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "--------------------------\n", 188 | "curr epoch: 2\n", 189 | "roi_cls loss: Variable containing:\n", 190 | " 0.2248\n", 191 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 192 | "\n", 193 | "roi_loc loss: Variable containing:\n", 194 | " 0.2781\n", 195 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 196 | "\n", 197 | "rpn_cls loss: Variable containing:\n", 198 | " 0.1257\n", 199 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 200 | "\n", 201 | "rpn_loc loss: Variable containing:\n", 202 | "1.00000e-02 *\n", 203 | " 5.9798\n", 204 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 205 | "\n", 206 | "--------------------------\n" 207 | ] 208 | }, 209 | { 210 | "name": "stderr", 211 | "output_type": "stream", 212 | "text": [ 213 | "5011it [21:37, 3.77it/s]\n" 214 | ] 215 | }, 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | "--------------------------\n", 221 | "curr epoch: 3\n", 222 | "roi_cls loss: Variable containing:\n", 223 | " 0.1998\n", 224 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 225 | "\n", 226 | "roi_loc loss: Variable containing:\n", 227 | " 0.2560\n", 228 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 229 | "\n", 230 | "rpn_cls loss: Variable containing:\n", 231 | " 0.1126\n", 232 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 233 | "\n", 234 | "rpn_loc loss: Variable containing:\n", 235 | "1.00000e-02 *\n", 236 | " 5.7650\n", 237 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 238 | "\n", 239 | "--------------------------\n" 240 | ] 241 | }, 242 | { 243 | "name": "stderr", 244 | "output_type": "stream", 245 | "text": [ 246 | "5011it [22:38, 3.31it/s]\n" 247 | ] 248 | }, 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "--------------------------\n", 254 | "curr epoch: 4\n", 255 | "roi_cls loss: Variable containing:\n", 256 | " 0.1831\n", 257 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 258 | "\n", 259 | "roi_loc loss: Variable containing:\n", 260 | " 0.2406\n", 261 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 262 | "\n", 263 | "rpn_cls loss: Variable containing:\n", 264 | " 0.1040\n", 265 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 266 | "\n", 267 | "rpn_loc loss: Variable containing:\n", 268 | "1.00000e-02 *\n", 269 | " 5.5827\n", 270 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 271 | "\n", 272 | "--------------------------\n" 273 | ] 274 | }, 275 | { 276 | "name": "stderr", 277 | "output_type": "stream", 278 | "text": [ 279 | "5011it [22:29, 3.93it/s]\n" 280 | ] 281 | }, 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "--------------------------\n", 287 | "curr epoch: 5\n", 288 | "roi_cls loss: Variable containing:\n", 289 | " 0.1717\n", 290 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 291 | "\n", 292 | "roi_loc loss: Variable containing:\n", 293 | " 0.2253\n", 294 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 295 | "\n", 296 | "rpn_cls loss: Variable containing:\n", 297 | "1.00000e-02 *\n", 298 | " 9.6195\n", 299 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 300 | "\n", 301 | "rpn_loc loss: Variable containing:\n", 302 | "1.00000e-02 *\n", 303 | " 5.3951\n", 304 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 305 | "\n", 306 | "--------------------------\n" 307 | ] 308 | }, 309 | { 310 | "name": "stderr", 311 | "output_type": "stream", 312 | "text": [ 313 | "5011it [22:42, 4.02it/s]\n" 314 | ] 315 | }, 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "--------------------------\n", 321 | "curr epoch: 6\n", 322 | "roi_cls loss: Variable containing:\n", 323 | " 0.1593\n", 324 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 325 | "\n", 326 | "roi_loc loss: Variable containing:\n", 327 | " 0.2130\n", 328 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 329 | "\n", 330 | "rpn_cls loss: Variable containing:\n", 331 | "1.00000e-02 *\n", 332 | " 8.8023\n", 333 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 334 | "\n", 335 | "rpn_loc loss: Variable containing:\n", 336 | "1.00000e-02 *\n", 337 | " 5.3082\n", 338 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 339 | "\n", 340 | "--------------------------\n" 341 | ] 342 | }, 343 | { 344 | "name": "stderr", 345 | "output_type": "stream", 346 | "text": [ 347 | "5011it [22:38, 3.69it/s]\n" 348 | ] 349 | }, 350 | { 351 | "name": "stdout", 352 | "output_type": "stream", 353 | "text": [ 354 | "--------------------------\n", 355 | "curr epoch: 7\n", 356 | "roi_cls loss: Variable containing:\n", 357 | " 0.1457\n", 358 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 359 | "\n", 360 | "roi_loc loss: Variable containing:\n", 361 | " 0.2030\n", 362 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 363 | "\n", 364 | "rpn_cls loss: Variable containing:\n", 365 | "1.00000e-02 *\n", 366 | " 8.0985\n", 367 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 368 | "\n", 369 | "rpn_loc loss: Variable containing:\n", 370 | "1.00000e-02 *\n", 371 | " 5.1582\n", 372 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 373 | "\n", 374 | "--------------------------\n" 375 | ] 376 | }, 377 | { 378 | "name": "stderr", 379 | "output_type": "stream", 380 | "text": [ 381 | "5011it [22:42, 4.22it/s]\n" 382 | ] 383 | }, 384 | { 385 | "name": "stdout", 386 | "output_type": "stream", 387 | "text": [ 388 | "--------------------------\n", 389 | "curr epoch: 8\n", 390 | "roi_cls loss: Variable containing:\n", 391 | " 0.1402\n", 392 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 393 | "\n", 394 | "roi_loc loss: Variable containing:\n", 395 | " 0.1936\n", 396 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 397 | "\n", 398 | "rpn_cls loss: Variable containing:\n", 399 | "1.00000e-02 *\n", 400 | " 7.5001\n", 401 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 402 | "\n", 403 | "rpn_loc loss: Variable containing:\n", 404 | "1.00000e-02 *\n", 405 | " 5.0869\n", 406 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 407 | "\n", 408 | "--------------------------\n" 409 | ] 410 | }, 411 | { 412 | "name": "stderr", 413 | "output_type": "stream", 414 | "text": [ 415 | "5011it [22:24, 3.66it/s]\n" 416 | ] 417 | }, 418 | { 419 | "name": "stdout", 420 | "output_type": "stream", 421 | "text": [ 422 | "--------------------------\n", 423 | "curr epoch: 9\n", 424 | "roi_cls loss: Variable containing:\n", 425 | " 0.1342\n", 426 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 427 | "\n", 428 | "roi_loc loss: Variable containing:\n", 429 | " 0.1853\n", 430 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 431 | "\n", 432 | "rpn_cls loss: Variable containing:\n", 433 | "1.00000e-02 *\n", 434 | " 7.0720\n", 435 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 436 | "\n", 437 | "rpn_loc loss: Variable containing:\n", 438 | "1.00000e-02 *\n", 439 | " 5.0439\n", 440 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 441 | "\n", 442 | "--------------------------\n" 443 | ] 444 | }, 445 | { 446 | "name": "stderr", 447 | "output_type": "stream", 448 | "text": [ 449 | "5011it [22:48, 3.66it/s]\n" 450 | ] 451 | }, 452 | { 453 | "name": "stdout", 454 | "output_type": "stream", 455 | "text": [ 456 | "--------------------------\n", 457 | "curr epoch: 10\n", 458 | "roi_cls loss: Variable containing:\n", 459 | " 0.1281\n", 460 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 461 | "\n", 462 | "roi_loc loss: Variable containing:\n", 463 | " 0.1791\n", 464 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 465 | "\n", 466 | "rpn_cls loss: Variable containing:\n", 467 | "1.00000e-02 *\n", 468 | " 6.5513\n", 469 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 470 | "\n", 471 | "rpn_loc loss: Variable containing:\n", 472 | "1.00000e-02 *\n", 473 | " 4.9259\n", 474 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 475 | "\n", 476 | "--------------------------\n" 477 | ] 478 | }, 479 | { 480 | "name": "stderr", 481 | "output_type": "stream", 482 | "text": [ 483 | "5011it [23:02, 4.23it/s]\n" 484 | ] 485 | }, 486 | { 487 | "name": "stdout", 488 | "output_type": "stream", 489 | "text": [ 490 | "--------------------------\n", 491 | "curr epoch: 11\n", 492 | "roi_cls loss: Variable containing:\n", 493 | " 0.1220\n", 494 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 495 | "\n", 496 | "roi_loc loss: Variable containing:\n", 497 | " 0.1720\n", 498 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 499 | "\n", 500 | "rpn_cls loss: Variable containing:\n", 501 | "1.00000e-02 *\n", 502 | " 6.1539\n", 503 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 504 | "\n", 505 | "rpn_loc loss: Variable containing:\n", 506 | "1.00000e-02 *\n", 507 | " 4.8372\n", 508 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 509 | "\n", 510 | "--------------------------\n" 511 | ] 512 | }, 513 | { 514 | "name": "stderr", 515 | "output_type": "stream", 516 | "text": [ 517 | "5011it [22:35, 3.51it/s]\n" 518 | ] 519 | }, 520 | { 521 | "name": "stdout", 522 | "output_type": "stream", 523 | "text": [ 524 | "--------------------------\n", 525 | "curr epoch: 12\n", 526 | "roi_cls loss: Variable containing:\n", 527 | " 0.1174\n", 528 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 529 | "\n", 530 | "roi_loc loss: Variable containing:\n", 531 | " 0.1660\n", 532 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 533 | "\n", 534 | "rpn_cls loss: Variable containing:\n", 535 | "1.00000e-02 *\n", 536 | " 5.8359\n", 537 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 538 | "\n", 539 | "rpn_loc loss: Variable containing:\n", 540 | "1.00000e-02 *\n", 541 | " 4.7706\n", 542 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 543 | "\n", 544 | "--------------------------\n" 545 | ] 546 | }, 547 | { 548 | "name": "stderr", 549 | "output_type": "stream", 550 | "text": [ 551 | "5011it [22:54, 3.90it/s]\n" 552 | ] 553 | }, 554 | { 555 | "name": "stdout", 556 | "output_type": "stream", 557 | "text": [ 558 | "--------------------------\n", 559 | "curr epoch: 13\n", 560 | "roi_cls loss: Variable containing:\n", 561 | " 0.1138\n", 562 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 563 | "\n", 564 | "roi_loc loss: Variable containing:\n", 565 | " 0.1598\n", 566 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 567 | "\n", 568 | "rpn_cls loss: Variable containing:\n", 569 | "1.00000e-02 *\n", 570 | " 5.4888\n", 571 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 572 | "\n", 573 | "rpn_loc loss: Variable containing:\n", 574 | "1.00000e-02 *\n", 575 | " 4.7324\n", 576 | "[torch.cuda.FloatTensor of size 1 (GPU 0)]\n", 577 | "\n", 578 | "--------------------------\n" 579 | ] 580 | } 581 | ], 582 | "source": [ 583 | "for epoch in range(14):\n", 584 | " \n", 585 | " loss_list_roi_cls = []\n", 586 | " loss_list_roi_loc = []\n", 587 | " loss_list_rpn_cls = []\n", 588 | " loss_list_rpn_loc = []\n", 589 | " for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):\n", 590 | " \n", 591 | " scale = at.scalar(scale)\n", 592 | " img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()\n", 593 | " img, bbox, label = Variable(img), Variable(bbox), Variable(label)\n", 594 | " loss_list = trainer.train_step(img, bbox, label, scale)\n", 595 | "\n", 596 | " loss_list_roi_cls.append(loss_list.roi_cls_loss)\n", 597 | " loss_list_roi_loc.append(loss_list.roi_loc_loss)\n", 598 | " loss_list_rpn_cls.append(loss_list.rpn_cls_loss)\n", 599 | " loss_list_rpn_loc.append(loss_list.rpn_loc_loss)\n", 600 | " print (\"--------------------------\")\n", 601 | " print (\"curr epoch: \", epoch)\n", 602 | " print (\"roi_cls loss: \", np.array(loss_list_roi_cls).mean())\n", 603 | " print (\"roi_loc loss: \", np.array(loss_list_roi_loc).mean())\n", 604 | " print (\"rpn_cls loss: \", np.array(loss_list_rpn_cls).mean())\n", 605 | " print (\"rpn_loc loss: \", np.array(loss_list_rpn_loc).mean())\n", 606 | " print (\"--------------------------\")" 607 | ] 608 | }, 609 | { 610 | "cell_type": "markdown", 611 | "metadata": {}, 612 | "source": [ 613 | "# Evaluation" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 8, 619 | "metadata": { 620 | "collapsed": true 621 | }, 622 | "outputs": [], 623 | "source": [ 624 | "from utils.eval_tool import eval_detection_voc" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 9, 630 | "metadata": { 631 | "collapsed": true 632 | }, 633 | "outputs": [], 634 | "source": [ 635 | "def eval(dataloader, faster_rcnn, test_num=10000):\n", 636 | " pred_bboxes, pred_labels, pred_scores = list(), list(), list()\n", 637 | " gt_bboxes, gt_labels, gt_difficults = list(), list(), list()\n", 638 | " for ii, (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) in tqdm(enumerate(dataloader)):\n", 639 | " sizes = [sizes[0][0], sizes[1][0]]\n", 640 | " pred_bboxes_, pred_labels_, pred_scores_ = faster_rcnn.predict(imgs, [sizes])\n", 641 | " gt_bboxes += list(gt_bboxes_.numpy())\n", 642 | " gt_labels += list(gt_labels_.numpy())\n", 643 | " gt_difficults += list(gt_difficults_.numpy())\n", 644 | " pred_bboxes += pred_bboxes_\n", 645 | " pred_labels += pred_labels_\n", 646 | " pred_scores += pred_scores_\n", 647 | " if ii == test_num: break\n", 648 | "\n", 649 | " result = eval_detection_voc(\n", 650 | " pred_bboxes, pred_labels, pred_scores,\n", 651 | " gt_bboxes, gt_labels, gt_difficults,\n", 652 | " use_07_metric=True)\n", 653 | " return result" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 10, 659 | "metadata": {}, 660 | "outputs": [], 661 | "source": [ 662 | "testset = TestDataset(opt)\n", 663 | "test_dataloader = data_.DataLoader(testset,\n", 664 | " batch_size=1,\n", 665 | " num_workers=8,\n", 666 | " shuffle=False, \\\n", 667 | " pin_memory=True\n", 668 | " )" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 11, 674 | "metadata": {}, 675 | "outputs": [ 676 | { 677 | "name": "stderr", 678 | "output_type": "stream", 679 | "text": [ 680 | "1000it [07:31, 1.98it/s]" 681 | ] 682 | } 683 | ], 684 | "source": [ 685 | "eval_result = eval(test_dataloader, faster_rcnn, test_num=1000)" 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": 12, 691 | "metadata": {}, 692 | "outputs": [ 693 | { 694 | "data": { 695 | "text/plain": [ 696 | "{'ap': array([ 0.78703012, 0.80306497, 0.64505543, 0.60716555, 0.49361586,\n", 697 | " 0.8341593 , 0.84566703, 0.78718817, 0.45073836, 0.79517284,\n", 698 | " 0.62107752, 0.6427704 , 0.74794425, 0.79348821, 0.74581093,\n", 699 | " 0.38075758, 0.67097107, 0.49560663, 0.74021585, 0.66774319]),\n", 700 | " 'map': 0.67776216301200209}" 701 | ] 702 | }, 703 | "execution_count": 12, 704 | "metadata": {}, 705 | "output_type": "execute_result" 706 | } 707 | ], 708 | "source": [ 709 | "eval_result" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 26, 715 | "metadata": {}, 716 | "outputs": [ 717 | { 718 | "data": { 719 | "text/plain": [ 720 | "{'ap': array([ 0.67399267, 0.58309746, 0.47840597, 0.38835252, 0.32832103,\n", 721 | " 0.65140195, 0.72522413, 0.67210257, 0.261774 , 0.60450732,\n", 722 | " 0.43700676, 0.38783834, 0.67360036, 0.58708554, 0.61982477,\n", 723 | " 0.32746522, 0.40122355, 0.38349585, 0.61877112, 0.59206734]),\n", 724 | " 'map': 0.51977792314647886}" 725 | ] 726 | }, 727 | "execution_count": 26, 728 | "metadata": {}, 729 | "output_type": "execute_result" 730 | } 731 | ], 732 | "source": [ 733 | "eval_result" 734 | ] 735 | } 736 | ], 737 | "metadata": { 738 | "kernelspec": { 739 | "display_name": "Python 3", 740 | "language": "python", 741 | "name": "python3" 742 | }, 743 | "language_info": { 744 | "codemirror_mode": { 745 | "name": "ipython", 746 | "version": 3 747 | }, 748 | "file_extension": ".py", 749 | "mimetype": "text/x-python", 750 | "name": "python", 751 | "nbconvert_exporter": "python", 752 | "pygments_lexer": "ipython3", 753 | "version": "3.6.3" 754 | } 755 | }, 756 | "nbformat": 4, 757 | "nbformat_minor": 2 758 | } 759 | --------------------------------------------------------------------------------