├── data ├── __init__.py ├── dataset.py ├── voc_dataset.py └── util.py ├── model ├── utils │ ├── __init__.py │ ├── roi_cupy.py │ ├── bbox_tools.py │ └── creator_tool.py ├── __init__.py ├── roi_module.py ├── faster_rcnn_vgg16.py ├── region_proposal_network.py └── faster_rcnn.py ├── requirements.txt ├── utils ├── __init__.py ├── array_tool.py ├── config.py ├── vis_tool.py └── eval_tool.py ├── misc └── convert_caffe_pretrain.py ├── LICENSE ├── train.py ├── README.MD └── trainer.py /data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | from .faster_rcnn_vgg16 import FasterRCNNVGG16 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | visdom 2 | # torchvision 3 | scikit-image 4 | tqdm 5 | fire 6 | pprint 7 | matplotlib 8 | ipdb 9 | cython 10 | git+https://github.com/pytorch/tnt.git@master 11 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 cy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /misc/convert_caffe_pretrain.py: -------------------------------------------------------------------------------- 1 | # code from ruotian luo 2 | # https://github.com/ruotianluo/pytorch-faster-rcnn 3 | import torch 4 | from torch.utils.model_zoo import load_url 5 | from torchvision import models 6 | 7 | sd = load_url("https://s3-us-west-2.amazonaws.com/jcjohns-models/vgg16-00b39a1b.pth") 8 | sd['classifier.0.weight'] = sd['classifier.1.weight'] 9 | sd['classifier.0.bias'] = sd['classifier.1.bias'] 10 | del sd['classifier.1.weight'] 11 | del sd['classifier.1.bias'] 12 | 13 | sd['classifier.3.weight'] = sd['classifier.4.weight'] 14 | sd['classifier.3.bias'] = sd['classifier.4.bias'] 15 | del sd['classifier.4.weight'] 16 | del sd['classifier.4.bias'] 17 | 18 | 19 | # speicify the path to save 20 | torch.save(sd, "vgg16_caffe.pth") -------------------------------------------------------------------------------- /utils/array_tool.py: -------------------------------------------------------------------------------- 1 | """ 2 | tools to convert specified type 3 | """ 4 | import torch as t 5 | import numpy as np 6 | 7 | def tonumpy(data): 8 | if isinstance(data, np.ndarray): 9 | return data 10 | if isinstance(data, t._C._TensorBase) and (data.requires_grad==False): 11 | return data.cpu().numpy() 12 | if isinstance(data, t.autograd.Variable): 13 | return tonumpy(data.data) 14 | 15 | 16 | def totensor(data, cuda=True): 17 | if isinstance(data, np.ndarray): 18 | tensor = t.from_numpy(data) 19 | if isinstance(data, t._C._TensorBase) and (data.requires_grad==False): 20 | tensor = data 21 | if isinstance(data, t.autograd.Variable): 22 | tensor = data.data 23 | if cuda: 24 | tensor = tensor.cuda() 25 | return tensor 26 | 27 | 28 | def tovariable(data): 29 | if isinstance(data, np.ndarray): 30 | return tovariable(totensor(data)) 31 | if isinstance(data, t._C._TensorBase) and (data.requires_grad==False): 32 | return t.autograd.Variable(data) 33 | if isinstance(data, t.autograd.Variable): 34 | return data 35 | else: 36 | raise ValueError("UnKnow data type: %s, input should be {np.ndarray,Tensor,Variable}" %type(data)) 37 | 38 | 39 | def scalar(data): 40 | if isinstance(data, np.ndarray): 41 | return data.reshape(1)[0] 42 | if isinstance(data, t._C._TensorBase) and (data.requires_grad==False): 43 | return data.view(1)[0] 44 | if isinstance(data, t.autograd.Variable): 45 | return data.data.view(1)[0] 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 Yun Chen 4 | 5 | Original works by: 6 | -------------------------------------------------------- 7 | chainer/chainercv 8 | Copyright (c) 2017 Yusuke Niitani 9 | Licensed under The MIT License 10 | https://github.com/chainer/chainercv/blob/master/LICENSE 11 | -------------------------------------------------------- 12 | Faster R-CNN 13 | Copyright (c) 2015 Microsoft 14 | Licensed under The MIT License 15 | https://github.com/rbgirshick/py-faster-rcnn/blob/master/LICENSE 16 | -------------------------------------------------------- 17 | 18 | Permission is hereby granted, free of charge, to any person obtaining a copy 19 | of this software and associated documentation files (the "Software"), to deal 20 | in the Software without restriction, including without limitation the rights 21 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 22 | copies of the Software, and to permit persons to whom the Software is 23 | furnished to do so, subject to the following conditions: 24 | 25 | The above copyright notice and this permission notice shall be included in 26 | all copies or substantial portions of the Software. 27 | 28 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 31 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 32 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 33 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 34 | THE SOFTWARE. -------------------------------------------------------------------------------- /utils/config.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | 3 | 4 | # Default Configs for training 5 | # NOTE that, config items could be overwriten by passing argument through command line. 6 | # e.g. --voc-data-dir='./data/' 7 | 8 | 9 | class Config: 10 | 11 | # probability threshold when using px and py to generate predicting box 12 | prob_thre = 0.5 13 | 14 | # data 15 | # voc_data_dir = '/home/cy/.chainer/dataset/pfnet/chainercv/voc/VOCdevkit/VOC2007/' 16 | # voc_data_dir = '/home/cvdev/Faster-RCNN-LocNet/VOCdevkit/VOC2007_traindev' 17 | voc_data_dir = '/home/cvdev/Faster-RCNN-LocNet/question_dataset' 18 | 19 | min_size = 600 # image resize 20 | max_size = 1000 # image resize 21 | num_workers = 8 22 | test_num_workers = 8 23 | 24 | # sigma for l1_smooth_loss 25 | rpn_sigma = 3. 26 | roi_sigma = 1e-5 27 | 28 | # param for optimizer 29 | # 0.0005 in origin paper but 0.0001 in tf-faster-rcnn 30 | weight_decay = 0.0005 31 | lr_decay = 0.1 # 1e-3 -> 1e-4 32 | lr = 1e-4 33 | 34 | 35 | # visualization 36 | env = 'faster-rcnn' # visdom env 37 | port = 8097 38 | plot_every = 40 # vis every N iter 39 | 40 | # preset 41 | data = 'voc' 42 | pretrained_model = 'vgg16' 43 | 44 | # training 45 | epoch = 50 46 | 47 | 48 | use_adam = False # Use Adam optimizer 49 | use_chainer = False # try match everything as chainer 50 | use_drop = False # use dropout in RoIHead 51 | # debug 52 | debug_file = '/tmp/debugf' 53 | 54 | test_num = 10000 55 | # model 56 | load_path = "/home/cvdev/Faster-RCNN-LocNet/checkpoints/best_model" 57 | 58 | caffe_pretrain = False # use caffe pretrained model instead of torchvision 59 | caffe_pretrain_path = 'checkpoints/vgg16-caffe.pth' 60 | 61 | def _parse(self, kwargs): 62 | state_dict = self._state_dict() 63 | for k, v in kwargs.items(): 64 | if k not in state_dict: 65 | raise ValueError('UnKnown Option: "--%s"' % k) 66 | setattr(self, k, v) 67 | 68 | print('======user config========') 69 | pprint(self._state_dict()) 70 | print('==========end============') 71 | 72 | def _state_dict(self): 73 | return {k: getattr(self, k) for k, _ in Config.__dict__.items() \ 74 | if not k.startswith('_')} 75 | 76 | 77 | opt = Config() 78 | -------------------------------------------------------------------------------- /data/dataset.py: -------------------------------------------------------------------------------- 1 | import torch as t 2 | from .voc_dataset import VOCBboxDataset 3 | from skimage import transform as sktsf 4 | from torchvision import transforms as tvtsf 5 | from . import util 6 | import numpy as np 7 | from utils.config import opt 8 | 9 | 10 | def inverse_normalize(img): 11 | if opt.caffe_pretrain: 12 | img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1)) 13 | return img[::-1, :, :] 14 | # approximate un-normalize for visualize 15 | return (img * 0.225 + 0.45).clip(min=0, max=1) * 255 16 | 17 | 18 | def pytorch_normalze(img): 19 | """ 20 | https://github.com/pytorch/vision/issues/223 21 | return appr -1~1 RGB 22 | """ 23 | normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406], 24 | std=[0.229, 0.224, 0.225]) 25 | img = normalize(t.from_numpy(img)) 26 | return img.numpy() 27 | 28 | 29 | def caffe_normalize(img): 30 | """ 31 | return appr -125-125 BGR 32 | """ 33 | img = img[[2, 1, 0], :, :] # RGB-BGR 34 | img = img * 255 35 | mean = np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1) 36 | img = (img - mean).astype(np.float32, copy=True) 37 | return img 38 | 39 | 40 | def preprocess(img, min_size=600, max_size=1000): 41 | """Preprocess an image for feature extraction. 42 | 43 | The length of the shorter edge is scaled to :obj:`self.min_size`. 44 | After the scaling, if the length of the longer edge is longer than 45 | :param min_size: 46 | :obj:`self.max_size`, the image is scaled to fit the longer edge 47 | to :obj:`self.max_size`. 48 | 49 | After resizing the image, the image is subtracted by a mean image value 50 | :obj:`self.mean`. 51 | 52 | Args: 53 | img (~numpy.ndarray): An image. This is in CHW and RGB format. 54 | The range of its value is :math:`[0, 255]`. 55 | 56 | Returns: 57 | ~numpy.ndarray: A preprocessed image. 58 | 59 | """ 60 | C, H, W = img.shape 61 | scale1 = min_size / min(H, W) 62 | scale2 = max_size / max(H, W) 63 | scale = min(scale1, scale2) 64 | img = img / 255. 65 | img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect') 66 | # both the longer and shorter should be less than 67 | # max_size and min_size 68 | 69 | if opt.caffe_pretrain: 70 | normalize = caffe_normalize 71 | else: 72 | normalize = pytorch_normalze 73 | return normalize(img) 74 | 75 | 76 | class Transform(object): 77 | ''' 78 | This is a self-defined transform class. 79 | ''' 80 | 81 | def __init__(self, min_size=600, max_size=1000): 82 | self.min_size = min_size 83 | self.max_size = max_size 84 | 85 | def __call__(self, in_data): 86 | img, bbox, label = in_data 87 | _, H, W = img.shape 88 | img = preprocess(img, self.min_size, self.max_size) 89 | _, o_H, o_W = img.shape 90 | scale = o_H / H 91 | bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W)) 92 | 93 | # horizontally flip 94 | img, params = util.random_flip( 95 | img, x_random=True, return_param=True) 96 | bbox = util.flip_bbox( 97 | bbox, (o_H, o_W), x_flip=params['x_flip']) 98 | 99 | return img, bbox, label, scale 100 | 101 | 102 | class Dataset: 103 | def __init__(self, opt): 104 | self.opt = opt 105 | self.db = VOCBboxDataset(opt.voc_data_dir) 106 | self.tsf = Transform(opt.min_size, opt.max_size) 107 | 108 | def __getitem__(self, idx): 109 | ori_img, bbox, label, difficult = self.db.get_example(idx) 110 | 111 | img, bbox, label, scale = self.tsf((ori_img, bbox, label)) 112 | # TODO: check whose stride is negative to fix this instead copy all 113 | # some of the strides of a given numpy array are negative. 114 | return img.copy(), bbox.copy(), label.copy(), scale 115 | 116 | def __len__(self): 117 | return len(self.db) 118 | 119 | 120 | class TestDataset: 121 | def __init__(self, opt, split='test', use_difficult=True): 122 | self.opt = opt 123 | self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult) 124 | 125 | def __getitem__(self, idx): 126 | ori_img, bbox, label, difficult = self.db.get_example(idx) 127 | img = preprocess(ori_img) 128 | return img, ori_img.shape[1:], bbox, label, difficult # the original shape of the image is passed. 129 | 130 | def __len__(self): 131 | return len(self.db) 132 | -------------------------------------------------------------------------------- /model/roi_module.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from string import Template 3 | 4 | import cupy, torch 5 | import cupy as cp 6 | import torch as t 7 | from torch.autograd import Function 8 | 9 | from model.utils.roi_cupy import kernel_backward, kernel_forward 10 | 11 | 12 | Stream = namedtuple('Stream', ['ptr']) 13 | 14 | 15 | @cupy.util.memoize(for_each_device=True) 16 | def load_kernel(kernel_name, code, **kwargs): 17 | cp.cuda.runtime.free(0) 18 | code = Template(code).substitute(**kwargs) 19 | kernel_code = cupy.cuda.compile_with_cache(code) 20 | return kernel_code.get_function(kernel_name) 21 | 22 | 23 | CUDA_NUM_THREADS = 1024 24 | 25 | 26 | def GET_BLOCKS(N, K=CUDA_NUM_THREADS): 27 | return (N + K - 1) // K 28 | 29 | 30 | class RoI(Function): 31 | """ 32 | NOTE:only CUDA-compatible 33 | """ 34 | 35 | def __init__(self, outh, outw, spatial_scale): 36 | self.forward_fn = load_kernel('roi_forward', kernel_forward) 37 | self.backward_fn = load_kernel('roi_backward', kernel_backward) 38 | self.outh, self.outw, self.spatial_scale = outh, outw, spatial_scale 39 | 40 | def forward(self, x, rois): 41 | # NOTE: MAKE SURE input is contiguous too 42 | x = x.contiguous() 43 | rois = rois.contiguous() 44 | self.in_size = B, C, H, W = x.size() 45 | self.N = N = rois.size(0) 46 | output = t.zeros(N, C, self.outh, self.outw).cuda() 47 | self.argmax_data = t.zeros(N, C, self.outh, self.outw).int().cuda() 48 | self.rois = rois 49 | args = [x.data_ptr(), rois.data_ptr(), 50 | output.data_ptr(), 51 | self.argmax_data.data_ptr(), 52 | self.spatial_scale, C, H, W, 53 | self.outh, self.outw, 54 | output.numel()] 55 | stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) 56 | self.forward_fn(args=args, 57 | block=(CUDA_NUM_THREADS, 1, 1), 58 | grid=(GET_BLOCKS(output.numel()), 1, 1), 59 | stream=stream) 60 | return output 61 | 62 | def backward(self, grad_output): 63 | ##NOTE: IMPORTANT CONTIGUOUS 64 | # TODO: input 65 | grad_output = grad_output.contiguous() 66 | B, C, H, W = self.in_size 67 | grad_input = t.zeros(self.in_size).cuda() 68 | stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) 69 | args = [grad_output.data_ptr(), 70 | self.argmax_data.data_ptr(), 71 | self.rois.data_ptr(), 72 | grad_input.data_ptr(), 73 | self.N, self.spatial_scale, C, H, W, self.outh, self.outw, 74 | grad_input.numel()] 75 | self.backward_fn(args=args, 76 | block=(CUDA_NUM_THREADS, 1, 1), 77 | grid=(GET_BLOCKS(grad_input.numel()), 1, 1), 78 | stream=stream 79 | ) 80 | return grad_input, None 81 | 82 | 83 | class RoIPooling2D(t.nn.Module): 84 | 85 | def __init__(self, outh, outw, spatial_scale): 86 | super(RoIPooling2D, self).__init__() 87 | self.RoI = RoI(outh, outw, spatial_scale) 88 | 89 | def forward(self, x, rois): 90 | return self.RoI(x, rois) 91 | 92 | 93 | def test_roi_module(): 94 | ## fake data### 95 | B, N, C, H, W, PH, PW = 2, 8, 4, 32, 32, 7, 7 96 | 97 | bottom_data = t.randn(B, C, H, W).cuda() 98 | bottom_rois = t.randn(N, 5) 99 | bottom_rois[:int(N / 2), 0] = 0 100 | bottom_rois[int(N / 2):, 0] = 1 101 | bottom_rois[:, 1:] = (t.rand(N, 4) * 100).float() 102 | bottom_rois = bottom_rois.cuda() 103 | spatial_scale = 1. / 16 104 | outh, outw = PH, PW 105 | 106 | # pytorch version 107 | module = RoIPooling2D(outh, outw, spatial_scale) 108 | x = t.autograd.Variable(bottom_data, requires_grad=True) 109 | rois = t.autograd.Variable(bottom_rois) 110 | output = module(x, rois) 111 | output.sum().backward() 112 | 113 | def t2c(variable): 114 | npa = variable.data.cpu().numpy() 115 | return cp.array(npa) 116 | 117 | def test_eq(variable, array, info): 118 | cc = cp.asnumpy(array) 119 | neq = (cc != variable.data.cpu().numpy()) 120 | assert neq.sum() == 0, 'test failed: %s' % info 121 | 122 | # chainer version,if you're going to run this 123 | # pip install chainer 124 | import chainer.functions as F 125 | from chainer import Variable 126 | x_cn = Variable(t2c(x)) 127 | 128 | o_cn = F.roi_pooling_2d(x_cn, t2c(rois), outh, outw, spatial_scale) 129 | test_eq(output, o_cn.array, 'forward') 130 | F.sum(o_cn).backward() 131 | test_eq(x.grad, x_cn.grad, 'backward') 132 | print('test pass') 133 | -------------------------------------------------------------------------------- /data/voc_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import xml.etree.ElementTree as ET 3 | 4 | import numpy as np 5 | 6 | from .util import read_image 7 | 8 | 9 | 10 | class VOCBboxDataset: 11 | """Bounding box dataset for PASCAL `VOC`_. 12 | 13 | .. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/ 14 | 15 | The index corresponds to each image. 16 | 17 | When queried by an index, if :obj:`return_difficult == False`, 18 | this dataset returns a corresponding 19 | :obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels. 20 | This is the default behaviour. 21 | If :obj:`return_difficult == True`, this dataset returns corresponding 22 | :obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array 23 | that indicates whether bounding boxes are labeled as difficult or not. 24 | 25 | The bounding boxes are packed into a two dimensional tensor of shape 26 | :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in 27 | the image. The second axis represents attributes of the bounding box. 28 | They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the 29 | four attributes are coordinates of the top left and the bottom right 30 | vertices. 31 | 32 | The labels are packed into a one dimensional tensor of shape :math:`(R,)`. 33 | :math:`R` is the number of bounding boxes in the image. 34 | The class name of the label :math:`l` is :math:`l` th element of 35 | :obj:`VOC_BBOX_LABEL_NAMES`. 36 | 37 | The array :obj:`difficult` is a one dimensional boolean array of shape 38 | :math:`(R,)`. :math:`R` is the number of bounding boxes in the image. 39 | If :obj:`use_difficult` is :obj:`False`, this array is 40 | a boolean array with all :obj:`False`. 41 | 42 | The type of the image, the bounding boxes and the labels are as follows. 43 | 44 | * :obj:`img.dtype == numpy.float32` 45 | * :obj:`bbox.dtype == numpy.float32` 46 | * :obj:`label.dtype == numpy.int32` 47 | * :obj:`difficult.dtype == numpy.bool` 48 | 49 | Args: 50 | data_dir (string): Path to the root of the training data. 51 | i.e. "/data/image/voc/VOCdevkit/VOC2007/" 52 | split ({'train', 'val', 'trainval', 'test'}): Select a split of the 53 | dataset. :obj:`test` split is only available for 54 | 2007 dataset. 55 | year ({'2007', '2012'}): Use a dataset prepared for a challenge 56 | held in :obj:`year`. 57 | use_difficult (bool): If :obj:`True`, use images that are labeled as 58 | difficult in the original annotation. 59 | return_difficult (bool): If :obj:`True`, this dataset returns 60 | a boolean array 61 | that indicates whether bounding boxes are labeled as difficult 62 | or not. The default value is :obj:`False`. 63 | 64 | """ 65 | 66 | def __init__(self, data_dir, split='trainval', 67 | use_difficult=False, return_difficult=False, 68 | ): 69 | 70 | # if split not in ['train', 'trainval', 'val']: 71 | # if not (split == 'test' and year == '2007'): 72 | # warnings.warn( 73 | # 'please pick split from \'train\', \'trainval\', \'val\'' 74 | # 'for 2012 dataset. For 2007 dataset, you can pick \'test\'' 75 | # ' in addition to the above mentioned splits.' 76 | # ) 77 | id_list_file = os.path.join( 78 | data_dir, 'ImageSets/Main/{0}.txt'.format(split)) 79 | 80 | self.ids = [id_.strip() for id_ in open(id_list_file)] 81 | self.data_dir = data_dir 82 | self.use_difficult = use_difficult 83 | self.return_difficult = return_difficult 84 | self.label_names = VOC_BBOX_LABEL_NAMES 85 | 86 | def __len__(self): 87 | return len(self.ids) 88 | 89 | def get_example(self, i): 90 | """Returns the i-th example. 91 | 92 | Returns a color image and bounding boxes. The image is in CHW format. 93 | The returned image is RGB. 94 | 95 | Args: 96 | i (int): The index of the example. 97 | 98 | Returns: 99 | tuple of an image and bounding boxes 100 | 101 | """ 102 | id_ = self.ids[i] 103 | anno = ET.parse( 104 | os.path.join(self.data_dir, 'Annotations', id_ + '.xml')) 105 | bbox = list() 106 | label = list() 107 | difficult = list() 108 | for obj in anno.findall('object'): 109 | 110 | # when in not using difficult split, and the object is 111 | # difficult, skipt it. 112 | # if not self.use_difficult and int(obj.find('difficult').text) == 1: 113 | # continue 114 | 115 | # difficult.append(int(obj.find('difficult').text)) 116 | # difficulty all set to 0 when using question dataset 117 | difficult.append(0) 118 | 119 | bndbox_anno = obj.find('bndbox') 120 | # subtract 1 to make pixel indexes 0-based 121 | bbox.append([ 122 | int(bndbox_anno.find(tag).text) - 1 123 | for tag in ('ymin', 'xmin', 'ymax', 'xmax')]) 124 | name = obj.find('name').text.lower().strip() 125 | label.append(VOC_BBOX_LABEL_NAMES.index(name)) 126 | bbox = np.stack(bbox).astype(np.float32) 127 | label = np.stack(label).astype(np.int32) 128 | # When `use_difficult==False`, all elements in `difficult` are False. 129 | difficult = np.array(difficult, dtype=np.bool).astype(np.uint8) # PyTorch don't support np.bool 130 | 131 | # Load a image 132 | # img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg') 133 | img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpeg') 134 | img = read_image(img_file, color=True) 135 | 136 | # if self.return_difficult: 137 | # return img, bbox, label, difficult 138 | return img, bbox, label, difficult 139 | 140 | __getitem__ = get_example 141 | 142 | 143 | # VOC_BBOX_LABEL_NAMES = ( 144 | # 'aeroplane', 145 | # 'bicycle', 146 | # 'bird', 147 | # 'boat', 148 | # 'bottle', 149 | # 'bus', 150 | # 'car', 151 | # 'cat', 152 | # 'chair', 153 | # 'cow', 154 | # 'diningtable', 155 | # 'dog', 156 | # 'horse', 157 | # 'motorbike', 158 | # 'person', 159 | # 'pottedplant', 160 | # 'sheep', 161 | # 'sofa', 162 | # 'train', 163 | # 'tvmonitor') 164 | 165 | VOC_BBOX_LABEL_NAMES = ('text') -------------------------------------------------------------------------------- /model/utils/roi_cupy.py: -------------------------------------------------------------------------------- 1 | kernel_forward = ''' 2 | extern "C" 3 | __global__ void roi_forward(const float* const bottom_data,const float* const bottom_rois, 4 | float* top_data, int* argmax_data, 5 | const double spatial_scale,const int channels,const int height, 6 | const int width, const int pooled_height, 7 | const int pooled_width,const int NN 8 | ){ 9 | 10 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 11 | if(idx>=NN) 12 | return; 13 | const int pw = idx % pooled_width; 14 | const int ph = (idx / pooled_width) % pooled_height; 15 | const int c = (idx / pooled_width / pooled_height) % channels; 16 | int num = idx / pooled_width / pooled_height / channels; 17 | const int roi_batch_ind = bottom_rois[num * 5 + 0]; 18 | const int roi_start_w = round(bottom_rois[num * 5 + 1] * spatial_scale); 19 | const int roi_start_h = round(bottom_rois[num * 5 + 2] * spatial_scale); 20 | const int roi_end_w = round(bottom_rois[num * 5 + 3] * spatial_scale); 21 | const int roi_end_h = round(bottom_rois[num * 5 + 4] * spatial_scale); 22 | // Force malformed ROIs to be 1x1 23 | const int roi_width = max(roi_end_w - roi_start_w + 1, 1); 24 | const int roi_height = max(roi_end_h - roi_start_h + 1, 1); 25 | const float bin_size_h = static_cast(roi_height) 26 | / static_cast(pooled_height); 27 | const float bin_size_w = static_cast(roi_width) 28 | / static_cast(pooled_width); 29 | 30 | int hstart = static_cast(floor(static_cast(ph) 31 | * bin_size_h)); 32 | int wstart = static_cast(floor(static_cast(pw) 33 | * bin_size_w)); 34 | int hend = static_cast(ceil(static_cast(ph + 1) 35 | * bin_size_h)); 36 | int wend = static_cast(ceil(static_cast(pw + 1) 37 | * bin_size_w)); 38 | 39 | // Add roi offsets and clip to input boundaries 40 | hstart = min(max(hstart + roi_start_h, 0), height); 41 | hend = min(max(hend + roi_start_h, 0), height); 42 | wstart = min(max(wstart + roi_start_w, 0), width); 43 | wend = min(max(wend + roi_start_w, 0), width); 44 | bool is_empty = (hend <= hstart) || (wend <= wstart); 45 | 46 | // Define an empty pooling region to be zero 47 | float maxval = is_empty ? 0 : -1E+37; 48 | // If nothing is pooled, argmax=-1 causes nothing to be backprop'd 49 | int maxidx = -1; 50 | const int data_offset = (roi_batch_ind * channels + c) * height * width; 51 | for (int h = hstart; h < hend; ++h) { 52 | for (int w = wstart; w < wend; ++w) { 53 | int bottom_index = h * width + w; 54 | if (bottom_data[data_offset + bottom_index] > maxval) { 55 | maxval = bottom_data[data_offset + bottom_index]; 56 | maxidx = bottom_index; 57 | } 58 | } 59 | } 60 | top_data[idx]=maxval; 61 | argmax_data[idx]=maxidx; 62 | } 63 | ''' 64 | kernel_backward = ''' 65 | extern "C" 66 | __global__ void roi_backward(const float* const top_diff, 67 | const int* const argmax_data,const float* const bottom_rois, 68 | float* bottom_diff, const int num_rois, 69 | const double spatial_scale, int channels, 70 | int height, int width, int pooled_height, 71 | int pooled_width,const int NN) 72 | { 73 | 74 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 75 | ////Importtan >= instead of > 76 | if(idx>=NN) 77 | return; 78 | int w = idx % width; 79 | int h = (idx / width) % height; 80 | int c = (idx/ (width * height)) % channels; 81 | int num = idx / (width * height * channels); 82 | 83 | float gradient = 0; 84 | // Accumulate gradient over all ROIs that pooled this element 85 | for (int roi_n = 0; roi_n < num_rois; ++roi_n) { 86 | // Skip if ROI's batch index doesn't match num 87 | if (num != static_cast(bottom_rois[roi_n * 5])) { 88 | continue; 89 | } 90 | 91 | int roi_start_w = round(bottom_rois[roi_n * 5 + 1] 92 | * spatial_scale); 93 | int roi_start_h = round(bottom_rois[roi_n * 5 + 2] 94 | * spatial_scale); 95 | int roi_end_w = round(bottom_rois[roi_n * 5 + 3] 96 | * spatial_scale); 97 | int roi_end_h = round(bottom_rois[roi_n * 5 + 4] 98 | * spatial_scale); 99 | 100 | // Skip if ROI doesn't include (h, w) 101 | const bool in_roi = (w >= roi_start_w && w <= roi_end_w && 102 | h >= roi_start_h && h <= roi_end_h); 103 | if (!in_roi) { 104 | continue; 105 | } 106 | 107 | int offset = (roi_n * channels + c) * pooled_height 108 | * pooled_width; 109 | 110 | // Compute feasible set of pooled units that could have pooled 111 | // this bottom unit 112 | 113 | // Force malformed ROIs to be 1x1 114 | int roi_width = max(roi_end_w - roi_start_w + 1, 1); 115 | int roi_height = max(roi_end_h - roi_start_h + 1, 1); 116 | 117 | float bin_size_h = static_cast(roi_height) 118 | / static_cast(pooled_height); 119 | float bin_size_w = static_cast(roi_width) 120 | / static_cast(pooled_width); 121 | 122 | int phstart = floor(static_cast(h - roi_start_h) 123 | / bin_size_h); 124 | int phend = ceil(static_cast(h - roi_start_h + 1) 125 | / bin_size_h); 126 | int pwstart = floor(static_cast(w - roi_start_w) 127 | / bin_size_w); 128 | int pwend = ceil(static_cast(w - roi_start_w + 1) 129 | / bin_size_w); 130 | 131 | phstart = min(max(phstart, 0), pooled_height); 132 | phend = min(max(phend, 0), pooled_height); 133 | pwstart = min(max(pwstart, 0), pooled_width); 134 | pwend = min(max(pwend, 0), pooled_width); 135 | for (int ph = phstart; ph < phend; ++ph) { 136 | for (int pw = pwstart; pw < pwend; ++pw) { 137 | int index_ = ph * pooled_width + pw + offset; 138 | if (argmax_data[index_] == (h * width + w)) { 139 | gradient += top_diff[index_]; 140 | } 141 | } 142 | } 143 | } 144 | bottom_diff[idx] = gradient; 145 | } 146 | ''' 147 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import ipdb 5 | import matplotlib 6 | from tqdm import tqdm 7 | 8 | from utils.config import opt 9 | from data.dataset import Dataset, TestDataset, inverse_normalize 10 | from model import FasterRCNNVGG16 11 | from torch.autograd import Variable 12 | from torch.utils import data as data_ 13 | from trainer import FasterRCNNTrainer 14 | from utils import array_tool as at 15 | from utils.vis_tool import visdom_bbox 16 | from utils.eval_tool import eval_detection_voc 17 | 18 | from model.utils.bbox_tools import bbox_iou 19 | 20 | 21 | import resource 22 | 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 24 | 25 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 26 | resource.setrlimit(resource.RLIMIT_NOFILE, (20480, rlimit[1])) 27 | 28 | matplotlib.use('agg') 29 | 30 | 31 | def eval(dataloader, faster_rcnn, test_num=10000, prob_thre=0.7): 32 | pred_bboxes, pred_labels, pred_scores = list(), list(), list() 33 | gt_bboxes, gt_labels, gt_difficults = list(), list(), list() 34 | for ii, (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) in tqdm(enumerate(dataloader)): 35 | # imgs here are reshaped images 36 | # sizes here are the original shape of images 37 | sizes = [sizes[0][0], sizes[1][0]] 38 | pred_bboxes_, pred_labels_, pred_scores_ = faster_rcnn.predict(imgs, [sizes], prob_thre=prob_thre) 39 | 40 | gt_bboxes += list(gt_bboxes_.numpy()) 41 | gt_labels += list(gt_labels_.numpy()) 42 | gt_difficults += list(gt_difficults_.numpy()) 43 | 44 | pred_bboxes += pred_bboxes_ 45 | pred_labels += pred_labels_ 46 | pred_scores += pred_scores_ 47 | if ii == test_num: break 48 | 49 | result = eval_detection_voc( 50 | pred_bboxes, pred_labels, pred_scores, 51 | gt_bboxes, gt_labels, gt_difficults, 52 | use_07_metric=True) 53 | return result 54 | 55 | 56 | def train(**kwargs): 57 | opt._parse(kwargs) 58 | 59 | dataset = Dataset(opt) 60 | 61 | # data 62 | print('load data') 63 | dataloader = data_.DataLoader(dataset, 64 | batch_size=1, 65 | shuffle=False, 66 | # pin_memory=True, 67 | num_workers=opt.num_workers) 68 | testset = TestDataset(opt) 69 | test_dataloader = data_.DataLoader(testset, 70 | batch_size=1, 71 | num_workers=opt.test_num_workers, 72 | shuffle=False, 73 | pin_memory=True 74 | ) 75 | 76 | # model and trainer 77 | faster_rcnn = FasterRCNNVGG16() 78 | print('model construct completed') 79 | 80 | trainer = FasterRCNNTrainer(faster_rcnn).cuda() 81 | 82 | if opt.load_path: 83 | trainer.load(opt.load_path) 84 | print('load pretrained model from %s' % opt.load_path) 85 | 86 | trainer.vis.text(dataset.db.label_names, win='labels') 87 | best_map = 0 88 | lr_ = opt.lr 89 | for epoch in range(opt.epoch): 90 | trainer.reset_meters() 91 | for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): 92 | scale = at.scalar(scale) 93 | img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() 94 | img, bbox, label = Variable(img), Variable(bbox), Variable(label) 95 | 96 | # print(label) 97 | 98 | # all the input data for one training are : img, bbox, label, scale 99 | trainer.train_step(img, bbox, label, scale) 100 | # training code stop here. 101 | 102 | 103 | if (ii + 1) % opt.plot_every == 0: 104 | if os.path.exists(opt.debug_file): 105 | ipdb.set_trace() 106 | 107 | # plot loss 108 | trainer.vis.plot_many(trainer.get_meter_data()) 109 | 110 | # plot groud truth bboxes 111 | ori_img_ = inverse_normalize(at.tonumpy(img[0])) 112 | gt_img = visdom_bbox(ori_img_, 113 | at.tonumpy(bbox_[0]), 114 | at.tonumpy(label_[0])) 115 | trainer.vis.img('gt_img', gt_img) 116 | 117 | # plot predicti bboxes 118 | _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True) 119 | 120 | pred_img = visdom_bbox(ori_img_, 121 | at.tonumpy(_bboxes[0]), 122 | at.tonumpy(_labels[0]).reshape(-1), 123 | at.tonumpy(_scores[0])) 124 | trainer.vis.img('pred_img', pred_img) 125 | 126 | # rpn confusion matrix(meter) 127 | trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') 128 | # roi confusion matrix 129 | trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) 130 | 131 | # use the test dataset to eval 132 | eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num, prob_thre=opt.prob_thre) 133 | 134 | print("eval_result", eval_result) 135 | 136 | if eval_result['map'] > best_map: 137 | best_map = eval_result['map'] 138 | best_path = trainer.save(best_map=best_map) 139 | if epoch == 9: 140 | trainer.load(best_path) 141 | trainer.faster_rcnn.scale_lr(opt.lr_decay) 142 | lr_ = lr_ * opt.lr_decay 143 | 144 | trainer.vis.plot('test_map', eval_result['map']) 145 | log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_), 146 | str(eval_result['map']), 147 | str(trainer.get_meter_data())) 148 | trainer.vis.log(log_info) 149 | if epoch == 13: 150 | break 151 | 152 | 153 | 154 | def eval_prob_thre(**kwargs): 155 | ''' 156 | Use the best trained model to find out the best prob_thre, \ 157 | which is used when generating prediction box using px and py. 158 | ''' 159 | opt._parse(kwargs) 160 | 161 | testset = TestDataset(opt) 162 | test_dataloader = data_.DataLoader(testset, 163 | batch_size=1, 164 | num_workers=opt.test_num_workers, 165 | shuffle=False, 166 | pin_memory=True 167 | ) 168 | 169 | # model and trainer 170 | faster_rcnn = FasterRCNNVGG16() 171 | print('model construct completed') 172 | 173 | trainer = FasterRCNNTrainer(faster_rcnn).cuda() 174 | 175 | if opt.load_path: 176 | trainer.load(opt.load_path) 177 | print('load pretrained model from %s' % opt.load_path) 178 | 179 | best_map = 0 180 | best_prob_thre = 0 181 | 182 | for prob_thre in np.linspace(0.3,0.9,7): 183 | 184 | # use the test dataset to eval 185 | eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num, prob_thre=prob_thre) 186 | print("eval_result", eval_result) 187 | if eval_result['map'] > best_map: 188 | best_map = eval_result['map'] 189 | best_prob_thre = prob_thre 190 | 191 | print("best_map is ", best_map) 192 | print("best prob_thre is ", best_prob_thre) 193 | 194 | 195 | 196 | 197 | if __name__ == '__main__': 198 | import fire 199 | 200 | fire.Fire() 201 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # Improved Localization Accuracy by LocNet for Faster R-CNN 2 | 3 | ## 1. Introduction 4 | 5 | ![](http://p3rz3gu1u.bkt.clouddn.com/2018-06-22-LocNet-FasterRCNN.001.jpeg) 6 | 7 | This project is a Simplified Faster R-CNN **improved by LocNet** (**Loc-Faster-RCNN** for short) implementation based on [Faster R-CNN by chenyuntc](https://github.com/chenyuntc/simple-faster-rcnn-pytorch). It aims in: 8 | 9 | - Improve the localization accuracy of Faster R-CNN by using LocNet in the Fast R-CNN part. 10 | - The first public implementation of the [original paper](https://ieeexplore.ieee.org/abstract/document/8270086/). The author of the paper didn't release their version. 11 | - Match the performance reported in original paper. 12 | 13 | And it has the following features: 14 | 15 | - It can be run as pure Python code, no more build affair. (cuda code moves to cupy, Cython acceleration are optional) 16 | 17 | This implementation is slightlly different from the original paper: 18 | 19 | - Skip pooling is not used here. Informations from conv5_3 layer(the feature map of original Faster R-CNN) is enough for my task, so skip pooling is droped in this repo. What's more, with the advent of new methods like [Feature Pyramid Networks](https://arxiv.org/abs/1612.03144), skip pooling seems to be obsolete :) 20 | - The RPN net is exactly same as Faster R-CNN, which means only 3X3 conv is applied, rather than 3X3 and 5X5 conv nets in the original paper. 21 | - Training strategy. The original paper train the RPN and LocNet alternately, but losses of RPN and LocNet are backproped at the same time in this repo. 22 | 23 | **prob_thre** : 24 | 25 | - Hyperparameters in Loc-Faster-RCNN are mostly like Faster R-CNN except for **prob_thre**. 26 | - prob_thre is the threshold of probability used when predicting the bounding box, if px or py is greater than prob_thre, this row or column is considered to be part of some object. 27 | - Different detection tasks may have different appropriate prob_thre to achive best performance. If most objects in the detection task are dense blocks, a higher prob_thre may achive better performance. 28 | - You can choose your own prob_thre according to your task characteristics. Use **eval_prob_thre** function in train.py to find out the best prob_thre for your task. Remember to set **load_path** variable in the **utils/config.py** to your best model before calling this function. 29 | 30 | ## 2. Performance 31 | 32 | ### 2.1 Pascal VOC 33 | 34 | Training and test set of Pascal VOC 2007 are used in this repo. 35 | 36 | #### 2.1.1 mAP 37 | 38 | The best prob_thre for Pascal VOC is 0.5. When using prob_thre=0.5, the performance of Loc-Faster-RCNN is listed as follows. So with dataset like Pascal VOC, Loc-Faster-RCNN can not achieve better result than Faster R-CNN. However, when apppied to dataset with lots of small and dense objects, Loc-Faster-RCNN is likely to achieve better performance. 39 | 40 | | Implementation | mAP | 41 | | :-------------: | :----: | 42 | | Loc-Faster-RCNN | 0.6527 | 43 | | Faster R-CNN | 0.7097 | 44 | 45 | #### 2.1.2 Differences between models predictions in Pascal VOC 46 | 47 | - LocNet part improves localization accuracy of Loc-Faster-RCNN by predicting the probability rather than locations. This helps when models is used to detection small objects or objects that are not so obvious. Like shown in the first 2 rows below, Loc-Faster-RCNN detected a person(row 1) and plant(row 2) even the objects are too small and not obvious. 48 | - However, LocNet part also hinders the model from identifying small parts of objects,which are more densely connected with the background rather than the main part of that object, like tail of a cat or wings of a bird ,as shown in the 3th~5th rows bellow. 49 | - What's more, if objects are overlaping or densely connected with each other in the same image, Loc-Faster-RCNN also have difficulty in drawing accurate bounding boxes around objects, as shown in the last row bellow. 50 | 51 | | Ground Truth | Loc-Faster-RCNN | Faster R-CNN | 52 | | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | 53 | |

|

|

| 54 | |

|

|

| 55 | |

|

|

| 56 | |

|

|

| 57 | |

|

|

| 58 | |

|

|

| 59 | 60 | ### 2.2 Text detection dataset 61 | 62 | ICDAR-2011 and ICDAR-2013 are used in training and eveluating. 63 | 64 | TBD. 65 | 66 | ## 3. Install dependencies 67 | 68 | This repo is built basically on [Faster R-CNN](https://github.com/chenyuntc/simple-faster-rcnn-pytorch). You can check this repo to see dependencies. 69 | 70 | ## 4. Train 71 | 72 | Compared with Faster R-CNN, Loc-Faster-RCNN is a little bit harder to train. If same initinal learning rate of 1e-3 is applied, the model may not converge after several epoches because px pr py would be nan. So if you encounter the same problem when using Loc-Faster-RCNN on your own dataset, maybe a smaller learning rate of 1e-4 or 1e-5 should work. 73 | 74 | ## Troubleshooting 75 | 76 | 77 | 78 | ## More 79 | 80 | - [x] model structure 81 | - [ ] maybe : skip pooling 82 | - [ ] Maybe : conv 3X3 and conv 5X5 in RPN 83 | - [ ] High likely : Feature Pyramid Network as backbone 84 | - [ ] High likely : RoI Align rather than RoI Pooling 85 | 86 | ## Acknowledgement 87 | 88 | This work builds on many excellent works, which include: 89 | 90 | - [Faster R-CNN by chenyuntc](https://github.com/chenyuntc/simple-faster-rcnn-pytorch), on which this repo is built on. The best implementation of Faster R-CNN in Pytorch I've ever seen. 91 | 92 | - [LocNet by the paper author](https://github.com/gidariss/LocNet). 93 | 94 | 95 | 96 | *** 97 | 98 | Licensed under MIT, see the LICENSE for more detail. 99 | 100 | Contribution Welcome. 101 | 102 | If you encounter any problem, feel free to open an issue. 103 | 104 | Correct me if anything is wrong or unclear. -------------------------------------------------------------------------------- /model/faster_rcnn_vgg16.py: -------------------------------------------------------------------------------- 1 | import torch as t 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from torchvision.models import vgg16 5 | from model.region_proposal_network import RegionProposalNetwork 6 | from model.faster_rcnn import FasterRCNN 7 | from model.roi_module import RoIPooling2D 8 | from utils import array_tool as at 9 | from utils.config import opt 10 | 11 | 12 | def decom_vgg16(): 13 | # the 30th layer of features is relu of conv5_3 14 | 15 | # use either caffe or pytorch pretrained model 16 | if opt.caffe_pretrain: 17 | model = vgg16(pretrained=False) 18 | if not opt.load_path: 19 | model.load_state_dict(t.load(opt.caffe_pretrain_path)) 20 | else: 21 | model = vgg16(not opt.load_path) # use pretrained torchvision vgg net 22 | 23 | features = list(model.features)[:30] 24 | 25 | # get the classification layer and drop some of them, leave the rest for use 26 | 27 | # classifier defined in pytorch source code. 28 | # self.classifier = nn.Sequential( 29 | # 0 nn.Linear(512 * 7 * 7, 4096), 30 | # 1 nn.ReLU(True), 31 | # 2 nn.Dropout(), 32 | # 3 nn.Linear(4096, 4096), 33 | # 4 nn.ReLU(True), 34 | # 5 nn.Dropout(), 35 | # 6 nn.Linear(4096, num_classes), 36 | # ) 37 | # only two linear and two ReLU layers are kept for classifier. 38 | 39 | classifier = model.classifier 40 | classifier = list(classifier) 41 | del classifier[6] 42 | if not opt.use_drop: 43 | del classifier[5] 44 | del classifier[2] 45 | classifier = nn.Sequential(*classifier) 46 | 47 | # freeze top4 conv 48 | for layer in features[:10]: 49 | for p in layer.parameters(): 50 | p.requires_grad = False 51 | 52 | return nn.Sequential(*features), classifier 53 | 54 | 55 | class FasterRCNNVGG16(FasterRCNN): 56 | """Faster R-CNN based on VGG-16. 57 | For descriptions on the interface of this model, please refer to 58 | :class:`model.faster_rcnn.FasterRCNN`. 59 | 60 | Args: 61 | n_fg_class (int): The number of classes excluding the background. 62 | ratios (list of floats): This is ratios of width to height of 63 | the anchors. 64 | anchor_scales (list of numbers): This is areas of anchors. 65 | Those areas will be the product of the square of an element in 66 | :obj:`anchor_scales` and the original area of the reference 67 | window. 68 | 69 | """ 70 | 71 | 72 | feat_stride = 16 # downsample 16x for output of conv5 in vgg16 73 | 74 | def __init__(self, 75 | n_fg_class=20, 76 | ratios=[0.5, 1, 2], 77 | anchor_scales=[8, 16, 32] 78 | ): 79 | # extractor is for base net of faster rcnn and classifier is for the final ROIHead part. 80 | # These are just some layers, not values. 81 | extractor, classifier = decom_vgg16() 82 | 83 | rpn = RegionProposalNetwork( 84 | 512, 512, 85 | ratios=ratios, 86 | anchor_scales=anchor_scales, 87 | feat_stride=self.feat_stride, 88 | ) 89 | 90 | head = VGG16RoIHead( 91 | n_class=n_fg_class + 1, 92 | roi_size=7, 93 | spatial_scale=(1. / self.feat_stride), 94 | M=28, 95 | classifier=classifier 96 | ) 97 | 98 | super(FasterRCNNVGG16, self).__init__( 99 | extractor, 100 | rpn, 101 | head, 102 | ) 103 | 104 | 105 | class VGG16RoIHead(nn.Module): 106 | """Faster R-CNN Head for VGG-16 based implementation. 107 | This class is used as a head for Faster R-CNN. 108 | This outputs class-wise localizations and classification based on feature 109 | maps in the given RoIs. 110 | 111 | Args: 112 | n_class (int): The number of classes possibly including the background. 113 | roi_size (int): Height and width of the feature maps after RoI-pooling. 114 | spatial_scale (float): Scale of the roi is resized. 115 | classifier (nn.Module): Two layer Linear ported from vgg16 116 | 117 | """ 118 | 119 | def __init__(self, n_class, roi_size, spatial_scale, M, 120 | classifier): 121 | # n_class includes the background 122 | super(VGG16RoIHead, self).__init__() 123 | 124 | self.n_class = n_class 125 | self.roi_size = roi_size 126 | self.spatial_scale = spatial_scale 127 | self.M = M 128 | 129 | 130 | # branch_1 131 | self.roi_1 = RoIPooling2D(self.roi_size, self.roi_size, self.spatial_scale) # roi shape of (N, C, outh, outw) 132 | self.classifier = classifier 133 | self.score = nn.Linear(4096, n_class) 134 | 135 | # branch_2 136 | self.roi_2 = RoIPooling2D(self.roi_size*2, self.roi_size*2, self.spatial_scale) # roi shape of (N, C, outh*2, outw*2) 137 | self.conv_21 = nn.Conv2d(512, 512, (3,3), padding=1) 138 | self.conv_22 = nn.Conv2d(512, 512, (3,3), padding=1) # output shape (1, 512, 14, 14) 139 | self.max_x = nn.MaxPool2d((14,1)) # output shape (1, 512, 1, 14) 140 | self.max_y = nn.MaxPool2d((1,14)) # output shape (1, 512, 14, 1) 141 | self.fc_x = nn.Linear(7168, M) 142 | self.fc_y = nn.Linear(7168, M) 143 | 144 | 145 | normal_init(self.score, 0, 0.01) 146 | normal_init(self.conv_21, 0, 0.01) 147 | normal_init(self.conv_22, 0, 0.01) 148 | normal_init(self.fc_x, 0, 0.01) 149 | normal_init(self.fc_y, 0, 0.01) 150 | 151 | 152 | def forward(self, x, rois, seach_regions, roi_indices): 153 | """Forward the chain. 154 | 155 | We assume that there are :math:`N` batches. 156 | 157 | Args: 158 | x (Variable): 4D image variable. (batch_size, channels, width, height) 159 | rois (Tensor): A bounding box array containing coordinates of 160 | proposal boxes. This is a concatenation of bounding box 161 | arrays from multiple images in the batch. 162 | Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed 163 | RoIs from the :math:`i` th image, 164 | :math:`R' = \\sum _{i=1} ^ N R_i`. 165 | roi_indices (Tensor): An array containing indices of images to 166 | which bounding boxes correspond to. Its shape is :math:`(R',)`. 167 | 168 | """ 169 | # in case roi_indices is ndarray 170 | 171 | roi_indices = at.totensor(roi_indices).float() 172 | 173 | rois = at.totensor(rois).float() 174 | indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1) 175 | # NOTE: important: yx->xy 176 | xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] 177 | indices_and_rois = t.autograd.Variable(xy_indices_and_rois.contiguous()) # [index, x1, y1, x2, y2] now 178 | 179 | seach_regions = at.totensor(seach_regions).float() 180 | indices_and_search_regions = t.cat([roi_indices[:, None], seach_regions], dim=1) 181 | # NOTE: important: yx->xy 182 | xy_indices_and_search_regions = indices_and_search_regions[:, [0, 2, 1, 4, 3]] 183 | indices_and_search_regions = t.autograd.Variable(xy_indices_and_search_regions.contiguous()) # [index, x1, y1, x2, y2] now 184 | 185 | # branch_1 186 | pool_1 = self.roi_1(x, indices_and_rois) # get all the ROI pooling, shape of (N, C, outh, outw) 187 | pool_1 = pool_1.view(pool_1.size(0), -1) # shape of shape of (N, C * outh * outw) where C=512 188 | fc7 = self.classifier(pool_1) 189 | roi_scores = self.score(fc7) 190 | 191 | # branch_2 192 | pool_2 = self.roi_2(x, indices_and_search_regions) 193 | conv_1 = self.conv_21(pool_2) 194 | conv_2 = self.conv_22(conv_1) 195 | max_x_ = self.max_x(conv_2) 196 | max_y_ = self.max_y(conv_2) 197 | max_x_ = max_x_.view(max_x_.size(0), -1) 198 | max_y_ = max_y_.view(max_y_.size(0), -1) 199 | px = F.sigmoid(self.fc_x(max_x_)) 200 | py = F.sigmoid(self.fc_y(max_y_)) 201 | 202 | return (px, py), roi_scores 203 | 204 | 205 | def normal_init(m, mean, stddev, truncated=False): 206 | """ 207 | weight initalizer: truncated normal and random normal. 208 | """ 209 | # x is a parameter 210 | if truncated: 211 | m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation 212 | else: 213 | m.weight.data.normal_(mean, stddev) 214 | m.bias.data.zero_() 215 | -------------------------------------------------------------------------------- /utils/vis_tool.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | import matplotlib 5 | import torch as t 6 | import visdom 7 | 8 | matplotlib.use('Agg') 9 | from matplotlib import pyplot as plot 10 | 11 | # from data.voc_dataset import VOC_BBOX_LABEL_NAMES 12 | 13 | 14 | VOC_BBOX_LABEL_NAMES = ( 15 | 'fly', 16 | 'bike', 17 | 'bird', 18 | 'boat', 19 | 'pin', 20 | 'bus', 21 | 'c', 22 | 'cat', 23 | 'chair', 24 | 'cow', 25 | 'table', 26 | 'dog', 27 | 'horse', 28 | 'moto', 29 | 'p', 30 | 'plant', 31 | 'shep', 32 | 'sofa', 33 | 'train', 34 | 'tv', 35 | ) 36 | 37 | 38 | def vis_image(img, ax=None): 39 | """Visualize a color image. 40 | 41 | Args: 42 | img (~numpy.ndarray): An array of shape :math:`(3, height, width)`. 43 | This is in RGB format and the range of its value is 44 | :math:`[0, 255]`. 45 | ax (matplotlib.axes.Axis): The visualization is displayed on this 46 | axis. If this is :obj:`None` (default), a new axis is created. 47 | 48 | Returns: 49 | ~matploblib.axes.Axes: 50 | Returns the Axes object with the plot for further tweaking. 51 | 52 | """ 53 | 54 | if ax is None: 55 | fig = plot.figure() 56 | ax = fig.add_subplot(1, 1, 1) 57 | # CHW -> HWC 58 | img = img.transpose((1, 2, 0)) 59 | ax.imshow(img.astype(np.uint8)) 60 | return ax 61 | 62 | 63 | def vis_bbox(img, bbox, label=None, score=None, ax=None): 64 | """Visualize bounding boxes inside image. 65 | 66 | Args: 67 | img (~numpy.ndarray): An array of shape :math:`(3, height, width)`. 68 | This is in RGB format and the range of its value is 69 | :math:`[0, 255]`. 70 | bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where 71 | :math:`R` is the number of bounding boxes in the image. 72 | Each element is organized 73 | by :math:`(y_{min}, x_{min}, y_{max}, x_{max})` in the second axis. 74 | label (~numpy.ndarray): An integer array of shape :math:`(R,)`. 75 | The values correspond to id for label names stored in 76 | :obj:`label_names`. This is optional. 77 | score (~numpy.ndarray): A float array of shape :math:`(R,)`. 78 | Each value indicates how confident the prediction is. 79 | This is optional. 80 | label_names (iterable of strings): Name of labels ordered according 81 | to label ids. If this is :obj:`None`, labels will be skipped. 82 | ax (matplotlib.axes.Axis): The visualization is displayed on this 83 | axis. If this is :obj:`None` (default), a new axis is created. 84 | 85 | Returns: 86 | ~matploblib.axes.Axes: 87 | Returns the Axes object with the plot for further tweaking. 88 | 89 | """ 90 | 91 | label_names = list(VOC_BBOX_LABEL_NAMES) + ['bg'] 92 | # add for index `-1` 93 | if label is not None and not len(bbox) == len(label): 94 | raise ValueError('The length of label must be same as that of bbox') 95 | if score is not None and not len(bbox) == len(score): 96 | raise ValueError('The length of score must be same as that of bbox') 97 | 98 | # Returns newly instantiated matplotlib.axes.Axes object if ax is None 99 | ax = vis_image(img, ax=ax) 100 | 101 | # If there is no bounding box to display, visualize the image and exit. 102 | if len(bbox) == 0: 103 | return ax 104 | 105 | for i, bb in enumerate(bbox): 106 | xy = (bb[1], bb[0]) 107 | height = bb[2] - bb[0] 108 | width = bb[3] - bb[1] 109 | ax.add_patch(plot.Rectangle( 110 | xy, width, height, fill=False, edgecolor='red', linewidth=2)) 111 | 112 | caption = list() 113 | 114 | if label is not None and label_names is not None: 115 | lb = label[i] 116 | if not (-1 <= lb < len(label_names)): # modfy here to add backgroud 117 | raise ValueError('No corresponding name is given') 118 | caption.append(label_names[lb]) 119 | if score is not None: 120 | sc = score[i] 121 | caption.append('{:.2f}'.format(sc)) 122 | 123 | if len(caption) > 0: 124 | ax.text(bb[1], bb[0], 125 | ': '.join(caption), 126 | style='italic', 127 | bbox={'facecolor': 'white', 'alpha': 0.5, 'pad': 0}) 128 | return ax 129 | 130 | 131 | def fig2data(fig): 132 | """ 133 | brief Convert a Matplotlib figure to a 4D numpy array with RGBA 134 | channels and return it 135 | 136 | @param fig: a matplotlib figure 137 | @return a numpy 3D array of RGBA values 138 | """ 139 | # draw the renderer 140 | fig.canvas.draw() 141 | 142 | # Get the RGBA buffer from the figure 143 | w, h = fig.canvas.get_width_height() 144 | buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8) 145 | buf.shape = (w, h, 4) 146 | 147 | # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode 148 | buf = np.roll(buf, 3, axis=2) 149 | return buf.reshape(h, w, 4) 150 | 151 | 152 | def fig4vis(fig): 153 | """ 154 | convert figure to ndarray 155 | """ 156 | ax = fig.get_figure() 157 | img_data = fig2data(ax).astype(np.int32) 158 | plot.close() 159 | # HWC->CHW 160 | return img_data[:, :, :3].transpose((2, 0, 1)) / 255. 161 | 162 | 163 | def visdom_bbox(*args, **kwargs): 164 | fig = vis_bbox(*args, **kwargs) 165 | data = fig4vis(fig) 166 | return data 167 | 168 | 169 | class Visualizer(object): 170 | """ 171 | wrapper for visdom 172 | you can still access naive visdom function by 173 | self.line, self.scater,self._send,etc. 174 | due to the implementation of `__getattr__` 175 | """ 176 | 177 | def __init__(self, env='default', **kwargs): 178 | self.vis = visdom.Visdom(env=env, **kwargs) 179 | self._vis_kw = kwargs 180 | 181 | # e.g.(’loss',23) the 23th value of loss 182 | self.index = {} 183 | self.log_text = '' 184 | 185 | def reinit(self, env='default', **kwargs): 186 | """ 187 | change the config of visdom 188 | """ 189 | self.vis = visdom.Visdom(env=env, **kwargs) 190 | return self 191 | 192 | def plot_many(self, d): 193 | """ 194 | plot multi values 195 | @params d: dict (name,value) i.e. ('loss',0.11) 196 | """ 197 | for k, v in d.items(): 198 | if v is not None: 199 | self.plot(k, v) 200 | 201 | def img_many(self, d): 202 | for k, v in d.items(): 203 | self.img(k, v) 204 | 205 | def plot(self, name, y, **kwargs): 206 | """ 207 | self.plot('loss',1.00) 208 | """ 209 | x = self.index.get(name, 0) 210 | self.vis.line(Y=np.array([y]), X=np.array([x]), 211 | win=name, 212 | opts=dict(title=name), 213 | update=None if x == 0 else 'append', 214 | **kwargs 215 | ) 216 | self.index[name] = x + 1 217 | 218 | def img(self, name, img_, **kwargs): 219 | """ 220 | self.img('input_img',t.Tensor(64,64)) 221 | self.img('input_imgs',t.Tensor(3,64,64)) 222 | self.img('input_imgs',t.Tensor(100,1,64,64)) 223 | self.img('input_imgs',t.Tensor(100,3,64,64),nrows=10) 224 | !!!don‘t ~~self.img('input_imgs',t.Tensor(100,64,64),nrows=10)~~!!! 225 | """ 226 | self.vis.images(t.Tensor(img_).cpu().numpy(), 227 | win=name, 228 | opts=dict(title=name), 229 | **kwargs 230 | ) 231 | 232 | def log(self, info, win='log_text'): 233 | """ 234 | self.log({'loss':1,'lr':0.0001}) 235 | """ 236 | self.log_text += ('[{time}] {info}
'.format( 237 | time=time.strftime('%m%d_%H%M%S'), \ 238 | info=info)) 239 | self.vis.text(self.log_text, win) 240 | 241 | def __getattr__(self, name): 242 | return getattr(self.vis, name) 243 | 244 | def state_dict(self): 245 | return { 246 | 'index': self.index, 247 | 'vis_kw': self._vis_kw, 248 | 'log_text': self.log_text, 249 | 'env': self.vis.env 250 | } 251 | 252 | def load_state_dict(self, d): 253 | self.vis = visdom.Visdom(env=d.get('env', self.vis.env), **(self.d.get('vis_kw'))) 254 | self.log_text = d.get('log_text', '') 255 | self.index = d.get('index', dict()) 256 | return self 257 | -------------------------------------------------------------------------------- /model/region_proposal_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch.nn import functional as F 3 | import torch as t 4 | from torch import nn 5 | 6 | from model.utils.bbox_tools import generate_anchor_base 7 | from model.utils.creator_tool import ProposalCreator 8 | 9 | 10 | class RegionProposalNetwork(nn.Module): 11 | """Region Proposal Network introduced in Faster R-CNN. 12 | 13 | This is Region Proposal Network introduced in Faster R-CNN [#]_. 14 | This takes features extracted from images and propose 15 | class agnostic bounding boxes around "objects". 16 | 17 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ 18 | Faster R-CNN: Towards Real-Time Object Detection with \ 19 | Region Proposal Networks. NIPS 2015. 20 | 21 | Args: 22 | in_channels (int): The channel size of input. 23 | mid_channels (int): The channel size of the intermediate tensor. 24 | ratios (list of floats): This is ratios of width to height of 25 | the anchors. 26 | anchor_scales (list of numbers): This is areas of anchors. 27 | Those areas will be the product of the square of an element in 28 | :obj:`anchor_scales` and the original area of the reference 29 | window. 30 | feat_stride (int): Stride size after extracting features from an 31 | image. 32 | initialW (callable): Initial weight value. If :obj:`None` then this 33 | function uses Gaussian distribution scaled by 0.1 to 34 | initialize weight. 35 | May also be a callable that takes an array and edits its values. 36 | proposal_creator_params (dict): Key valued paramters for 37 | :class:`model.utils.creator_tools.ProposalCreator`. 38 | 39 | .. seealso:: 40 | :class:`~model.utils.creator_tools.ProposalCreator` 41 | 42 | """ 43 | 44 | def __init__( 45 | self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2], 46 | anchor_scales=[8, 16, 32], feat_stride=16, 47 | proposal_creator_params=dict(), 48 | ): 49 | super(RegionProposalNetwork, self).__init__() 50 | 51 | # 在reshaped image的尺度上,以feature map上一个点对应的一个16*16的左上角点为原点,计算得到的所有anchorbox的角点的相对坐标 52 | # 为了后边计算reshaped image上的所有anchor box做准备 53 | self.anchor_base = generate_anchor_base(anchor_scales=anchor_scales, ratios=ratios) 54 | self.feat_stride = feat_stride 55 | self.proposal_layer = ProposalCreator(self, **proposal_creator_params) # parent_model = instance of RegionProposalNetwork, and use other default parameters 56 | n_anchor = self.anchor_base.shape[0] 57 | self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1) 58 | self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0) 59 | self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0) 60 | normal_init(self.conv1, 0, 0.01) 61 | normal_init(self.score, 0, 0.01) 62 | normal_init(self.loc, 0, 0.01) 63 | 64 | def forward(self, x, img_size, scale=1.): 65 | """Forward Region Proposal Network. 66 | 67 | Here are notations. 68 | 69 | * :math:`N` is batch size. 70 | * :math:`C` channel size of the input. 71 | * :math:`H` and :math:`W` are height and witdh of the input feature. 72 | * :math:`A` is number of anchors assigned to each pixel. 73 | 74 | Args: 75 | x (~torch.autograd.Variable): The Features extracted from images. 76 | Its shape is :math:`(N, C, H, W)`. 77 | img_size (tuple of ints): A tuple :obj:`height, width`, 78 | which contains image size after scaling. 79 | scale (float): The amount of scaling done to the input images after 80 | reading them from files. 81 | 82 | Returns: 83 | (~torch.autograd.Variable, ~torch.autograd.Variable, array, array, array): 84 | 85 | This is a tuple of five following values. 86 | 87 | * **rpn_locs**: Predicted bounding box offsets and scales for anchors. Its shape is :math:`(N, H W A, 4)`. 88 | * **rpn_scores**: Predicted foreground scores for anchors. Its shape is :math:`(N, H W A, 2)`. 89 | * **rois**: A bounding box array containing coordinates of proposal boxes. This is a concatenation of bounding box \ 90 | arrays from multiple images in the batch. Its shape is :math:`(R', 4)`. Given :math:`R_i` predicted \ 91 | bounding boxes from the :math:`i` th image, :math:`R' = \\sum _{i=1} ^ N R_i`. 92 | * **roi_indices**: An array containing indices of images to which RoIs correspond to. Its shape is :math:`(R',)`. 93 | * **anchor**: Coordinates of enumerated shifted anchors. Its shape is :math:`(H W A, 4)`. 94 | 95 | """ 96 | n, _, hh, ww = x.shape # n is always 1 here. 97 | 98 | # reshaped image中的所有anchor box, shape (hh*ww*n_anchor, 4) 99 | anchor = _enumerate_shifted_anchor(np.array(self.anchor_base), 100 | self.feat_stride, hh, ww) 101 | 102 | n_anchor = anchor.shape[0] // (hh * ww) # feature map中每一个点上的anchor box数量 103 | h = F.relu(self.conv1(x)) 104 | 105 | rpn_locs = self.loc(h) 106 | rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4) # shape (n, hh*ww*n_anchor, 4) 107 | 108 | rpn_scores = self.score(h) 109 | rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous() 110 | rpn_scores = rpn_scores.view(n, -1, 2) # shape (n, hh*ww*n_anchor, 2) 111 | 112 | rpn_fg_scores = rpn_scores.view(n, hh, ww, n_anchor, 2)[:, :, :, :, 1].contiguous() # 该anchor是前景的概率 113 | rpn_fg_scores = rpn_fg_scores.view(n, -1) # shape (n, hh*ww*n_anchor) 114 | 115 | rois = list() 116 | search_regions = list() 117 | roi_indices = list() 118 | for i in range(n): 119 | roi, search_region = self.proposal_layer(rpn_locs[i].cpu().data.numpy(), 120 | rpn_fg_scores[i].cpu().data.numpy(), 121 | anchor, img_size, 122 | scale=scale) 123 | batch_index = i * np.ones((len(roi),), dtype=np.int32) 124 | rois.append(roi) 125 | search_regions.append(search_region) 126 | roi_indices.append(batch_index) 127 | 128 | rois = np.concatenate(rois, axis=0) # shape (num_rois, 4) 129 | search_regions = np.concatenate(search_regions, axis=0) # shape (num_rois, 4) 130 | roi_indices = np.concatenate(roi_indices, axis=0) # shape (num_rois,) 131 | return rpn_locs, rpn_scores, rois, search_regions, roi_indices, anchor 132 | 133 | 134 | 135 | def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width): 136 | # Enumerate all shifted anchors: 137 | # 138 | # add A anchors (1, A, 4) to 139 | # cell K shifts (K, 1, 4) to get 140 | # shift anchors (K, A, 4) 141 | # reshape to (K*A, 4) shifted anchors 142 | # return (K*A, 4) 143 | 144 | # !TODO: add support for torch.CudaTensor 145 | # xp = cuda.get_array_module(anchor_base) 146 | # it seems that it can't be boosed using GPU 147 | import numpy as xp 148 | shift_y = xp.arange(0, height * feat_stride, feat_stride) 149 | shift_x = xp.arange(0, width * feat_stride, feat_stride) 150 | shift_x, shift_y = xp.meshgrid(shift_x, shift_y) 151 | shift = xp.stack((shift_y.ravel(), shift_x.ravel(), 152 | shift_y.ravel(), shift_x.ravel()), axis=1) 153 | 154 | A = anchor_base.shape[0] 155 | K = shift.shape[0] 156 | anchor = anchor_base.reshape((1, A, 4)) + \ 157 | shift.reshape((1, K, 4)).transpose((1, 0, 2)) 158 | anchor = anchor.reshape((K * A, 4)).astype(np.float32) 159 | return anchor 160 | 161 | 162 | def _enumerate_shifted_anchor_torch(anchor_base, feat_stride, height, width): 163 | # Enumerate all shifted anchors: 164 | # 165 | # add A anchors (1, A, 4) to 166 | # cell K shifts (K, 1, 4) to get 167 | # shift anchors (K, A, 4) 168 | # reshape to (K*A, 4) shifted anchors 169 | # return (K*A, 4) 170 | 171 | # !TODO: add support for torch.CudaTensor 172 | # xp = cuda.get_array_module(anchor_base) 173 | import torch as t 174 | shift_y = t.arange(0, height * feat_stride, feat_stride) 175 | shift_x = t.arange(0, width * feat_stride, feat_stride) 176 | shift_x, shift_y = xp.meshgrid(shift_x, shift_y) 177 | shift = xp.stack((shift_y.ravel(), shift_x.ravel(), 178 | shift_y.ravel(), shift_x.ravel()), axis=1) 179 | 180 | A = anchor_base.shape[0] 181 | K = shift.shape[0] 182 | anchor = anchor_base.reshape((1, A, 4)) + \ 183 | shift.reshape((1, K, 4)).transpose((1, 0, 2)) 184 | anchor = anchor.reshape((K * A, 4)).astype(np.float32) 185 | return anchor 186 | 187 | 188 | def normal_init(m, mean, stddev, truncated=False): 189 | """ 190 | weight initalizer: truncated normal and random normal. 191 | """ 192 | # x is a parameter 193 | if truncated: 194 | m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation 195 | else: 196 | m.weight.data.normal_(mean, stddev) 197 | m.bias.data.zero_() 198 | -------------------------------------------------------------------------------- /data/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | import random 4 | 5 | 6 | def read_image(path, dtype=np.float32, color=True): 7 | """Read an image from a file. 8 | 9 | This function reads an image from given file. The image is CHW format and 10 | the range of its value is :math:`[0, 255]`. If :obj:`color = True`, the 11 | order of the channels is RGB. 12 | 13 | Args: 14 | path (str): A path of image file. 15 | dtype: The type of array. The default value is :obj:`~numpy.float32`. 16 | color (bool): This option determines the number of channels. 17 | If :obj:`True`, the number of channels is three. In this case, 18 | the order of the channels is RGB. This is the default behaviour. 19 | If :obj:`False`, this function returns a grayscale image. 20 | 21 | Returns: 22 | ~numpy.ndarray: An image. 23 | """ 24 | 25 | f = Image.open(path) 26 | try: 27 | if color: 28 | img = f.convert('RGB') 29 | else: 30 | img = f.convert('P') 31 | img = np.asarray(img, dtype=dtype) 32 | finally: 33 | if hasattr(f, 'close'): 34 | f.close() 35 | 36 | if img.ndim == 2: 37 | # reshape (H, W) -> (1, H, W) 38 | return img[np.newaxis] 39 | else: 40 | # transpose (H, W, C) -> (C, H, W) 41 | return img.transpose((2, 0, 1)) 42 | 43 | 44 | def resize_bbox(bbox, in_size, out_size): 45 | """Resize bounding boxes according to image resize. 46 | 47 | The bounding boxes are expected to be packed into a two dimensional 48 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 49 | bounding boxes in the image. The second axis represents attributes of 50 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 51 | where the four attributes are coordinates of the top left and the 52 | bottom right vertices. 53 | 54 | Args: 55 | bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`. 56 | :math:`R` is the number of bounding boxes. 57 | in_size (tuple): A tuple of length 2. The height and the width 58 | of the image before resized. 59 | out_size (tuple): A tuple of length 2. The height and the width 60 | of the image after resized. 61 | 62 | Returns: 63 | ~numpy.ndarray: 64 | Bounding boxes rescaled according to the given image shapes. 65 | 66 | """ 67 | bbox = bbox.copy() 68 | y_scale = float(out_size[0]) / in_size[0] 69 | x_scale = float(out_size[1]) / in_size[1] 70 | bbox[:, 0] = y_scale * bbox[:, 0] 71 | bbox[:, 2] = y_scale * bbox[:, 2] 72 | bbox[:, 1] = x_scale * bbox[:, 1] 73 | bbox[:, 3] = x_scale * bbox[:, 3] 74 | return bbox 75 | 76 | 77 | def flip_bbox(bbox, size, y_flip=False, x_flip=False): 78 | """Flip bounding boxes accordingly. 79 | 80 | The bounding boxes are expected to be packed into a two dimensional 81 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 82 | bounding boxes in the image. The second axis represents attributes of 83 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 84 | where the four attributes are coordinates of the top left and the 85 | bottom right vertices. 86 | 87 | Args: 88 | bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`. 89 | :math:`R` is the number of bounding boxes. 90 | size (tuple): A tuple of length 2. The height and the width 91 | of the image before resized. 92 | y_flip (bool): Flip bounding box according to a vertical flip of 93 | an image. 94 | x_flip (bool): Flip bounding box according to a horizontal flip of 95 | an image. 96 | 97 | Returns: 98 | ~numpy.ndarray: 99 | Bounding boxes flipped according to the given flips. 100 | 101 | """ 102 | H, W = size 103 | bbox = bbox.copy() 104 | if y_flip: 105 | y_max = H - bbox[:, 0] 106 | y_min = H - bbox[:, 2] 107 | bbox[:, 0] = y_min 108 | bbox[:, 2] = y_max 109 | if x_flip: 110 | x_max = W - bbox[:, 1] 111 | x_min = W - bbox[:, 3] 112 | bbox[:, 1] = x_min 113 | bbox[:, 3] = x_max 114 | return bbox 115 | 116 | 117 | def crop_bbox( 118 | bbox, y_slice=None, x_slice=None, 119 | allow_outside_center=True, return_param=False): 120 | """Translate bounding boxes to fit within the cropped area of an image. 121 | 122 | This method is mainly used together with image cropping. 123 | This method translates the coordinates of bounding boxes like 124 | :func:`data.util.translate_bbox`. In addition, 125 | this function truncates the bounding boxes to fit within the cropped area. 126 | If a bounding box does not overlap with the cropped area, 127 | this bounding box will be removed. 128 | 129 | The bounding boxes are expected to be packed into a two dimensional 130 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 131 | bounding boxes in the image. The second axis represents attributes of 132 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 133 | where the four attributes are coordinates of the top left and the 134 | bottom right vertices. 135 | 136 | Args: 137 | bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is 138 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes. 139 | y_slice (slice): The slice of y axis. 140 | x_slice (slice): The slice of x axis. 141 | allow_outside_center (bool): If this argument is :obj:`False`, 142 | bounding boxes whose centers are outside of the cropped area 143 | are removed. The default value is :obj:`True`. 144 | return_param (bool): If :obj:`True`, this function returns 145 | indices of kept bounding boxes. 146 | 147 | Returns: 148 | ~numpy.ndarray or (~numpy.ndarray, dict): 149 | 150 | If :obj:`return_param = False`, returns an array :obj:`bbox`. 151 | 152 | If :obj:`return_param = True`, 153 | returns a tuple whose elements are :obj:`bbox, param`. 154 | :obj:`param` is a dictionary of intermediate parameters whose 155 | contents are listed below with key, value-type and the description 156 | of the value. 157 | 158 | * **index** (*numpy.ndarray*): An array holding indices of used \ 159 | bounding boxes. 160 | 161 | """ 162 | 163 | t, b = _slice_to_bounds(y_slice) 164 | l, r = _slice_to_bounds(x_slice) 165 | crop_bb = np.array((t, l, b, r)) 166 | 167 | if allow_outside_center: 168 | mask = np.ones(bbox.shape[0], dtype=bool) 169 | else: 170 | center = (bbox[:, :2] + bbox[:, 2:]) / 2 171 | mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) \ 172 | .all(axis=1) 173 | 174 | bbox = bbox.copy() 175 | bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2]) 176 | bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:]) 177 | bbox[:, :2] -= crop_bb[:2] 178 | bbox[:, 2:] -= crop_bb[:2] 179 | 180 | mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1)) 181 | bbox = bbox[mask] 182 | 183 | if return_param: 184 | return bbox, {'index': np.flatnonzero(mask)} 185 | else: 186 | return bbox 187 | 188 | 189 | def _slice_to_bounds(slice_): 190 | if slice_ is None: 191 | return 0, np.inf 192 | 193 | if slice_.start is None: 194 | l = 0 195 | else: 196 | l = slice_.start 197 | 198 | if slice_.stop is None: 199 | u = np.inf 200 | else: 201 | u = slice_.stop 202 | 203 | return l, u 204 | 205 | 206 | def translate_bbox(bbox, y_offset=0, x_offset=0): 207 | """Translate bounding boxes. 208 | 209 | This method is mainly used together with image transforms, such as padding 210 | and cropping, which translates the left top point of the image from 211 | coordinate :math:`(0, 0)` to coordinate 212 | :math:`(y, x) = (y_{offset}, x_{offset})`. 213 | 214 | The bounding boxes are expected to be packed into a two dimensional 215 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 216 | bounding boxes in the image. The second axis represents attributes of 217 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 218 | where the four attributes are coordinates of the top left and the 219 | bottom right vertices. 220 | 221 | Args: 222 | bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is 223 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes. 224 | y_offset (int or float): The offset along y axis. 225 | x_offset (int or float): The offset along x axis. 226 | 227 | Returns: 228 | ~numpy.ndarray: 229 | Bounding boxes translated according to the given offsets. 230 | 231 | """ 232 | 233 | out_bbox = bbox.copy() 234 | out_bbox[:, :2] += (y_offset, x_offset) 235 | out_bbox[:, 2:] += (y_offset, x_offset) 236 | 237 | return out_bbox 238 | 239 | 240 | def random_flip(img, y_random=False, x_random=False, 241 | return_param=False, copy=False): 242 | """Randomly flip an image in vertical or horizontal direction. 243 | 244 | Args: 245 | img (~numpy.ndarray): An array that gets flipped. This is in 246 | CHW format. 247 | y_random (bool): Randomly flip in vertical direction. 248 | x_random (bool): Randomly flip in horizontal direction. 249 | return_param (bool): Returns information of flip. 250 | copy (bool): If False, a view of :obj:`img` will be returned. 251 | 252 | Returns: 253 | ~numpy.ndarray or (~numpy.ndarray, dict): 254 | 255 | If :obj:`return_param = False`, 256 | returns an array :obj:`out_img` that is the result of flipping. 257 | 258 | If :obj:`return_param = True`, 259 | returns a tuple whose elements are :obj:`out_img, param`. 260 | :obj:`param` is a dictionary of intermediate parameters whose 261 | contents are listed below with key, value-type and the description 262 | of the value. 263 | 264 | * **y_flip** (*bool*): Whether the image was flipped in the\ 265 | vertical direction or not. 266 | * **x_flip** (*bool*): Whether the image was flipped in the\ 267 | horizontal direction or not. 268 | 269 | """ 270 | y_flip, x_flip = False, False 271 | if y_random: 272 | y_flip = random.choice([True, False]) 273 | if x_random: 274 | x_flip = random.choice([True, False]) 275 | 276 | if y_flip: 277 | img = img[:, ::-1, :] 278 | if x_flip: 279 | img = img[:, :, ::-1] 280 | 281 | if copy: 282 | img = img.copy() 283 | 284 | if return_param: 285 | return img, {'y_flip': y_flip, 'x_flip': x_flip} 286 | else: 287 | return img 288 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import time 3 | from torch.nn import functional as F 4 | from model.utils.creator_tool import AnchorTargetCreator, ProposalTargetCreator 5 | 6 | from torch import nn 7 | import torch as t 8 | from torch.autograd import Variable 9 | from utils import array_tool as at 10 | from utils.vis_tool import Visualizer 11 | 12 | from utils.config import opt 13 | from torchnet.meter import ConfusionMeter, AverageValueMeter 14 | 15 | 16 | import numpy as np 17 | 18 | 19 | 20 | 21 | # create a namedtuple 22 | LossTuple = namedtuple('LossTuple', 23 | ['rpn_loc_loss', 24 | 'rpn_cls_loss', 25 | 'roi_loc_loss', 26 | 'roi_cls_loss', 27 | 'total_loss' 28 | ]) 29 | 30 | 31 | class FasterRCNNTrainer(nn.Module): 32 | """wrapper for conveniently training. return losses 33 | 34 | The losses include: 35 | 36 | * :obj:`rpn_loc_loss`: The localization loss for Region Proposal Network (RPN). 37 | * :obj:`rpn_cls_loss`: The classification loss for RPN. 38 | * :obj:`roi_loc_loss`: The localization loss for the head module. 39 | * :obj:`roi_cls_loss`: The classification loss for the head module. 40 | * :obj:`total_loss`: The sum of 4 loss above. 41 | 42 | Args: 43 | faster_rcnn (model.FasterRCNN): 44 | A Faster R-CNN model that is going to be trained. 45 | """ 46 | 47 | def __init__(self, faster_rcnn): 48 | super(FasterRCNNTrainer, self).__init__() 49 | 50 | self.faster_rcnn = faster_rcnn 51 | self.rpn_sigma = opt.rpn_sigma 52 | self.roi_sigma = opt.roi_sigma 53 | 54 | # target creator create gt_bbox gt_label etc as training targets. 55 | self.anchor_target_creator = AnchorTargetCreator() 56 | self.proposal_target_creator = ProposalTargetCreator() 57 | 58 | self.optimizer = self.faster_rcnn.get_optimizer() 59 | # visdom wrapper 60 | self.vis = Visualizer(env=opt.env) 61 | 62 | # indicators for training status 63 | self.rpn_cm = ConfusionMeter(2) 64 | self.roi_cm = ConfusionMeter(21) 65 | self.meters = {k: AverageValueMeter() for k in LossTuple._fields} # average loss 66 | 67 | def forward(self, imgs, bboxes, labels, scale): 68 | """Forward Faster R-CNN and calculate losses. 69 | 70 | Here are notations used. 71 | 72 | * :math:`N` is the batch size. 73 | * :math:`R` is the number of bounding boxes per image. 74 | 75 | Currently, only :math:`N=1` is supported. 76 | 77 | Args: 78 | imgs (~torch.autograd.Variable): A variable with a batch of images. 79 | bboxes (~torch.autograd.Variable): A batch of bounding boxes. 80 | Its shape is :math:`(N, R, 4)`. 81 | labels (~torch.autograd..Variable): A batch of labels. 82 | Its shape is :math:`(N, R)`. The background is excluded from 83 | the definition, which means that the range of the value 84 | is :math:`[0, L - 1]`. :math:`L` is the number of foreground 85 | classes. 86 | scale (float): Amount of scaling applied to 87 | the raw image during preprocessing. 88 | 89 | Returns: 90 | namedtuple of 5 losses 91 | """ 92 | n = bboxes.shape[0] # number of input images one time 93 | if n != 1: 94 | raise ValueError('Currently only batch size 1 is supported.') 95 | 96 | _, _, H, W = imgs.shape # should be (1,3,H,W) 97 | img_size = (H, W) 98 | 99 | # need more feature maps here when you are trying to use features of different scale 100 | features = self.faster_rcnn.extractor(imgs) 101 | 102 | rpn_locs, rpn_scores, rois, search_regions, roi_indices, anchor = self.faster_rcnn.rpn(features, img_size, scale) 103 | 104 | # Since batch size is one, convert variables to singular form 105 | # different parameters here : 106 | # num_boxes : number of ground truth bounding boxes in a image. 107 | # num_anchors : number of anchors in images(or to say in a feature map). 108 | # num_rois : number of ROIs that are generated by RPN, which will be used in Fast RCNN. 109 | bbox = bboxes[0] # shape (num_boxes, 4) 110 | label = labels[0] # shape (num_boxes,) 111 | rpn_score = rpn_scores[0] # shape (num_anchors,) 112 | rpn_loc = rpn_locs[0] # shape (num_anchors, 4) 113 | roi = rois # shape (num_rois, 4) 114 | search_region = search_regions # shape (num_rois, 4) 115 | 116 | # Sample RoIs and forward 117 | # it's fine to break the computation graph of rois, 118 | # consider them as constant input 119 | sample_roi, sample_search_region, (Tx,Ty), gt_roi_label = self.proposal_target_creator(roi, 120 | search_region, 121 | at.tonumpy(bbox), 122 | at.tonumpy(label)) 123 | 124 | # NOTE it's all zero because now it only support for batch=1 now 125 | sample_roi_index = t.zeros(len(sample_roi)) 126 | (px, py), roi_score = self.faster_rcnn.head(features, 127 | sample_roi, 128 | sample_search_region, 129 | sample_roi_index) 130 | 131 | 132 | # ------------------ RPN losses -------------------# 133 | gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(at.tonumpy(bbox), 134 | anchor, 135 | img_size) 136 | gt_rpn_label = at.tovariable(gt_rpn_label).long() 137 | gt_rpn_loc = at.tovariable(gt_rpn_loc) 138 | rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, 139 | gt_rpn_loc, 140 | gt_rpn_label.data, 141 | self.rpn_sigma) 142 | 143 | # NOTE: default value of ignore_index is -100 ... 144 | rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) 145 | _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] 146 | _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] 147 | self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) 148 | 149 | 150 | # ------------------ ROI losses (fast rcnn loss) -------------------# 151 | n_sample = px.shape[0] 152 | # (px, py) and (Tx, Ty) are to be used to caculate loss :roi_loc_loss 153 | 154 | Tx = at.tovariable(Tx).float() 155 | Ty = at.tovariable(Ty).float() 156 | 157 | print("px is ", px) 158 | # print("max of px is ", t.max(px)) 159 | # print("min of px is ", t.min(px)) 160 | # print(t.max(Tx)) 161 | # print(t.max(Ty)) 162 | # print(Tx.shape, Ty.shape, px.shape, py.shape) 163 | 164 | roi_loc_loss = _LocNet_loss(Tx, Ty, px, py, gt_roi_label.data, self.roi_sigma) 165 | 166 | 167 | gt_roi_label = at.tovariable(gt_roi_label).long() 168 | roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) 169 | 170 | self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) 171 | 172 | 173 | losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] 174 | 175 | print("losses", losses) 176 | 177 | losses = losses + [sum(losses)] 178 | 179 | return LossTuple(*losses) # return a namedtuple 180 | 181 | def train_step(self, imgs, bboxes, labels, scale): 182 | self.optimizer.zero_grad() 183 | losses = self.forward(imgs, bboxes, labels, scale) # losses is a namedtuple 184 | losses.total_loss.backward() # use total_loss to backprop 185 | self.optimizer.step() 186 | self.update_meters(losses) 187 | return losses 188 | 189 | def save(self, save_optimizer=False, save_path=None, **kwargs): 190 | """serialize models include optimizer and other info 191 | return path where the model-file is stored. 192 | 193 | Args: 194 | save_optimizer (bool): whether save optimizer.state_dict(). 195 | save_path (string): where to save model, if it's None, save_path 196 | is generate using time str and info from kwargs. 197 | 198 | Returns: 199 | save_path(str): the path to save models. 200 | """ 201 | save_dict = dict() 202 | 203 | save_dict['model'] = self.faster_rcnn.state_dict() 204 | save_dict['config'] = opt._state_dict() 205 | save_dict['other_info'] = kwargs 206 | save_dict['vis_info'] = self.vis.state_dict() 207 | 208 | if save_optimizer: 209 | save_dict['optimizer'] = self.optimizer.state_dict() 210 | 211 | if save_path is None: 212 | timestr = time.strftime('%m%d%H%M') 213 | save_path = 'checkpoints/fasterrcnn_%s' % timestr 214 | for k_, v_ in kwargs.items(): 215 | save_path += '_%s' % v_ 216 | 217 | t.save(save_dict, save_path) 218 | self.vis.save([self.vis.env]) 219 | return save_path 220 | 221 | def load(self, path, load_optimizer=True, parse_opt=False, ): 222 | state_dict = t.load(path) 223 | if 'model' in state_dict: 224 | self.faster_rcnn.load_state_dict(state_dict['model']) 225 | else: # legacy way, for backward compatibility 226 | self.faster_rcnn.load_state_dict(state_dict) 227 | return self 228 | if parse_opt: 229 | opt._parse(state_dict['config']) 230 | if 'optimizer' in state_dict and load_optimizer: 231 | self.optimizer.load_state_dict(state_dict['optimizer']) 232 | return self 233 | 234 | def update_meters(self, losses): 235 | loss_d = {k: at.scalar(v) for k, v in losses._asdict().items()} 236 | for key, meter in self.meters.items(): 237 | meter.add(loss_d[key]) 238 | 239 | def reset_meters(self): 240 | for key, meter in self.meters.items(): 241 | meter.reset() 242 | self.roi_cm.reset() 243 | self.rpn_cm.reset() 244 | 245 | def get_meter_data(self): 246 | return {k: v.value()[0] for k, v in self.meters.items()} 247 | 248 | 249 | def _smooth_l1_loss(x, t, in_weight, sigma): 250 | sigma2 = sigma ** 2 251 | diff = in_weight * (x - t) 252 | abs_diff = diff.abs() 253 | flag = (abs_diff.data < (1. / sigma2)).float() 254 | flag = Variable(flag) 255 | y = (flag * (sigma2 / 2.) * (diff ** 2) + 256 | (1 - flag) * (abs_diff - 0.5 / sigma2)) 257 | return y.sum() 258 | 259 | 260 | def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma): 261 | in_weight = t.zeros(gt_loc.shape).cuda() 262 | # Localization loss is calculated only for positive rois. 263 | # NOTE: unlike origin implementation, 264 | # we don't need inside_weight and outside_weight, they can calculate by gt_label 265 | in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1 266 | loc_loss = _smooth_l1_loss(pred_loc, gt_loc, Variable(in_weight), sigma) 267 | # Normalize by total number of negtive and positive rois. 268 | loc_loss /= (gt_label >= 0).sum().float() # ignore gt_label==-1 for rpn_loss 269 | return loc_loss 270 | 271 | def _LocNet_loss(Tx, Ty, Px, Py, gt_label, sigma): 272 | ''' 273 | Args: 274 | Tx, Ty : ground truth value for all points in all boxes. 275 | shape of (R, M) of which R is the number of boxes used in \ 276 | Head part and M is the number of parts along x and y axis of a box. 277 | Px, Py : predicted value of Tx and Ty, shape of (R, M) 278 | gt_label : class id of the box, 0 means background. shape of (R) 279 | ''' 280 | # print("Tx.shape ", Tx.shape) 281 | # print("Ty shape ", Ty.shape) 282 | # print("Tx max ", t.max(Tx)) 283 | # print("Tx min", t.min(Tx)) 284 | 285 | s = t.sum(Tx * t.log(Px), dim=1) + t.sum((1-Tx) * t.log(1-Px), dim=1) + t.sum(Ty * t.log(Py), dim=1) + t.sum((1-Ty) * t.log(1-Py), dim=1) 286 | s = (-1) * s 287 | 288 | # Localization loss is calculated only for positive rois. 289 | in_weight = t.zeros(s.shape).cuda() 290 | for i in range(len(gt_label)): 291 | if gt_label[i]>0: 292 | in_weight[i] = 1 293 | in_weight = Variable(in_weight) 294 | 295 | result = sigma * (in_weight * s).sum() 296 | 297 | return result 298 | 299 | 300 | 301 | -------------------------------------------------------------------------------- /model/faster_rcnn.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import torch as t 3 | import numpy as np 4 | import cupy as cp 5 | from utils import array_tool as at 6 | from model.utils.bbox_tools import loc2bbox, p2bbox 7 | from model.utils.nms import non_maximum_suppression 8 | 9 | from torch import nn 10 | from data.dataset import preprocess 11 | from torch.nn import functional as F 12 | from utils.config import opt 13 | 14 | 15 | class FasterRCNN(nn.Module): 16 | """Base class for Faster R-CNN. 17 | 18 | This is a base class for Faster R-CNN links supporting object detection 19 | API [#]_. The following three stages constitute Faster R-CNN. 20 | 21 | 1. **Feature extraction**: Images are taken and their \ 22 | feature maps are calculated. 23 | 2. **Region Proposal Networks**: Given the feature maps calculated in \ 24 | the previous stage, produce set of RoIs around objects. 25 | 3. **Localization and Classification Heads**: Using feature maps that \ 26 | belong to the proposed RoIs, classify the categories of the objects \ 27 | in the RoIs and improve localizations. 28 | 29 | Each stage is carried out by one of the callable 30 | :class:`torch.nn.Module` objects :obj:`feature`, :obj:`rpn` and :obj:`head`. 31 | 32 | There are two functions :meth:`predict` and :meth:`__call__` to conduct 33 | object detection. 34 | :meth:`predict` takes images and returns bounding boxes that are converted 35 | to image coordinates. This will be useful for a scenario when 36 | Faster R-CNN is treated as a black box function, for instance. 37 | :meth:`__call__` is provided for a scnerario when intermediate outputs 38 | are needed, for instance, for training and debugging. 39 | 40 | Links that support obejct detection API have method :meth:`predict` with 41 | the same interface. Please refer to :meth:`predict` for 42 | further details. 43 | 44 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ 45 | Faster R-CNN: Towards Real-Time Object Detection with \ 46 | Region Proposal Networks. NIPS 2015. 47 | 48 | Args: 49 | extractor (nn.Module): A module that takes a BCHW image 50 | array and returns feature maps. 51 | rpn (nn.Module): A module that has the same interface as 52 | :class:`model.region_proposal_network.RegionProposalNetwork`. 53 | Please refer to the documentation found there. 54 | head (nn.Module): A module that takes 55 | a BCHW variable, RoIs and batch indices for RoIs. This returns class 56 | dependent localization paramters and class scores. 57 | loc_normalize_mean (tuple of four floats): Mean values of 58 | localization estimates. 59 | loc_normalize_std (tupler of four floats): Standard deviation 60 | of localization estimates. 61 | 62 | """ 63 | 64 | def __init__(self, extractor, rpn, head, 65 | loc_normalize_mean = (0., 0., 0., 0.), 66 | loc_normalize_std = (0.1, 0.1, 0.2, 0.2) 67 | ): 68 | super(FasterRCNN, self).__init__() 69 | self.extractor = extractor 70 | self.rpn = rpn 71 | self.head = head 72 | 73 | # mean and std 74 | self.loc_normalize_mean = loc_normalize_mean 75 | self.loc_normalize_std = loc_normalize_std 76 | self.use_preset('evaluate') 77 | 78 | @property 79 | def n_class(self): 80 | # Total number of classes including the background. 81 | return self.head.n_class 82 | 83 | def forward(self, x, scale=1.): 84 | """Forward Faster R-CNN. 85 | 86 | Scaling paramter :obj:`scale` is used by RPN to determine the 87 | threshold to select small objects, which are going to be 88 | rejected irrespective of their confidence scores. 89 | 90 | Here are notations used. 91 | 92 | * :math:`N` is the number of batch size 93 | * :math:`R'` is the total number of RoIs produced across batches. \ 94 | Given :math:`R_i` proposed RoIs from the :math:`i` th image, \ 95 | :math:`R' = \\sum _{i=1} ^ N R_i`. 96 | * :math:`L` is the number of classes excluding the background. 97 | 98 | Classes are ordered by the background, the first class, ..., and 99 | the :math:`L` th class. 100 | 101 | Args: 102 | x (autograd.Variable): 4D image variable. 103 | scale (float): Amount of scaling applied to the raw image 104 | during preprocessing. 105 | 106 | Returns: 107 | Variable, Variable, array, array: 108 | Returns tuple of four values listed below. 109 | 110 | * **roi_cls_locs**: Offsets and scalings for the proposed RoIs. \ 111 | Its shape is :math:`(R', (L + 1) \\times 4)`. 112 | * **roi_scores**: Class predictions for the proposed RoIs. \ 113 | Its shape is :math:`(R', L + 1)`. 114 | * **rois**: RoIs proposed by RPN. Its shape is \ 115 | :math:`(R', 4)`. 116 | * **roi_indices**: Batch indices of RoIs. Its shape is \ 117 | :math:`(R',)`. 118 | 119 | """ 120 | img_size = x.shape[2:] 121 | 122 | h = self.extractor(x) 123 | rpn_locs, rpn_scores, rois, search_regions, roi_indices, anchor = self.rpn(h, img_size, scale) 124 | (px, py), roi_scores = self.head(h, rois, search_regions, roi_indices) 125 | return (px, py), roi_scores, rois, search_regions, roi_indices 126 | 127 | 128 | 129 | def use_preset(self, preset): 130 | """Use the given preset during prediction. 131 | 132 | This method changes values of :obj:`self.nms_thresh` and 133 | :obj:`self.score_thresh`. These values are a threshold value 134 | used for non maximum suppression and a threshold value 135 | to discard low confidence proposals in :meth:`predict`, 136 | respectively. 137 | 138 | If the attributes need to be changed to something 139 | other than the values provided in the presets, please modify 140 | them by directly accessing the public attributes. 141 | 142 | Args: 143 | preset ({'visualize', 'evaluate'): A string to determine the 144 | preset to use. 145 | 146 | """ 147 | if preset == 'visualize': 148 | self.nms_thresh = 0.3 149 | self.score_thresh = 0.7 150 | elif preset == 'evaluate': 151 | self.nms_thresh = 0.3 152 | self.score_thresh = 0.05 153 | else: 154 | raise ValueError('preset must be visualize or evaluate') 155 | 156 | def _suppress(self, raw_cls_bbox, raw_prob): 157 | bbox = list() 158 | label = list() 159 | score = list() 160 | # skip cls_id = 0 because it is the background class 161 | for l in range(1, self.n_class): 162 | 163 | cls_bbox_l = raw_cls_bbox 164 | prob_l = raw_prob[:, l] 165 | 166 | mask = prob_l > self.score_thresh 167 | cls_bbox_l = cls_bbox_l[mask] 168 | prob_l = prob_l[mask] 169 | 170 | keep = non_maximum_suppression( 171 | cp.array(cls_bbox_l), self.nms_thresh, prob_l) 172 | keep = cp.asnumpy(keep) 173 | 174 | bbox.append(cls_bbox_l[keep]) 175 | # The labels are in [0, self.n_class - 2]. 176 | label.append((l - 1) * np.ones((len(keep),))) 177 | score.append(prob_l[keep]) 178 | 179 | bbox = np.concatenate(bbox, axis=0).astype(np.float32) 180 | label = np.concatenate(label, axis=0).astype(np.int32) 181 | score = np.concatenate(score, axis=0).astype(np.float32) 182 | 183 | return bbox, label, score 184 | 185 | def predict(self, imgs, sizes=None, visualize=False, prob_thre=0.7): 186 | """Detect objects from images. 187 | 188 | This method predicts objects for each image. 189 | 190 | Args: 191 | imgs (iterable of numpy.ndarray): Arrays holding images. 192 | All images are in CHW and RGB format 193 | and the range of their value is :math:`[0, 255]`. 194 | 195 | Returns: 196 | tuple of lists: 197 | This method returns a tuple of three lists, 198 | :obj:`(bboxes, labels, scores)`. 199 | 200 | * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ 201 | where :math:`R` is the number of bounding boxes in a image. \ 202 | Each bouding box is organized by \ 203 | :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ 204 | in the second axis. 205 | * **labels** : A list of integer arrays of shape :math:`(R,)`. \ 206 | Each value indicates the class of the bounding box. \ 207 | Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ 208 | number of the foreground classes. 209 | * **scores** : A list of float arrays of shape :math:`(R,)`. \ 210 | Each value indicates how confident the prediction is. 211 | 212 | """ 213 | self.eval() 214 | 215 | # sizes changes when visualize is set to different values 216 | if visualize: 217 | self.use_preset('visualize') 218 | prepared_imgs = list() 219 | sizes = list() 220 | for img in imgs: 221 | size = img.shape[1:] # reshaped image size 222 | img = preprocess(at.tonumpy(img)) 223 | prepared_imgs.append(img) 224 | sizes.append(size) 225 | else: 226 | prepared_imgs = imgs 227 | 228 | bboxes = list() 229 | labels = list() 230 | scores = list() 231 | 232 | for img, size in zip(prepared_imgs, sizes): 233 | img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) 234 | 235 | # judge and change type if necessary 236 | if t.is_tensor(size[1]) : 237 | size[1] = int(size[1]) 238 | 239 | if t.is_tensor(img.shape[3]): 240 | img.shape[3] = int(img.shape[3]) 241 | 242 | scale = img.shape[3] / size[1] 243 | 244 | (px, py), roi_scores, rois, search_regions, _ = self(img, scale=scale) 245 | # We are assuming that batch size is 1. 246 | roi_score = roi_scores.data 247 | px = px.data 248 | py = py.data 249 | 250 | roi = at.totensor(rois) / scale 251 | search_regions = at.totensor(search_regions) / scale 252 | 253 | # Convert to numpy array 254 | px = at.tonumpy(px) 255 | py = at.tonumpy(py) 256 | search_regions = at.tonumpy(search_regions) 257 | 258 | # Convert predictions to bounding boxes in image coordinates. 259 | # Bounding boxes are scaled to the scale of the input images. 260 | 261 | # use px, py and search_regions to generate boxes 262 | cls_bbox = p2bbox(px, py, search_regions, threshold=prob_thre) 263 | cls_bbox = at.totensor(cls_bbox) 264 | 265 | # clip bounding box 266 | cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) 267 | cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) 268 | 269 | prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) 270 | 271 | raw_cls_bbox = at.tonumpy(cls_bbox) 272 | raw_prob = at.tonumpy(prob) 273 | 274 | # print("raw_cls_bbox shape : ", raw_cls_bbox.shape) 275 | # print("raw_prob : ", raw_prob) 276 | 277 | bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) 278 | 279 | bboxes.append(bbox) 280 | labels.append(label) 281 | scores.append(score) 282 | 283 | self.use_preset('evaluate') 284 | self.train() 285 | 286 | return bboxes, labels, scores 287 | 288 | def get_optimizer(self): 289 | """ 290 | return optimizer, It could be overwriten if you want to specify 291 | special optimizer 292 | """ 293 | lr = opt.lr 294 | params = [] 295 | 296 | # different learning rate for different parameters 297 | for key, value in dict(self.named_parameters()).items(): 298 | if value.requires_grad: 299 | if 'bias' in key: 300 | params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}] 301 | else: 302 | params += [{'params': [value], 'lr': lr, 'weight_decay': opt.weight_decay}] 303 | if opt.use_adam: 304 | self.optimizer = t.optim.Adam(params) 305 | else: 306 | self.optimizer = t.optim.SGD(params, momentum=0.9) 307 | return self.optimizer 308 | 309 | def scale_lr(self, decay=0.1): 310 | for param_group in self.optimizer.param_groups: 311 | param_group['lr'] *= decay 312 | return self.optimizer 313 | 314 | 315 | 316 | 317 | -------------------------------------------------------------------------------- /utils/eval_tool.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | from collections import defaultdict 4 | import itertools 5 | import numpy as np 6 | import six 7 | 8 | from model.utils.bbox_tools import bbox_iou 9 | 10 | 11 | def eval_detection_voc( 12 | pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, 13 | gt_difficults=None, 14 | iou_thresh=0.5, use_07_metric=False): 15 | """Calculate average precisions based on evaluation code of PASCAL VOC. 16 | 17 | This function evaluates predicted bounding boxes obtained from a dataset 18 | which has :math:`N` images by using average precision for each class. 19 | The code is based on the evaluation code used in PASCAL VOC Challenge. 20 | 21 | Args: 22 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` 23 | sets of bounding boxes. 24 | Its index corresponds to an index for the base dataset. 25 | Each element of :obj:`pred_bboxes` is a set of coordinates 26 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`, 27 | where :math:`R` corresponds 28 | to the number of bounding boxes, which may vary among boxes. 29 | The second axis corresponds to 30 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. 31 | pred_labels (iterable of numpy.ndarray): An iterable of labels. 32 | Similar to :obj:`pred_bboxes`, its index corresponds to an 33 | index for the base dataset. Its length is :math:`N`. 34 | pred_scores (iterable of numpy.ndarray): An iterable of confidence 35 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, 36 | its index corresponds to an index for the base dataset. 37 | Its length is :math:`N`. 38 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth 39 | bounding boxes 40 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a 41 | bounding box whose shape is :math:`(R, 4)`. Note that the number of 42 | bounding boxes in each image does not need to be same as the number 43 | of corresponding predicted boxes. 44 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth 45 | labels which are organized similarly to :obj:`gt_bboxes`. 46 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean 47 | arrays which is organized similarly to :obj:`gt_bboxes`. 48 | This tells whether the 49 | corresponding ground truth bounding box is difficult or not. 50 | By default, this is :obj:`None`. In that case, this function 51 | considers all bounding boxes to be not difficult. 52 | iou_thresh (float): A prediction is correct if its Intersection over 53 | Union with the ground truth is above this value. 54 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric 55 | for calculating average precision. The default value is 56 | :obj:`False`. 57 | 58 | Returns: 59 | dict: 60 | 61 | The keys, value-types and the description of the values are listed 62 | below. 63 | 64 | * **ap** (*numpy.ndarray*): An array of average precisions. \ 65 | The :math:`l`-th value corresponds to the average precision \ 66 | for class :math:`l`. If class :math:`l` does not exist in \ 67 | either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \ 68 | value is set to :obj:`numpy.nan`. 69 | * **map** (*float*): The average of Average Precisions over classes. 70 | 71 | """ 72 | 73 | prec, rec = calc_detection_voc_prec_rec( 74 | pred_bboxes, pred_labels, pred_scores, 75 | gt_bboxes, gt_labels, gt_difficults, 76 | iou_thresh=iou_thresh) 77 | 78 | ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric) 79 | 80 | return {'ap': ap, 'map': np.nanmean(ap)} 81 | 82 | 83 | def calc_detection_voc_prec_rec( 84 | pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, 85 | gt_difficults=None, 86 | iou_thresh=0.5): 87 | """Calculate precision and recall based on evaluation code of PASCAL VOC. 88 | 89 | This function calculates precision and recall of 90 | predicted bounding boxes obtained from a dataset which has :math:`N` 91 | images. 92 | The code is based on the evaluation code used in PASCAL VOC Challenge. 93 | 94 | Args: 95 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` 96 | sets of bounding boxes. 97 | Its index corresponds to an index for the base dataset. 98 | Each element of :obj:`pred_bboxes` is a set of coordinates 99 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`, 100 | where :math:`R` corresponds 101 | to the number of bounding boxes, which may vary among boxes. 102 | The second axis corresponds to 103 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. 104 | pred_labels (iterable of numpy.ndarray): An iterable of labels. 105 | Similar to :obj:`pred_bboxes`, its index corresponds to an 106 | index for the base dataset. Its length is :math:`N`. 107 | pred_scores (iterable of numpy.ndarray): An iterable of confidence 108 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, 109 | its index corresponds to an index for the base dataset. 110 | Its length is :math:`N`. 111 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth 112 | bounding boxes 113 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a 114 | bounding box whose shape is :math:`(R, 4)`. Note that the number of 115 | bounding boxes in each image does not need to be same as the number 116 | of corresponding predicted boxes. 117 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth 118 | labels which are organized similarly to :obj:`gt_bboxes`. 119 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean 120 | arrays which is organized similarly to :obj:`gt_bboxes`. 121 | This tells whether the 122 | corresponding ground truth bounding box is difficult or not. 123 | By default, this is :obj:`None`. In that case, this function 124 | considers all bounding boxes to be not difficult. 125 | iou_thresh (float): A prediction is correct if its Intersection over 126 | Union with the ground truth is above this value.. 127 | 128 | Returns: 129 | tuple of two lists: 130 | This function returns two lists: :obj:`prec` and :obj:`rec`. 131 | 132 | * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \ 133 | for class :math:`l`. If class :math:`l` does not exist in \ 134 | either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \ 135 | set to :obj:`None`. 136 | * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \ 137 | for class :math:`l`. If class :math:`l` that is not marked as \ 138 | difficult does not exist in \ 139 | :obj:`gt_labels`, :obj:`rec[l]` is \ 140 | set to :obj:`None`. 141 | 142 | """ 143 | 144 | pred_bboxes = iter(pred_bboxes) 145 | pred_labels = iter(pred_labels) 146 | pred_scores = iter(pred_scores) 147 | gt_bboxes = iter(gt_bboxes) 148 | gt_labels = iter(gt_labels) 149 | if gt_difficults is None: 150 | gt_difficults = itertools.repeat(None) 151 | else: 152 | gt_difficults = iter(gt_difficults) 153 | 154 | n_pos = defaultdict(int) 155 | score = defaultdict(list) 156 | match = defaultdict(list) 157 | 158 | for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \ 159 | six.moves.zip( 160 | pred_bboxes, pred_labels, pred_scores, 161 | gt_bboxes, gt_labels, gt_difficults): 162 | 163 | if gt_difficult is None: 164 | gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool) 165 | 166 | for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): 167 | pred_mask_l = pred_label == l 168 | pred_bbox_l = pred_bbox[pred_mask_l] 169 | pred_score_l = pred_score[pred_mask_l] 170 | # sort by score 171 | order = pred_score_l.argsort()[::-1] 172 | pred_bbox_l = pred_bbox_l[order] 173 | pred_score_l = pred_score_l[order] 174 | 175 | gt_mask_l = gt_label == l 176 | gt_bbox_l = gt_bbox[gt_mask_l] 177 | gt_difficult_l = gt_difficult[gt_mask_l] 178 | 179 | n_pos[l] += np.logical_not(gt_difficult_l).sum() 180 | score[l].extend(pred_score_l) 181 | 182 | if len(pred_bbox_l) == 0: 183 | continue 184 | if len(gt_bbox_l) == 0: 185 | match[l].extend((0,) * pred_bbox_l.shape[0]) 186 | continue 187 | 188 | # VOC evaluation follows integer typed bounding boxes. 189 | pred_bbox_l = pred_bbox_l.copy() 190 | pred_bbox_l[:, 2:] += 1 191 | gt_bbox_l = gt_bbox_l.copy() 192 | gt_bbox_l[:, 2:] += 1 193 | 194 | iou = bbox_iou(pred_bbox_l, gt_bbox_l) 195 | gt_index = iou.argmax(axis=1) 196 | # set -1 if there is no matching ground truth 197 | gt_index[iou.max(axis=1) < iou_thresh] = -1 198 | del iou 199 | 200 | selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) 201 | for gt_idx in gt_index: 202 | if gt_idx >= 0: 203 | if gt_difficult_l[gt_idx]: 204 | match[l].append(-1) 205 | else: 206 | if not selec[gt_idx]: 207 | match[l].append(1) 208 | else: 209 | match[l].append(0) 210 | selec[gt_idx] = True 211 | else: 212 | match[l].append(0) 213 | 214 | for iter_ in ( 215 | pred_bboxes, pred_labels, pred_scores, 216 | gt_bboxes, gt_labels, gt_difficults): 217 | if next(iter_, None) is not None: 218 | raise ValueError('Length of input iterables need to be same.') 219 | 220 | n_fg_class = max(n_pos.keys()) + 1 221 | prec = [None] * n_fg_class 222 | rec = [None] * n_fg_class 223 | 224 | for l in n_pos.keys(): 225 | score_l = np.array(score[l]) 226 | match_l = np.array(match[l], dtype=np.int8) 227 | 228 | order = score_l.argsort()[::-1] 229 | match_l = match_l[order] 230 | 231 | tp = np.cumsum(match_l == 1) 232 | fp = np.cumsum(match_l == 0) 233 | 234 | # If an element of fp + tp is 0, 235 | # the corresponding element of prec[l] is nan. 236 | prec[l] = tp / (fp + tp) 237 | # If n_pos[l] is 0, rec[l] is None. 238 | if n_pos[l] > 0: 239 | rec[l] = tp / n_pos[l] 240 | 241 | return prec, rec 242 | 243 | 244 | def calc_detection_voc_ap(prec, rec, use_07_metric=False): 245 | """Calculate average precisions based on evaluation code of PASCAL VOC. 246 | 247 | This function calculates average precisions 248 | from given precisions and recalls. 249 | The code is based on the evaluation code used in PASCAL VOC Challenge. 250 | 251 | Args: 252 | prec (list of numpy.array): A list of arrays. 253 | :obj:`prec[l]` indicates precision for class :math:`l`. 254 | If :obj:`prec[l]` is :obj:`None`, this function returns 255 | :obj:`numpy.nan` for class :math:`l`. 256 | rec (list of numpy.array): A list of arrays. 257 | :obj:`rec[l]` indicates recall for class :math:`l`. 258 | If :obj:`rec[l]` is :obj:`None`, this function returns 259 | :obj:`numpy.nan` for class :math:`l`. 260 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric 261 | for calculating average precision. The default value is 262 | :obj:`False`. 263 | 264 | Returns: 265 | ~numpy.ndarray: 266 | This function returns an array of average precisions. 267 | The :math:`l`-th value corresponds to the average precision 268 | for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is 269 | :obj:`None`, the corresponding value is set to :obj:`numpy.nan`. 270 | 271 | """ 272 | 273 | n_fg_class = len(prec) 274 | ap = np.empty(n_fg_class) 275 | for l in six.moves.range(n_fg_class): 276 | if prec[l] is None or rec[l] is None: 277 | ap[l] = np.nan 278 | continue 279 | 280 | if use_07_metric: 281 | # 11 point metric 282 | ap[l] = 0 283 | for t in np.arange(0., 1.1, 0.1): 284 | if np.sum(rec[l] >= t) == 0: 285 | p = 0 286 | else: 287 | p = np.max(np.nan_to_num(prec[l])[rec[l] >= t]) 288 | ap[l] += p / 11 289 | else: 290 | # correct AP calculation 291 | # first append sentinel values at the end 292 | mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0])) 293 | mrec = np.concatenate(([0], rec[l], [1])) 294 | 295 | mpre = np.maximum.accumulate(mpre[::-1])[::-1] 296 | 297 | # to calculate area under PR curve, look for points 298 | # where X axis (recall) changes value 299 | i = np.where(mrec[1:] != mrec[:-1])[0] 300 | 301 | # and sum (\Delta recall) * prec 302 | ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 303 | 304 | return ap 305 | -------------------------------------------------------------------------------- /model/utils/bbox_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy as xp 3 | 4 | import six 5 | from six import __init__ 6 | 7 | 8 | def _area_of_box(box): 9 | ymin, xmin, ymax, xmax = box 10 | 11 | return (ymax-ymin) * (xmax-xmin) 12 | 13 | def p2bbox(px, py, search_regions, threshold=0.5): 14 | ''' 15 | use px, py to get bounding boxes from search_regions. 16 | Args : 17 | px, py : probability of content. shape of (S, M) 18 | search_regions : shape of (S, 4), (ymin, xmin, ymax, xmax) 19 | 20 | Return : 21 | bboxes shape of (S,4) 22 | ''' 23 | boxes = np.zeros(search_regions.shape) 24 | # boxes = [] 25 | 26 | M = px.shape[1] 27 | 28 | for i in range(px.shape[0]): 29 | 30 | x = px[i,:] 31 | y = py[i,:] 32 | 33 | x = np.where(x>threshold) 34 | y = np.where(y>threshold) 35 | 36 | # print("x.shape and y.sghape", x[0].shape, y[0].shape) 37 | # if x[0].any() and y[0].any() : 38 | if len(x[0])>=5 and len(y[0])>=5 : 39 | x_s, x_e = x[0][0], x[0][-1] 40 | y_s, y_e = y[0][0], y[0][-1] 41 | 42 | ymin, xmin, ymax, xmax = search_regions[i,:] 43 | height_block = (ymax - ymin) / M 44 | width_block = (xmax - xmin) / M 45 | 46 | y_start = ymin + y_s * height_block 47 | y_end = ymin + y_e * height_block 48 | x_start = xmin + x_s * width_block 49 | x_end = xmin + x_e * width_block 50 | 51 | boxes[i, :] = [y_start, x_start, y_end, x_end] 52 | # boxes.append([y_start, x_start, y_end, x_end]) 53 | 54 | # boxes = np.array(boxes) 55 | 56 | return boxes 57 | 58 | 59 | def bbox_intersection(box_a, box_b): 60 | ''' 61 | Args: 62 | box_a (array): A array of coordinates of a box. 63 | Its shape is :math:`(4,)`. These coordinates are 64 | :math:`ymin, xmin, ymax, xmax`. 65 | box_b (array): A array of coordinates of a box. 66 | Its shape is :math:`(4,)`. These coordinates are 67 | :math:`ymin, xmin, ymax, xmax`. 68 | 69 | Return: 70 | intersection (array): A array of coordinates of the intersection box. 71 | Its shape is :math:`(4,)`. These coordinates are 72 | :math:`ymin, xmin, ymax, xmax`. 73 | ''' 74 | y_min_a, x_min_a, y_max_a, x_max_a = box_a 75 | y_min_b, x_min_b, y_max_b, x_max_b = box_b 76 | 77 | left = max(x_min_a, x_min_b) 78 | right = min(x_max_a, x_max_b) 79 | bottom = max(y_min_a, y_min_b) 80 | top = min(y_max_a, y_max_b) 81 | 82 | # 两个 box 没有交集 83 | if right= pos_iou_thresh IoU. 114 | pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] 115 | 116 | pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) 117 | if pos_index.size > 0: 118 | pos_index = np.random.choice( 119 | pos_index, size=pos_roi_per_this_image, replace=False) 120 | 121 | # Select background RoIs as those within 122 | # [neg_iou_thresh_lo, neg_iou_thresh_hi). 123 | neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & 124 | (max_iou >= self.neg_iou_thresh_lo))[0] 125 | neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image 126 | neg_roi_per_this_image = int(min(neg_roi_per_this_image, 127 | neg_index.size)) 128 | if neg_index.size > 0: 129 | neg_index = np.random.choice( 130 | neg_index, size=neg_roi_per_this_image, replace=False) 131 | 132 | # The indices that we're selecting (both positive and negative). 133 | keep_index = np.append(pos_index, neg_index) 134 | 135 | gt_roi_label = gt_roi_label[keep_index] 136 | gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0 137 | 138 | # print("gt_roi_label ", gt_roi_label) 139 | # print(keep_index.shape) 140 | # print("search_region.shape", search_region.shape) 141 | # print("max of search_region", np.max(search_region)) 142 | # for i in range(search_region.shape[0]): 143 | # print(search_region[i,:]) 144 | # print(roi.shape) 145 | 146 | sample_roi = roi[keep_index] 147 | sample_search_region = search_region[keep_index] 148 | 149 | # use search region and bbox to generate Tx and Ty 150 | # sample_roi 和 sample_bboxes 是一一对应的 151 | # 当然这里可能出现一个 bbox 对应了多个 sample_roi 的情况 152 | # sample_bboxes shape (S,4) 153 | sample_bbox = bbox[gt_assignment[keep_index]] 154 | 155 | # print("", gt_assignment[keep_index]) 156 | # print("sample_bbox ", sample_bbox) 157 | 158 | Tx, Ty = bbox2T(sample_search_region, sample_bbox) 159 | 160 | return sample_roi, sample_search_region, (Tx,Ty), gt_roi_label 161 | 162 | 163 | class AnchorTargetCreator(object): 164 | """Assign the ground truth bounding boxes to anchors. 165 | 166 | Assigns the ground truth bounding boxes to anchors for training Region 167 | Proposal Networks introduced in Faster R-CNN [#]_. 168 | 169 | Offsets and scales to match anchors to the ground truth are 170 | calculated using the encoding scheme of 171 | :func:`model.utils.bbox_tools.bbox2loc`. 172 | 173 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ 174 | Faster R-CNN: Towards Real-Time Object Detection with \ 175 | Region Proposal Networks. NIPS 2015. 176 | 177 | Args: 178 | n_sample (int): The number of regions to produce. 179 | pos_iou_thresh (float): Anchors with IoU above this 180 | threshold will be assigned as positive. 181 | neg_iou_thresh (float): Anchors with IoU below this 182 | threshold will be assigned as negative. 183 | pos_ratio (float): Ratio of positive regions in the 184 | sampled regions. 185 | 186 | """ 187 | 188 | def __init__(self, 189 | n_sample=256, 190 | pos_iou_thresh=0.7, 191 | neg_iou_thresh=0.3, 192 | pos_ratio=0.5): 193 | self.n_sample = n_sample 194 | self.pos_iou_thresh = pos_iou_thresh 195 | self.neg_iou_thresh = neg_iou_thresh 196 | self.pos_ratio = pos_ratio 197 | 198 | def __call__(self, bbox, anchor, img_size): 199 | """Assign ground truth supervision to sampled subset of anchors. 200 | 201 | Types of input arrays and output arrays are same. 202 | 203 | Here are notations. 204 | 205 | * :math:`S` is the number of anchors. 206 | * :math:`R` is the number of bounding boxes. 207 | 208 | Args: 209 | bbox (array): Coordinates of bounding boxes. Its shape is 210 | :math:`(R, 4)`. 211 | anchor (array): Coordinates of anchors. Its shape is 212 | :math:`(S, 4)`. 213 | img_size (tuple of ints): A tuple :obj:`H, W`, which 214 | is a tuple of height and width of an image. 215 | 216 | Returns: 217 | (array, array): 218 | 219 | #NOTE: it's scale not only offset 220 | * **loc**: Offsets and scales to match the anchors to \ 221 | the ground truth bounding boxes. Its shape is :math:`(S, 4)`. 222 | * **label**: Labels of anchors with values \ 223 | :obj:`(1=positive, 0=negative, -1=ignore)`. Its shape \ 224 | is :math:`(S,)`. 225 | 226 | """ 227 | 228 | img_H, img_W = img_size 229 | 230 | n_anchor = len(anchor) 231 | inside_index = _get_inside_index(anchor, img_H, img_W) 232 | anchor = anchor[inside_index] 233 | argmax_ious, label = self._create_label( 234 | inside_index, anchor, bbox) 235 | 236 | # compute bounding box regression targets 237 | loc = bbox2loc(anchor, bbox[argmax_ious]) 238 | 239 | # map up to original set of anchors 240 | label = _unmap(label, n_anchor, inside_index, fill=-1) 241 | loc = _unmap(loc, n_anchor, inside_index, fill=0) 242 | 243 | return loc, label 244 | 245 | def _create_label(self, inside_index, anchor, bbox): 246 | # label: 1 is positive, 0 is negative, -1 is dont care 247 | label = np.empty((len(inside_index),), dtype=np.int32) 248 | label.fill(-1) 249 | 250 | argmax_ious, max_ious, gt_argmax_ious = \ 251 | self._calc_ious(anchor, bbox, inside_index) 252 | 253 | # assign negative labels first so that positive labels can clobber them 254 | label[max_ious < self.neg_iou_thresh] = 0 255 | 256 | # positive label: for each gt, anchor with highest iou 257 | label[gt_argmax_ious] = 1 258 | 259 | # positive label: above threshold IOU 260 | label[max_ious >= self.pos_iou_thresh] = 1 261 | 262 | # subsample positive labels if we have too many 263 | n_pos = int(self.pos_ratio * self.n_sample) 264 | pos_index = np.where(label == 1)[0] 265 | if len(pos_index) > n_pos: 266 | disable_index = np.random.choice( 267 | pos_index, size=(len(pos_index) - n_pos), replace=False) 268 | label[disable_index] = -1 269 | 270 | # subsample negative labels if we have too many 271 | n_neg = self.n_sample - np.sum(label == 1) 272 | neg_index = np.where(label == 0)[0] 273 | if len(neg_index) > n_neg: 274 | disable_index = np.random.choice( 275 | neg_index, size=(len(neg_index) - n_neg), replace=False) 276 | label[disable_index] = -1 277 | 278 | return argmax_ious, label 279 | 280 | def _calc_ious(self, anchor, bbox, inside_index): 281 | # ious between the anchors and the gt boxes 282 | ious = bbox_iou(anchor, bbox) 283 | argmax_ious = ious.argmax(axis=1) 284 | max_ious = ious[np.arange(len(inside_index)), argmax_ious] 285 | gt_argmax_ious = ious.argmax(axis=0) 286 | gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])] 287 | gt_argmax_ious = np.where(ious == gt_max_ious)[0] 288 | 289 | return argmax_ious, max_ious, gt_argmax_ious 290 | 291 | 292 | def _unmap(data, count, index, fill=0): 293 | # Unmap a subset of item (data) back to the original set of items (of 294 | # size count) 295 | 296 | if len(data.shape) == 1: 297 | ret = np.empty((count,), dtype=data.dtype) 298 | ret.fill(fill) 299 | ret[index] = data 300 | else: 301 | ret = np.empty((count,) + data.shape[1:], dtype=data.dtype) 302 | ret.fill(fill) 303 | ret[index, :] = data 304 | return ret 305 | 306 | 307 | def _get_inside_index(anchor, H, W): 308 | # Calc indicies of anchors which are located completely inside of the image 309 | # whose size is speficied. 310 | index_inside = np.where( 311 | (anchor[:, 0] >= 0) & 312 | (anchor[:, 1] >= 0) & 313 | (anchor[:, 2] <= H) & 314 | (anchor[:, 3] <= W) 315 | )[0] 316 | return index_inside 317 | 318 | 319 | class ProposalCreator: 320 | """ 321 | Changed : search_regions are also generated by call this object. 322 | 323 | Proposal regions are generated by calling this object. 324 | 325 | The :meth:`__call__` of this object outputs object detection proposals by 326 | applying estimated bounding box offsets to a set of anchors. 327 | 328 | This class takes parameters to control number of bounding boxes to 329 | pass to NMS and keep after NMS. 330 | If the paramters are negative, it uses all the bounding boxes supplied 331 | or keep all the bounding boxes returned by NMS. 332 | 333 | This class is used for Region Proposal Networks introduced in 334 | Faster R-CNN [#]_. 335 | 336 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ 337 | Faster R-CNN: Towards Real-Time Object Detection with \ 338 | Region Proposal Networks. NIPS 2015. 339 | 340 | Args: 341 | nms_thresh (float): Threshold value used when calling NMS. 342 | n_train_pre_nms (int): Number of top scored bounding boxes 343 | to keep before passing to NMS in train mode. 344 | n_train_post_nms (int): Number of top scored bounding boxes 345 | to keep after passing to NMS in train mode. 346 | n_test_pre_nms (int): Number of top scored bounding boxes 347 | to keep before passing to NMS in test mode. 348 | n_test_post_nms (int): Number of top scored bounding boxes 349 | to keep after passing to NMS in test mode. 350 | force_cpu_nms (bool): If this is :obj:`True`, 351 | always use NMS in CPU mode. If :obj:`False`, 352 | the NMS mode is selected based on the type of inputs. 353 | min_size (int): A paramter to determine the threshold on 354 | discarding bounding boxes based on their sizes. 355 | 356 | """ 357 | 358 | def __init__(self, 359 | parent_model, 360 | nms_thresh=0.7, 361 | n_train_pre_nms=12000, 362 | n_train_post_nms=2000, 363 | n_test_pre_nms=6000, 364 | n_test_post_nms=300, 365 | min_size=16 366 | ): 367 | self.parent_model = parent_model 368 | self.nms_thresh = nms_thresh 369 | self.n_train_pre_nms = n_train_pre_nms 370 | self.n_train_post_nms = n_train_post_nms 371 | self.n_test_pre_nms = n_test_pre_nms 372 | self.n_test_post_nms = n_test_post_nms 373 | self.min_size = min_size 374 | 375 | def __call__(self, loc, score, 376 | anchor, img_size, scale=1.): 377 | """input should be ndarray 378 | Propose RoIs. 379 | 380 | Inputs :obj:`loc, score, anchor` refer to the same anchor when indexed 381 | by the same index. 382 | 383 | On notations, :math:`R` is the total number of anchors. This is equal 384 | to product of the height and the width of an image and the number of 385 | anchor bases per pixel. 386 | 387 | Type of the output is same as the inputs. 388 | 389 | Args: 390 | loc (array): Predicted offsets and scaling to anchors. 391 | Its shape is :math:`(R, 4)`. 392 | score (array): Predicted foreground probability for anchors. 393 | Its shape is :math:`(R,)`. 394 | anchor (array): Coordinates of anchors. Its shape is 395 | :math:`(R, 4)`. 396 | img_size (tuple of ints): A tuple :obj:`height, width`, 397 | which contains image size after scaling. 398 | scale (float): The scaling factor used to scale an image after 399 | reading it from a file. 400 | 401 | Returns: 402 | array: 403 | An array of coordinates of proposal boxes. 404 | Its shape is :math:`(S, 4)`. :math:`S` is less than 405 | :obj:`self.n_test_post_nms` in test time and less than 406 | :obj:`self.n_train_post_nms` in train time. :math:`S` depends on 407 | the size of the predicted bounding boxes and the number of 408 | bounding boxes discarded by NMS. 409 | 410 | Steps: 411 | 1. set different parameters when training or predicting. 412 | 2. use anchor and loc to caculate boxes coordinates. 413 | 3. clip. 414 | 4. remove too small boxes. 415 | 5. select top score boxes. 416 | 417 | """ 418 | # NOTE: when test, remember 419 | # faster_rcnn.eval() 420 | # to set self.traing = False 421 | if self.parent_model.training: 422 | n_pre_nms = self.n_train_pre_nms 423 | n_post_nms = self.n_train_post_nms 424 | else: 425 | n_pre_nms = self.n_test_pre_nms 426 | n_post_nms = self.n_test_post_nms 427 | 428 | # Convert anchors into proposal via bbox transformations. 429 | # roi = loc2bbox(anchor, loc) 430 | # 使用 anchor 和 loc 来计算rpn预测的box的具体坐标 431 | # roi shape (R, 4) 432 | # roi : (ymin, xmin, ymax, xmax) 433 | roi = loc2bbox(anchor, loc) 434 | 435 | # generate search_region using roi 436 | search_region = _generate_search_region(roi) 437 | 438 | # Clip predicted boxes to image. 439 | # 使用reshaped image的上下左右边来裁剪 roi,使得roi都在reshaped image里边 440 | roi[:, slice(0, 4, 2)] = np.clip(roi[:, slice(0, 4, 2)], 0, img_size[0]) 441 | roi[:, slice(1, 4, 2)] = np.clip(roi[:, slice(1, 4, 2)], 0, img_size[1]) 442 | # 使用reshaped image的上下左右边来裁剪 search_region,使得search_region都在reshaped image里边 443 | search_region[:, slice(0, 4, 2)] = np.clip(search_region[:, slice(0, 4, 2)], 0, img_size[0]) 444 | search_region[:, slice(1, 4, 2)] = np.clip(search_region[:, slice(1, 4, 2)], 0, img_size[1]) 445 | 446 | # Remove predicted boxes with either height or width < threshold. 447 | min_size = self.min_size * scale 448 | hs = roi[:, 2] - roi[:, 0] 449 | ws = roi[:, 3] - roi[:, 1] 450 | keep = np.where((hs >= min_size) & (ws >= min_size))[0] 451 | roi = roi[keep, :] 452 | search_region = search_region[keep, :] 453 | score = score[keep] 454 | 455 | # Sort all (proposal, score) pairs by score from highest to lowest. 456 | # Take top pre_nms_topN (e.g. 6000). 457 | order = score.ravel().argsort()[::-1] 458 | if n_pre_nms > 0: 459 | order = order[:n_pre_nms] 460 | roi = roi[order, :] 461 | search_region = search_region[order, :] 462 | 463 | # Apply nms (e.g. threshold = 0.7). 464 | # Take after_nms_topN (e.g. 300). 465 | 466 | # unNOTE: somthing is wrong here! 467 | # TODO: remove cuda.to_gpu 468 | keep = non_maximum_suppression(cp.ascontiguousarray(cp.asarray(roi)), 469 | thresh=self.nms_thresh) 470 | if n_post_nms > 0: 471 | keep = keep[:n_post_nms] 472 | 473 | roi = roi[keep] 474 | search_region = search_region[keep] 475 | 476 | return roi, search_region 477 | 478 | 479 | def _generate_search_region(roi, Sh=1.2, Sw=1.2): 480 | 481 | search_region = np.zeros(roi.shape) 482 | 483 | for i in range(roi.shape[0]): 484 | ymin_roi, xmin_roi, ymax_roi, xmax_roi = roi[i,:] 485 | 486 | y_center = (ymin_roi + ymax_roi)/2 487 | x_center = (xmin_roi + xmax_roi)/2 488 | 489 | height_roi = ymax_roi - ymin_roi 490 | width_roi = xmax_roi - xmin_roi 491 | 492 | # search region parameters 493 | height_s = height_roi * Sh 494 | width_s = width_roi * Sw 495 | 496 | ymin_s = y_center - height_s/2 497 | ymax_s = y_center + height_s/2 498 | xmin_s = x_center - width_s/2 499 | xmax_s = x_center + width_s/2 500 | 501 | search_region[i,:] = ymin_s, xmin_s, ymax_s, xmax_s 502 | 503 | return search_region 504 | --------------------------------------------------------------------------------