├── LICENSE ├── README.md └── lib ├── Makefile ├── datasets ├── VOCdevkit-matlab-wrapper │ ├── get_voc_opts.m │ ├── voc_eval.m │ └── xVOCap.m ├── __init__.py ├── coco.py ├── ds_utils.py ├── factory.py ├── imdb.py ├── pascal_voc.py ├── tools │ └── mcg_munge.py └── voc_eval.py ├── fast_rcnn ├── __init__.py ├── bbox_transform.py ├── config.py ├── nms_wrapper.py ├── test.py └── train.py ├── nms ├── .gitignore ├── __init__.py ├── __init__.pyc ├── cpu_nms.pyd ├── cpu_nms.pyx ├── gpu_nms.cu ├── gpu_nms.hpp ├── gpu_nms.pyd ├── gpu_nms.pyx ├── nms_kernel.cu └── py_cpu_nms.py ├── pycocotools ├── UPSTREAM_REV ├── __init__.py ├── _mask.pyx ├── coco.py ├── cocoeval.py ├── license.txt ├── mask.py ├── maskApi.c └── maskApi.h ├── roi_data_layer ├── __init__.py ├── layer.py ├── minibatch.py └── roidb.py ├── rpn ├── README.md ├── __init__.py ├── anchor_target_layer.py ├── generate.py ├── generate_anchors.py ├── proposal_layer.py └── proposal_target_layer.py ├── setup.py ├── setup_cuda.py ├── transform ├── __init__.py └── torch_image_transform_layer.py └── utils ├── .gitignore ├── __init__.py ├── bbox.pyx ├── blob.py └── timer.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 MrGF 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # py-faster-rcnn-windows 2 | py-faster-rcnn that can compile on windows directly 3 | 4 | Usage: 5 | 6 | 1. Download py-faster-rcnn from this url: 7 | https://github.com/rbgirshick/py-faster-rcnn 8 | 9 | This version of faster-rcnn can be compiled on linux directly. 10 | 11 | 2. For windows user, download the lib from this url: 12 | https://github.com/MrGF/py-faster-rcnn-windows 13 | 14 | and replace it with the original lib provided by rbgirshick. 15 | 16 | Then you can run : 17 | python setup.py 18 | python setup_cuda.py 19 | to compile the lib on windows directly. Enjoy! 20 | 21 | 22 | Note: 23 | Please see the LICENSE on https://github.com/rbgirshick/py-faster-rcnn for details. 24 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python setup.py build_ext --inplace 3 | rm -rf build 4 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m: -------------------------------------------------------------------------------- 1 | function VOCopts = get_voc_opts(path) 2 | 3 | tmp = pwd; 4 | cd(path); 5 | try 6 | addpath('VOCcode'); 7 | VOCinit; 8 | catch 9 | rmpath('VOCcode'); 10 | cd(tmp); 11 | error(sprintf('VOCcode directory not found under %s', path)); 12 | end 13 | rmpath('VOCcode'); 14 | cd(tmp); 15 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/voc_eval.m: -------------------------------------------------------------------------------- 1 | function res = voc_eval(path, comp_id, test_set, output_dir) 2 | 3 | VOCopts = get_voc_opts(path); 4 | VOCopts.testset = test_set; 5 | 6 | for i = 1:length(VOCopts.classes) 7 | cls = VOCopts.classes{i}; 8 | res(i) = voc_eval_cls(cls, VOCopts, comp_id, output_dir); 9 | end 10 | 11 | fprintf('\n~~~~~~~~~~~~~~~~~~~~\n'); 12 | fprintf('Results:\n'); 13 | aps = [res(:).ap]'; 14 | fprintf('%.1f\n', aps * 100); 15 | fprintf('%.1f\n', mean(aps) * 100); 16 | fprintf('~~~~~~~~~~~~~~~~~~~~\n'); 17 | 18 | function res = voc_eval_cls(cls, VOCopts, comp_id, output_dir) 19 | 20 | test_set = VOCopts.testset; 21 | year = VOCopts.dataset(4:end); 22 | 23 | addpath(fullfile(VOCopts.datadir, 'VOCcode')); 24 | 25 | res_fn = sprintf(VOCopts.detrespath, comp_id, cls); 26 | 27 | recall = []; 28 | prec = []; 29 | ap = 0; 30 | ap_auc = 0; 31 | 32 | do_eval = (str2num(year) <= 2007) | ~strcmp(test_set, 'test'); 33 | if do_eval 34 | % Bug in VOCevaldet requires that tic has been called first 35 | tic; 36 | [recall, prec, ap] = VOCevaldet(VOCopts, comp_id, cls, true); 37 | ap_auc = xVOCap(recall, prec); 38 | 39 | % force plot limits 40 | ylim([0 1]); 41 | xlim([0 1]); 42 | 43 | print(gcf, '-djpeg', '-r0', ... 44 | [output_dir '/' cls '_pr.jpg']); 45 | end 46 | fprintf('!!! %s : %.4f %.4f\n', cls, ap, ap_auc); 47 | 48 | res.recall = recall; 49 | res.prec = prec; 50 | res.ap = ap; 51 | res.ap_auc = ap_auc; 52 | 53 | save([output_dir '/' cls '_pr.mat'], ... 54 | 'res', 'recall', 'prec', 'ap', 'ap_auc'); 55 | 56 | rmpath(fullfile(VOCopts.datadir, 'VOCcode')); 57 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/xVOCap.m: -------------------------------------------------------------------------------- 1 | function ap = xVOCap(rec,prec) 2 | % From the PASCAL VOC 2011 devkit 3 | 4 | mrec=[0 ; rec ; 1]; 5 | mpre=[0 ; prec ; 0]; 6 | for i=numel(mpre)-1:-1:1 7 | mpre(i)=max(mpre(i),mpre(i+1)); 8 | end 9 | i=find(mrec(2:end)~=mrec(1:end-1))+1; 10 | ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); 11 | -------------------------------------------------------------------------------- /lib/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/datasets/ds_utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Ross Girshick 5 | # -------------------------------------------------------- 6 | 7 | import numpy as np 8 | 9 | def unique_boxes(boxes, scale=1.0): 10 | """Return indices of unique boxes.""" 11 | v = np.array([1, 1e3, 1e6, 1e9]) 12 | hashes = np.round(boxes * scale).dot(v) 13 | _, index = np.unique(hashes, return_index=True) 14 | return np.sort(index) 15 | 16 | def xywh_to_xyxy(boxes): 17 | """Convert [x y w h] box format to [x1 y1 x2 y2] format.""" 18 | return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1)) 19 | 20 | def xyxy_to_xywh(boxes): 21 | """Convert [x1 y1 x2 y2] box format to [x y w h] format.""" 22 | return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1)) 23 | 24 | def validate_boxes(boxes, width=0, height=0): 25 | """Check that a set of boxes are valid.""" 26 | x1 = boxes[:, 0] 27 | y1 = boxes[:, 1] 28 | x2 = boxes[:, 2] 29 | y2 = boxes[:, 3] 30 | assert (x1 >= 0).all() 31 | assert (y1 >= 0).all() 32 | assert (x2 >= x1).all() 33 | assert (y2 >= y1).all() 34 | assert (x2 < width).all() 35 | assert (y2 < height).all() 36 | 37 | def filter_small_boxes(boxes, min_size): 38 | w = boxes[:, 2] - boxes[:, 0] 39 | h = boxes[:, 3] - boxes[:, 1] 40 | keep = np.where((w >= min_size) & (h > min_size))[0] 41 | return keep 42 | -------------------------------------------------------------------------------- /lib/datasets/factory.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Factory method for easily getting imdbs by name.""" 9 | 10 | __sets = {} 11 | 12 | from datasets.pascal_voc import pascal_voc 13 | from datasets.coco import coco 14 | import numpy as np 15 | 16 | # Set up voc__ using selective search "fast" mode 17 | for year in ['2007', '2012']: 18 | for split in ['train', 'val', 'trainval', 'test']: 19 | name = 'voc_{}_{}'.format(year, split) 20 | __sets[name] = (lambda split=split, year=year: pascal_voc(split, year)) 21 | 22 | # Set up coco_2014_ 23 | for year in ['2014']: 24 | for split in ['train', 'val', 'minival', 'valminusminival']: 25 | name = 'coco_{}_{}'.format(year, split) 26 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 27 | 28 | # Set up coco_2015_ 29 | for year in ['2015']: 30 | for split in ['test', 'test-dev']: 31 | name = 'coco_{}_{}'.format(year, split) 32 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 33 | 34 | def get_imdb(name): 35 | """Get an imdb (image database) by name.""" 36 | if not __sets.has_key(name): 37 | raise KeyError('Unknown dataset: {}'.format(name)) 38 | return __sets[name]() 39 | 40 | def list_imdbs(): 41 | """List all registered imdbs.""" 42 | return __sets.keys() 43 | -------------------------------------------------------------------------------- /lib/datasets/imdb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | import os.path as osp 10 | import PIL 11 | from utils.cython_bbox import bbox_overlaps 12 | import numpy as np 13 | import scipy.sparse 14 | from fast_rcnn.config import cfg 15 | 16 | class imdb(object): 17 | """Image database.""" 18 | 19 | def __init__(self, name): 20 | self._name = name 21 | self._num_classes = 0 22 | self._classes = [] 23 | self._image_index = [] 24 | self._obj_proposer = 'selective_search' 25 | self._roidb = None 26 | self._roidb_handler = self.default_roidb 27 | # Use this dict for storing dataset specific config options 28 | self.config = {} 29 | 30 | @property 31 | def name(self): 32 | return self._name 33 | 34 | @property 35 | def num_classes(self): 36 | return len(self._classes) 37 | 38 | @property 39 | def classes(self): 40 | return self._classes 41 | 42 | @property 43 | def image_index(self): 44 | return self._image_index 45 | 46 | @property 47 | def roidb_handler(self): 48 | return self._roidb_handler 49 | 50 | @roidb_handler.setter 51 | def roidb_handler(self, val): 52 | self._roidb_handler = val 53 | 54 | def set_proposal_method(self, method): 55 | method = eval('self.' + method + '_roidb') 56 | self.roidb_handler = method 57 | 58 | @property 59 | def roidb(self): 60 | # A roidb is a list of dictionaries, each with the following keys: 61 | # boxes 62 | # gt_overlaps 63 | # gt_classes 64 | # flipped 65 | if self._roidb is not None: 66 | return self._roidb 67 | self._roidb = self.roidb_handler() 68 | return self._roidb 69 | 70 | @property 71 | def cache_path(self): 72 | cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache')) 73 | if not os.path.exists(cache_path): 74 | os.makedirs(cache_path) 75 | return cache_path 76 | 77 | @property 78 | def num_images(self): 79 | return len(self.image_index) 80 | 81 | def image_path_at(self, i): 82 | raise NotImplementedError 83 | 84 | def default_roidb(self): 85 | raise NotImplementedError 86 | 87 | def evaluate_detections(self, all_boxes, output_dir=None): 88 | """ 89 | all_boxes is a list of length number-of-classes. 90 | Each list element is a list of length number-of-images. 91 | Each of those list elements is either an empty list [] 92 | or a numpy array of detection. 93 | 94 | all_boxes[class][image] = [] or np.array of shape #dets x 5 95 | """ 96 | raise NotImplementedError 97 | 98 | def _get_widths(self): 99 | return [PIL.Image.open(self.image_path_at(i)).size[0] 100 | for i in xrange(self.num_images)] 101 | 102 | def append_flipped_images(self): 103 | num_images = self.num_images 104 | widths = self._get_widths() 105 | for i in xrange(num_images): 106 | boxes = self.roidb[i]['boxes'].copy() 107 | oldx1 = boxes[:, 0].copy() 108 | oldx2 = boxes[:, 2].copy() 109 | boxes[:, 0] = widths[i] - oldx2 - 1 110 | boxes[:, 2] = widths[i] - oldx1 - 1 111 | assert (boxes[:, 2] >= boxes[:, 0]).all() 112 | entry = {'boxes' : boxes, 113 | 'gt_overlaps' : self.roidb[i]['gt_overlaps'], 114 | 'gt_classes' : self.roidb[i]['gt_classes'], 115 | 'flipped' : True} 116 | self.roidb.append(entry) 117 | self._image_index = self._image_index * 2 118 | 119 | def evaluate_recall(self, candidate_boxes=None, thresholds=None, 120 | area='all', limit=None): 121 | """Evaluate detection proposal recall metrics. 122 | 123 | Returns: 124 | results: dictionary of results with keys 125 | 'ar': average recall 126 | 'recalls': vector recalls at each IoU overlap threshold 127 | 'thresholds': vector of IoU overlap thresholds 128 | 'gt_overlaps': vector of all ground-truth overlaps 129 | """ 130 | # Record max overlap value for each gt box 131 | # Return vector of overlap values 132 | areas = { 'all': 0, 'small': 1, 'medium': 2, 'large': 3, 133 | '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7} 134 | area_ranges = [ [0**2, 1e5**2], # all 135 | [0**2, 32**2], # small 136 | [32**2, 96**2], # medium 137 | [96**2, 1e5**2], # large 138 | [96**2, 128**2], # 96-128 139 | [128**2, 256**2], # 128-256 140 | [256**2, 512**2], # 256-512 141 | [512**2, 1e5**2], # 512-inf 142 | ] 143 | assert areas.has_key(area), 'unknown area range: {}'.format(area) 144 | area_range = area_ranges[areas[area]] 145 | gt_overlaps = np.zeros(0) 146 | num_pos = 0 147 | for i in xrange(self.num_images): 148 | # Checking for max_overlaps == 1 avoids including crowd annotations 149 | # (...pretty hacking :/) 150 | max_gt_overlaps = self.roidb[i]['gt_overlaps'].toarray().max(axis=1) 151 | gt_inds = np.where((self.roidb[i]['gt_classes'] > 0) & 152 | (max_gt_overlaps == 1))[0] 153 | gt_boxes = self.roidb[i]['boxes'][gt_inds, :] 154 | gt_areas = self.roidb[i]['seg_areas'][gt_inds] 155 | valid_gt_inds = np.where((gt_areas >= area_range[0]) & 156 | (gt_areas <= area_range[1]))[0] 157 | gt_boxes = gt_boxes[valid_gt_inds, :] 158 | num_pos += len(valid_gt_inds) 159 | 160 | if candidate_boxes is None: 161 | # If candidate_boxes is not supplied, the default is to use the 162 | # non-ground-truth boxes from this roidb 163 | non_gt_inds = np.where(self.roidb[i]['gt_classes'] == 0)[0] 164 | boxes = self.roidb[i]['boxes'][non_gt_inds, :] 165 | else: 166 | boxes = candidate_boxes[i] 167 | if boxes.shape[0] == 0: 168 | continue 169 | if limit is not None and boxes.shape[0] > limit: 170 | boxes = boxes[:limit, :] 171 | 172 | overlaps = bbox_overlaps(boxes.astype(np.float), 173 | gt_boxes.astype(np.float)) 174 | 175 | _gt_overlaps = np.zeros((gt_boxes.shape[0])) 176 | for j in xrange(gt_boxes.shape[0]): 177 | # find which proposal box maximally covers each gt box 178 | argmax_overlaps = overlaps.argmax(axis=0) 179 | # and get the iou amount of coverage for each gt box 180 | max_overlaps = overlaps.max(axis=0) 181 | # find which gt box is 'best' covered (i.e. 'best' = most iou) 182 | gt_ind = max_overlaps.argmax() 183 | gt_ovr = max_overlaps.max() 184 | assert(gt_ovr >= 0) 185 | # find the proposal box that covers the best covered gt box 186 | box_ind = argmax_overlaps[gt_ind] 187 | # record the iou coverage of this gt box 188 | _gt_overlaps[j] = overlaps[box_ind, gt_ind] 189 | assert(_gt_overlaps[j] == gt_ovr) 190 | # mark the proposal box and the gt box as used 191 | overlaps[box_ind, :] = -1 192 | overlaps[:, gt_ind] = -1 193 | # append recorded iou coverage level 194 | gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps)) 195 | 196 | gt_overlaps = np.sort(gt_overlaps) 197 | if thresholds is None: 198 | step = 0.05 199 | thresholds = np.arange(0.5, 0.95 + 1e-5, step) 200 | recalls = np.zeros_like(thresholds) 201 | # compute recall for each iou threshold 202 | for i, t in enumerate(thresholds): 203 | recalls[i] = (gt_overlaps >= t).sum() / float(num_pos) 204 | # ar = 2 * np.trapz(recalls, thresholds) 205 | ar = recalls.mean() 206 | return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds, 207 | 'gt_overlaps': gt_overlaps} 208 | 209 | def create_roidb_from_box_list(self, box_list, gt_roidb): 210 | assert len(box_list) == self.num_images, \ 211 | 'Number of boxes must match number of ground-truth images' 212 | roidb = [] 213 | for i in xrange(self.num_images): 214 | boxes = box_list[i] 215 | num_boxes = boxes.shape[0] 216 | overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32) 217 | 218 | if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0: 219 | gt_boxes = gt_roidb[i]['boxes'] 220 | gt_classes = gt_roidb[i]['gt_classes'] 221 | gt_overlaps = bbox_overlaps(boxes.astype(np.float), 222 | gt_boxes.astype(np.float)) 223 | argmaxes = gt_overlaps.argmax(axis=1) 224 | maxes = gt_overlaps.max(axis=1) 225 | I = np.where(maxes > 0)[0] 226 | overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] 227 | 228 | overlaps = scipy.sparse.csr_matrix(overlaps) 229 | roidb.append({ 230 | 'boxes' : boxes, 231 | 'gt_classes' : np.zeros((num_boxes,), dtype=np.int32), 232 | 'gt_overlaps' : overlaps, 233 | 'flipped' : False, 234 | 'seg_areas' : np.zeros((num_boxes,), dtype=np.float32), 235 | }) 236 | return roidb 237 | 238 | @staticmethod 239 | def merge_roidbs(a, b): 240 | assert len(a) == len(b) 241 | for i in xrange(len(a)): 242 | a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes'])) 243 | a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'], 244 | b[i]['gt_classes'])) 245 | a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'], 246 | b[i]['gt_overlaps']]) 247 | a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'], 248 | b[i]['seg_areas'])) 249 | return a 250 | 251 | def competition_mode(self, on): 252 | """Turn competition mode on or off.""" 253 | pass 254 | -------------------------------------------------------------------------------- /lib/datasets/pascal_voc.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from datasets.imdb import imdb 10 | import datasets.ds_utils as ds_utils 11 | import xml.etree.ElementTree as ET 12 | import numpy as np 13 | import scipy.sparse 14 | import scipy.io as sio 15 | import utils.cython_bbox 16 | import cPickle 17 | import subprocess 18 | import uuid 19 | from voc_eval import voc_eval 20 | from fast_rcnn.config import cfg 21 | 22 | class pascal_voc(imdb): 23 | def __init__(self, image_set, year, devkit_path=None): 24 | imdb.__init__(self, 'voc_' + year + '_' + image_set) 25 | self._year = year 26 | self._image_set = image_set 27 | self._devkit_path = self._get_default_path() if devkit_path is None \ 28 | else devkit_path 29 | self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year) 30 | self._classes = ('__background__', # always index 0 31 | 'aeroplane', 'bicycle', 'bird', 'boat', 32 | 'bottle', 'bus', 'car', 'cat', 'chair', 33 | 'cow', 'diningtable', 'dog', 'horse', 34 | 'motorbike', 'person', 'pottedplant', 35 | 'sheep', 'sofa', 'train', 'tvmonitor') 36 | self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes))) 37 | self._image_ext = '.jpg' 38 | self._image_index = self._load_image_set_index() 39 | # Default to roidb handler 40 | self._roidb_handler = self.selective_search_roidb 41 | self._salt = str(uuid.uuid4()) 42 | self._comp_id = 'comp4' 43 | 44 | # PASCAL specific config options 45 | self.config = {'cleanup' : True, 46 | 'use_salt' : True, 47 | 'use_diff' : False, 48 | 'matlab_eval' : False, 49 | 'rpn_file' : None, 50 | 'min_size' : 2} 51 | 52 | assert os.path.exists(self._devkit_path), \ 53 | 'VOCdevkit path does not exist: {}'.format(self._devkit_path) 54 | assert os.path.exists(self._data_path), \ 55 | 'Path does not exist: {}'.format(self._data_path) 56 | 57 | def image_path_at(self, i): 58 | """ 59 | Return the absolute path to image i in the image sequence. 60 | """ 61 | return self.image_path_from_index(self._image_index[i]) 62 | 63 | def image_path_from_index(self, index): 64 | """ 65 | Construct an image path from the image's "index" identifier. 66 | """ 67 | image_path = os.path.join(self._data_path, 'JPEGImages', 68 | index + self._image_ext) 69 | assert os.path.exists(image_path), \ 70 | 'Path does not exist: {}'.format(image_path) 71 | return image_path 72 | 73 | def _load_image_set_index(self): 74 | """ 75 | Load the indexes listed in this dataset's image set file. 76 | """ 77 | # Example path to image set file: 78 | # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt 79 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main', 80 | self._image_set + '.txt') 81 | assert os.path.exists(image_set_file), \ 82 | 'Path does not exist: {}'.format(image_set_file) 83 | with open(image_set_file) as f: 84 | image_index = [x.strip() for x in f.readlines()] 85 | return image_index 86 | 87 | def _get_default_path(self): 88 | """ 89 | Return the default path where PASCAL VOC is expected to be installed. 90 | """ 91 | return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year) 92 | 93 | def gt_roidb(self): 94 | """ 95 | Return the database of ground-truth regions of interest. 96 | 97 | This function loads/saves from/to a cache file to speed up future calls. 98 | """ 99 | cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') 100 | if os.path.exists(cache_file): 101 | with open(cache_file, 'rb') as fid: 102 | roidb = cPickle.load(fid) 103 | print '{} gt roidb loaded from {}'.format(self.name, cache_file) 104 | return roidb 105 | 106 | gt_roidb = [self._load_pascal_annotation(index) 107 | for index in self.image_index] 108 | with open(cache_file, 'wb') as fid: 109 | cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL) 110 | print 'wrote gt roidb to {}'.format(cache_file) 111 | 112 | return gt_roidb 113 | 114 | def selective_search_roidb(self): 115 | """ 116 | Return the database of selective search regions of interest. 117 | Ground-truth ROIs are also included. 118 | 119 | This function loads/saves from/to a cache file to speed up future calls. 120 | """ 121 | cache_file = os.path.join(self.cache_path, 122 | self.name + '_selective_search_roidb.pkl') 123 | 124 | if os.path.exists(cache_file): 125 | with open(cache_file, 'rb') as fid: 126 | roidb = cPickle.load(fid) 127 | print '{} ss roidb loaded from {}'.format(self.name, cache_file) 128 | return roidb 129 | 130 | if int(self._year) == 2007 or self._image_set != 'test': 131 | gt_roidb = self.gt_roidb() 132 | ss_roidb = self._load_selective_search_roidb(gt_roidb) 133 | roidb = imdb.merge_roidbs(gt_roidb, ss_roidb) 134 | else: 135 | roidb = self._load_selective_search_roidb(None) 136 | with open(cache_file, 'wb') as fid: 137 | cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL) 138 | print 'wrote ss roidb to {}'.format(cache_file) 139 | 140 | return roidb 141 | 142 | def rpn_roidb(self): 143 | if int(self._year) == 2007 or self._image_set != 'test': 144 | gt_roidb = self.gt_roidb() 145 | rpn_roidb = self._load_rpn_roidb(gt_roidb) 146 | roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb) 147 | else: 148 | roidb = self._load_rpn_roidb(None) 149 | 150 | return roidb 151 | 152 | def _load_rpn_roidb(self, gt_roidb): 153 | filename = self.config['rpn_file'] 154 | print 'loading {}'.format(filename) 155 | assert os.path.exists(filename), \ 156 | 'rpn data not found at: {}'.format(filename) 157 | with open(filename, 'rb') as f: 158 | box_list = cPickle.load(f) 159 | return self.create_roidb_from_box_list(box_list, gt_roidb) 160 | 161 | def _load_selective_search_roidb(self, gt_roidb): 162 | filename = os.path.abspath(os.path.join(cfg.DATA_DIR, 163 | 'selective_search_data', 164 | self.name + '.mat')) 165 | assert os.path.exists(filename), \ 166 | 'Selective search data not found at: {}'.format(filename) 167 | raw_data = sio.loadmat(filename)['boxes'].ravel() 168 | 169 | box_list = [] 170 | for i in xrange(raw_data.shape[0]): 171 | boxes = raw_data[i][:, (1, 0, 3, 2)] - 1 172 | keep = ds_utils.unique_boxes(boxes) 173 | boxes = boxes[keep, :] 174 | keep = ds_utils.filter_small_boxes(boxes, self.config['min_size']) 175 | boxes = boxes[keep, :] 176 | box_list.append(boxes) 177 | 178 | return self.create_roidb_from_box_list(box_list, gt_roidb) 179 | 180 | def _load_pascal_annotation(self, index): 181 | """ 182 | Load image and bounding boxes info from XML file in the PASCAL VOC 183 | format. 184 | """ 185 | filename = os.path.join(self._data_path, 'Annotations', index + '.xml') 186 | tree = ET.parse(filename) 187 | objs = tree.findall('object') 188 | if not self.config['use_diff']: 189 | # Exclude the samples labeled as difficult 190 | non_diff_objs = [ 191 | obj for obj in objs if int(obj.find('difficult').text) == 0] 192 | # if len(non_diff_objs) != len(objs): 193 | # print 'Removed {} difficult objects'.format( 194 | # len(objs) - len(non_diff_objs)) 195 | objs = non_diff_objs 196 | num_objs = len(objs) 197 | 198 | boxes = np.zeros((num_objs, 4), dtype=np.uint16) 199 | gt_classes = np.zeros((num_objs), dtype=np.int32) 200 | overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) 201 | # "Seg" area for pascal is just the box area 202 | seg_areas = np.zeros((num_objs), dtype=np.float32) 203 | 204 | # Load object bounding boxes into a data frame. 205 | for ix, obj in enumerate(objs): 206 | bbox = obj.find('bndbox') 207 | # Make pixel indexes 0-based 208 | x1 = float(bbox.find('xmin').text) - 1 209 | y1 = float(bbox.find('ymin').text) - 1 210 | x2 = float(bbox.find('xmax').text) - 1 211 | y2 = float(bbox.find('ymax').text) - 1 212 | cls = self._class_to_ind[obj.find('name').text.lower().strip()] 213 | boxes[ix, :] = [x1, y1, x2, y2] 214 | gt_classes[ix] = cls 215 | overlaps[ix, cls] = 1.0 216 | seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1) 217 | 218 | overlaps = scipy.sparse.csr_matrix(overlaps) 219 | 220 | return {'boxes' : boxes, 221 | 'gt_classes': gt_classes, 222 | 'gt_overlaps' : overlaps, 223 | 'flipped' : False, 224 | 'seg_areas' : seg_areas} 225 | 226 | def _get_comp_id(self): 227 | comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt'] 228 | else self._comp_id) 229 | return comp_id 230 | 231 | def _get_voc_results_file_template(self): 232 | # VOCdevkit/results/VOC2007/Main/_det_test_aeroplane.txt 233 | filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt' 234 | path = os.path.join( 235 | self._devkit_path, 236 | 'results', 237 | 'VOC' + self._year, 238 | 'Main', 239 | filename) 240 | return path 241 | 242 | def _write_voc_results_file(self, all_boxes): 243 | for cls_ind, cls in enumerate(self.classes): 244 | if cls == '__background__': 245 | continue 246 | print 'Writing {} VOC results file'.format(cls) 247 | filename = self._get_voc_results_file_template().format(cls) 248 | with open(filename, 'wt') as f: 249 | for im_ind, index in enumerate(self.image_index): 250 | dets = all_boxes[cls_ind][im_ind] 251 | if dets == []: 252 | continue 253 | # the VOCdevkit expects 1-based indices 254 | for k in xrange(dets.shape[0]): 255 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 256 | format(index, dets[k, -1], 257 | dets[k, 0] + 1, dets[k, 1] + 1, 258 | dets[k, 2] + 1, dets[k, 3] + 1)) 259 | 260 | def _do_python_eval(self, output_dir = 'output'): 261 | annopath = os.path.join( 262 | self._devkit_path, 263 | 'VOC' + self._year, 264 | 'Annotations', 265 | '{:s}.xml') 266 | imagesetfile = os.path.join( 267 | self._devkit_path, 268 | 'VOC' + self._year, 269 | 'ImageSets', 270 | 'Main', 271 | self._image_set + '.txt') 272 | cachedir = os.path.join(self._devkit_path, 'annotations_cache') 273 | aps = [] 274 | # The PASCAL VOC metric changed in 2010 275 | use_07_metric = True if int(self._year) < 2010 else False 276 | print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No') 277 | if not os.path.isdir(output_dir): 278 | os.mkdir(output_dir) 279 | for i, cls in enumerate(self._classes): 280 | if cls == '__background__': 281 | continue 282 | filename = self._get_voc_results_file_template().format(cls) 283 | rec, prec, ap = voc_eval( 284 | filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5, 285 | use_07_metric=use_07_metric) 286 | aps += [ap] 287 | print('AP for {} = {:.4f}'.format(cls, ap)) 288 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f: 289 | cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 290 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 291 | print('~~~~~~~~') 292 | print('Results:') 293 | for ap in aps: 294 | print('{:.3f}'.format(ap)) 295 | print('{:.3f}'.format(np.mean(aps))) 296 | print('~~~~~~~~') 297 | print('') 298 | print('--------------------------------------------------------------') 299 | print('Results computed with the **unofficial** Python eval code.') 300 | print('Results should be very close to the official MATLAB eval code.') 301 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 302 | print('-- Thanks, The Management') 303 | print('--------------------------------------------------------------') 304 | 305 | def _do_matlab_eval(self, output_dir='output'): 306 | print '-----------------------------------------------------' 307 | print 'Computing results with the official MATLAB eval code.' 308 | print '-----------------------------------------------------' 309 | path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets', 310 | 'VOCdevkit-matlab-wrapper') 311 | cmd = 'cd {} && '.format(path) 312 | cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB) 313 | cmd += '-r "dbstop if error; ' 314 | cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \ 315 | .format(self._devkit_path, self._get_comp_id(), 316 | self._image_set, output_dir) 317 | print('Running:\n{}'.format(cmd)) 318 | status = subprocess.call(cmd, shell=True) 319 | 320 | def evaluate_detections(self, all_boxes, output_dir): 321 | self._write_voc_results_file(all_boxes) 322 | self._do_python_eval(output_dir) 323 | if self.config['matlab_eval']: 324 | self._do_matlab_eval(output_dir) 325 | if self.config['cleanup']: 326 | for cls in self._classes: 327 | if cls == '__background__': 328 | continue 329 | filename = self._get_voc_results_file_template().format(cls) 330 | os.remove(filename) 331 | 332 | def competition_mode(self, on): 333 | if on: 334 | self.config['use_salt'] = False 335 | self.config['cleanup'] = False 336 | else: 337 | self.config['use_salt'] = True 338 | self.config['cleanup'] = True 339 | 340 | if __name__ == '__main__': 341 | from datasets.pascal_voc import pascal_voc 342 | d = pascal_voc('trainval', '2007') 343 | res = d.roidb 344 | from IPython import embed; embed() 345 | -------------------------------------------------------------------------------- /lib/datasets/tools/mcg_munge.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | """Hacky tool to convert file system layout of MCG boxes downloaded from 5 | http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/mcg/ 6 | so that it's consistent with those computed by Jan Hosang (see: 7 | http://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal- 8 | computing/research/object-recognition-and-scene-understanding/how- 9 | good-are-detection-proposals-really/) 10 | 11 | NB: Boxes from the MCG website are in (y1, x1, y2, x2) order. 12 | Boxes from Hosang et al. are in (x1, y1, x2, y2) order. 13 | """ 14 | 15 | def munge(src_dir): 16 | # stored as: ./MCG-COCO-val2014-boxes/COCO_val2014_000000193401.mat 17 | # want: ./MCG/mat/COCO_val2014_0/COCO_val2014_000000141/COCO_val2014_000000141334.mat 18 | 19 | files = os.listdir(src_dir) 20 | for fn in files: 21 | base, ext = os.path.splitext(fn) 22 | # first 14 chars / first 22 chars / all chars + .mat 23 | # COCO_val2014_0/COCO_val2014_000000447/COCO_val2014_000000447991.mat 24 | first = base[:14] 25 | second = base[:22] 26 | dst_dir = os.path.join('MCG', 'mat', first, second) 27 | if not os.path.exists(dst_dir): 28 | os.makedirs(dst_dir) 29 | src = os.path.join(src_dir, fn) 30 | dst = os.path.join(dst_dir, fn) 31 | print 'MV: {} -> {}'.format(src, dst) 32 | os.rename(src, dst) 33 | 34 | if __name__ == '__main__': 35 | # src_dir should look something like: 36 | # src_dir = 'MCG-COCO-val2014-boxes' 37 | src_dir = sys.argv[1] 38 | munge(src_dir) 39 | -------------------------------------------------------------------------------- /lib/datasets/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os 9 | import cPickle 10 | import numpy as np 11 | 12 | def parse_rec(filename): 13 | """ Parse a PASCAL VOC xml file """ 14 | tree = ET.parse(filename) 15 | objects = [] 16 | for obj in tree.findall('object'): 17 | obj_struct = {} 18 | obj_struct['name'] = obj.find('name').text 19 | obj_struct['pose'] = obj.find('pose').text 20 | obj_struct['truncated'] = int(obj.find('truncated').text) 21 | obj_struct['difficult'] = int(obj.find('difficult').text) 22 | bbox = obj.find('bndbox') 23 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 24 | int(bbox.find('ymin').text), 25 | int(bbox.find('xmax').text), 26 | int(bbox.find('ymax').text)] 27 | objects.append(obj_struct) 28 | 29 | return objects 30 | 31 | def voc_ap(rec, prec, use_07_metric=False): 32 | """ ap = voc_ap(rec, prec, [use_07_metric]) 33 | Compute VOC AP given precision and recall. 34 | If use_07_metric is true, uses the 35 | VOC 07 11 point method (default:False). 36 | """ 37 | if use_07_metric: 38 | # 11 point metric 39 | ap = 0. 40 | for t in np.arange(0., 1.1, 0.1): 41 | if np.sum(rec >= t) == 0: 42 | p = 0 43 | else: 44 | p = np.max(prec[rec >= t]) 45 | ap = ap + p / 11. 46 | else: 47 | # correct AP calculation 48 | # first append sentinel values at the end 49 | mrec = np.concatenate(([0.], rec, [1.])) 50 | mpre = np.concatenate(([0.], prec, [0.])) 51 | 52 | # compute the precision envelope 53 | for i in range(mpre.size - 1, 0, -1): 54 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 55 | 56 | # to calculate area under PR curve, look for points 57 | # where X axis (recall) changes value 58 | i = np.where(mrec[1:] != mrec[:-1])[0] 59 | 60 | # and sum (\Delta recall) * prec 61 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 62 | return ap 63 | 64 | def voc_eval(detpath, 65 | annopath, 66 | imagesetfile, 67 | classname, 68 | cachedir, 69 | ovthresh=0.5, 70 | use_07_metric=False): 71 | """rec, prec, ap = voc_eval(detpath, 72 | annopath, 73 | imagesetfile, 74 | classname, 75 | [ovthresh], 76 | [use_07_metric]) 77 | 78 | Top level function that does the PASCAL VOC evaluation. 79 | 80 | detpath: Path to detections 81 | detpath.format(classname) should produce the detection results file. 82 | annopath: Path to annotations 83 | annopath.format(imagename) should be the xml annotations file. 84 | imagesetfile: Text file containing the list of images, one image per line. 85 | classname: Category name (duh) 86 | cachedir: Directory for caching the annotations 87 | [ovthresh]: Overlap threshold (default = 0.5) 88 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 89 | (default False) 90 | """ 91 | # assumes detections are in detpath.format(classname) 92 | # assumes annotations are in annopath.format(imagename) 93 | # assumes imagesetfile is a text file with each line an image name 94 | # cachedir caches the annotations in a pickle file 95 | 96 | # first load gt 97 | if not os.path.isdir(cachedir): 98 | os.mkdir(cachedir) 99 | cachefile = os.path.join(cachedir, 'annots.pkl') 100 | # read list of images 101 | with open(imagesetfile, 'r') as f: 102 | lines = f.readlines() 103 | imagenames = [x.strip() for x in lines] 104 | 105 | if not os.path.isfile(cachefile): 106 | # load annots 107 | recs = {} 108 | for i, imagename in enumerate(imagenames): 109 | recs[imagename] = parse_rec(annopath.format(imagename)) 110 | if i % 100 == 0: 111 | print 'Reading annotation for {:d}/{:d}'.format( 112 | i + 1, len(imagenames)) 113 | # save 114 | print 'Saving cached annotations to {:s}'.format(cachefile) 115 | with open(cachefile, 'w') as f: 116 | cPickle.dump(recs, f) 117 | else: 118 | # load 119 | with open(cachefile, 'r') as f: 120 | recs = cPickle.load(f) 121 | 122 | # extract gt objects for this class 123 | class_recs = {} 124 | npos = 0 125 | for imagename in imagenames: 126 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 127 | bbox = np.array([x['bbox'] for x in R]) 128 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 129 | det = [False] * len(R) 130 | npos = npos + sum(~difficult) 131 | class_recs[imagename] = {'bbox': bbox, 132 | 'difficult': difficult, 133 | 'det': det} 134 | 135 | # read dets 136 | detfile = detpath.format(classname) 137 | with open(detfile, 'r') as f: 138 | lines = f.readlines() 139 | 140 | splitlines = [x.strip().split(' ') for x in lines] 141 | image_ids = [x[0] for x in splitlines] 142 | confidence = np.array([float(x[1]) for x in splitlines]) 143 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 144 | 145 | # sort by confidence 146 | sorted_ind = np.argsort(-confidence) 147 | sorted_scores = np.sort(-confidence) 148 | BB = BB[sorted_ind, :] 149 | image_ids = [image_ids[x] for x in sorted_ind] 150 | 151 | # go down dets and mark TPs and FPs 152 | nd = len(image_ids) 153 | tp = np.zeros(nd) 154 | fp = np.zeros(nd) 155 | for d in range(nd): 156 | R = class_recs[image_ids[d]] 157 | bb = BB[d, :].astype(float) 158 | ovmax = -np.inf 159 | BBGT = R['bbox'].astype(float) 160 | 161 | if BBGT.size > 0: 162 | # compute overlaps 163 | # intersection 164 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 165 | iymin = np.maximum(BBGT[:, 1], bb[1]) 166 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 167 | iymax = np.minimum(BBGT[:, 3], bb[3]) 168 | iw = np.maximum(ixmax - ixmin + 1., 0.) 169 | ih = np.maximum(iymax - iymin + 1., 0.) 170 | inters = iw * ih 171 | 172 | # union 173 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 174 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 175 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 176 | 177 | overlaps = inters / uni 178 | ovmax = np.max(overlaps) 179 | jmax = np.argmax(overlaps) 180 | 181 | if ovmax > ovthresh: 182 | if not R['difficult'][jmax]: 183 | if not R['det'][jmax]: 184 | tp[d] = 1. 185 | R['det'][jmax] = 1 186 | else: 187 | fp[d] = 1. 188 | else: 189 | fp[d] = 1. 190 | 191 | # compute precision recall 192 | fp = np.cumsum(fp) 193 | tp = np.cumsum(tp) 194 | rec = tp / float(npos) 195 | # avoid divide by zero in case the first detection matches a difficult 196 | # ground truth 197 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 198 | ap = voc_ap(rec, prec, use_07_metric) 199 | 200 | return rec, prec, ap 201 | -------------------------------------------------------------------------------- /lib/fast_rcnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/fast_rcnn/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def bbox_transform(ex_rois, gt_rois): 11 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 12 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 13 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 14 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 15 | 16 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 17 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 18 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 19 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 20 | 21 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 22 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 23 | targets_dw = np.log(gt_widths / ex_widths) 24 | targets_dh = np.log(gt_heights / ex_heights) 25 | 26 | targets = np.vstack( 27 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 28 | return targets 29 | 30 | def bbox_transform_inv(boxes, deltas): 31 | if boxes.shape[0] == 0: 32 | return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype) 33 | 34 | boxes = boxes.astype(deltas.dtype, copy=False) 35 | 36 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 37 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 38 | ctr_x = boxes[:, 0] + 0.5 * widths 39 | ctr_y = boxes[:, 1] + 0.5 * heights 40 | 41 | dx = deltas[:, 0::4] 42 | dy = deltas[:, 1::4] 43 | dw = deltas[:, 2::4] 44 | dh = deltas[:, 3::4] 45 | 46 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] 47 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 48 | pred_w = np.exp(dw) * widths[:, np.newaxis] 49 | pred_h = np.exp(dh) * heights[:, np.newaxis] 50 | 51 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) 52 | # x1 53 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 54 | # y1 55 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 56 | # x2 57 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w 58 | # y2 59 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h 60 | 61 | return pred_boxes 62 | 63 | def clip_boxes(boxes, im_shape): 64 | """ 65 | Clip boxes to image boundaries. 66 | """ 67 | 68 | # x1 >= 0 69 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 70 | # y1 >= 0 71 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 72 | # x2 < im_shape[1] 73 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 74 | # y2 < im_shape[0] 75 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 76 | return boxes 77 | -------------------------------------------------------------------------------- /lib/fast_rcnn/config.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Fast R-CNN config system. 9 | 10 | This file specifies default config options for Fast R-CNN. You should not 11 | change values in this file. Instead, you should write a config file (in yaml) 12 | and use cfg_from_file(yaml_file) to load it and override the default options. 13 | 14 | Most tools in $ROOT/tools take a --cfg option to specify an override file. 15 | - See tools/{train,test}_net.py for example code that uses cfg_from_file() 16 | - See experiments/cfgs/*.yml for example YAML config override files 17 | """ 18 | 19 | import os 20 | import os.path as osp 21 | import numpy as np 22 | # `pip install easydict` if you don't have it 23 | from easydict import EasyDict as edict 24 | 25 | __C = edict() 26 | # Consumers can get config by: 27 | # from fast_rcnn_config import cfg 28 | cfg = __C 29 | 30 | # 31 | # Training options 32 | # 33 | 34 | __C.TRAIN = edict() 35 | 36 | # Scales to use during training (can list multiple scales) 37 | # Each scale is the pixel size of an image's shortest side 38 | __C.TRAIN.SCALES = (600,) 39 | 40 | # Max pixel size of the longest side of a scaled input image 41 | __C.TRAIN.MAX_SIZE = 1000 42 | 43 | # Images to use per minibatch 44 | __C.TRAIN.IMS_PER_BATCH = 2 45 | 46 | # Minibatch size (number of regions of interest [ROIs]) 47 | __C.TRAIN.BATCH_SIZE = 128 48 | 49 | # Fraction of minibatch that is labeled foreground (i.e. class > 0) 50 | __C.TRAIN.FG_FRACTION = 0.25 51 | 52 | # Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH) 53 | __C.TRAIN.FG_THRESH = 0.5 54 | 55 | # Overlap threshold for a ROI to be considered background (class = 0 if 56 | # overlap in [LO, HI)) 57 | __C.TRAIN.BG_THRESH_HI = 0.5 58 | __C.TRAIN.BG_THRESH_LO = 0.1 59 | 60 | # Use horizontally-flipped images during training? 61 | __C.TRAIN.USE_FLIPPED = True 62 | 63 | # Train bounding-box regressors 64 | __C.TRAIN.BBOX_REG = True 65 | 66 | # Overlap required between a ROI and ground-truth box in order for that ROI to 67 | # be used as a bounding-box regression training example 68 | __C.TRAIN.BBOX_THRESH = 0.5 69 | 70 | # Iterations between snapshots 71 | __C.TRAIN.SNAPSHOT_ITERS = 10000 72 | 73 | # solver.prototxt specifies the snapshot path prefix, this adds an optional 74 | # infix to yield the path: [_]_iters_XYZ.caffemodel 75 | __C.TRAIN.SNAPSHOT_INFIX = '' 76 | 77 | # Use a prefetch thread in roi_data_layer.layer 78 | # So far I haven't found this useful; likely more engineering work is required 79 | __C.TRAIN.USE_PREFETCH = False 80 | 81 | # Normalize the targets (subtract empirical mean, divide by empirical stddev) 82 | __C.TRAIN.BBOX_NORMALIZE_TARGETS = True 83 | # Deprecated (inside weights) 84 | __C.TRAIN.BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0) 85 | # Normalize the targets using "precomputed" (or made up) means and stdevs 86 | # (BBOX_NORMALIZE_TARGETS must also be True) 87 | __C.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = False 88 | __C.TRAIN.BBOX_NORMALIZE_MEANS = (0.0, 0.0, 0.0, 0.0) 89 | __C.TRAIN.BBOX_NORMALIZE_STDS = (0.1, 0.1, 0.2, 0.2) 90 | 91 | # Train using these proposals 92 | __C.TRAIN.PROPOSAL_METHOD = 'selective_search' 93 | 94 | # Make minibatches from images that have similar aspect ratios (i.e. both 95 | # tall and thin or both short and wide) in order to avoid wasting computation 96 | # on zero-padding. 97 | __C.TRAIN.ASPECT_GROUPING = True 98 | 99 | # Use RPN to detect objects 100 | __C.TRAIN.HAS_RPN = False 101 | # IOU >= thresh: positive example 102 | __C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7 103 | # IOU < thresh: negative example 104 | __C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3 105 | # If an anchor statisfied by positive and negative conditions set to negative 106 | __C.TRAIN.RPN_CLOBBER_POSITIVES = False 107 | # Max number of foreground examples 108 | __C.TRAIN.RPN_FG_FRACTION = 0.5 109 | # Total number of examples 110 | __C.TRAIN.RPN_BATCHSIZE = 256 111 | # NMS threshold used on RPN proposals 112 | __C.TRAIN.RPN_NMS_THRESH = 0.7 113 | # Number of top scoring boxes to keep before apply NMS to RPN proposals 114 | __C.TRAIN.RPN_PRE_NMS_TOP_N = 12000 115 | # Number of top scoring boxes to keep after applying NMS to RPN proposals 116 | __C.TRAIN.RPN_POST_NMS_TOP_N = 2000 117 | # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale) 118 | __C.TRAIN.RPN_MIN_SIZE = 16 119 | # Deprecated (outside weights) 120 | __C.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0) 121 | # Give the positive RPN examples weight of p * 1 / {num positives} 122 | # and give negatives a weight of (1 - p) 123 | # Set to -1.0 to use uniform example weighting 124 | __C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0 125 | 126 | 127 | # 128 | # Testing options 129 | # 130 | 131 | __C.TEST = edict() 132 | 133 | # Scales to use during testing (can list multiple scales) 134 | # Each scale is the pixel size of an image's shortest side 135 | __C.TEST.SCALES = (600,) 136 | 137 | # Max pixel size of the longest side of a scaled input image 138 | __C.TEST.MAX_SIZE = 1000 139 | 140 | # Overlap threshold used for non-maximum suppression (suppress boxes with 141 | # IoU >= this threshold) 142 | __C.TEST.NMS = 0.3 143 | 144 | # Experimental: treat the (K+1) units in the cls_score layer as linear 145 | # predictors (trained, eg, with one-vs-rest SVMs). 146 | __C.TEST.SVM = False 147 | 148 | # Test using bounding-box regressors 149 | __C.TEST.BBOX_REG = True 150 | 151 | # Propose boxes 152 | __C.TEST.HAS_RPN = False 153 | 154 | # Test using these proposals 155 | __C.TEST.PROPOSAL_METHOD = 'selective_search' 156 | 157 | ## NMS threshold used on RPN proposals 158 | __C.TEST.RPN_NMS_THRESH = 0.7 159 | ## Number of top scoring boxes to keep before apply NMS to RPN proposals 160 | __C.TEST.RPN_PRE_NMS_TOP_N = 6000 161 | ## Number of top scoring boxes to keep after applying NMS to RPN proposals 162 | __C.TEST.RPN_POST_NMS_TOP_N = 300 163 | # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale) 164 | __C.TEST.RPN_MIN_SIZE = 16 165 | 166 | 167 | # 168 | # MISC 169 | # 170 | 171 | # The mapping from image coordinates to feature map coordinates might cause 172 | # some boxes that are distinct in image space to become identical in feature 173 | # coordinates. If DEDUP_BOXES > 0, then DEDUP_BOXES is used as the scale factor 174 | # for identifying duplicate boxes. 175 | # 1/16 is correct for {Alex,Caffe}Net, VGG_CNN_M_1024, and VGG16 176 | __C.DEDUP_BOXES = 1./16. 177 | 178 | # Pixel mean values (BGR order) as a (1, 1, 3) array 179 | # We use the same pixel mean for all networks even though it's not exactly what 180 | # they were trained with 181 | __C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]]) 182 | 183 | # For reproducibility 184 | __C.RNG_SEED = 3 185 | 186 | # A small number that's used many times 187 | __C.EPS = 1e-14 188 | 189 | # Root directory of project 190 | __C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..')) 191 | 192 | # Data directory 193 | __C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data')) 194 | 195 | # Model directory 196 | __C.MODELS_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'models', 'pascal_voc')) 197 | 198 | # Name (or path to) the matlab executable 199 | __C.MATLAB = 'matlab' 200 | 201 | # Place outputs under an experiments directory 202 | __C.EXP_DIR = 'default' 203 | 204 | # Use GPU implementation of non-maximum suppression 205 | __C.USE_GPU_NMS = True 206 | 207 | # Default GPU device id 208 | __C.GPU_ID = 0 209 | 210 | 211 | def get_output_dir(imdb, net=None): 212 | """Return the directory where experimental artifacts are placed. 213 | If the directory does not exist, it is created. 214 | 215 | A canonical path is built using the name from an imdb and a network 216 | (if not None). 217 | """ 218 | outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name)) 219 | if net is not None: 220 | outdir = osp.join(outdir, net.name) 221 | if not os.path.exists(outdir): 222 | os.makedirs(outdir) 223 | return outdir 224 | 225 | def _merge_a_into_b(a, b): 226 | """Merge config dictionary a into config dictionary b, clobbering the 227 | options in b whenever they are also specified in a. 228 | """ 229 | if type(a) is not edict: 230 | return 231 | 232 | for k, v in a.iteritems(): 233 | # a must specify keys that are in b 234 | if not b.has_key(k): 235 | raise KeyError('{} is not a valid config key'.format(k)) 236 | 237 | # the types must match, too 238 | old_type = type(b[k]) 239 | if old_type is not type(v): 240 | if isinstance(b[k], np.ndarray): 241 | v = np.array(v, dtype=b[k].dtype) 242 | else: 243 | raise ValueError(('Type mismatch ({} vs. {}) ' 244 | 'for config key: {}').format(type(b[k]), 245 | type(v), k)) 246 | 247 | # recursively merge dicts 248 | if type(v) is edict: 249 | try: 250 | _merge_a_into_b(a[k], b[k]) 251 | except: 252 | print('Error under config key: {}'.format(k)) 253 | raise 254 | else: 255 | b[k] = v 256 | 257 | def cfg_from_file(filename): 258 | """Load a config file and merge it into the default options.""" 259 | import yaml 260 | with open(filename, 'r') as f: 261 | yaml_cfg = edict(yaml.load(f)) 262 | 263 | _merge_a_into_b(yaml_cfg, __C) 264 | 265 | def cfg_from_list(cfg_list): 266 | """Set config keys via list (e.g., from command line).""" 267 | from ast import literal_eval 268 | assert len(cfg_list) % 2 == 0 269 | for k, v in zip(cfg_list[0::2], cfg_list[1::2]): 270 | key_list = k.split('.') 271 | d = __C 272 | for subkey in key_list[:-1]: 273 | assert d.has_key(subkey) 274 | d = d[subkey] 275 | subkey = key_list[-1] 276 | assert d.has_key(subkey) 277 | try: 278 | value = literal_eval(v) 279 | except: 280 | # handle the case when v is a string literal 281 | value = v 282 | assert type(value) == type(d[subkey]), \ 283 | 'type {} does not match original type {}'.format( 284 | type(value), type(d[subkey])) 285 | d[subkey] = value 286 | -------------------------------------------------------------------------------- /lib/fast_rcnn/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from fast_rcnn.config import cfg 9 | from nms.gpu_nms import gpu_nms 10 | from nms.cpu_nms import cpu_nms 11 | 12 | def nms(dets, thresh, force_cpu=False): 13 | """Dispatch to either CPU or GPU NMS implementations.""" 14 | 15 | if dets.shape[0] == 0: 16 | return [] 17 | if cfg.USE_GPU_NMS and not force_cpu: 18 | return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 19 | else: 20 | return cpu_nms(dets, thresh) 21 | -------------------------------------------------------------------------------- /lib/fast_rcnn/test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Test a Fast R-CNN network on an imdb (image database).""" 9 | 10 | from fast_rcnn.config import cfg, get_output_dir 11 | from fast_rcnn.bbox_transform import clip_boxes, bbox_transform_inv 12 | import argparse 13 | from utils.timer import Timer 14 | import numpy as np 15 | import cv2 16 | import caffe 17 | from fast_rcnn.nms_wrapper import nms 18 | import cPickle 19 | from utils.blob import im_list_to_blob 20 | import os 21 | 22 | def _get_image_blob(im): 23 | """Converts an image into a network input. 24 | 25 | Arguments: 26 | im (ndarray): a color image in BGR order 27 | 28 | Returns: 29 | blob (ndarray): a data blob holding an image pyramid 30 | im_scale_factors (list): list of image scales (relative to im) used 31 | in the image pyramid 32 | """ 33 | im_orig = im.astype(np.float32, copy=True) 34 | im_orig -= cfg.PIXEL_MEANS 35 | 36 | im_shape = im_orig.shape 37 | im_size_min = np.min(im_shape[0:2]) 38 | im_size_max = np.max(im_shape[0:2]) 39 | 40 | processed_ims = [] 41 | im_scale_factors = [] 42 | 43 | for target_size in cfg.TEST.SCALES: 44 | im_scale = float(target_size) / float(im_size_min) 45 | # Prevent the biggest axis from being more than MAX_SIZE 46 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: 47 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) 48 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 49 | interpolation=cv2.INTER_LINEAR) 50 | im_scale_factors.append(im_scale) 51 | processed_ims.append(im) 52 | 53 | # Create a blob to hold the input images 54 | blob = im_list_to_blob(processed_ims) 55 | 56 | return blob, np.array(im_scale_factors) 57 | 58 | def _get_rois_blob(im_rois, im_scale_factors): 59 | """Converts RoIs into network inputs. 60 | 61 | Arguments: 62 | im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates 63 | im_scale_factors (list): scale factors as returned by _get_image_blob 64 | 65 | Returns: 66 | blob (ndarray): R x 5 matrix of RoIs in the image pyramid 67 | """ 68 | rois, levels = _project_im_rois(im_rois, im_scale_factors) 69 | rois_blob = np.hstack((levels, rois)) 70 | return rois_blob.astype(np.float32, copy=False) 71 | 72 | def _project_im_rois(im_rois, scales): 73 | """Project image RoIs into the image pyramid built by _get_image_blob. 74 | 75 | Arguments: 76 | im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates 77 | scales (list): scale factors as returned by _get_image_blob 78 | 79 | Returns: 80 | rois (ndarray): R x 4 matrix of projected RoI coordinates 81 | levels (list): image pyramid levels used by each projected RoI 82 | """ 83 | im_rois = im_rois.astype(np.float, copy=False) 84 | 85 | if len(scales) > 1: 86 | widths = im_rois[:, 2] - im_rois[:, 0] + 1 87 | heights = im_rois[:, 3] - im_rois[:, 1] + 1 88 | 89 | areas = widths * heights 90 | scaled_areas = areas[:, np.newaxis] * (scales[np.newaxis, :] ** 2) 91 | diff_areas = np.abs(scaled_areas - 224 * 224) 92 | levels = diff_areas.argmin(axis=1)[:, np.newaxis] 93 | else: 94 | levels = np.zeros((im_rois.shape[0], 1), dtype=np.int) 95 | 96 | rois = im_rois * scales[levels] 97 | 98 | return rois, levels 99 | 100 | def _get_blobs(im, rois): 101 | """Convert an image and RoIs within that image into network inputs.""" 102 | blobs = {'data' : None, 'rois' : None} 103 | blobs['data'], im_scale_factors = _get_image_blob(im) 104 | if not cfg.TEST.HAS_RPN: 105 | blobs['rois'] = _get_rois_blob(rois, im_scale_factors) 106 | return blobs, im_scale_factors 107 | 108 | def im_detect(net, im, boxes=None): 109 | """Detect object classes in an image given object proposals. 110 | 111 | Arguments: 112 | net (caffe.Net): Fast R-CNN network to use 113 | im (ndarray): color image to test (in BGR order) 114 | boxes (ndarray): R x 4 array of object proposals or None (for RPN) 115 | 116 | Returns: 117 | scores (ndarray): R x K array of object class scores (K includes 118 | background as object category 0) 119 | boxes (ndarray): R x (4*K) array of predicted bounding boxes 120 | """ 121 | blobs, im_scales = _get_blobs(im, boxes) 122 | 123 | # When mapping from image ROIs to feature map ROIs, there's some aliasing 124 | # (some distinct image ROIs get mapped to the same feature ROI). 125 | # Here, we identify duplicate feature ROIs, so we only compute features 126 | # on the unique subset. 127 | if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN: 128 | v = np.array([1, 1e3, 1e6, 1e9, 1e12]) 129 | hashes = np.round(blobs['rois'] * cfg.DEDUP_BOXES).dot(v) 130 | _, index, inv_index = np.unique(hashes, return_index=True, 131 | return_inverse=True) 132 | blobs['rois'] = blobs['rois'][index, :] 133 | boxes = boxes[index, :] 134 | 135 | if cfg.TEST.HAS_RPN: 136 | im_blob = blobs['data'] 137 | blobs['im_info'] = np.array( 138 | [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], 139 | dtype=np.float32) 140 | 141 | # reshape network inputs 142 | net.blobs['data'].reshape(*(blobs['data'].shape)) 143 | if cfg.TEST.HAS_RPN: 144 | net.blobs['im_info'].reshape(*(blobs['im_info'].shape)) 145 | else: 146 | net.blobs['rois'].reshape(*(blobs['rois'].shape)) 147 | 148 | # do forward 149 | forward_kwargs = {'data': blobs['data'].astype(np.float32, copy=False)} 150 | if cfg.TEST.HAS_RPN: 151 | forward_kwargs['im_info'] = blobs['im_info'].astype(np.float32, copy=False) 152 | else: 153 | forward_kwargs['rois'] = blobs['rois'].astype(np.float32, copy=False) 154 | blobs_out = net.forward(**forward_kwargs) 155 | 156 | if cfg.TEST.HAS_RPN: 157 | assert len(im_scales) == 1, "Only single-image batch implemented" 158 | rois = net.blobs['rois'].data.copy() 159 | # unscale back to raw image space 160 | boxes = rois[:, 1:5] / im_scales[0] 161 | 162 | if cfg.TEST.SVM: 163 | # use the raw scores before softmax under the assumption they 164 | # were trained as linear SVMs 165 | scores = net.blobs['cls_score'].data 166 | else: 167 | # use softmax estimated probabilities 168 | scores = blobs_out['cls_prob'] 169 | 170 | if cfg.TEST.BBOX_REG: 171 | # Apply bounding-box regression deltas 172 | box_deltas = blobs_out['bbox_pred'] 173 | pred_boxes = bbox_transform_inv(boxes, box_deltas) 174 | pred_boxes = clip_boxes(pred_boxes, im.shape) 175 | else: 176 | # Simply repeat the boxes, once for each class 177 | pred_boxes = np.tile(boxes, (1, scores.shape[1])) 178 | 179 | if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN: 180 | # Map scores and predictions back to the original set of boxes 181 | scores = scores[inv_index, :] 182 | pred_boxes = pred_boxes[inv_index, :] 183 | 184 | return scores, pred_boxes 185 | 186 | def vis_detections(im, class_name, dets, thresh=0.3): 187 | """Visual debugging of detections.""" 188 | import matplotlib.pyplot as plt 189 | im = im[:, :, (2, 1, 0)] 190 | for i in xrange(np.minimum(10, dets.shape[0])): 191 | bbox = dets[i, :4] 192 | score = dets[i, -1] 193 | if score > thresh: 194 | plt.cla() 195 | plt.imshow(im) 196 | plt.gca().add_patch( 197 | plt.Rectangle((bbox[0], bbox[1]), 198 | bbox[2] - bbox[0], 199 | bbox[3] - bbox[1], fill=False, 200 | edgecolor='g', linewidth=3) 201 | ) 202 | plt.title('{} {:.3f}'.format(class_name, score)) 203 | plt.show() 204 | 205 | def apply_nms(all_boxes, thresh): 206 | """Apply non-maximum suppression to all predicted boxes output by the 207 | test_net method. 208 | """ 209 | num_classes = len(all_boxes) 210 | num_images = len(all_boxes[0]) 211 | nms_boxes = [[[] for _ in xrange(num_images)] 212 | for _ in xrange(num_classes)] 213 | for cls_ind in xrange(num_classes): 214 | for im_ind in xrange(num_images): 215 | dets = all_boxes[cls_ind][im_ind] 216 | if dets == []: 217 | continue 218 | # CPU NMS is much faster than GPU NMS when the number of boxes 219 | # is relative small (e.g., < 10k) 220 | # TODO(rbg): autotune NMS dispatch 221 | keep = nms(dets, thresh, force_cpu=True) 222 | if len(keep) == 0: 223 | continue 224 | nms_boxes[cls_ind][im_ind] = dets[keep, :].copy() 225 | return nms_boxes 226 | 227 | def test_net(net, imdb, max_per_image=100, thresh=0.05, vis=False): 228 | """Test a Fast R-CNN network on an image database.""" 229 | num_images = len(imdb.image_index) 230 | # all detections are collected into: 231 | # all_boxes[cls][image] = N x 5 array of detections in 232 | # (x1, y1, x2, y2, score) 233 | all_boxes = [[[] for _ in xrange(num_images)] 234 | for _ in xrange(imdb.num_classes)] 235 | 236 | output_dir = get_output_dir(imdb, net) 237 | 238 | # timers 239 | _t = {'im_detect' : Timer(), 'misc' : Timer()} 240 | 241 | if not cfg.TEST.HAS_RPN: 242 | roidb = imdb.roidb 243 | 244 | for i in xrange(num_images): 245 | # filter out any ground truth boxes 246 | if cfg.TEST.HAS_RPN: 247 | box_proposals = None 248 | else: 249 | # The roidb may contain ground-truth rois (for example, if the roidb 250 | # comes from the training or val split). We only want to evaluate 251 | # detection on the *non*-ground-truth rois. We select those the rois 252 | # that have the gt_classes field set to 0, which means there's no 253 | # ground truth. 254 | box_proposals = roidb[i]['boxes'][roidb[i]['gt_classes'] == 0] 255 | 256 | im = cv2.imread(imdb.image_path_at(i)) 257 | _t['im_detect'].tic() 258 | scores, boxes = im_detect(net, im, box_proposals) 259 | _t['im_detect'].toc() 260 | 261 | _t['misc'].tic() 262 | # skip j = 0, because it's the background class 263 | for j in xrange(1, imdb.num_classes): 264 | inds = np.where(scores[:, j] > thresh)[0] 265 | cls_scores = scores[inds, j] 266 | cls_boxes = boxes[inds, j*4:(j+1)*4] 267 | cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ 268 | .astype(np.float32, copy=False) 269 | keep = nms(cls_dets, cfg.TEST.NMS) 270 | cls_dets = cls_dets[keep, :] 271 | if vis: 272 | vis_detections(im, imdb.classes[j], cls_dets) 273 | all_boxes[j][i] = cls_dets 274 | 275 | # Limit to max_per_image detections *over all classes* 276 | if max_per_image > 0: 277 | image_scores = np.hstack([all_boxes[j][i][:, -1] 278 | for j in xrange(1, imdb.num_classes)]) 279 | if len(image_scores) > max_per_image: 280 | image_thresh = np.sort(image_scores)[-max_per_image] 281 | for j in xrange(1, imdb.num_classes): 282 | keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] 283 | all_boxes[j][i] = all_boxes[j][i][keep, :] 284 | _t['misc'].toc() 285 | 286 | print 'im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \ 287 | .format(i + 1, num_images, _t['im_detect'].average_time, 288 | _t['misc'].average_time) 289 | 290 | det_file = os.path.join(output_dir, 'detections.pkl') 291 | with open(det_file, 'wb') as f: 292 | cPickle.dump(all_boxes, f, cPickle.HIGHEST_PROTOCOL) 293 | 294 | print 'Evaluating detections' 295 | imdb.evaluate_detections(all_boxes, output_dir) 296 | -------------------------------------------------------------------------------- /lib/fast_rcnn/train.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Train a Fast R-CNN network.""" 9 | 10 | import caffe 11 | from fast_rcnn.config import cfg 12 | import roi_data_layer.roidb as rdl_roidb 13 | from utils.timer import Timer 14 | import numpy as np 15 | import os 16 | 17 | from caffe.proto import caffe_pb2 18 | import google.protobuf as pb2 19 | 20 | class SolverWrapper(object): 21 | """A simple wrapper around Caffe's solver. 22 | This wrapper gives us control over he snapshotting process, which we 23 | use to unnormalize the learned bounding-box regression weights. 24 | """ 25 | 26 | def __init__(self, solver_prototxt, roidb, output_dir, 27 | pretrained_model=None): 28 | """Initialize the SolverWrapper.""" 29 | self.output_dir = output_dir 30 | 31 | if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and 32 | cfg.TRAIN.BBOX_NORMALIZE_TARGETS): 33 | # RPN can only use precomputed normalization because there are no 34 | # fixed statistics to compute a priori 35 | assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED 36 | 37 | if cfg.TRAIN.BBOX_REG: 38 | print 'Computing bounding-box regression targets...' 39 | self.bbox_means, self.bbox_stds = \ 40 | rdl_roidb.add_bbox_regression_targets(roidb) 41 | print 'done' 42 | 43 | self.solver = caffe.SGDSolver(solver_prototxt) 44 | if pretrained_model is not None: 45 | print ('Loading pretrained model ' 46 | 'weights from {:s}').format(pretrained_model) 47 | self.solver.net.copy_from(pretrained_model) 48 | 49 | self.solver_param = caffe_pb2.SolverParameter() 50 | with open(solver_prototxt, 'rt') as f: 51 | pb2.text_format.Merge(f.read(), self.solver_param) 52 | 53 | self.solver.net.layers[0].set_roidb(roidb) 54 | 55 | def snapshot(self): 56 | """Take a snapshot of the network after unnormalizing the learned 57 | bounding-box regression weights. This enables easy use at test-time. 58 | """ 59 | net = self.solver.net 60 | 61 | scale_bbox_params = (cfg.TRAIN.BBOX_REG and 62 | cfg.TRAIN.BBOX_NORMALIZE_TARGETS and 63 | net.params.has_key('bbox_pred')) 64 | 65 | if scale_bbox_params: 66 | # save original values 67 | orig_0 = net.params['bbox_pred'][0].data.copy() 68 | orig_1 = net.params['bbox_pred'][1].data.copy() 69 | 70 | # scale and shift with bbox reg unnormalization; then save snapshot 71 | net.params['bbox_pred'][0].data[...] = \ 72 | (net.params['bbox_pred'][0].data * 73 | self.bbox_stds[:, np.newaxis]) 74 | net.params['bbox_pred'][1].data[...] = \ 75 | (net.params['bbox_pred'][1].data * 76 | self.bbox_stds + self.bbox_means) 77 | 78 | infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX 79 | if cfg.TRAIN.SNAPSHOT_INFIX != '' else '') 80 | filename = (self.solver_param.snapshot_prefix + infix + 81 | '_iter_{:d}'.format(self.solver.iter) + '.caffemodel') 82 | filename = os.path.join(self.output_dir, filename) 83 | 84 | net.save(str(filename)) 85 | print 'Wrote snapshot to: {:s}'.format(filename) 86 | 87 | if scale_bbox_params: 88 | # restore net to original state 89 | net.params['bbox_pred'][0].data[...] = orig_0 90 | net.params['bbox_pred'][1].data[...] = orig_1 91 | return filename 92 | 93 | def train_model(self, max_iters): 94 | """Network training loop.""" 95 | last_snapshot_iter = -1 96 | timer = Timer() 97 | model_paths = [] 98 | while self.solver.iter < max_iters: 99 | # Make one SGD update 100 | timer.tic() 101 | self.solver.step(1) 102 | timer.toc() 103 | if self.solver.iter % (10 * self.solver_param.display) == 0: 104 | print 'speed: {:.3f}s / iter'.format(timer.average_time) 105 | 106 | if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0: 107 | last_snapshot_iter = self.solver.iter 108 | model_paths.append(self.snapshot()) 109 | 110 | if last_snapshot_iter != self.solver.iter: 111 | model_paths.append(self.snapshot()) 112 | return model_paths 113 | 114 | def get_training_roidb(imdb): 115 | """Returns a roidb (Region of Interest database) for use in training.""" 116 | if cfg.TRAIN.USE_FLIPPED: 117 | print 'Appending horizontally-flipped training examples...' 118 | imdb.append_flipped_images() 119 | print 'done' 120 | 121 | print 'Preparing training data...' 122 | rdl_roidb.prepare_roidb(imdb) 123 | print 'done' 124 | 125 | return imdb.roidb 126 | 127 | def filter_roidb(roidb): 128 | """Remove roidb entries that have no usable RoIs.""" 129 | 130 | def is_valid(entry): 131 | # Valid images have: 132 | # (1) At least one foreground RoI OR 133 | # (2) At least one background RoI 134 | overlaps = entry['max_overlaps'] 135 | # find boxes with sufficient overlap 136 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] 137 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 138 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & 139 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 140 | # image is only valid if such boxes exist 141 | valid = len(fg_inds) > 0 or len(bg_inds) > 0 142 | return valid 143 | 144 | num = len(roidb) 145 | filtered_roidb = [entry for entry in roidb if is_valid(entry)] 146 | num_after = len(filtered_roidb) 147 | print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after, 148 | num, num_after) 149 | return filtered_roidb 150 | 151 | def train_net(solver_prototxt, roidb, output_dir, 152 | pretrained_model=None, max_iters=40000): 153 | """Train a Fast R-CNN network.""" 154 | 155 | roidb = filter_roidb(roidb) 156 | sw = SolverWrapper(solver_prototxt, roidb, output_dir, 157 | pretrained_model=pretrained_model) 158 | 159 | print 'Solving...' 160 | model_paths = sw.train_model(max_iters) 161 | print 'done solving' 162 | return model_paths 163 | -------------------------------------------------------------------------------- /lib/nms/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrGF/py-faster-rcnn-windows/12e11924217e79fd7124d05a18baa49b9908340d/lib/nms/__init__.py -------------------------------------------------------------------------------- /lib/nms/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrGF/py-faster-rcnn-windows/12e11924217e79fd7124d05a18baa49b9908340d/lib/nms/__init__.pyc -------------------------------------------------------------------------------- /lib/nms/cpu_nms.pyd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrGF/py-faster-rcnn-windows/12e11924217e79fd7124d05a18baa49b9908340d/lib/nms/cpu_nms.pyd -------------------------------------------------------------------------------- /lib/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | #cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] #20160531, by MrX 26 | cdef np.ndarray[np.intp_t, ndim=1] order = scores.argsort()[::-1] 27 | 28 | cdef int ndets = dets.shape[0] 29 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 30 | np.zeros((ndets), dtype=np.int) 31 | 32 | # nominal indices 33 | cdef int _i, _j 34 | # sorted indices 35 | cdef int i, j 36 | # temp variables for box i's (the box currently under consideration) 37 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 38 | # variables for computing overlap with box j (lower scoring box) 39 | cdef np.float32_t xx1, yy1, xx2, yy2 40 | cdef np.float32_t w, h 41 | cdef np.float32_t inter, ovr 42 | 43 | keep = [] 44 | for _i in range(ndets): 45 | i = order[_i] 46 | if suppressed[i] == 1: 47 | continue 48 | keep.append(i) 49 | ix1 = x1[i] 50 | iy1 = y1[i] 51 | ix2 = x2[i] 52 | iy2 = y2[i] 53 | iarea = areas[i] 54 | for _j in range(_i + 1, ndets): 55 | j = order[_j] 56 | if suppressed[j] == 1: 57 | continue 58 | xx1 = max(ix1, x1[j]) 59 | yy1 = max(iy1, y1[j]) 60 | xx2 = min(ix2, x2[j]) 61 | yy2 = min(iy2, y2[j]) 62 | w = max(0.0, xx2 - xx1 + 1) 63 | h = max(0.0, yy2 - yy1 + 1) 64 | inter = w * h 65 | ovr = inter / (iarea + areas[j] - inter) 66 | if ovr >= thresh: 67 | suppressed[j] = 1 68 | 69 | return keep 70 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(long* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.pyd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrGF/py-faster-rcnn-windows/12e11924217e79fd7124d05a18baa49b9908340d/lib/nms/gpu_nms.pyd -------------------------------------------------------------------------------- /lib/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | #cdef np.ndarray[np.int_t, ndim=1] \ // 20160601, by MrX 26 | # order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.intp_t, ndim=1] \ 28 | order = scores.argsort()[::-1] 29 | cdef np.ndarray[np.float32_t, ndim=2] \ 30 | sorted_dets = dets[order, :] 31 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 32 | keep = keep[:num_out] 33 | return list(order[keep]) 34 | -------------------------------------------------------------------------------- /lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(long* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /lib/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /lib/pycocotools/UPSTREAM_REV: -------------------------------------------------------------------------------- 1 | https://github.com/pdollar/coco/commit/3ac47c77ebd5a1ed4254a98b7fbf2ef4765a3574 2 | -------------------------------------------------------------------------------- /lib/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /lib/pycocotools/_mask.pyx: -------------------------------------------------------------------------------- 1 | # distutils: language = c 2 | # distutils: sources = ../MatlabAPI/private/maskApi.c 3 | 4 | #************************************************************************** 5 | # Microsoft COCO Toolbox. version 2.0 6 | # Data, paper, and tutorials available at: http://mscoco.org/ 7 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 8 | # Licensed under the Simplified BSD License [see coco/license.txt] 9 | #************************************************************************** 10 | 11 | __author__ = 'tsungyi' 12 | 13 | # import both Python-level and C-level symbols of Numpy 14 | # the API uses Numpy to interface C and Python 15 | import numpy as np 16 | cimport numpy as np 17 | from libc.stdlib cimport malloc, free 18 | 19 | # intialized Numpy. must do. 20 | np.import_array() 21 | 22 | # import numpy C function 23 | # we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management 24 | cdef extern from "numpy/arrayobject.h": 25 | void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) 26 | 27 | # Declare the prototype of the C functions in MaskApi.h 28 | cdef extern from "maskApi.h": 29 | ctypedef unsigned int uint 30 | ctypedef unsigned long siz 31 | ctypedef unsigned char byte 32 | ctypedef double* BB 33 | ctypedef struct RLE: 34 | siz h, 35 | siz w, 36 | siz m, 37 | uint* cnts, 38 | void rlesInit( RLE **R, siz n ) 39 | void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) 40 | void rleDecode( const RLE *R, byte *mask, siz n ) 41 | void rleMerge( const RLE *R, RLE *M, siz n, bint intersect ) 42 | void rleArea( const RLE *R, siz n, uint *a ) 43 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) 44 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) 45 | void rleToBbox( const RLE *R, BB bb, siz n ) 46 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) 47 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) 48 | char* rleToString( const RLE *R ) 49 | void rleFrString( RLE *R, char *s, siz h, siz w ) 50 | 51 | # python class to wrap RLE array in C 52 | # the class handles the memory allocation and deallocation 53 | cdef class RLEs: 54 | cdef RLE *_R 55 | cdef siz _n 56 | 57 | def __cinit__(self, siz n =0): 58 | rlesInit(&self._R, n) 59 | self._n = n 60 | 61 | # free the RLE array here 62 | def __dealloc__(self): 63 | if self._R is not NULL: 64 | for i in range(self._n): 65 | free(self._R[i].cnts) 66 | free(self._R) 67 | def __getattr__(self, key): 68 | if key == 'n': 69 | return self._n 70 | raise AttributeError(key) 71 | 72 | # python class to wrap Mask array in C 73 | # the class handles the memory allocation and deallocation 74 | cdef class Masks: 75 | cdef byte *_mask 76 | cdef siz _h 77 | cdef siz _w 78 | cdef siz _n 79 | 80 | def __cinit__(self, h, w, n): 81 | self._mask = malloc(h*w*n* sizeof(byte)) 82 | self._h = h 83 | self._w = w 84 | self._n = n 85 | # def __dealloc__(self): 86 | # the memory management of _mask has been passed to np.ndarray 87 | # it doesn't need to be freed here 88 | 89 | # called when passing into np.array() and return an np.ndarray in column-major order 90 | def __array__(self): 91 | cdef np.npy_intp shape[1] 92 | shape[0] = self._h*self._w*self._n 93 | # Create a 1D array, and reshape it to fortran/Matlab column-major array 94 | ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F') 95 | # The _mask allocated by Masks is now handled by ndarray 96 | PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA) 97 | return ndarray 98 | 99 | # internal conversion from Python RLEs object to compressed RLE format 100 | def _toString(RLEs Rs): 101 | cdef siz n = Rs.n 102 | cdef bytes py_string 103 | cdef char* c_string 104 | objs = [] 105 | for i in range(n): 106 | c_string = rleToString( &Rs._R[i] ) 107 | py_string = c_string 108 | objs.append({ 109 | 'size': [Rs._R[i].h, Rs._R[i].w], 110 | 'counts': py_string 111 | }) 112 | free(c_string) 113 | return objs 114 | 115 | # internal conversion from compressed RLE format to Python RLEs object 116 | def _frString(rleObjs): 117 | cdef siz n = len(rleObjs) 118 | Rs = RLEs(n) 119 | cdef bytes py_string 120 | cdef char* c_string 121 | for i, obj in enumerate(rleObjs): 122 | py_string = str(obj['counts']) 123 | c_string = py_string 124 | rleFrString( &Rs._R[i], c_string, obj['size'][0], obj['size'][1] ) 125 | return Rs 126 | 127 | # encode mask to RLEs objects 128 | # list of RLE string can be generated by RLEs member function 129 | def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask): 130 | h, w, n = mask.shape[0], mask.shape[1], mask.shape[2] 131 | cdef RLEs Rs = RLEs(n) 132 | rleEncode(Rs._R,mask.data,h,w,n) 133 | objs = _toString(Rs) 134 | return objs 135 | 136 | # decode mask from compressed list of RLE string or RLEs object 137 | def decode(rleObjs): 138 | cdef RLEs Rs = _frString(rleObjs) 139 | h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n 140 | masks = Masks(h, w, n) 141 | rleDecode( Rs._R, masks._mask, n ); 142 | return np.array(masks) 143 | 144 | def merge(rleObjs, bint intersect=0): 145 | cdef RLEs Rs = _frString(rleObjs) 146 | cdef RLEs R = RLEs(1) 147 | rleMerge(Rs._R, R._R, Rs._n, intersect) 148 | obj = _toString(R)[0] 149 | return obj 150 | 151 | def area(rleObjs): 152 | cdef RLEs Rs = _frString(rleObjs) 153 | cdef uint* _a = malloc(Rs._n* sizeof(uint)) 154 | rleArea(Rs._R, Rs._n, _a) 155 | cdef np.npy_intp shape[1] 156 | shape[0] = Rs._n 157 | a = np.array((Rs._n, ), dtype=np.uint8) 158 | a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a) 159 | PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA) 160 | return a 161 | 162 | # iou computation. support function overload (RLEs-RLEs and bbox-bbox). 163 | def iou( dt, gt, pyiscrowd ): 164 | def _preproc(objs): 165 | if len(objs) == 0: 166 | return objs 167 | if type(objs) == np.ndarray: 168 | if len(objs.shape) == 1: 169 | objs = objs.reshape((objs[0], 1)) 170 | # check if it's Nx4 bbox 171 | if not len(objs.shape) == 2 or not objs.shape[1] == 4: 172 | raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension') 173 | objs = objs.astype(np.double) 174 | elif type(objs) == list: 175 | # check if list is in box format and convert it to np.ndarray 176 | isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs])) 177 | isrle = np.all(np.array([type(obj) == dict for obj in objs])) 178 | if isbox: 179 | objs = np.array(objs, dtype=np.double) 180 | if len(objs.shape) == 1: 181 | objs = objs.reshape((1,objs.shape[0])) 182 | elif isrle: 183 | objs = _frString(objs) 184 | else: 185 | raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])') 186 | else: 187 | raise Exception('unrecognized type. The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.') 188 | return objs 189 | def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): 190 | rleIou( dt._R, gt._R, m, n, iscrowd.data, _iou.data ) 191 | def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): 192 | bbIou( dt.data, gt.data, m, n, iscrowd.data, _iou.data ) 193 | def _len(obj): 194 | cdef siz N = 0 195 | if type(obj) == RLEs: 196 | N = obj.n 197 | elif len(obj)==0: 198 | pass 199 | elif type(obj) == np.ndarray: 200 | N = obj.shape[0] 201 | return N 202 | # convert iscrowd to numpy array 203 | cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8) 204 | # simple type checking 205 | cdef siz m, n 206 | dt = _preproc(dt) 207 | gt = _preproc(gt) 208 | m = _len(dt) 209 | n = _len(gt) 210 | if m == 0 or n == 0: 211 | return [] 212 | if not type(dt) == type(gt): 213 | raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray') 214 | 215 | # define local variables 216 | cdef double* _iou = 0 217 | cdef np.npy_intp shape[1] 218 | # check type and assign iou function 219 | if type(dt) == RLEs: 220 | _iouFun = _rleIou 221 | elif type(dt) == np.ndarray: 222 | _iouFun = _bbIou 223 | else: 224 | raise Exception('input data type not allowed.') 225 | _iou = malloc(m*n* sizeof(double)) 226 | iou = np.zeros((m*n, ), dtype=np.double) 227 | shape[0] = m*n 228 | iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou) 229 | PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA) 230 | _iouFun(dt, gt, iscrowd, m, n, iou) 231 | return iou.reshape((m,n), order='F') 232 | 233 | def toBbox( rleObjs ): 234 | cdef RLEs Rs = _frString(rleObjs) 235 | cdef siz n = Rs.n 236 | cdef BB _bb = malloc(4*n* sizeof(double)) 237 | rleToBbox( Rs._R, _bb, n ) 238 | cdef np.npy_intp shape[1] 239 | shape[0] = 4*n 240 | bb = np.array((1,4*n), dtype=np.double) 241 | bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4)) 242 | PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA) 243 | return bb 244 | 245 | def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ): 246 | cdef siz n = bb.shape[0] 247 | Rs = RLEs(n) 248 | rleFrBbox( Rs._R, bb.data, h, w, n ) 249 | objs = _toString(Rs) 250 | return objs 251 | 252 | def frPoly( poly, siz h, siz w ): 253 | cdef np.ndarray[np.double_t, ndim=1] np_poly 254 | n = len(poly) 255 | Rs = RLEs(n) 256 | for i, p in enumerate(poly): 257 | np_poly = np.array(p, dtype=np.double, order='F') 258 | rleFrPoly( &Rs._R[i], np_poly.data, len(np_poly)/2, h, w ) 259 | objs = _toString(Rs) 260 | return objs 261 | 262 | def frUncompressedRLE(ucRles, siz h, siz w): 263 | cdef np.ndarray[np.uint32_t, ndim=1] cnts 264 | cdef RLE R 265 | cdef uint *data 266 | n = len(ucRles) 267 | objs = [] 268 | for i in range(n): 269 | Rs = RLEs(1) 270 | cnts = np.array(ucRles[i]['counts'], dtype=np.uint32) 271 | # time for malloc can be saved here but it's fine 272 | data = malloc(len(cnts)* sizeof(uint)) 273 | for j in range(len(cnts)): 274 | data[j] = cnts[j] 275 | R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), data) 276 | Rs._R[0] = R 277 | objs.append(_toString(Rs)[0]) 278 | return objs 279 | 280 | def frPyObjects(pyobj, siz h, w): 281 | if type(pyobj) == np.ndarray: 282 | objs = frBbox(pyobj, h, w ) 283 | elif type(pyobj) == list and len(pyobj[0]) == 4: 284 | objs = frBbox(pyobj, h, w ) 285 | elif type(pyobj) == list and len(pyobj[0]) > 4: 286 | objs = frPoly(pyobj, h, w ) 287 | elif type(pyobj) == list and type(pyobj[0]) == dict: 288 | objs = frUncompressedRLE(pyobj, h, w) 289 | else: 290 | raise Exception('input type is not supported.') 291 | return objs 292 | -------------------------------------------------------------------------------- /lib/pycocotools/coco.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | __version__ = '1.0.1' 3 | # Interface for accessing the Microsoft COCO dataset. 4 | 5 | # Microsoft COCO is a large image dataset designed for object detection, 6 | # segmentation, and caption generation. pycocotools is a Python API that 7 | # assists in loading, parsing and visualizing the annotations in COCO. 8 | # Please visit http://mscoco.org/ for more information on COCO, including 9 | # for the data, paper, and tutorials. The exact format of the annotations 10 | # is also described on the COCO website. For example usage of the pycocotools 11 | # please see pycocotools_demo.ipynb. In addition to this API, please download both 12 | # the COCO images and annotations in order to run the demo. 13 | 14 | # An alternative to using the API is to load the annotations directly 15 | # into Python dictionary 16 | # Using the API provides additional utility functions. Note that this API 17 | # supports both *instance* and *caption* annotations. In the case of 18 | # captions not all functions are defined (e.g. categories are undefined). 19 | 20 | # The following API functions are defined: 21 | # COCO - COCO api class that loads COCO annotation file and prepare data structures. 22 | # decodeMask - Decode binary mask M encoded via run-length encoding. 23 | # encodeMask - Encode binary mask M using run-length encoding. 24 | # getAnnIds - Get ann ids that satisfy given filter conditions. 25 | # getCatIds - Get cat ids that satisfy given filter conditions. 26 | # getImgIds - Get img ids that satisfy given filter conditions. 27 | # loadAnns - Load anns with the specified ids. 28 | # loadCats - Load cats with the specified ids. 29 | # loadImgs - Load imgs with the specified ids. 30 | # segToMask - Convert polygon segmentation to binary mask. 31 | # showAnns - Display the specified annotations. 32 | # loadRes - Load algorithm results and create API for accessing them. 33 | # download - Download COCO images from mscoco.org server. 34 | # Throughout the API "ann"=annotation, "cat"=category, and "img"=image. 35 | # Help on each functions can be accessed by: "help COCO>function". 36 | 37 | # See also COCO>decodeMask, 38 | # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds, 39 | # COCO>getImgIds, COCO>loadAnns, COCO>loadCats, 40 | # COCO>loadImgs, COCO>segToMask, COCO>showAnns 41 | 42 | # Microsoft COCO Toolbox. version 2.0 43 | # Data, paper, and tutorials available at: http://mscoco.org/ 44 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2014. 45 | # Licensed under the Simplified BSD License [see bsd.txt] 46 | 47 | import json 48 | import datetime 49 | import time 50 | import matplotlib.pyplot as plt 51 | from matplotlib.collections import PatchCollection 52 | from matplotlib.patches import Polygon 53 | import numpy as np 54 | from skimage.draw import polygon 55 | import urllib 56 | import copy 57 | import itertools 58 | import mask 59 | import os 60 | 61 | class COCO: 62 | def __init__(self, annotation_file=None): 63 | """ 64 | Constructor of Microsoft COCO helper class for reading and visualizing annotations. 65 | :param annotation_file (str): location of annotation file 66 | :param image_folder (str): location to the folder that hosts images. 67 | :return: 68 | """ 69 | # load dataset 70 | self.dataset = {} 71 | self.anns = [] 72 | self.imgToAnns = {} 73 | self.catToImgs = {} 74 | self.imgs = {} 75 | self.cats = {} 76 | if not annotation_file == None: 77 | print 'loading annotations into memory...' 78 | tic = time.time() 79 | dataset = json.load(open(annotation_file, 'r')) 80 | print 'Done (t=%0.2fs)'%(time.time()- tic) 81 | self.dataset = dataset 82 | self.createIndex() 83 | 84 | def createIndex(self): 85 | # create index 86 | print 'creating index...' 87 | anns = {} 88 | imgToAnns = {} 89 | catToImgs = {} 90 | cats = {} 91 | imgs = {} 92 | if 'annotations' in self.dataset: 93 | imgToAnns = {ann['image_id']: [] for ann in self.dataset['annotations']} 94 | anns = {ann['id']: [] for ann in self.dataset['annotations']} 95 | for ann in self.dataset['annotations']: 96 | imgToAnns[ann['image_id']] += [ann] 97 | anns[ann['id']] = ann 98 | 99 | if 'images' in self.dataset: 100 | imgs = {im['id']: {} for im in self.dataset['images']} 101 | for img in self.dataset['images']: 102 | imgs[img['id']] = img 103 | 104 | if 'categories' in self.dataset: 105 | cats = {cat['id']: [] for cat in self.dataset['categories']} 106 | for cat in self.dataset['categories']: 107 | cats[cat['id']] = cat 108 | catToImgs = {cat['id']: [] for cat in self.dataset['categories']} 109 | if 'annotations' in self.dataset: 110 | for ann in self.dataset['annotations']: 111 | catToImgs[ann['category_id']] += [ann['image_id']] 112 | 113 | print 'index created!' 114 | 115 | # create class members 116 | self.anns = anns 117 | self.imgToAnns = imgToAnns 118 | self.catToImgs = catToImgs 119 | self.imgs = imgs 120 | self.cats = cats 121 | 122 | def info(self): 123 | """ 124 | Print information about the annotation file. 125 | :return: 126 | """ 127 | for key, value in self.dataset['info'].items(): 128 | print '%s: %s'%(key, value) 129 | 130 | def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None): 131 | """ 132 | Get ann ids that satisfy given filter conditions. default skips that filter 133 | :param imgIds (int array) : get anns for given imgs 134 | catIds (int array) : get anns for given cats 135 | areaRng (float array) : get anns for given area range (e.g. [0 inf]) 136 | iscrowd (boolean) : get anns for given crowd label (False or True) 137 | :return: ids (int array) : integer array of ann ids 138 | """ 139 | imgIds = imgIds if type(imgIds) == list else [imgIds] 140 | catIds = catIds if type(catIds) == list else [catIds] 141 | 142 | if len(imgIds) == len(catIds) == len(areaRng) == 0: 143 | anns = self.dataset['annotations'] 144 | else: 145 | if not len(imgIds) == 0: 146 | # this can be changed by defaultdict 147 | lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns] 148 | anns = list(itertools.chain.from_iterable(lists)) 149 | else: 150 | anns = self.dataset['annotations'] 151 | anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] 152 | anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]] 153 | if not iscrowd == None: 154 | ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] 155 | else: 156 | ids = [ann['id'] for ann in anns] 157 | return ids 158 | 159 | def getCatIds(self, catNms=[], supNms=[], catIds=[]): 160 | """ 161 | filtering parameters. default skips that filter. 162 | :param catNms (str array) : get cats for given cat names 163 | :param supNms (str array) : get cats for given supercategory names 164 | :param catIds (int array) : get cats for given cat ids 165 | :return: ids (int array) : integer array of cat ids 166 | """ 167 | catNms = catNms if type(catNms) == list else [catNms] 168 | supNms = supNms if type(supNms) == list else [supNms] 169 | catIds = catIds if type(catIds) == list else [catIds] 170 | 171 | if len(catNms) == len(supNms) == len(catIds) == 0: 172 | cats = self.dataset['categories'] 173 | else: 174 | cats = self.dataset['categories'] 175 | cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] 176 | cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] 177 | cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] 178 | ids = [cat['id'] for cat in cats] 179 | return ids 180 | 181 | def getImgIds(self, imgIds=[], catIds=[]): 182 | ''' 183 | Get img ids that satisfy given filter conditions. 184 | :param imgIds (int array) : get imgs for given ids 185 | :param catIds (int array) : get imgs with all given cats 186 | :return: ids (int array) : integer array of img ids 187 | ''' 188 | imgIds = imgIds if type(imgIds) == list else [imgIds] 189 | catIds = catIds if type(catIds) == list else [catIds] 190 | 191 | if len(imgIds) == len(catIds) == 0: 192 | ids = self.imgs.keys() 193 | else: 194 | ids = set(imgIds) 195 | for i, catId in enumerate(catIds): 196 | if i == 0 and len(ids) == 0: 197 | ids = set(self.catToImgs[catId]) 198 | else: 199 | ids &= set(self.catToImgs[catId]) 200 | return list(ids) 201 | 202 | def loadAnns(self, ids=[]): 203 | """ 204 | Load anns with the specified ids. 205 | :param ids (int array) : integer ids specifying anns 206 | :return: anns (object array) : loaded ann objects 207 | """ 208 | if type(ids) == list: 209 | return [self.anns[id] for id in ids] 210 | elif type(ids) == int: 211 | return [self.anns[ids]] 212 | 213 | def loadCats(self, ids=[]): 214 | """ 215 | Load cats with the specified ids. 216 | :param ids (int array) : integer ids specifying cats 217 | :return: cats (object array) : loaded cat objects 218 | """ 219 | if type(ids) == list: 220 | return [self.cats[id] for id in ids] 221 | elif type(ids) == int: 222 | return [self.cats[ids]] 223 | 224 | def loadImgs(self, ids=[]): 225 | """ 226 | Load anns with the specified ids. 227 | :param ids (int array) : integer ids specifying img 228 | :return: imgs (object array) : loaded img objects 229 | """ 230 | if type(ids) == list: 231 | return [self.imgs[id] for id in ids] 232 | elif type(ids) == int: 233 | return [self.imgs[ids]] 234 | 235 | def showAnns(self, anns): 236 | """ 237 | Display the specified annotations. 238 | :param anns (array of object): annotations to display 239 | :return: None 240 | """ 241 | if len(anns) == 0: 242 | return 0 243 | if 'segmentation' in anns[0]: 244 | datasetType = 'instances' 245 | elif 'caption' in anns[0]: 246 | datasetType = 'captions' 247 | if datasetType == 'instances': 248 | ax = plt.gca() 249 | polygons = [] 250 | color = [] 251 | for ann in anns: 252 | c = np.random.random((1, 3)).tolist()[0] 253 | if type(ann['segmentation']) == list: 254 | # polygon 255 | for seg in ann['segmentation']: 256 | poly = np.array(seg).reshape((len(seg)/2, 2)) 257 | polygons.append(Polygon(poly, True,alpha=0.4)) 258 | color.append(c) 259 | else: 260 | # mask 261 | t = self.imgs[ann['image_id']] 262 | if type(ann['segmentation']['counts']) == list: 263 | rle = mask.frPyObjects([ann['segmentation']], t['height'], t['width']) 264 | else: 265 | rle = [ann['segmentation']] 266 | m = mask.decode(rle) 267 | img = np.ones( (m.shape[0], m.shape[1], 3) ) 268 | if ann['iscrowd'] == 1: 269 | color_mask = np.array([2.0,166.0,101.0])/255 270 | if ann['iscrowd'] == 0: 271 | color_mask = np.random.random((1, 3)).tolist()[0] 272 | for i in range(3): 273 | img[:,:,i] = color_mask[i] 274 | ax.imshow(np.dstack( (img, m*0.5) )) 275 | p = PatchCollection(polygons, facecolors=color, edgecolors=(0,0,0,1), linewidths=3, alpha=0.4) 276 | ax.add_collection(p) 277 | elif datasetType == 'captions': 278 | for ann in anns: 279 | print ann['caption'] 280 | 281 | def loadRes(self, resFile): 282 | """ 283 | Load result file and return a result api object. 284 | :param resFile (str) : file name of result file 285 | :return: res (obj) : result api object 286 | """ 287 | res = COCO() 288 | res.dataset['images'] = [img for img in self.dataset['images']] 289 | # res.dataset['info'] = copy.deepcopy(self.dataset['info']) 290 | # res.dataset['licenses'] = copy.deepcopy(self.dataset['licenses']) 291 | 292 | print 'Loading and preparing results... ' 293 | tic = time.time() 294 | anns = json.load(open(resFile)) 295 | assert type(anns) == list, 'results in not an array of objects' 296 | annsImgIds = [ann['image_id'] for ann in anns] 297 | assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ 298 | 'Results do not correspond to current coco set' 299 | if 'caption' in anns[0]: 300 | imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) 301 | res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] 302 | for id, ann in enumerate(anns): 303 | ann['id'] = id+1 304 | elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: 305 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 306 | for id, ann in enumerate(anns): 307 | bb = ann['bbox'] 308 | x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]] 309 | if not 'segmentation' in ann: 310 | ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] 311 | ann['area'] = bb[2]*bb[3] 312 | ann['id'] = id+1 313 | ann['iscrowd'] = 0 314 | elif 'segmentation' in anns[0]: 315 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 316 | for id, ann in enumerate(anns): 317 | # now only support compressed RLE format as segmentation results 318 | ann['area'] = mask.area([ann['segmentation']])[0] 319 | if not 'bbox' in ann: 320 | ann['bbox'] = mask.toBbox([ann['segmentation']])[0] 321 | ann['id'] = id+1 322 | ann['iscrowd'] = 0 323 | print 'DONE (t=%0.2fs)'%(time.time()- tic) 324 | 325 | res.dataset['annotations'] = anns 326 | res.createIndex() 327 | return res 328 | 329 | def download( self, tarDir = None, imgIds = [] ): 330 | ''' 331 | Download COCO images from mscoco.org server. 332 | :param tarDir (str): COCO results directory name 333 | imgIds (list): images to be downloaded 334 | :return: 335 | ''' 336 | if tarDir is None: 337 | print 'Please specify target directory' 338 | return -1 339 | if len(imgIds) == 0: 340 | imgs = self.imgs.values() 341 | else: 342 | imgs = self.loadImgs(imgIds) 343 | N = len(imgs) 344 | if not os.path.exists(tarDir): 345 | os.makedirs(tarDir) 346 | for i, img in enumerate(imgs): 347 | tic = time.time() 348 | fname = os.path.join(tarDir, img['file_name']) 349 | if not os.path.exists(fname): 350 | urllib.urlretrieve(img['coco_url'], fname) 351 | print 'downloaded %d/%d images (t=%.1fs)'%(i, N, time.time()- tic) 352 | -------------------------------------------------------------------------------- /lib/pycocotools/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /lib/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | import pycocotools._mask as _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | encode = _mask.encode 77 | decode = _mask.decode 78 | iou = _mask.iou 79 | merge = _mask.merge 80 | area = _mask.area 81 | toBbox = _mask.toBbox 82 | frPyObjects = _mask.frPyObjects -------------------------------------------------------------------------------- /lib/pycocotools/maskApi.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "maskApi.h" 8 | #include 9 | #include 10 | 11 | uint umin( uint a, uint b ) { return (ab) ? a : b; } 13 | 14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { 15 | R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); 16 | if(cnts) for(siz j=0; jcnts[j]=cnts[j]; 17 | } 18 | 19 | void rleFree( RLE *R ) { 20 | free(R->cnts); R->cnts=0; 21 | } 22 | 23 | void rlesInit( RLE **R, siz n ) { 24 | *R = (RLE*) malloc(sizeof(RLE)*n); 25 | for(siz i=0; i0 ) { 61 | c=umin(ca,cb); cc+=c; ct=0; 62 | ca-=c; if(!ca && a0) { 83 | crowd=iscrowd!=NULL && iscrowd[g]; 84 | if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } 85 | siz ka, kb, a, b; uint c, ca, cb, ct, i, u; bool va, vb; 86 | ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; 87 | cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; 88 | while( ct>0 ) { 89 | c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; 90 | ca-=c; if(!ca && ad?1:c=dy && xs>xe) || (dxye); 151 | if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } 152 | s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; 153 | if(dx>=dy) for( int d=0; d<=dx; d++ ) { 154 | t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; 155 | } else for( int d=0; d<=dy; d++ ) { 156 | t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; 157 | } 158 | } 159 | // get points along y-boundary and downsample 160 | free(x); free(y); k=m; m=0; double xd, yd; 161 | x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); 162 | for( j=1; jw-1 ) continue; 165 | yd=(double)(v[j]h) yd=h; yd=ceil(yd); 167 | x[m]=(int) xd; y[m]=(int) yd; m++; 168 | } 169 | // compute rle encoding given y-boundary points 170 | k=m; a=malloc(sizeof(uint)*(k+1)); 171 | for( j=0; j0) b[m++]=a[j++]; else { 177 | j++; if(jm, p=0; long x; bool more; 184 | char *s=malloc(sizeof(char)*m*6); 185 | for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; 187 | while( more ) { 188 | char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; 189 | if(more) c |= 0x20; c+=48; s[p++]=c; 190 | } 191 | } 192 | s[p]=0; return s; 193 | } 194 | 195 | void rleFrString( RLE *R, char *s, siz h, siz w ) { 196 | siz m=0, p=0, k; long x; bool more; uint *cnts; 197 | while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; 198 | while( s[p] ) { 199 | x=0; k=0; more=1; 200 | while( more ) { 201 | char c=s[p]-48; x |= (c & 0x1f) << 5*k; 202 | more = c & 0x20; p++; k++; 203 | if(!more && (c & 0x10)) x |= -1 << 5*k; 204 | } 205 | if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; 206 | } 207 | rleInit(R,h,w,m,cnts); free(cnts); 208 | } 209 | -------------------------------------------------------------------------------- /lib/pycocotools/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | #include 9 | 10 | typedef unsigned int uint; 11 | typedef unsigned long siz; 12 | typedef unsigned char byte; 13 | typedef double* BB; 14 | typedef struct { siz h, w, m; uint *cnts; } RLE; 15 | 16 | // Initialize/destroy RLE. 17 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 18 | void rleFree( RLE *R ); 19 | 20 | // Initialize/destroy RLE array. 21 | void rlesInit( RLE **R, siz n ); 22 | void rlesFree( RLE **R, siz n ); 23 | 24 | // Encode binary masks using RLE. 25 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 26 | 27 | // Decode binary masks encoded via RLE. 28 | void rleDecode( const RLE *R, byte *mask, siz n ); 29 | 30 | // Compute union or intersection of encoded masks. 31 | void rleMerge( const RLE *R, RLE *M, siz n, bool intersect ); 32 | 33 | // Compute area of encoded masks. 34 | void rleArea( const RLE *R, siz n, uint *a ); 35 | 36 | // Compute intersection over union between masks. 37 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 38 | 39 | // Compute intersection over union between bounding boxes. 40 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 41 | 42 | // Get bounding boxes surrounding encoded masks. 43 | void rleToBbox( const RLE *R, BB bb, siz n ); 44 | 45 | // Convert bounding boxes to encoded masks. 46 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 47 | 48 | // Convert polygon to encoded mask. 49 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 50 | 51 | // Get compressed string representation of encoded mask. 52 | char* rleToString( const RLE *R ); 53 | 54 | // Convert from compressed string representation of encoded mask. 55 | void rleFrString( RLE *R, char *s, siz h, siz w ); 56 | -------------------------------------------------------------------------------- /lib/roi_data_layer/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/roi_data_layer/layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """The data layer used during training to train a Fast R-CNN network. 9 | 10 | RoIDataLayer implements a Caffe Python layer. 11 | """ 12 | 13 | import caffe 14 | from fast_rcnn.config import cfg 15 | from roi_data_layer.minibatch import get_minibatch 16 | import numpy as np 17 | import yaml 18 | from multiprocessing import Process, Queue 19 | 20 | class RoIDataLayer(caffe.Layer): 21 | """Fast R-CNN data layer used for training.""" 22 | 23 | def _shuffle_roidb_inds(self): 24 | """Randomly permute the training roidb.""" 25 | if cfg.TRAIN.ASPECT_GROUPING: 26 | widths = np.array([r['width'] for r in self._roidb]) 27 | heights = np.array([r['height'] for r in self._roidb]) 28 | horz = (widths >= heights) 29 | vert = np.logical_not(horz) 30 | horz_inds = np.where(horz)[0] 31 | vert_inds = np.where(vert)[0] 32 | inds = np.hstack(( 33 | np.random.permutation(horz_inds), 34 | np.random.permutation(vert_inds))) 35 | inds = np.reshape(inds, (-1, 2)) 36 | row_perm = np.random.permutation(np.arange(inds.shape[0])) 37 | inds = np.reshape(inds[row_perm, :], (-1,)) 38 | self._perm = inds 39 | else: 40 | self._perm = np.random.permutation(np.arange(len(self._roidb))) 41 | self._cur = 0 42 | 43 | def _get_next_minibatch_inds(self): 44 | """Return the roidb indices for the next minibatch.""" 45 | if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): 46 | self._shuffle_roidb_inds() 47 | 48 | db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] 49 | self._cur += cfg.TRAIN.IMS_PER_BATCH 50 | return db_inds 51 | 52 | def _get_next_minibatch(self): 53 | """Return the blobs to be used for the next minibatch. 54 | 55 | If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a 56 | separate process and made available through self._blob_queue. 57 | """ 58 | if cfg.TRAIN.USE_PREFETCH: 59 | return self._blob_queue.get() 60 | else: 61 | db_inds = self._get_next_minibatch_inds() 62 | minibatch_db = [self._roidb[i] for i in db_inds] 63 | return get_minibatch(minibatch_db, self._num_classes) 64 | 65 | def set_roidb(self, roidb): 66 | """Set the roidb to be used by this layer during training.""" 67 | self._roidb = roidb 68 | self._shuffle_roidb_inds() 69 | if cfg.TRAIN.USE_PREFETCH: 70 | self._blob_queue = Queue(10) 71 | self._prefetch_process = BlobFetcher(self._blob_queue, 72 | self._roidb, 73 | self._num_classes) 74 | self._prefetch_process.start() 75 | # Terminate the child process when the parent exists 76 | def cleanup(): 77 | print 'Terminating BlobFetcher' 78 | self._prefetch_process.terminate() 79 | self._prefetch_process.join() 80 | import atexit 81 | atexit.register(cleanup) 82 | 83 | def setup(self, bottom, top): 84 | """Setup the RoIDataLayer.""" 85 | 86 | # parse the layer parameter string, which must be valid YAML 87 | layer_params = yaml.load(self.param_str_) 88 | 89 | self._num_classes = layer_params['num_classes'] 90 | 91 | self._name_to_top_map = {} 92 | 93 | # data blob: holds a batch of N images, each with 3 channels 94 | idx = 0 95 | top[idx].reshape(cfg.TRAIN.IMS_PER_BATCH, 3, 96 | max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE) 97 | self._name_to_top_map['data'] = idx 98 | idx += 1 99 | 100 | if cfg.TRAIN.HAS_RPN: 101 | top[idx].reshape(1, 3) 102 | self._name_to_top_map['im_info'] = idx 103 | idx += 1 104 | 105 | top[idx].reshape(1, 4) 106 | self._name_to_top_map['gt_boxes'] = idx 107 | idx += 1 108 | else: # not using RPN 109 | # rois blob: holds R regions of interest, each is a 5-tuple 110 | # (n, x1, y1, x2, y2) specifying an image batch index n and a 111 | # rectangle (x1, y1, x2, y2) 112 | top[idx].reshape(1, 5) 113 | self._name_to_top_map['rois'] = idx 114 | idx += 1 115 | 116 | # labels blob: R categorical labels in [0, ..., K] for K foreground 117 | # classes plus background 118 | top[idx].reshape(1) 119 | self._name_to_top_map['labels'] = idx 120 | idx += 1 121 | 122 | if cfg.TRAIN.BBOX_REG: 123 | # bbox_targets blob: R bounding-box regression targets with 4 124 | # targets per class 125 | top[idx].reshape(1, self._num_classes * 4) 126 | self._name_to_top_map['bbox_targets'] = idx 127 | idx += 1 128 | 129 | # bbox_inside_weights blob: At most 4 targets per roi are active; 130 | # thisbinary vector sepcifies the subset of active targets 131 | top[idx].reshape(1, self._num_classes * 4) 132 | self._name_to_top_map['bbox_inside_weights'] = idx 133 | idx += 1 134 | 135 | top[idx].reshape(1, self._num_classes * 4) 136 | self._name_to_top_map['bbox_outside_weights'] = idx 137 | idx += 1 138 | 139 | print 'RoiDataLayer: name_to_top:', self._name_to_top_map 140 | assert len(top) == len(self._name_to_top_map) 141 | 142 | def forward(self, bottom, top): 143 | """Get blobs and copy them into this layer's top blob vector.""" 144 | blobs = self._get_next_minibatch() 145 | 146 | for blob_name, blob in blobs.iteritems(): 147 | top_ind = self._name_to_top_map[blob_name] 148 | # Reshape net's input blobs 149 | top[top_ind].reshape(*(blob.shape)) 150 | # Copy data into net's input blobs 151 | top[top_ind].data[...] = blob.astype(np.float32, copy=False) 152 | 153 | def backward(self, top, propagate_down, bottom): 154 | """This layer does not propagate gradients.""" 155 | pass 156 | 157 | def reshape(self, bottom, top): 158 | """Reshaping happens during the call to forward.""" 159 | pass 160 | 161 | class BlobFetcher(Process): 162 | """Experimental class for prefetching blobs in a separate process.""" 163 | def __init__(self, queue, roidb, num_classes): 164 | super(BlobFetcher, self).__init__() 165 | self._queue = queue 166 | self._roidb = roidb 167 | self._num_classes = num_classes 168 | self._perm = None 169 | self._cur = 0 170 | self._shuffle_roidb_inds() 171 | # fix the random seed for reproducibility 172 | np.random.seed(cfg.RNG_SEED) 173 | 174 | def _shuffle_roidb_inds(self): 175 | """Randomly permute the training roidb.""" 176 | # TODO(rbg): remove duplicated code 177 | self._perm = np.random.permutation(np.arange(len(self._roidb))) 178 | self._cur = 0 179 | 180 | def _get_next_minibatch_inds(self): 181 | """Return the roidb indices for the next minibatch.""" 182 | # TODO(rbg): remove duplicated code 183 | if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): 184 | self._shuffle_roidb_inds() 185 | 186 | db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] 187 | self._cur += cfg.TRAIN.IMS_PER_BATCH 188 | return db_inds 189 | 190 | def run(self): 191 | print 'BlobFetcher started' 192 | while True: 193 | db_inds = self._get_next_minibatch_inds() 194 | minibatch_db = [self._roidb[i] for i in db_inds] 195 | blobs = get_minibatch(minibatch_db, self._num_classes) 196 | self._queue.put(blobs) 197 | -------------------------------------------------------------------------------- /lib/roi_data_layer/minibatch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Compute minibatch blobs for training a Fast R-CNN network.""" 9 | 10 | import numpy as np 11 | import numpy.random as npr 12 | import cv2 13 | from fast_rcnn.config import cfg 14 | from utils.blob import prep_im_for_blob, im_list_to_blob 15 | 16 | def get_minibatch(roidb, num_classes): 17 | """Given a roidb, construct a minibatch sampled from it.""" 18 | num_images = len(roidb) 19 | # Sample random scales to use for each image in this batch 20 | random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), 21 | size=num_images) 22 | assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \ 23 | 'num_images ({}) must divide BATCH_SIZE ({})'. \ 24 | format(num_images, cfg.TRAIN.BATCH_SIZE) 25 | rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images 26 | fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) 27 | 28 | # Get the input image blob, formatted for caffe 29 | im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) 30 | 31 | blobs = {'data': im_blob} 32 | 33 | if cfg.TRAIN.HAS_RPN: 34 | assert len(im_scales) == 1, "Single batch only" 35 | assert len(roidb) == 1, "Single batch only" 36 | # gt boxes: (x1, y1, x2, y2, cls) 37 | gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] 38 | gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) 39 | gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0] 40 | gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] 41 | blobs['gt_boxes'] = gt_boxes 42 | blobs['im_info'] = np.array( 43 | [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], 44 | dtype=np.float32) 45 | else: # not using RPN 46 | # Now, build the region of interest and label blobs 47 | rois_blob = np.zeros((0, 5), dtype=np.float32) 48 | labels_blob = np.zeros((0), dtype=np.float32) 49 | bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32) 50 | bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32) 51 | # all_overlaps = [] 52 | for im_i in xrange(num_images): 53 | labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \ 54 | = _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, 55 | num_classes) 56 | 57 | # Add to RoIs blob 58 | rois = _project_im_rois(im_rois, im_scales[im_i]) 59 | batch_ind = im_i * np.ones((rois.shape[0], 1)) 60 | rois_blob_this_image = np.hstack((batch_ind, rois)) 61 | rois_blob = np.vstack((rois_blob, rois_blob_this_image)) 62 | 63 | # Add to labels, bbox targets, and bbox loss blobs 64 | labels_blob = np.hstack((labels_blob, labels)) 65 | bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets)) 66 | bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights)) 67 | # all_overlaps = np.hstack((all_overlaps, overlaps)) 68 | 69 | # For debug visualizations 70 | # _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps) 71 | 72 | blobs['rois'] = rois_blob 73 | blobs['labels'] = labels_blob 74 | 75 | if cfg.TRAIN.BBOX_REG: 76 | blobs['bbox_targets'] = bbox_targets_blob 77 | blobs['bbox_inside_weights'] = bbox_inside_blob 78 | blobs['bbox_outside_weights'] = \ 79 | np.array(bbox_inside_blob > 0).astype(np.float32) 80 | 81 | return blobs 82 | 83 | def _sample_rois(roidb, fg_rois_per_image, rois_per_image, num_classes): 84 | """Generate a random sample of RoIs comprising foreground and background 85 | examples. 86 | """ 87 | # label = class RoI has max overlap with 88 | labels = roidb['max_classes'] 89 | overlaps = roidb['max_overlaps'] 90 | rois = roidb['boxes'] 91 | 92 | # Select foreground RoIs as those with >= FG_THRESH overlap 93 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] 94 | # Guard against the case when an image has fewer than fg_rois_per_image 95 | # foreground RoIs 96 | fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) 97 | # Sample foreground regions without replacement 98 | if fg_inds.size > 0: 99 | fg_inds = npr.choice( 100 | fg_inds, size=fg_rois_per_this_image, replace=False) 101 | 102 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 103 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & 104 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 105 | # Compute number of background RoIs to take from this image (guarding 106 | # against there being fewer than desired) 107 | bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image 108 | bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, 109 | bg_inds.size) 110 | # Sample foreground regions without replacement 111 | if bg_inds.size > 0: 112 | bg_inds = npr.choice( 113 | bg_inds, size=bg_rois_per_this_image, replace=False) 114 | 115 | # The indices that we're selecting (both fg and bg) 116 | keep_inds = np.append(fg_inds, bg_inds) 117 | # Select sampled values from various arrays: 118 | labels = labels[keep_inds] 119 | # Clamp labels for the background RoIs to 0 120 | labels[fg_rois_per_this_image:] = 0 121 | overlaps = overlaps[keep_inds] 122 | rois = rois[keep_inds] 123 | 124 | bbox_targets, bbox_inside_weights = _get_bbox_regression_labels( 125 | roidb['bbox_targets'][keep_inds, :], num_classes) 126 | 127 | return labels, overlaps, rois, bbox_targets, bbox_inside_weights 128 | 129 | def _get_image_blob(roidb, scale_inds): 130 | """Builds an input blob from the images in the roidb at the specified 131 | scales. 132 | """ 133 | num_images = len(roidb) 134 | processed_ims = [] 135 | im_scales = [] 136 | for i in xrange(num_images): 137 | im = cv2.imread(roidb[i]['image']) 138 | if roidb[i]['flipped']: 139 | im = im[:, ::-1, :] 140 | target_size = cfg.TRAIN.SCALES[scale_inds[i]] 141 | im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, 142 | cfg.TRAIN.MAX_SIZE) 143 | im_scales.append(im_scale) 144 | processed_ims.append(im) 145 | 146 | # Create a blob to hold the input images 147 | blob = im_list_to_blob(processed_ims) 148 | 149 | return blob, im_scales 150 | 151 | def _project_im_rois(im_rois, im_scale_factor): 152 | """Project image RoIs into the rescaled training image.""" 153 | rois = im_rois * im_scale_factor 154 | return rois 155 | 156 | def _get_bbox_regression_labels(bbox_target_data, num_classes): 157 | """Bounding-box regression targets are stored in a compact form in the 158 | roidb. 159 | 160 | This function expands those targets into the 4-of-4*K representation used 161 | by the network (i.e. only one class has non-zero targets). The loss weights 162 | are similarly expanded. 163 | 164 | Returns: 165 | bbox_target_data (ndarray): N x 4K blob of regression targets 166 | bbox_inside_weights (ndarray): N x 4K blob of loss weights 167 | """ 168 | clss = bbox_target_data[:, 0] 169 | bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) 170 | bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 171 | inds = np.where(clss > 0)[0] 172 | for ind in inds: 173 | cls = clss[ind] 174 | start = 4 * cls 175 | end = start + 4 176 | bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] 177 | bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS 178 | return bbox_targets, bbox_inside_weights 179 | 180 | def _vis_minibatch(im_blob, rois_blob, labels_blob, overlaps): 181 | """Visualize a mini-batch for debugging.""" 182 | import matplotlib.pyplot as plt 183 | for i in xrange(rois_blob.shape[0]): 184 | rois = rois_blob[i, :] 185 | im_ind = rois[0] 186 | roi = rois[1:] 187 | im = im_blob[im_ind, :, :, :].transpose((1, 2, 0)).copy() 188 | im += cfg.PIXEL_MEANS 189 | im = im[:, :, (2, 1, 0)] 190 | im = im.astype(np.uint8) 191 | cls = labels_blob[i] 192 | plt.imshow(im) 193 | print 'class: ', cls, ' overlap: ', overlaps[i] 194 | plt.gca().add_patch( 195 | plt.Rectangle((roi[0], roi[1]), roi[2] - roi[0], 196 | roi[3] - roi[1], fill=False, 197 | edgecolor='r', linewidth=3) 198 | ) 199 | plt.show() 200 | -------------------------------------------------------------------------------- /lib/roi_data_layer/roidb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Transform a roidb into a trainable roidb by adding a bunch of metadata.""" 9 | 10 | import numpy as np 11 | from fast_rcnn.config import cfg 12 | from fast_rcnn.bbox_transform import bbox_transform 13 | from utils.cython_bbox import bbox_overlaps 14 | import PIL 15 | 16 | def prepare_roidb(imdb): 17 | """Enrich the imdb's roidb by adding some derived quantities that 18 | are useful for training. This function precomputes the maximum 19 | overlap, taken over ground-truth boxes, between each ROI and 20 | each ground-truth box. The class with maximum overlap is also 21 | recorded. 22 | """ 23 | sizes = [PIL.Image.open(imdb.image_path_at(i)).size 24 | for i in xrange(imdb.num_images)] 25 | roidb = imdb.roidb 26 | for i in xrange(len(imdb.image_index)): 27 | roidb[i]['image'] = imdb.image_path_at(i) 28 | roidb[i]['width'] = sizes[i][0] 29 | roidb[i]['height'] = sizes[i][1] 30 | # need gt_overlaps as a dense array for argmax 31 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 32 | # max overlap with gt over classes (columns) 33 | max_overlaps = gt_overlaps.max(axis=1) 34 | # gt class that had the max overlap 35 | max_classes = gt_overlaps.argmax(axis=1) 36 | roidb[i]['max_classes'] = max_classes 37 | roidb[i]['max_overlaps'] = max_overlaps 38 | # sanity checks 39 | # max overlap of 0 => class should be zero (background) 40 | zero_inds = np.where(max_overlaps == 0)[0] 41 | assert all(max_classes[zero_inds] == 0) 42 | # max overlap > 0 => class should not be zero (must be a fg class) 43 | nonzero_inds = np.where(max_overlaps > 0)[0] 44 | assert all(max_classes[nonzero_inds] != 0) 45 | 46 | def add_bbox_regression_targets(roidb): 47 | """Add information needed to train bounding-box regressors.""" 48 | assert len(roidb) > 0 49 | assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?' 50 | 51 | num_images = len(roidb) 52 | # Infer number of classes from the number of columns in gt_overlaps 53 | num_classes = roidb[0]['gt_overlaps'].shape[1] 54 | for im_i in xrange(num_images): 55 | rois = roidb[im_i]['boxes'] 56 | max_overlaps = roidb[im_i]['max_overlaps'] 57 | max_classes = roidb[im_i]['max_classes'] 58 | roidb[im_i]['bbox_targets'] = \ 59 | _compute_targets(rois, max_overlaps, max_classes) 60 | 61 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 62 | # Use fixed / precomputed "means" and "stds" instead of empirical values 63 | means = np.tile( 64 | np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1)) 65 | stds = np.tile( 66 | np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1)) 67 | else: 68 | # Compute values needed for means and stds 69 | # var(x) = E(x^2) - E(x)^2 70 | class_counts = np.zeros((num_classes, 1)) + cfg.EPS 71 | sums = np.zeros((num_classes, 4)) 72 | squared_sums = np.zeros((num_classes, 4)) 73 | for im_i in xrange(num_images): 74 | targets = roidb[im_i]['bbox_targets'] 75 | for cls in xrange(1, num_classes): 76 | cls_inds = np.where(targets[:, 0] == cls)[0] 77 | if cls_inds.size > 0: 78 | class_counts[cls] += cls_inds.size 79 | sums[cls, :] += targets[cls_inds, 1:].sum(axis=0) 80 | squared_sums[cls, :] += \ 81 | (targets[cls_inds, 1:] ** 2).sum(axis=0) 82 | 83 | means = sums / class_counts 84 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 85 | 86 | print 'bbox target means:' 87 | print means 88 | print means[1:, :].mean(axis=0) # ignore bg class 89 | print 'bbox target stdevs:' 90 | print stds 91 | print stds[1:, :].mean(axis=0) # ignore bg class 92 | 93 | # Normalize targets 94 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS: 95 | print "Normalizing targets" 96 | for im_i in xrange(num_images): 97 | targets = roidb[im_i]['bbox_targets'] 98 | for cls in xrange(1, num_classes): 99 | cls_inds = np.where(targets[:, 0] == cls)[0] 100 | roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :] 101 | roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :] 102 | else: 103 | print "NOT normalizing targets" 104 | 105 | # These values will be needed for making predictions 106 | # (the predicts will need to be unnormalized and uncentered) 107 | return means.ravel(), stds.ravel() 108 | 109 | def _compute_targets(rois, overlaps, labels): 110 | """Compute bounding-box regression targets for an image.""" 111 | # Indices of ground-truth ROIs 112 | gt_inds = np.where(overlaps == 1)[0] 113 | if len(gt_inds) == 0: 114 | # Bail if the image has no ground-truth ROIs 115 | return np.zeros((rois.shape[0], 5), dtype=np.float32) 116 | # Indices of examples for which we try to make predictions 117 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] 118 | 119 | # Get IoU overlap between each ex ROI and gt ROI 120 | ex_gt_overlaps = bbox_overlaps( 121 | np.ascontiguousarray(rois[ex_inds, :], dtype=np.float), 122 | np.ascontiguousarray(rois[gt_inds, :], dtype=np.float)) 123 | 124 | # Find which gt ROI each ex ROI has max overlap with: 125 | # this will be the ex ROI's gt target 126 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 127 | gt_rois = rois[gt_inds[gt_assignment], :] 128 | ex_rois = rois[ex_inds, :] 129 | 130 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 131 | targets[ex_inds, 0] = labels[ex_inds] 132 | targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) 133 | return targets 134 | -------------------------------------------------------------------------------- /lib/rpn/README.md: -------------------------------------------------------------------------------- 1 | ### `rpn` module overview 2 | 3 | ##### `generate_anchors.py` 4 | 5 | Generates a regular grid of multi-scale, multi-aspect anchor boxes. 6 | 7 | ##### `proposal_layer.py` 8 | 9 | Converts RPN outputs (per-anchor scores and bbox regression estimates) into object proposals. 10 | 11 | ##### `anchor_target_layer.py` 12 | 13 | Generates training targets/labels for each anchor. Classification labels are 1 (object), 0 (not object) or -1 (ignore). 14 | Bbox regression targets are specified when the classification label is > 0. 15 | 16 | ##### `proposal_target_layer.py` 17 | 18 | Generates training targets/labels for each object proposal: classification labels 0 - K (bg or object class 1, ... , K) 19 | and bbox regression targets in that case that the label is > 0. 20 | 21 | ##### `generate.py` 22 | 23 | Generate object detection proposals from an imdb using an RPN. 24 | -------------------------------------------------------------------------------- /lib/rpn/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/rpn/anchor_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | import caffe 10 | import yaml 11 | from fast_rcnn.config import cfg 12 | import numpy as np 13 | import numpy.random as npr 14 | from generate_anchors import generate_anchors 15 | from utils.cython_bbox import bbox_overlaps 16 | from fast_rcnn.bbox_transform import bbox_transform 17 | 18 | DEBUG = False 19 | 20 | class AnchorTargetLayer(caffe.Layer): 21 | """ 22 | Assign anchors to ground-truth targets. Produces anchor classification 23 | labels and bounding-box regression targets. 24 | """ 25 | 26 | def setup(self, bottom, top): 27 | layer_params = yaml.load(self.param_str_) 28 | anchor_scales = layer_params.get('scales', (8, 16, 32)) 29 | self._anchors = generate_anchors(scales=np.array(anchor_scales)) 30 | self._num_anchors = self._anchors.shape[0] 31 | self._feat_stride = layer_params['feat_stride'] 32 | 33 | if DEBUG: 34 | print 'anchors:' 35 | print self._anchors 36 | print 'anchor shapes:' 37 | print np.hstack(( 38 | self._anchors[:, 2::4] - self._anchors[:, 0::4], 39 | self._anchors[:, 3::4] - self._anchors[:, 1::4], 40 | )) 41 | self._counts = cfg.EPS 42 | self._sums = np.zeros((1, 4)) 43 | self._squared_sums = np.zeros((1, 4)) 44 | self._fg_sum = 0 45 | self._bg_sum = 0 46 | self._count = 0 47 | 48 | # allow boxes to sit over the edge by a small amount 49 | self._allowed_border = layer_params.get('allowed_border', 0) 50 | 51 | height, width = bottom[0].data.shape[-2:] 52 | if DEBUG: 53 | print 'AnchorTargetLayer: height', height, 'width', width 54 | 55 | A = self._num_anchors 56 | # labels 57 | top[0].reshape(1, 1, A * height, width) 58 | # bbox_targets 59 | top[1].reshape(1, A * 4, height, width) 60 | # bbox_inside_weights 61 | top[2].reshape(1, A * 4, height, width) 62 | # bbox_outside_weights 63 | top[3].reshape(1, A * 4, height, width) 64 | 65 | def forward(self, bottom, top): 66 | # Algorithm: 67 | # 68 | # for each (H, W) location i 69 | # generate 9 anchor boxes centered on cell i 70 | # apply predicted bbox deltas at cell i to each of the 9 anchors 71 | # filter out-of-image anchors 72 | # measure GT overlap 73 | 74 | assert bottom[0].data.shape[0] == 1, \ 75 | 'Only single item batches are supported' 76 | 77 | # map of shape (..., H, W) 78 | height, width = bottom[0].data.shape[-2:] 79 | # GT boxes (x1, y1, x2, y2, label) 80 | gt_boxes = bottom[1].data 81 | # im_info 82 | im_info = bottom[2].data[0, :] 83 | 84 | if DEBUG: 85 | print '' 86 | print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) 87 | print 'scale: {}'.format(im_info[2]) 88 | print 'height, width: ({}, {})'.format(height, width) 89 | print 'rpn: gt_boxes.shape', gt_boxes.shape 90 | print 'rpn: gt_boxes', gt_boxes 91 | 92 | # 1. Generate proposals from bbox deltas and shifted anchors 93 | shift_x = np.arange(0, width) * self._feat_stride 94 | shift_y = np.arange(0, height) * self._feat_stride 95 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 96 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), 97 | shift_x.ravel(), shift_y.ravel())).transpose() 98 | # add A anchors (1, A, 4) to 99 | # cell K shifts (K, 1, 4) to get 100 | # shift anchors (K, A, 4) 101 | # reshape to (K*A, 4) shifted anchors 102 | A = self._num_anchors 103 | K = shifts.shape[0] 104 | all_anchors = (self._anchors.reshape((1, A, 4)) + 105 | shifts.reshape((1, K, 4)).transpose((1, 0, 2))) 106 | all_anchors = all_anchors.reshape((K * A, 4)) 107 | total_anchors = int(K * A) 108 | 109 | # only keep anchors inside the image 110 | inds_inside = np.where( 111 | (all_anchors[:, 0] >= -self._allowed_border) & 112 | (all_anchors[:, 1] >= -self._allowed_border) & 113 | (all_anchors[:, 2] < im_info[1] + self._allowed_border) & # width 114 | (all_anchors[:, 3] < im_info[0] + self._allowed_border) # height 115 | )[0] 116 | 117 | if DEBUG: 118 | print 'total_anchors', total_anchors 119 | print 'inds_inside', len(inds_inside) 120 | 121 | # keep only inside anchors 122 | anchors = all_anchors[inds_inside, :] 123 | if DEBUG: 124 | print 'anchors.shape', anchors.shape 125 | 126 | # label: 1 is positive, 0 is negative, -1 is dont care 127 | labels = np.empty((len(inds_inside), ), dtype=np.float32) 128 | labels.fill(-1) 129 | 130 | # overlaps between the anchors and the gt boxes 131 | # overlaps (ex, gt) 132 | overlaps = bbox_overlaps( 133 | np.ascontiguousarray(anchors, dtype=np.float), 134 | np.ascontiguousarray(gt_boxes, dtype=np.float)) 135 | argmax_overlaps = overlaps.argmax(axis=1) 136 | max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] 137 | gt_argmax_overlaps = overlaps.argmax(axis=0) 138 | gt_max_overlaps = overlaps[gt_argmax_overlaps, 139 | np.arange(overlaps.shape[1])] 140 | gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] 141 | 142 | if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: 143 | # assign bg labels first so that positive labels can clobber them 144 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 145 | 146 | # fg label: for each gt, anchor with highest overlap 147 | labels[gt_argmax_overlaps] = 1 148 | 149 | # fg label: above threshold IOU 150 | labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 151 | 152 | if cfg.TRAIN.RPN_CLOBBER_POSITIVES: 153 | # assign bg labels last so that negative labels can clobber positives 154 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 155 | 156 | # subsample positive labels if we have too many 157 | num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) 158 | fg_inds = np.where(labels == 1)[0] 159 | if len(fg_inds) > num_fg: 160 | disable_inds = npr.choice( 161 | fg_inds, size=(len(fg_inds) - num_fg), replace=False) 162 | labels[disable_inds] = -1 163 | 164 | # subsample negative labels if we have too many 165 | num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) 166 | bg_inds = np.where(labels == 0)[0] 167 | if len(bg_inds) > num_bg: 168 | disable_inds = npr.choice( 169 | bg_inds, size=(len(bg_inds) - num_bg), replace=False) 170 | labels[disable_inds] = -1 171 | #print "was %s inds, disabling %s, now %s inds" % ( 172 | #len(bg_inds), len(disable_inds), np.sum(labels == 0)) 173 | 174 | bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) 175 | bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) 176 | 177 | bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 178 | bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) 179 | 180 | bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 181 | if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: 182 | # uniform weighting of examples (given non-uniform sampling) 183 | num_examples = np.sum(labels >= 0) 184 | positive_weights = np.ones((1, 4)) * 1.0 / num_examples 185 | negative_weights = np.ones((1, 4)) * 1.0 / num_examples 186 | else: 187 | assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) & 188 | (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) 189 | positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT / 190 | np.sum(labels == 1)) 191 | negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / 192 | np.sum(labels == 0)) 193 | bbox_outside_weights[labels == 1, :] = positive_weights 194 | bbox_outside_weights[labels == 0, :] = negative_weights 195 | 196 | if DEBUG: 197 | self._sums += bbox_targets[labels == 1, :].sum(axis=0) 198 | self._squared_sums += (bbox_targets[labels == 1, :] ** 2).sum(axis=0) 199 | self._counts += np.sum(labels == 1) 200 | means = self._sums / self._counts 201 | stds = np.sqrt(self._squared_sums / self._counts - means ** 2) 202 | print 'means:' 203 | print means 204 | print 'stdevs:' 205 | print stds 206 | 207 | # map up to original set of anchors 208 | labels = _unmap(labels, total_anchors, inds_inside, fill=-1) 209 | bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) 210 | bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) 211 | bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) 212 | 213 | if DEBUG: 214 | print 'rpn: max max_overlap', np.max(max_overlaps) 215 | print 'rpn: num_positive', np.sum(labels == 1) 216 | print 'rpn: num_negative', np.sum(labels == 0) 217 | self._fg_sum += np.sum(labels == 1) 218 | self._bg_sum += np.sum(labels == 0) 219 | self._count += 1 220 | print 'rpn: num_positive avg', self._fg_sum / self._count 221 | print 'rpn: num_negative avg', self._bg_sum / self._count 222 | 223 | # labels 224 | labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2) 225 | labels = labels.reshape((1, 1, A * height, width)) 226 | top[0].reshape(*labels.shape) 227 | top[0].data[...] = labels 228 | 229 | # bbox_targets 230 | bbox_targets = bbox_targets \ 231 | .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) 232 | top[1].reshape(*bbox_targets.shape) 233 | top[1].data[...] = bbox_targets 234 | 235 | # bbox_inside_weights 236 | bbox_inside_weights = bbox_inside_weights \ 237 | .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) 238 | assert bbox_inside_weights.shape[2] == height 239 | assert bbox_inside_weights.shape[3] == width 240 | top[2].reshape(*bbox_inside_weights.shape) 241 | top[2].data[...] = bbox_inside_weights 242 | 243 | # bbox_outside_weights 244 | bbox_outside_weights = bbox_outside_weights \ 245 | .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) 246 | assert bbox_outside_weights.shape[2] == height 247 | assert bbox_outside_weights.shape[3] == width 248 | top[3].reshape(*bbox_outside_weights.shape) 249 | top[3].data[...] = bbox_outside_weights 250 | 251 | def backward(self, top, propagate_down, bottom): 252 | """This layer does not propagate gradients.""" 253 | pass 254 | 255 | def reshape(self, bottom, top): 256 | """Reshaping happens during the call to forward.""" 257 | pass 258 | 259 | 260 | def _unmap(data, count, inds, fill=0): 261 | """ Unmap a subset of item (data) back to the original set of items (of 262 | size count) """ 263 | if len(data.shape) == 1: 264 | ret = np.empty((count, ), dtype=np.float32) 265 | ret.fill(fill) 266 | ret[inds] = data 267 | else: 268 | ret = np.empty((count, ) + data.shape[1:], dtype=np.float32) 269 | ret.fill(fill) 270 | ret[inds, :] = data 271 | return ret 272 | 273 | 274 | def _compute_targets(ex_rois, gt_rois): 275 | """Compute bounding-box regression targets for an image.""" 276 | 277 | assert ex_rois.shape[0] == gt_rois.shape[0] 278 | assert ex_rois.shape[1] == 4 279 | assert gt_rois.shape[1] == 5 280 | 281 | return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False) 282 | -------------------------------------------------------------------------------- /lib/rpn/generate.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from fast_rcnn.config import cfg 9 | from utils.blob import im_list_to_blob 10 | from utils.timer import Timer 11 | import numpy as np 12 | import cv2 13 | 14 | def _vis_proposals(im, dets, thresh=0.5): 15 | """Draw detected bounding boxes.""" 16 | inds = np.where(dets[:, -1] >= thresh)[0] 17 | if len(inds) == 0: 18 | return 19 | 20 | class_name = 'obj' 21 | im = im[:, :, (2, 1, 0)] 22 | fig, ax = plt.subplots(figsize=(12, 12)) 23 | ax.imshow(im, aspect='equal') 24 | for i in inds: 25 | bbox = dets[i, :4] 26 | score = dets[i, -1] 27 | 28 | ax.add_patch( 29 | plt.Rectangle((bbox[0], bbox[1]), 30 | bbox[2] - bbox[0], 31 | bbox[3] - bbox[1], fill=False, 32 | edgecolor='red', linewidth=3.5) 33 | ) 34 | ax.text(bbox[0], bbox[1] - 2, 35 | '{:s} {:.3f}'.format(class_name, score), 36 | bbox=dict(facecolor='blue', alpha=0.5), 37 | fontsize=14, color='white') 38 | 39 | ax.set_title(('{} detections with ' 40 | 'p({} | box) >= {:.1f}').format(class_name, class_name, 41 | thresh), 42 | fontsize=14) 43 | plt.axis('off') 44 | plt.tight_layout() 45 | plt.draw() 46 | 47 | def _get_image_blob(im): 48 | """Converts an image into a network input. 49 | 50 | Arguments: 51 | im (ndarray): a color image in BGR order 52 | 53 | Returns: 54 | blob (ndarray): a data blob holding an image pyramid 55 | im_scale_factors (list): list of image scales (relative to im) used 56 | in the image pyramid 57 | """ 58 | im_orig = im.astype(np.float32, copy=True) 59 | im_orig -= cfg.PIXEL_MEANS 60 | 61 | im_shape = im_orig.shape 62 | im_size_min = np.min(im_shape[0:2]) 63 | im_size_max = np.max(im_shape[0:2]) 64 | 65 | processed_ims = [] 66 | 67 | assert len(cfg.TEST.SCALES) == 1 68 | target_size = cfg.TEST.SCALES[0] 69 | 70 | im_scale = float(target_size) / float(im_size_min) 71 | # Prevent the biggest axis from being more than MAX_SIZE 72 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: 73 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) 74 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 75 | interpolation=cv2.INTER_LINEAR) 76 | im_info = np.hstack((im.shape[:2], im_scale))[np.newaxis, :] 77 | processed_ims.append(im) 78 | 79 | # Create a blob to hold the input images 80 | blob = im_list_to_blob(processed_ims) 81 | 82 | return blob, im_info 83 | 84 | def im_proposals(net, im): 85 | """Generate RPN proposals on a single image.""" 86 | blobs = {} 87 | blobs['data'], blobs['im_info'] = _get_image_blob(im) 88 | net.blobs['data'].reshape(*(blobs['data'].shape)) 89 | net.blobs['im_info'].reshape(*(blobs['im_info'].shape)) 90 | blobs_out = net.forward( 91 | data=blobs['data'].astype(np.float32, copy=False), 92 | im_info=blobs['im_info'].astype(np.float32, copy=False)) 93 | 94 | scale = blobs['im_info'][0, 2] 95 | boxes = blobs_out['rois'][:, 1:].copy() / scale 96 | scores = blobs_out['scores'].copy() 97 | return boxes, scores 98 | 99 | def imdb_proposals(net, imdb): 100 | """Generate RPN proposals on all images in an imdb.""" 101 | 102 | _t = Timer() 103 | imdb_boxes = [[] for _ in xrange(imdb.num_images)] 104 | for i in xrange(imdb.num_images): 105 | im = cv2.imread(imdb.image_path_at(i)) 106 | _t.tic() 107 | imdb_boxes[i], scores = im_proposals(net, im) 108 | _t.toc() 109 | print 'im_proposals: {:d}/{:d} {:.3f}s' \ 110 | .format(i + 1, imdb.num_images, _t.average_time) 111 | if 0: 112 | dets = np.hstack((imdb_boxes[i], scores)) 113 | # from IPython import embed; embed() 114 | _vis_proposals(im, dets[:3, :], thresh=0.9) 115 | plt.show() 116 | 117 | return imdb_boxes 118 | -------------------------------------------------------------------------------- /lib/rpn/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 11 | # 12 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 13 | # >> anchors 14 | # 15 | # anchors = 16 | # 17 | # -83 -39 100 56 18 | # -175 -87 192 104 19 | # -359 -183 376 200 20 | # -55 -55 72 72 21 | # -119 -119 136 136 22 | # -247 -247 264 264 23 | # -35 -79 52 96 24 | # -79 -167 96 184 25 | # -167 -343 184 360 26 | 27 | #array([[ -83., -39., 100., 56.], 28 | # [-175., -87., 192., 104.], 29 | # [-359., -183., 376., 200.], 30 | # [ -55., -55., 72., 72.], 31 | # [-119., -119., 136., 136.], 32 | # [-247., -247., 264., 264.], 33 | # [ -35., -79., 52., 96.], 34 | # [ -79., -167., 96., 184.], 35 | # [-167., -343., 184., 360.]]) 36 | 37 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 38 | scales=2**np.arange(3, 6)): 39 | """ 40 | Generate anchor (reference) windows by enumerating aspect ratios X 41 | scales wrt a reference (0, 0, 15, 15) window. 42 | """ 43 | 44 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 45 | ratio_anchors = _ratio_enum(base_anchor, ratios) 46 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 47 | for i in xrange(ratio_anchors.shape[0])]) 48 | return anchors 49 | 50 | def _whctrs(anchor): 51 | """ 52 | Return width, height, x center, and y center for an anchor (window). 53 | """ 54 | 55 | w = anchor[2] - anchor[0] + 1 56 | h = anchor[3] - anchor[1] + 1 57 | x_ctr = anchor[0] + 0.5 * (w - 1) 58 | y_ctr = anchor[1] + 0.5 * (h - 1) 59 | return w, h, x_ctr, y_ctr 60 | 61 | def _mkanchors(ws, hs, x_ctr, y_ctr): 62 | """ 63 | Given a vector of widths (ws) and heights (hs) around a center 64 | (x_ctr, y_ctr), output a set of anchors (windows). 65 | """ 66 | 67 | ws = ws[:, np.newaxis] 68 | hs = hs[:, np.newaxis] 69 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 70 | y_ctr - 0.5 * (hs - 1), 71 | x_ctr + 0.5 * (ws - 1), 72 | y_ctr + 0.5 * (hs - 1))) 73 | return anchors 74 | 75 | def _ratio_enum(anchor, ratios): 76 | """ 77 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 78 | """ 79 | 80 | w, h, x_ctr, y_ctr = _whctrs(anchor) 81 | size = w * h 82 | size_ratios = size / ratios 83 | ws = np.round(np.sqrt(size_ratios)) 84 | hs = np.round(ws * ratios) 85 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 86 | return anchors 87 | 88 | def _scale_enum(anchor, scales): 89 | """ 90 | Enumerate a set of anchors for each scale wrt an anchor. 91 | """ 92 | 93 | w, h, x_ctr, y_ctr = _whctrs(anchor) 94 | ws = w * scales 95 | hs = h * scales 96 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 97 | return anchors 98 | 99 | if __name__ == '__main__': 100 | import time 101 | t = time.time() 102 | a = generate_anchors() 103 | print time.time() - t 104 | print a 105 | from IPython import embed; embed() 106 | -------------------------------------------------------------------------------- /lib/rpn/proposal_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import caffe 9 | import numpy as np 10 | import yaml 11 | from fast_rcnn.config import cfg 12 | from generate_anchors import generate_anchors 13 | from fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes 14 | from fast_rcnn.nms_wrapper import nms 15 | 16 | DEBUG = False 17 | 18 | class ProposalLayer(caffe.Layer): 19 | """ 20 | Outputs object detection proposals by applying estimated bounding-box 21 | transformations to a set of regular boxes (called "anchors"). 22 | """ 23 | 24 | def setup(self, bottom, top): 25 | # parse the layer parameter string, which must be valid YAML 26 | layer_params = yaml.load(self.param_str_) 27 | 28 | self._feat_stride = layer_params['feat_stride'] 29 | anchor_scales = layer_params.get('scales', (8, 16, 32)) 30 | self._anchors = generate_anchors(scales=np.array(anchor_scales)) 31 | self._num_anchors = self._anchors.shape[0] 32 | 33 | if DEBUG: 34 | print 'feat_stride: {}'.format(self._feat_stride) 35 | print 'anchors:' 36 | print self._anchors 37 | 38 | # rois blob: holds R regions of interest, each is a 5-tuple 39 | # (n, x1, y1, x2, y2) specifying an image batch index n and a 40 | # rectangle (x1, y1, x2, y2) 41 | top[0].reshape(1, 5) 42 | 43 | # scores blob: holds scores for R regions of interest 44 | if len(top) > 1: 45 | top[1].reshape(1, 1, 1, 1) 46 | 47 | def forward(self, bottom, top): 48 | # Algorithm: 49 | # 50 | # for each (H, W) location i 51 | # generate A anchor boxes centered on cell i 52 | # apply predicted bbox deltas at cell i to each of the A anchors 53 | # clip predicted boxes to image 54 | # remove predicted boxes with either height or width < threshold 55 | # sort all (proposal, score) pairs by score from highest to lowest 56 | # take top pre_nms_topN proposals before NMS 57 | # apply NMS with threshold 0.7 to remaining proposals 58 | # take after_nms_topN proposals after NMS 59 | # return the top proposals (-> RoIs top, scores top) 60 | 61 | assert bottom[0].data.shape[0] == 1, \ 62 | 'Only single item batches are supported' 63 | 64 | cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' 65 | pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N 66 | post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N 67 | nms_thresh = cfg[cfg_key].RPN_NMS_THRESH 68 | min_size = cfg[cfg_key].RPN_MIN_SIZE 69 | 70 | # the first set of _num_anchors channels are bg probs 71 | # the second set are the fg probs, which we want 72 | scores = bottom[0].data[:, self._num_anchors:, :, :] 73 | bbox_deltas = bottom[1].data 74 | im_info = bottom[2].data[0, :] 75 | 76 | if DEBUG: 77 | print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) 78 | print 'scale: {}'.format(im_info[2]) 79 | 80 | # 1. Generate proposals from bbox deltas and shifted anchors 81 | height, width = scores.shape[-2:] 82 | 83 | if DEBUG: 84 | print 'score map size: {}'.format(scores.shape) 85 | 86 | # Enumerate all shifts 87 | shift_x = np.arange(0, width) * self._feat_stride 88 | shift_y = np.arange(0, height) * self._feat_stride 89 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 90 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), 91 | shift_x.ravel(), shift_y.ravel())).transpose() 92 | 93 | # Enumerate all shifted anchors: 94 | # 95 | # add A anchors (1, A, 4) to 96 | # cell K shifts (K, 1, 4) to get 97 | # shift anchors (K, A, 4) 98 | # reshape to (K*A, 4) shifted anchors 99 | A = self._num_anchors 100 | K = shifts.shape[0] 101 | anchors = self._anchors.reshape((1, A, 4)) + \ 102 | shifts.reshape((1, K, 4)).transpose((1, 0, 2)) 103 | anchors = anchors.reshape((K * A, 4)) 104 | 105 | # Transpose and reshape predicted bbox transformations to get them 106 | # into the same order as the anchors: 107 | # 108 | # bbox deltas will be (1, 4 * A, H, W) format 109 | # transpose to (1, H, W, 4 * A) 110 | # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) 111 | # in slowest to fastest order 112 | bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) 113 | 114 | # Same story for the scores: 115 | # 116 | # scores are (1, A, H, W) format 117 | # transpose to (1, H, W, A) 118 | # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) 119 | scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) 120 | 121 | # Convert anchors into proposals via bbox transformations 122 | proposals = bbox_transform_inv(anchors, bbox_deltas) 123 | 124 | # 2. clip predicted boxes to image 125 | proposals = clip_boxes(proposals, im_info[:2]) 126 | 127 | # 3. remove predicted boxes with either height or width < threshold 128 | # (NOTE: convert min_size to input image scale stored in im_info[2]) 129 | keep = _filter_boxes(proposals, min_size * im_info[2]) 130 | proposals = proposals[keep, :] 131 | scores = scores[keep] 132 | 133 | # 4. sort all (proposal, score) pairs by score from highest to lowest 134 | # 5. take top pre_nms_topN (e.g. 6000) 135 | order = scores.ravel().argsort()[::-1] 136 | if pre_nms_topN > 0: 137 | order = order[:pre_nms_topN] 138 | proposals = proposals[order, :] 139 | scores = scores[order] 140 | 141 | # 6. apply nms (e.g. threshold = 0.7) 142 | # 7. take after_nms_topN (e.g. 300) 143 | # 8. return the top proposals (-> RoIs top) 144 | keep = nms(np.hstack((proposals, scores)), nms_thresh) 145 | if post_nms_topN > 0: 146 | keep = keep[:post_nms_topN] 147 | proposals = proposals[keep, :] 148 | scores = scores[keep] 149 | 150 | # Output rois blob 151 | # Our RPN implementation only supports a single input image, so all 152 | # batch inds are 0 153 | batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) 154 | blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) 155 | top[0].reshape(*(blob.shape)) 156 | top[0].data[...] = blob 157 | 158 | # [Optional] output scores blob 159 | if len(top) > 1: 160 | top[1].reshape(*(scores.shape)) 161 | top[1].data[...] = scores 162 | 163 | def backward(self, top, propagate_down, bottom): 164 | """This layer does not propagate gradients.""" 165 | pass 166 | 167 | def reshape(self, bottom, top): 168 | """Reshaping happens during the call to forward.""" 169 | pass 170 | 171 | def _filter_boxes(boxes, min_size): 172 | """Remove all boxes with any side smaller than min_size.""" 173 | ws = boxes[:, 2] - boxes[:, 0] + 1 174 | hs = boxes[:, 3] - boxes[:, 1] + 1 175 | keep = np.where((ws >= min_size) & (hs >= min_size))[0] 176 | return keep 177 | -------------------------------------------------------------------------------- /lib/rpn/proposal_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import caffe 9 | import yaml 10 | import numpy as np 11 | import numpy.random as npr 12 | from fast_rcnn.config import cfg 13 | from fast_rcnn.bbox_transform import bbox_transform 14 | from utils.cython_bbox import bbox_overlaps 15 | 16 | DEBUG = False 17 | 18 | class ProposalTargetLayer(caffe.Layer): 19 | """ 20 | Assign object detection proposals to ground-truth targets. Produces proposal 21 | classification labels and bounding-box regression targets. 22 | """ 23 | 24 | def setup(self, bottom, top): 25 | layer_params = yaml.load(self.param_str_) 26 | self._num_classes = layer_params['num_classes'] 27 | 28 | # sampled rois (0, x1, y1, x2, y2) 29 | top[0].reshape(1, 5) 30 | # labels 31 | top[1].reshape(1, 1) 32 | # bbox_targets 33 | top[2].reshape(1, self._num_classes * 4) 34 | # bbox_inside_weights 35 | top[3].reshape(1, self._num_classes * 4) 36 | # bbox_outside_weights 37 | top[4].reshape(1, self._num_classes * 4) 38 | 39 | def forward(self, bottom, top): 40 | # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN 41 | # (i.e., rpn.proposal_layer.ProposalLayer), or any other source 42 | all_rois = bottom[0].data 43 | # GT boxes (x1, y1, x2, y2, label) 44 | # TODO(rbg): it's annoying that sometimes I have extra info before 45 | # and other times after box coordinates -- normalize to one format 46 | gt_boxes = bottom[1].data 47 | 48 | # Include ground-truth boxes in the set of candidate rois 49 | zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) 50 | all_rois = np.vstack( 51 | (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) 52 | ) 53 | 54 | # Sanity check: single batch only 55 | assert np.all(all_rois[:, 0] == 0), \ 56 | 'Only single item batches are supported' 57 | 58 | num_images = 1 59 | rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images 60 | fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) 61 | 62 | # Sample rois with classification labels and bounding box regression 63 | # targets 64 | labels, rois, bbox_targets, bbox_inside_weights = _sample_rois( 65 | all_rois, gt_boxes, fg_rois_per_image, 66 | rois_per_image, self._num_classes) 67 | 68 | if DEBUG: 69 | print 'num fg: {}'.format((labels > 0).sum()) 70 | print 'num bg: {}'.format((labels == 0).sum()) 71 | self._count += 1 72 | self._fg_num += (labels > 0).sum() 73 | self._bg_num += (labels == 0).sum() 74 | print 'num fg avg: {}'.format(self._fg_num / self._count) 75 | print 'num bg avg: {}'.format(self._bg_num / self._count) 76 | print 'ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num)) 77 | 78 | # sampled rois 79 | top[0].reshape(*rois.shape) 80 | top[0].data[...] = rois 81 | 82 | # classification labels 83 | top[1].reshape(*labels.shape) 84 | top[1].data[...] = labels 85 | 86 | # bbox_targets 87 | top[2].reshape(*bbox_targets.shape) 88 | top[2].data[...] = bbox_targets 89 | 90 | # bbox_inside_weights 91 | top[3].reshape(*bbox_inside_weights.shape) 92 | top[3].data[...] = bbox_inside_weights 93 | 94 | # bbox_outside_weights 95 | top[4].reshape(*bbox_inside_weights.shape) 96 | top[4].data[...] = np.array(bbox_inside_weights > 0).astype(np.float32) 97 | 98 | def backward(self, top, propagate_down, bottom): 99 | """This layer does not propagate gradients.""" 100 | pass 101 | 102 | def reshape(self, bottom, top): 103 | """Reshaping happens during the call to forward.""" 104 | pass 105 | 106 | 107 | def _get_bbox_regression_labels(bbox_target_data, num_classes): 108 | """Bounding-box regression targets (bbox_target_data) are stored in a 109 | compact form N x (class, tx, ty, tw, th) 110 | 111 | This function expands those targets into the 4-of-4*K representation used 112 | by the network (i.e. only one class has non-zero targets). 113 | 114 | Returns: 115 | bbox_target (ndarray): N x 4K blob of regression targets 116 | bbox_inside_weights (ndarray): N x 4K blob of loss weights 117 | """ 118 | 119 | clss = bbox_target_data[:, 0] 120 | bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) 121 | bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 122 | inds = np.where(clss > 0)[0] 123 | for ind in inds: 124 | cls = clss[ind] 125 | start = 4 * cls 126 | end = start + 4 127 | bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] 128 | bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS 129 | return bbox_targets, bbox_inside_weights 130 | 131 | 132 | def _compute_targets(ex_rois, gt_rois, labels): 133 | """Compute bounding-box regression targets for an image.""" 134 | 135 | assert ex_rois.shape[0] == gt_rois.shape[0] 136 | assert ex_rois.shape[1] == 4 137 | assert gt_rois.shape[1] == 4 138 | 139 | targets = bbox_transform(ex_rois, gt_rois) 140 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 141 | # Optionally normalize targets by a precomputed mean and stdev 142 | targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) 143 | / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) 144 | return np.hstack( 145 | (labels[:, np.newaxis], targets)).astype(np.float32, copy=False) 146 | 147 | def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): 148 | """Generate a random sample of RoIs comprising foreground and background 149 | examples. 150 | """ 151 | # overlaps: (rois x gt_boxes) 152 | overlaps = bbox_overlaps( 153 | np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), 154 | np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) 155 | gt_assignment = overlaps.argmax(axis=1) 156 | max_overlaps = overlaps.max(axis=1) 157 | labels = gt_boxes[gt_assignment, 4] 158 | 159 | # Select foreground RoIs as those with >= FG_THRESH overlap 160 | fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] 161 | # Guard against the case when an image has fewer than fg_rois_per_image 162 | # foreground RoIs 163 | fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size) 164 | # Sample foreground regions without replacement 165 | if fg_inds.size > 0: 166 | fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) 167 | 168 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 169 | bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & 170 | (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 171 | # Compute number of background RoIs to take from this image (guarding 172 | # against there being fewer than desired) 173 | bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image 174 | bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) 175 | # Sample background regions without replacement 176 | if bg_inds.size > 0: 177 | bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) 178 | 179 | # The indices that we're selecting (both fg and bg) 180 | keep_inds = np.append(fg_inds, bg_inds) 181 | # Select sampled values from various arrays: 182 | labels = labels[keep_inds] 183 | # Clamp labels for the background RoIs to 0 184 | labels[fg_rois_per_this_image:] = 0 185 | rois = all_rois[keep_inds] 186 | 187 | bbox_target_data = _compute_targets( 188 | rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) 189 | 190 | bbox_targets, bbox_inside_weights = \ 191 | _get_bbox_regression_labels(bbox_target_data, num_classes) 192 | 193 | return labels, rois, bbox_targets, bbox_inside_weights 194 | -------------------------------------------------------------------------------- /lib/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import os 10 | from os.path import join as pjoin 11 | #from distutils.core import setup 12 | from setuptools import setup 13 | from distutils.extension import Extension 14 | from Cython.Distutils import build_ext 15 | import subprocess 16 | 17 | #change for windows, by MrX 18 | nvcc_bin = 'nvcc.exe' 19 | lib_dir = 'lib/x64' 20 | 21 | def find_in_path(name, path): 22 | "Find a file in a search path" 23 | # Adapted fom 24 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 25 | for dir in path.split(os.pathsep): 26 | binpath = pjoin(dir, name) 27 | if os.path.exists(binpath): 28 | return os.path.abspath(binpath) 29 | return None 30 | 31 | 32 | def locate_cuda(): 33 | """Locate the CUDA environment on the system 34 | 35 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 36 | and values giving the absolute path to each directory. 37 | 38 | Starts by looking for the CUDAHOME env variable. If not found, everything 39 | is based on finding 'nvcc' in the PATH. 40 | """ 41 | 42 | # first check if the CUDAHOME env variable is in use 43 | if 'CUDA_PATH' in os.environ: 44 | home = os.environ['CUDA_PATH'] 45 | print("home = %s\n" % home) 46 | nvcc = pjoin(home, 'bin', nvcc_bin) 47 | else: 48 | # otherwise, search the PATH for NVCC 49 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 50 | nvcc = find_in_path(nvcc_bin, os.environ['PATH'] + os.pathsep + default_path) 51 | if nvcc is None: 52 | raise EnvironmentError('The nvcc binary could not be ' 53 | 'located in your $PATH. Either add it to your path, or set $CUDA_PATH') 54 | home = os.path.dirname(os.path.dirname(nvcc)) 55 | print("home = %s, nvcc = %s\n" % (home, nvcc)) 56 | 57 | 58 | cudaconfig = {'home':home, 'nvcc':nvcc, 59 | 'include': pjoin(home, 'include'), 60 | 'lib64': pjoin(home, lib_dir)} 61 | for k, v in cudaconfig.iteritems(): 62 | if not os.path.exists(v): 63 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 64 | 65 | return cudaconfig 66 | CUDA = locate_cuda() 67 | 68 | 69 | # Obtain the numpy include directory. This logic works across numpy versions. 70 | try: 71 | numpy_include = np.get_include() 72 | except AttributeError: 73 | numpy_include = np.get_numpy_include() 74 | 75 | 76 | def customize_compiler_for_nvcc(self): 77 | """inject deep into distutils to customize how the dispatch 78 | to gcc/nvcc works. 79 | 80 | If you subclass UnixCCompiler, it's not trivial to get your subclass 81 | injected in, and still have the right customizations (i.e. 82 | distutils.sysconfig.customize_compiler) run on it. So instead of going 83 | the OO route, I have this. Note, it's kindof like a wierd functional 84 | subclassing going on.""" 85 | 86 | # tell the compiler it can processes .cu 87 | #self.src_extensions.append('.cu') 88 | 89 | 90 | # save references to the default compiler_so and _comple methods 91 | #default_compiler_so = self.spawn 92 | #default_compiler_so = self.rc 93 | super = self.compile 94 | 95 | # now redefine the _compile method. This gets executed for each 96 | # object but distutils doesn't have the ability to change compilers 97 | # based on source extension: we add it. 98 | def compile(sources, output_dir=None, macros=None, include_dirs=None, debug=0, extra_preargs=None, extra_postargs=None, depends=None): 99 | postfix=os.path.splitext(sources[0])[1] 100 | 101 | if postfix == '.cu': 102 | # use the cuda for .cu files 103 | #self.set_executable('compiler_so', CUDA['nvcc']) 104 | # use only a subset of the extra_postargs, which are 1-1 translated 105 | # from the extra_compile_args in the Extension class 106 | postargs = extra_postargs['nvcc'] 107 | else: 108 | postargs = extra_postargs['gcc'] 109 | 110 | 111 | return super(sources, output_dir, macros, include_dirs, debug, extra_preargs, postargs, depends) 112 | # reset the default compiler_so, which we might have changed for cuda 113 | #self.rc = default_compiler_so 114 | 115 | # inject our redefined _compile method into the class 116 | self.compile = compile 117 | 118 | 119 | # run the customize_compiler 120 | class custom_build_ext(build_ext): 121 | def build_extensions(self): 122 | customize_compiler_for_nvcc(self.compiler) 123 | build_ext.build_extensions(self) 124 | 125 | 126 | ext_modules = [ 127 | # unix _compile: obj, src, ext, cc_args, extra_postargs, pp_opts 128 | Extension( 129 | "utils.cython_bbox", 130 | sources=["utils\\bbox.pyx"], 131 | #define_macros={'/LD'}, 132 | #extra_compile_args={'gcc': ['/link', '/DLL', '/OUT:cython_bbox.dll']}, 133 | #extra_compile_args={'gcc': ['/LD']}, 134 | extra_compile_args={'gcc': []}, 135 | include_dirs = [numpy_include] 136 | ), 137 | Extension( 138 | "nms.cpu_nms", 139 | sources=["nms\\cpu_nms.pyx"], 140 | extra_compile_args={'gcc': []}, 141 | include_dirs = [numpy_include], 142 | ), 143 | Extension( 144 | "pycocotools._mask", 145 | sources=['pycocotools\\maskApi.c', 'pycocotools\\_mask.pyx'], 146 | include_dirs = [numpy_include, 'pycocotools'], 147 | extra_compile_args={ 148 | 'gcc': ['/Qstd=c99']}, 149 | ), 150 | #Extension( # just used to get nms\gpu_nms.obj 151 | # "nms.gpu_nms", 152 | # sources=['nms\\gpu_nms.pyx'], 153 | # language='c++', 154 | # extra_compile_args={'gcc': []}, 155 | # include_dirs = [numpy_include] 156 | #), 157 | ] 158 | 159 | setup( 160 | name='fast_rcnn', 161 | ext_modules=ext_modules, 162 | # inject our custom trigger 163 | cmdclass={'build_ext': custom_build_ext}, 164 | ) 165 | -------------------------------------------------------------------------------- /lib/setup_cuda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import os 5 | # on Windows, we need the original PATH without Anaconda's compiler in it: 6 | PATH = os.environ.get('PATH') 7 | from distutils.spawn import spawn, find_executable 8 | from setuptools import setup, find_packages, Extension 9 | from setuptools.command.build_ext import build_ext 10 | import sys 11 | 12 | # CUDA specific config 13 | # nvcc is assumed to be in user's PATH 14 | nvcc_compile_args = ['-O', '--ptxas-options=-v', '-arch=sm_35', '-c', '--compiler-options=-fPIC'] 15 | nvcc_compile_args = os.environ.get('NVCCFLAGS', '').split() + nvcc_compile_args 16 | cuda_libs = ['cublas'] 17 | 18 | 19 | # Obtain the numpy include directory. This logic works across numpy versions. 20 | try: 21 | numpy_include = np.get_include() 22 | except AttributeError: 23 | numpy_include = np.get_numpy_include() 24 | 25 | 26 | cudamat_ext = Extension('nms.gpu_nms', 27 | sources=[ 28 | 'nms\\gpu_nms.cu' 29 | ], 30 | language='c++', 31 | libraries=cuda_libs, 32 | extra_compile_args=nvcc_compile_args, 33 | include_dirs = [numpy_include, 'C:\\Programming\\CUDA\\v7.5\\include']) 34 | 35 | 36 | class CUDA_build_ext(build_ext): 37 | """ 38 | Custom build_ext command that compiles CUDA files. 39 | Note that all extension source files will be processed with this compiler. 40 | """ 41 | def build_extensions(self): 42 | self.compiler.src_extensions.append('.cu') 43 | self.compiler.set_executable('compiler_so', 'nvcc') 44 | self.compiler.set_executable('linker_so', 'nvcc --shared') 45 | if hasattr(self.compiler, '_c_extensions'): 46 | self.compiler._c_extensions.append('.cu') # needed for Windows 47 | self.compiler.spawn = self.spawn 48 | build_ext.build_extensions(self) 49 | 50 | def spawn(self, cmd, search_path=1, verbose=0, dry_run=0): 51 | """ 52 | Perform any CUDA specific customizations before actually launching 53 | compile/link etc. commands. 54 | """ 55 | if (sys.platform == 'darwin' and len(cmd) >= 2 and cmd[0] == 'nvcc' and 56 | cmd[1] == '--shared' and cmd.count('-arch') > 0): 57 | # Versions of distutils on OSX earlier than 2.7.9 inject 58 | # '-arch x86_64' which we need to strip while using nvcc for 59 | # linking 60 | while True: 61 | try: 62 | index = cmd.index('-arch') 63 | del cmd[index:index+2] 64 | except ValueError: 65 | break 66 | elif self.compiler.compiler_type == 'msvc': 67 | # There are several things we need to do to change the commands 68 | # issued by MSVCCompiler into one that works with nvcc. In the end, 69 | # it might have been easier to write our own CCompiler class for 70 | # nvcc, as we're only interested in creating a shared library to 71 | # load with ctypes, not in creating an importable Python extension. 72 | # - First, we replace the cl.exe or link.exe call with an nvcc 73 | # call. In case we're running Anaconda, we search cl.exe in the 74 | # original search path we captured further above -- Anaconda 75 | # inserts a MSVC version into PATH that is too old for nvcc. 76 | cmd[:1] = ['nvcc', '--compiler-bindir', 77 | os.path.dirname(find_executable("cl.exe", PATH)) 78 | or cmd[0]] 79 | # - Secondly, we fix a bunch of command line arguments. 80 | for idx, c in enumerate(cmd): 81 | # create .dll instead of .pyd files 82 | #if '.pyd' in c: cmd[idx] = c = c.replace('.pyd', '.dll') #20160601, by MrX 83 | # replace /c by -c 84 | if c == '/c': cmd[idx] = '-c' 85 | # replace /DLL by --shared 86 | elif c == '/DLL': cmd[idx] = '--shared' 87 | # remove --compiler-options=-fPIC 88 | elif '-fPIC' in c: del cmd[idx] 89 | # replace /Tc... by ... 90 | elif c.startswith('/Tc'): cmd[idx] = c[3:] 91 | # replace /Fo... by -o ... 92 | elif c.startswith('/Fo'): cmd[idx:idx+1] = ['-o', c[3:]] 93 | # replace /LIBPATH:... by -L... 94 | elif c.startswith('/LIBPATH:'): cmd[idx] = '-L' + c[9:] 95 | # replace /OUT:... by -o ... 96 | elif c.startswith('/OUT:'): cmd[idx:idx+1] = ['-o', c[5:]] 97 | # remove /EXPORT:initlibcudamat or /EXPORT:initlibcudalearn 98 | elif c.startswith('/EXPORT:'): del cmd[idx] 99 | # replace cublas.lib by -lcublas 100 | elif c == 'cublas.lib': cmd[idx] = '-lcublas' 101 | # - Finally, we pass on all arguments starting with a '/' to the 102 | # compiler or linker, and have nvcc handle all other arguments 103 | if '--shared' in cmd: 104 | pass_on = '--linker-options=' 105 | # we only need MSVCRT for a .dll, remove CMT if it sneaks in: 106 | cmd.append('/NODEFAULTLIB:libcmt.lib') 107 | else: 108 | pass_on = '--compiler-options=' 109 | cmd = ([c for c in cmd if c[0] != '/'] + 110 | [pass_on + ','.join(c for c in cmd if c[0] == '/')]) 111 | # For the future: Apart from the wrongly set PATH by Anaconda, it 112 | # would suffice to run the following for compilation on Windows: 113 | # nvcc -c -O -o .obj .cu 114 | # And the following for linking: 115 | # nvcc --shared -o .dll .obj .obj -lcublas 116 | # This could be done by a NVCCCompiler class for all platforms. 117 | spawn(cmd, search_path, verbose, dry_run) 118 | 119 | setup(name="py_fast_rcnn_gpu", 120 | description="Performs linear algebra computation on the GPU via CUDA", 121 | ext_modules=[cudamat_ext], 122 | cmdclass={'build_ext': CUDA_build_ext}, 123 | ) 124 | -------------------------------------------------------------------------------- /lib/transform/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrGF/py-faster-rcnn-windows/12e11924217e79fd7124d05a18baa49b9908340d/lib/transform/__init__.py -------------------------------------------------------------------------------- /lib/transform/torch_image_transform_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # -------------------------------------------------------- 5 | 6 | """ Transform images for compatibility with models trained with 7 | https://github.com/facebook/fb.resnet.torch. 8 | 9 | Usage in model prototxt: 10 | 11 | layer { 12 | name: 'data_xform' 13 | type: 'Python' 14 | bottom: 'data_caffe' 15 | top: 'data' 16 | python_param { 17 | module: 'transform.torch_image_transform_layer' 18 | layer: 'TorchImageTransformLayer' 19 | } 20 | } 21 | """ 22 | 23 | import caffe 24 | from fast_rcnn.config import cfg 25 | import numpy as np 26 | 27 | class TorchImageTransformLayer(caffe.Layer): 28 | def setup(self, bottom, top): 29 | # (1, 3, 1, 1) shaped arrays 30 | self.PIXEL_MEANS = \ 31 | np.array([[[[0.48462227599918]], 32 | [[0.45624044862054]], 33 | [[0.40588363755159]]]]) 34 | self.PIXEL_STDS = \ 35 | np.array([[[[0.22889466674951]], 36 | [[0.22446679341259]], 37 | [[0.22495548344775]]]]) 38 | # The default ("old") pixel means that were already subtracted 39 | channel_swap = (0, 3, 1, 2) 40 | self.OLD_PIXEL_MEANS = \ 41 | cfg.PIXEL_MEANS[np.newaxis, :, :, :].transpose(channel_swap) 42 | 43 | top[0].reshape(*(bottom[0].shape)) 44 | 45 | def forward(self, bottom, top): 46 | ims = bottom[0].data 47 | # Invert the channel means that were already subtracted 48 | ims += self.OLD_PIXEL_MEANS 49 | # 1. Permute BGR to RGB and normalize to [0, 1] 50 | ims = ims[:, [2, 1, 0], :, :] / 255.0 51 | # 2. Remove channel means 52 | ims -= self.PIXEL_MEANS 53 | # 3. Standardize channels 54 | ims /= self.PIXEL_STDS 55 | top[0].reshape(*(ims.shape)) 56 | top[0].data[...] = ims 57 | 58 | def backward(self, top, propagate_down, bottom): 59 | """This layer does not propagate gradients.""" 60 | pass 61 | 62 | def reshape(self, bottom, top): 63 | """Reshaping happens during the call to forward.""" 64 | pass 65 | -------------------------------------------------------------------------------- /lib/utils/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.so 3 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps 56 | -------------------------------------------------------------------------------- /lib/utils/blob.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Blob helper functions.""" 9 | 10 | import numpy as np 11 | import cv2 12 | 13 | def im_list_to_blob(ims): 14 | """Convert a list of images into a network input. 15 | 16 | Assumes images are already prepared (means subtracted, BGR order, ...). 17 | """ 18 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 19 | num_images = len(ims) 20 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 21 | dtype=np.float32) 22 | for i in xrange(num_images): 23 | im = ims[i] 24 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 25 | # Move channels (axis 3) to axis 1 26 | # Axis order will become: (batch elem, channel, height, width) 27 | channel_swap = (0, 3, 1, 2) 28 | blob = blob.transpose(channel_swap) 29 | return blob 30 | 31 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 32 | """Mean subtract and scale an image for use in a blob.""" 33 | im = im.astype(np.float32, copy=False) 34 | im -= pixel_means 35 | im_shape = im.shape 36 | im_size_min = np.min(im_shape[0:2]) 37 | im_size_max = np.max(im_shape[0:2]) 38 | im_scale = float(target_size) / float(im_size_min) 39 | # Prevent the biggest axis from being more than MAX_SIZE 40 | if np.round(im_scale * im_size_max) > max_size: 41 | im_scale = float(max_size) / float(im_size_max) 42 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 43 | interpolation=cv2.INTER_LINEAR) 44 | 45 | return im, im_scale 46 | -------------------------------------------------------------------------------- /lib/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | class Timer(object): 11 | """A simple timer.""" 12 | def __init__(self): 13 | self.total_time = 0. 14 | self.calls = 0 15 | self.start_time = 0. 16 | self.diff = 0. 17 | self.average_time = 0. 18 | 19 | def tic(self): 20 | # using time.time instead of time.clock because time time.clock 21 | # does not normalize for multithreading 22 | self.start_time = time.time() 23 | 24 | def toc(self, average=True): 25 | self.diff = time.time() - self.start_time 26 | self.total_time += self.diff 27 | self.calls += 1 28 | self.average_time = self.total_time / self.calls 29 | if average: 30 | return self.average_time 31 | else: 32 | return self.diff 33 | --------------------------------------------------------------------------------