├── LICENSE ├── README.md ├── __init__.py ├── demo.py ├── demo ├── 004545.jpg └── out.jpg ├── experiments └── cfgs │ └── faster_rcnn_end2end.yml ├── faster_rcnn ├── __init__.py ├── datasets │ ├── __init__.py │ ├── coco.py │ ├── ds_utils.py │ ├── factory.py │ ├── imagenet3d.py │ ├── imdb.py │ ├── imdb2.py │ ├── kitti.py │ ├── kitti_tracking.py │ ├── kittivoc.py │ ├── nissan.py │ ├── nthu.py │ ├── pascal3d.py │ ├── pascal_voc.py │ ├── pascal_voc2.py │ └── voc_eval.py ├── fast_rcnn │ ├── __init__.py │ ├── bbox_transform.py │ ├── config.py │ ├── config2.py │ └── nms_wrapper.py ├── faster_rcnn.py ├── make.sh ├── network.py ├── nms │ ├── .gitignore │ ├── __init__.py │ ├── cpu_nms.pyx │ ├── gpu_nms.hpp │ ├── gpu_nms.pyx │ ├── nms_kernel.cu │ └── py_cpu_nms.py ├── psroi_pooling │ ├── __init__.py │ ├── __init__.pyc │ ├── _ext │ │ ├── __init__.py │ │ └── psroi_pooling │ │ │ └── __init__.py │ ├── build.py │ ├── functions │ │ ├── __init__.py │ │ └── psroi_pooling.py │ ├── modules │ │ ├── __init__.py │ │ └── psroi_pool.py │ └── src │ │ ├── cuda │ │ ├── psroi_pooling_kernel.cu │ │ └── psroi_pooling_kernel.h │ │ ├── psroi_pooling_cuda.c │ │ └── psroi_pooling_cuda.h ├── psroippooling_layer.py ├── pycocotools │ ├── UPSTREAM_REV │ ├── __init__.py │ ├── __init__.pyc │ ├── _mask.c │ ├── _mask.pyx │ ├── _mask.so │ ├── coco.py │ ├── coco.pyc │ ├── cocoeval.py │ ├── cocoeval.pyc │ ├── license.txt │ ├── mask.py │ ├── mask.pyc │ ├── maskApi.c │ └── maskApi.h ├── resnet.py ├── roi_data_layer │ ├── __init__.py │ ├── layer.py │ ├── minibatch.py │ ├── minibatch2.py │ ├── roidb.py │ └── roidb2.py ├── roi_pooling │ ├── __init__.py │ ├── _ext │ │ ├── __init__.py │ │ └── roi_pooling │ │ │ └── __init__.py │ ├── build.py │ ├── functions │ │ ├── __init__.py │ │ └── roi_pool.py │ ├── modules │ │ ├── __init__.py │ │ ├── roi_pool.py │ │ └── roi_pool_py.py │ └── src │ │ ├── cuda │ │ ├── roi_pooling_kernel.cu │ │ └── roi_pooling_kernel.h │ │ ├── roi_pooling.c │ │ ├── roi_pooling.h │ │ ├── roi_pooling_cuda.c │ │ └── roi_pooling_cuda.h ├── rpn_msr │ ├── __init__.py │ ├── anchor_target_layer.py │ ├── generate.py │ ├── generate_anchors.py │ ├── proposal_layer.py │ └── proposal_target_layer.py ├── setup.py └── utils │ ├── .gitignore │ ├── __init__.py │ ├── bbox.pyx │ ├── blob.py │ ├── boxes_grid.py │ ├── nms.py │ ├── nms.pyx │ └── timer.py ├── test.py ├── train.py └── train_log_20180215_20.out /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RFCN with PyTorch 2 | **Note:** This project is pytorch implementation of [RFCN](https://arxiv.org/abs/1605.06409), Resnet101_without_dilation. 3 | This project is mainly based on [faster_rcnn_pytorch](https://github.com/longcw/faster_rcnn_pytorch), while psroi_pooling modules 4 | is copied from another pytorch version of RFCN, [pytorch_RFCN](https://github.com/PureDiors/pytorch_RFCN) 5 | 6 | **Difference** Since dilation isn't used in resnet, so the space_scale is 1/32.0 in psroi_pooling, 7 | not 1/16.0 in original paper. As result, I set SCALES=800 and MAX_SIZE=1200. 8 | 9 | ### Installation and demo 10 | 0. Install the requirements (you can use pip or [Anaconda](https://www.continuum.io/downloads)): 11 | 12 | ``` 13 | conda install pip pyyaml sympy h5py cython numpy scipy 14 | conda install -c menpo opencv3 15 | pip install easydict 16 | ``` 17 | 18 | 19 | 1. Clone the Faster RFCN repository 20 | ```bash 21 | git clone https://github.com/xingmimfl/pytorch_RFCN.git 22 | ``` 23 | 24 | 2. Build the Cython modules for nms and the roi_pooling layer 25 | ```bash 26 | cd pytorch_RFCN/faster_rcnn 27 | ./make.sh 28 | ``` 29 | 30 | ### Training on Pascal VOC 2007 31 | 32 | This project use ResNet-101 model converted from Caffe, and you can get it following [RuotianLuo-pytorch-ResNet](https://github.com/ruotianluo/pytorch-resnet). 33 | 34 | Since the program loading the data in `pytorch_RFCN/data` by default, 35 | you can set the data path as following. 36 | ```bash 37 | cd pytorch_RFCN 38 | mkdir data 39 | cd data 40 | ln -s $VOCdevkit VOCdevkit2007 41 | ``` 42 | Then you can set some hyper-parameters in `train.py` and training parameters in the `.yml` file. 43 | 44 | ### Evaluation 45 | Set the path of the trained model in `test.py`. 46 | ```bash 47 | cd pytorch_RFCN 48 | python demo.py 49 | ``` 50 | 51 | ![image](https://github.com/xingmimfl/pytorch_RFCN/blob/master/demo/out.jpg) 52 | 53 | License: MIT license (MIT) 54 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/__init__.py -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import torch 4 | from faster_rcnn import network 5 | from faster_rcnn.faster_rcnn import FasterRCNN 6 | from faster_rcnn.utils.timer import Timer 7 | 8 | 9 | def test(): 10 | import os 11 | im_file = 'demo/004545.jpg' 12 | # im_file = 'data/VOCdevkit2007/VOC2007/JPEGImages/009036.jpg' 13 | # im_file = '/media/longc/Data/data/2DMOT2015/test/ETH-Crossing/img1/000100.jpg' 14 | image = cv2.imread(im_file) 15 | 16 | model_file = 'models/saved_model3/faster_rcnn_200000.pth' 17 | # model_file = '/media/longc/Data/models/faster_rcnn_pytorch3/faster_rcnn_100000.h5' 18 | # model_file = '/media/longc/Data/models/faster_rcnn_pytorch2/faster_rcnn_2000.h5' 19 | detector = FasterRCNN() 20 | detector.load_state_dict(torch.load(model_file)) 21 | detector.cuda() 22 | detector.eval() 23 | print('load model successfully!') 24 | 25 | # network.save_net(r'/media/longc/Data/models/VGGnet_fast_rcnn_iter_70000.h5', detector) 26 | # print('save model succ') 27 | 28 | t = Timer() 29 | t.tic() 30 | # image = np.zeros(shape=[600, 800, 3], dtype=np.uint8) + 255 31 | dets, scores, classes = detector.detect(image, 0.7) 32 | runtime = t.toc() 33 | print('total spend: {}s'.format(runtime)) 34 | 35 | im2show = np.copy(image) 36 | for i, det in enumerate(dets): 37 | det = tuple(int(x) for x in det) 38 | cv2.rectangle(im2show, det[0:2], det[2:4], (255, 205, 51), 2) 39 | cv2.putText(im2show, '%s: %.3f' % (classes[i], scores[i]), (det[0], det[1] + 15), cv2.FONT_HERSHEY_PLAIN, 40 | 1.0, (0, 0, 255), thickness=1) 41 | cv2.imwrite(os.path.join('demo', 'out.jpg'), im2show) 42 | cv2.imshow('demo', im2show) 43 | cv2.waitKey(0) 44 | 45 | 46 | if __name__ == '__main__': 47 | test() 48 | -------------------------------------------------------------------------------- /demo/004545.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/demo/004545.jpg -------------------------------------------------------------------------------- /demo/out.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/demo/out.jpg -------------------------------------------------------------------------------- /experiments/cfgs/faster_rcnn_end2end.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: faster_rcnn_end2end 2 | LOG_DIR: faster_rcnn_voc 3 | IS_MULTISCALE: False 4 | NET_NAME: VGGnet 5 | ANCHOR_SCALES: [8, 16, 32] 6 | NCLASSES: 21 7 | TRAIN: 8 | OHEM: False 9 | RPN_BATCHSIZE: 300 10 | BATCH_SIZE: 300 11 | LOG_IMAGE_ITERS: 100 12 | DISPLAY: 10 13 | SNAPSHOT_ITERS: 5000 14 | HAS_RPN: True 15 | LEARNING_RATE: 0.001 16 | MOMENTUM: 0.9 17 | GAMMA: 0.1 18 | STEPSIZE: 60000 19 | IMS_PER_BATCH: 1 20 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: False 21 | RPN_POSITIVE_OVERLAP: 0.7 22 | RPN_BATCHSIZE: 256 23 | PROPOSAL_METHOD: gt 24 | BG_THRESH_LO: 0.0 25 | PRECLUDE_HARD_SAMPLES: True 26 | BBOX_INSIDE_WEIGHTS: [1, 1, 1, 1] 27 | RPN_BBOX_INSIDE_WEIGHTS: [1, 1, 1, 1] 28 | RPN_POSITIVE_WEIGHT: -1.0 29 | FG_FRACTION: 0.3 30 | WEIGHT_DECAY: 0.0005 31 | TEST: 32 | HAS_RPN: True 33 | -------------------------------------------------------------------------------- /faster_rcnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/__init__.py -------------------------------------------------------------------------------- /faster_rcnn/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | # TODO: make this fold self-contained, only depends on utils package 9 | 10 | from .imdb import imdb 11 | from .pascal_voc import pascal_voc 12 | from .pascal3d import pascal3d 13 | from .imagenet3d import imagenet3d 14 | from .kitti import kitti 15 | from .kitti_tracking import kitti_tracking 16 | from .nissan import nissan 17 | from .nthu import nthu 18 | from . import factory 19 | 20 | ## NOTE: obsolete 21 | import os.path as osp 22 | from .imdb import ROOT_DIR 23 | from .imdb import MATLAB 24 | 25 | # http://stackoverflow.com/questions/377017/test-if-executable-exists-in-python 26 | def _which(program): 27 | import os 28 | def is_exe(fpath): 29 | return os.path.isfile(fpath) and os.access(fpath, os.X_OK) 30 | 31 | fpath, fname = os.path.split(program) 32 | if fpath: 33 | if is_exe(program): 34 | return program 35 | else: 36 | for path in os.environ["PATH"].split(os.pathsep): 37 | path = path.strip('"') 38 | exe_file = os.path.join(path, program) 39 | if is_exe(exe_file): 40 | return exe_file 41 | 42 | return None 43 | """ 44 | if _which(MATLAB) is None: 45 | msg = ("MATLAB command '{}' not found. " 46 | "Please add '{}' to your PATH.").format(MATLAB, MATLAB) 47 | raise EnvironmentError(msg) 48 | """ 49 | -------------------------------------------------------------------------------- /faster_rcnn/datasets/ds_utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Ross Girshick 5 | # -------------------------------------------------------- 6 | 7 | import numpy as np 8 | 9 | def unique_boxes(boxes, scale=1.0): 10 | """Return indices of unique boxes.""" 11 | v = np.array([1, 1e3, 1e6, 1e9]) 12 | hashes = np.round(boxes * scale).dot(v) 13 | _, index = np.unique(hashes, return_index=True) 14 | return np.sort(index) 15 | 16 | def xywh_to_xyxy(boxes): 17 | """Convert [x y w h] box format to [x1 y1 x2 y2] format.""" 18 | return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1)) 19 | 20 | def xyxy_to_xywh(boxes): 21 | """Convert [x1 y1 x2 y2] box format to [x y w h] format.""" 22 | return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1)) 23 | 24 | def validate_boxes(boxes, width=0, height=0): 25 | """Check that a set of boxes are valid.""" 26 | x1 = boxes[:, 0] 27 | y1 = boxes[:, 1] 28 | x2 = boxes[:, 2] 29 | y2 = boxes[:, 3] 30 | assert (x1 >= 0).all() 31 | assert (y1 >= 0).all() 32 | assert (x2 >= x1).all() 33 | assert (y2 >= y1).all() 34 | assert (x2 < width).all() 35 | assert (y2 < height).all() 36 | 37 | def filter_small_boxes(boxes, min_size): 38 | w = boxes[:, 2] - boxes[:, 0] 39 | h = boxes[:, 3] - boxes[:, 1] 40 | keep = np.where((w >= min_size) & (h > min_size))[0] 41 | return keep 42 | -------------------------------------------------------------------------------- /faster_rcnn/datasets/factory.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Factory method for easily getting imdbs by name.""" 9 | 10 | __sets = {} 11 | 12 | import numpy as np 13 | 14 | from .pascal_voc import pascal_voc 15 | from .imagenet3d import imagenet3d 16 | from .kitti import kitti 17 | from .kitti_tracking import kitti_tracking 18 | from .nthu import nthu 19 | from .coco import coco 20 | from .kittivoc import kittivoc 21 | 22 | 23 | def _selective_search_IJCV_top_k(split, year, top_k): 24 | """Return an imdb that uses the top k proposals from the selective search 25 | IJCV code. 26 | """ 27 | imdb = pascal_voc(split, year) 28 | imdb.roidb_handler = imdb.selective_search_IJCV_roidb 29 | imdb.config['top_k'] = top_k 30 | return imdb 31 | 32 | 33 | # Set up voc__ using selective search "fast" mode 34 | for year in ['2007', '2012', '0712']: 35 | for split in ['train', 'val', 'trainval', 'test']: 36 | name = 'voc_{}_{}'.format(year, split) 37 | __sets[name] = (lambda split=split, year=year: 38 | pascal_voc(split, year)) 39 | 40 | 41 | # Set up kittivoc 42 | for split in ['train', 'val', 'trainval', 'test']: 43 | name = 'kittivoc_{}'.format(split) 44 | # print name 45 | __sets[name] = (lambda split=split: kittivoc(split)) 46 | 47 | # # KITTI dataset 48 | for split in ['train', 'val', 'trainval', 'test']: 49 | name = 'kitti_{}'.format(split) 50 | # print name 51 | __sets[name] = (lambda split=split: kitti(split)) 52 | 53 | # Set up coco_2014_ 54 | for year in ['2014']: 55 | for split in ['train', 'val', 'minival', 'valminusminival']: 56 | name = 'coco_{}_{}'.format(year, split) 57 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 58 | 59 | # Set up coco_2015_ 60 | for year in ['2015']: 61 | for split in ['test', 'test-dev']: 62 | name = 'coco_{}_{}'.format(year, split) 63 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 64 | 65 | # NTHU dataset 66 | for split in ['71', '370']: 67 | name = 'nthu_{}'.format(split) 68 | # print name 69 | __sets[name] = (lambda split=split: nthu(split)) 70 | 71 | 72 | def get_imdb(name): 73 | """Get an imdb (image database) by name.""" 74 | if not __sets.has_key(name): 75 | # print (list_imdbs()) 76 | raise KeyError('Unknown dataset: {}'.format(name)) 77 | return __sets[name]() 78 | 79 | 80 | def list_imdbs(): 81 | """List all registered imdbs.""" 82 | return __sets.keys() 83 | -------------------------------------------------------------------------------- /faster_rcnn/datasets/nissan.py: -------------------------------------------------------------------------------- 1 | import os 2 | import PIL 3 | import numpy as np 4 | import scipy.sparse 5 | import subprocess 6 | import cPickle 7 | import math 8 | import glob 9 | 10 | from .imdb import imdb 11 | from .imdb import ROOT_DIR 12 | 13 | # TODO: make fast_rcnn irrelevant 14 | # >>>> obsolete, because it depends on sth outside of this project 15 | from ..fast_rcnn.config import cfg 16 | # <<<< obsolete 17 | 18 | class nissan(imdb): 19 | def __init__(self, image_set, nissan_path=None): 20 | imdb.__init__(self, 'nissan_' + image_set) 21 | self._image_set = image_set 22 | self._nissan_path = self._get_default_path() if nissan_path is None \ 23 | else nissan_path 24 | self._data_path = os.path.join(self._nissan_path, 'Images') 25 | self._classes = ('__background__', 'Car', 'Pedestrian', 'Cyclist') 26 | self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes))) 27 | self._image_ext = '.png' 28 | self._image_index = self._load_image_set_index() 29 | # Default to roidb handler 30 | if cfg.IS_RPN: 31 | self._roidb_handler = self.gt_roidb 32 | else: 33 | self._roidb_handler = self.region_proposal_roidb 34 | 35 | # num of subclasses 36 | self._num_subclasses = 227 + 36 + 36 + 1 37 | 38 | # load the mapping for subcalss to class 39 | filename = os.path.join(self._nissan_path, 'mapping.txt') 40 | assert os.path.exists(filename), 'Path does not exist: {}'.format(filename) 41 | 42 | mapping = np.zeros(self._num_subclasses, dtype=np.int) 43 | with open(filename) as f: 44 | for line in f: 45 | words = line.split() 46 | subcls = int(words[0]) 47 | mapping[subcls] = self._class_to_ind[words[1]] 48 | self._subclass_mapping = mapping 49 | 50 | self.config = {'top_k': 100000} 51 | 52 | # statistics for computing recall 53 | self._num_boxes_all = np.zeros(self.num_classes, dtype=np.int) 54 | self._num_boxes_covered = np.zeros(self.num_classes, dtype=np.int) 55 | self._num_boxes_proposal = 0 56 | 57 | assert os.path.exists(self._nissan_path), \ 58 | 'Nissan path does not exist: {}'.format(self._nissan_path) 59 | assert os.path.exists(self._data_path), \ 60 | 'Path does not exist: {}'.format(self._data_path) 61 | 62 | def image_path_at(self, i): 63 | """ 64 | Return the absolute path to image i in the image sequence. 65 | """ 66 | return self.image_path_from_index(self.image_index[i]) 67 | 68 | def image_path_from_index(self, index): 69 | """ 70 | Construct an image path from the image's "index" identifier. 71 | """ 72 | # set the prefix 73 | prefix = self._image_set 74 | 75 | image_path = os.path.join(self._data_path, prefix, index + self._image_ext) 76 | assert os.path.exists(image_path), \ 77 | 'Path does not exist: {}'.format(image_path) 78 | return image_path 79 | 80 | def _load_image_set_index(self): 81 | """ 82 | Load the indexes listed in this dataset's image set file. 83 | """ 84 | image_set_file = os.path.join(self._data_path, self._image_set + '.txt') 85 | assert os.path.exists(image_set_file), \ 86 | 'Path does not exist: {}'.format(image_set_file) 87 | 88 | with open(image_set_file) as f: 89 | image_index = [x.rstrip('\n') for x in f.readlines()] 90 | return image_index 91 | 92 | def _get_default_path(self): 93 | """ 94 | Return the default path where NISSAN is expected to be installed. 95 | """ 96 | return os.path.join(ROOT_DIR, 'data', 'NISSAN') 97 | 98 | 99 | def gt_roidb(self): 100 | """ 101 | Return the database of ground-truth regions of interest. 102 | No implementation. 103 | """ 104 | 105 | gt_roidb = [] 106 | return gt_roidb 107 | 108 | def region_proposal_roidb(self): 109 | """ 110 | Return the database of regions of interest. 111 | Ground-truth ROIs are also included. 112 | 113 | This function loads/saves from/to a cache file to speed up future calls. 114 | """ 115 | cache_file = os.path.join(self.cache_path, 116 | self.name + '_' + cfg.REGION_PROPOSAL + '_region_proposal_roidb.pkl') 117 | 118 | if os.path.exists(cache_file): 119 | with open(cache_file, 'rb') as fid: 120 | roidb = cPickle.load(fid) 121 | print '{} roidb loaded from {}'.format(self.name, cache_file) 122 | return roidb 123 | 124 | print 'Loading region proposal network boxes...' 125 | model = cfg.REGION_PROPOSAL 126 | roidb = self._load_rpn_roidb(None, model) 127 | print 'Region proposal network boxes loaded' 128 | print '{} region proposals per image'.format(self._num_boxes_proposal / len(self.image_index)) 129 | 130 | with open(cache_file, 'wb') as fid: 131 | cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL) 132 | print 'wrote roidb to {}'.format(cache_file) 133 | 134 | return roidb 135 | 136 | def _load_rpn_roidb(self, gt_roidb, model): 137 | # set the prefix 138 | prefix = model 139 | 140 | box_list = [] 141 | for index in self.image_index: 142 | filename = os.path.join(self._nissan_path, 'region_proposals', prefix, self._image_set, index + '.txt') 143 | assert os.path.exists(filename), \ 144 | 'RPN data not found at: {}'.format(filename) 145 | raw_data = np.loadtxt(filename, dtype=float) 146 | if len(raw_data.shape) == 1: 147 | if raw_data.size == 0: 148 | raw_data = raw_data.reshape((0, 5)) 149 | else: 150 | raw_data = raw_data.reshape((1, 5)) 151 | 152 | x1 = raw_data[:, 0] 153 | y1 = raw_data[:, 1] 154 | x2 = raw_data[:, 2] 155 | y2 = raw_data[:, 3] 156 | score = raw_data[:, 4] 157 | inds = np.where((x2 > x1) & (y2 > y1))[0] 158 | raw_data = raw_data[inds,:4] 159 | self._num_boxes_proposal += raw_data.shape[0] 160 | box_list.append(raw_data) 161 | 162 | return self.create_roidb_from_box_list(box_list, gt_roidb) 163 | 164 | def evaluate_detections(self, all_boxes, output_dir): 165 | # load the mapping for subcalss the alpha (viewpoint) 166 | filename = os.path.join(self._nissan_path, 'mapping.txt') 167 | assert os.path.exists(filename), \ 168 | 'Path does not exist: {}'.format(filename) 169 | 170 | mapping = np.zeros(self._num_subclasses, dtype=np.float) 171 | with open(filename) as f: 172 | for line in f: 173 | words = line.split() 174 | subcls = int(words[0]) 175 | mapping[subcls] = float(words[3]) 176 | 177 | # for each image 178 | for im_ind, index in enumerate(self.image_index): 179 | filename = os.path.join(output_dir, index + '.txt') 180 | print 'Writing NISSAN results to file ' + filename 181 | with open(filename, 'wt') as f: 182 | # for each class 183 | for cls_ind, cls in enumerate(self.classes): 184 | if cls == '__background__': 185 | continue 186 | dets = all_boxes[cls_ind][im_ind] 187 | if dets == []: 188 | continue 189 | for k in xrange(dets.shape[0]): 190 | subcls = int(dets[k, 5]) 191 | cls_name = self.classes[self.subclass_mapping[subcls]] 192 | assert (cls_name == cls), 'subclass not in class' 193 | alpha = mapping[subcls] 194 | f.write('{:s} -1 -1 {:f} {:f} {:f} {:f} {:f} -1 -1 -1 -1 -1 -1 -1 {:.32f}\n'.format(\ 195 | cls, alpha, dets[k, 0], dets[k, 1], dets[k, 2], dets[k, 3], dets[k, 4])) 196 | 197 | # write detection results into one file 198 | def evaluate_detections_one_file(self, all_boxes, output_dir): 199 | # open results file 200 | filename = os.path.join(output_dir, 'detections.txt') 201 | print 'Writing all NISSAN results to file ' + filename 202 | with open(filename, 'wt') as f: 203 | # for each image 204 | for im_ind, index in enumerate(self.image_index): 205 | # for each class 206 | for cls_ind, cls in enumerate(self.classes): 207 | if cls == '__background__': 208 | continue 209 | dets = all_boxes[cls_ind][im_ind] 210 | if dets == []: 211 | continue 212 | for k in xrange(dets.shape[0]): 213 | subcls = int(dets[k, 5]) 214 | cls_name = self.classes[self.subclass_mapping[subcls]] 215 | assert (cls_name == cls), 'subclass not in class' 216 | f.write('{:s} {:s} {:f} {:f} {:f} {:f} {:d} {:f}\n'.format(\ 217 | index, cls, dets[k, 0], dets[k, 1], dets[k, 2], dets[k, 3], subcls, dets[k, 4])) 218 | 219 | def evaluate_proposals(self, all_boxes, output_dir): 220 | # for each image 221 | for im_ind, index in enumerate(self.image_index): 222 | filename = os.path.join(output_dir, index + '.txt') 223 | print 'Writing NISSAN results to file ' + filename 224 | with open(filename, 'wt') as f: 225 | # for each class 226 | for cls_ind, cls in enumerate(self.classes): 227 | if cls == '__background__': 228 | continue 229 | dets = all_boxes[cls_ind][im_ind] 230 | if dets == []: 231 | continue 232 | for k in xrange(dets.shape[0]): 233 | f.write('{:f} {:f} {:f} {:f} {:.32f}\n'.format(\ 234 | dets[k, 0], dets[k, 1], dets[k, 2], dets[k, 3], dets[k, 4])) 235 | 236 | def evaluate_proposals_msr(self, all_boxes, output_dir): 237 | # for each image 238 | for im_ind, index in enumerate(self.image_index): 239 | filename = os.path.join(output_dir, index + '.txt') 240 | print 'Writing NISSAN results to file ' + filename 241 | with open(filename, 'wt') as f: 242 | dets = all_boxes[im_ind] 243 | if dets == []: 244 | continue 245 | for k in xrange(dets.shape[0]): 246 | f.write('{:f} {:f} {:f} {:f} {:.32f}\n'.format(dets[k, 0], dets[k, 1], dets[k, 2], dets[k, 3], dets[k, 4])) 247 | 248 | 249 | if __name__ == '__main__': 250 | d = nissan('2015-10-21-16-25-12') 251 | res = d.roidb 252 | from IPython import embed; embed() 253 | -------------------------------------------------------------------------------- /faster_rcnn/datasets/nthu.py: -------------------------------------------------------------------------------- 1 | import os 2 | import PIL 3 | import numpy as np 4 | import scipy.sparse 5 | import subprocess 6 | import cPickle 7 | import math 8 | import glob 9 | 10 | from .imdb import imdb 11 | from .imdb import ROOT_DIR 12 | 13 | # TODO: make fast_rcnn irrelevant 14 | # >>>> obsolete, because it depends on sth outside of this project 15 | from ..fast_rcnn.config import cfg 16 | # <<<< obsolete 17 | 18 | class nthu(imdb): 19 | def __init__(self, image_set, nthu_path=None): 20 | imdb.__init__(self, 'nthu_' + image_set) 21 | self._image_set = image_set 22 | self._nthu_path = self._get_default_path() if nthu_path is None \ 23 | else nthu_path 24 | self._data_path = os.path.join(self._nthu_path, 'data') 25 | self._classes = ('__background__', 'Car', 'Pedestrian', 'Cyclist') 26 | self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes))) 27 | self._image_ext = '.jpg' 28 | self._image_index = self._load_image_set_index() 29 | # Default to roidb handler 30 | if cfg.IS_RPN: 31 | self._roidb_handler = self.gt_roidb 32 | else: 33 | self._roidb_handler = self.region_proposal_roidb 34 | 35 | # num of subclasses 36 | self._num_subclasses = 227 + 36 + 36 + 1 37 | 38 | # load the mapping for subcalss to class 39 | filename = os.path.join(self._nthu_path, 'mapping.txt') 40 | assert os.path.exists(filename), 'Path does not exist: {}'.format(filename) 41 | 42 | mapping = np.zeros(self._num_subclasses, dtype=np.int) 43 | with open(filename) as f: 44 | for line in f: 45 | words = line.split() 46 | subcls = int(words[0]) 47 | mapping[subcls] = self._class_to_ind[words[1]] 48 | self._subclass_mapping = mapping 49 | 50 | self.config = {'top_k': 100000} 51 | 52 | # statistics for computing recall 53 | self._num_boxes_all = np.zeros(self.num_classes, dtype=np.int) 54 | self._num_boxes_covered = np.zeros(self.num_classes, dtype=np.int) 55 | self._num_boxes_proposal = 0 56 | 57 | assert os.path.exists(self._nthu_path), \ 58 | 'NTHU path does not exist: {}'.format(self._nthu_path) 59 | assert os.path.exists(self._data_path), \ 60 | 'Path does not exist: {}'.format(self._data_path) 61 | 62 | def image_path_at(self, i): 63 | """ 64 | Return the absolute path to image i in the image sequence. 65 | """ 66 | return self.image_path_from_index(self.image_index[i]) 67 | 68 | def image_path_from_index(self, index): 69 | """ 70 | Construct an image path from the image's "index" identifier. 71 | """ 72 | # set the prefix 73 | prefix = self._image_set 74 | 75 | image_path = os.path.join(self._data_path, prefix, index + self._image_ext) 76 | assert os.path.exists(image_path), \ 77 | 'Path does not exist: {}'.format(image_path) 78 | return image_path 79 | 80 | def _load_image_set_index(self): 81 | """ 82 | Load the indexes listed in this dataset's image set file. 83 | """ 84 | image_set_file = os.path.join(self._data_path, self._image_set + '.txt') 85 | assert os.path.exists(image_set_file), \ 86 | 'Path does not exist: {}'.format(image_set_file) 87 | 88 | with open(image_set_file) as f: 89 | image_index = [x.rstrip('\n') for x in f.readlines()] 90 | return image_index 91 | 92 | def _get_default_path(self): 93 | """ 94 | Return the default path where nthu is expected to be installed. 95 | """ 96 | return os.path.join(ROOT_DIR, 'data', 'NTHU') 97 | 98 | 99 | def gt_roidb(self): 100 | """ 101 | Return the database of ground-truth regions of interest. 102 | No implementation. 103 | """ 104 | 105 | gt_roidb = [] 106 | return gt_roidb 107 | 108 | def region_proposal_roidb(self): 109 | """ 110 | Return the database of regions of interest. 111 | Ground-truth ROIs are also included. 112 | 113 | This function loads/saves from/to a cache file to speed up future calls. 114 | """ 115 | cache_file = os.path.join(self.cache_path, 116 | self.name + '_' + cfg.REGION_PROPOSAL + '_region_proposal_roidb.pkl') 117 | 118 | if os.path.exists(cache_file): 119 | with open(cache_file, 'rb') as fid: 120 | roidb = cPickle.load(fid) 121 | print '{} roidb loaded from {}'.format(self.name, cache_file) 122 | return roidb 123 | 124 | print 'Loading region proposal network boxes...' 125 | model = cfg.REGION_PROPOSAL 126 | roidb = self._load_rpn_roidb(None, model) 127 | print 'Region proposal network boxes loaded' 128 | print '{} region proposals per image'.format(self._num_boxes_proposal / len(self.image_index)) 129 | 130 | with open(cache_file, 'wb') as fid: 131 | cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL) 132 | print 'wrote roidb to {}'.format(cache_file) 133 | 134 | return roidb 135 | 136 | def _load_rpn_roidb(self, gt_roidb, model): 137 | # set the prefix 138 | prefix = model 139 | 140 | box_list = [] 141 | for index in self.image_index: 142 | filename = os.path.join(self._nthu_path, 'region_proposals', prefix, self._image_set, index + '.txt') 143 | assert os.path.exists(filename), \ 144 | 'RPN data not found at: {}'.format(filename) 145 | raw_data = np.loadtxt(filename, dtype=float) 146 | if len(raw_data.shape) == 1: 147 | if raw_data.size == 0: 148 | raw_data = raw_data.reshape((0, 5)) 149 | else: 150 | raw_data = raw_data.reshape((1, 5)) 151 | 152 | x1 = raw_data[:, 0] 153 | y1 = raw_data[:, 1] 154 | x2 = raw_data[:, 2] 155 | y2 = raw_data[:, 3] 156 | score = raw_data[:, 4] 157 | inds = np.where((x2 > x1) & (y2 > y1))[0] 158 | raw_data = raw_data[inds,:4] 159 | self._num_boxes_proposal += raw_data.shape[0] 160 | box_list.append(raw_data) 161 | 162 | return self.create_roidb_from_box_list(box_list, gt_roidb) 163 | 164 | def evaluate_detections(self, all_boxes, output_dir): 165 | # load the mapping for subcalss the alpha (viewpoint) 166 | filename = os.path.join(self._nthu_path, 'mapping.txt') 167 | assert os.path.exists(filename), \ 168 | 'Path does not exist: {}'.format(filename) 169 | 170 | mapping = np.zeros(self._num_subclasses, dtype=np.float) 171 | with open(filename) as f: 172 | for line in f: 173 | words = line.split() 174 | subcls = int(words[0]) 175 | mapping[subcls] = float(words[3]) 176 | 177 | # for each image 178 | for im_ind, index in enumerate(self.image_index): 179 | filename = os.path.join(output_dir, index + '.txt') 180 | print 'Writing nthu results to file ' + filename 181 | with open(filename, 'wt') as f: 182 | # for each class 183 | for cls_ind, cls in enumerate(self.classes): 184 | if cls == '__background__': 185 | continue 186 | dets = all_boxes[cls_ind][im_ind] 187 | if dets == []: 188 | continue 189 | for k in xrange(dets.shape[0]): 190 | subcls = int(dets[k, 5]) 191 | cls_name = self.classes[self.subclass_mapping[subcls]] 192 | assert (cls_name == cls), 'subclass not in class' 193 | alpha = mapping[subcls] 194 | f.write('{:s} -1 -1 {:f} {:f} {:f} {:f} {:f} -1 -1 -1 -1 -1 -1 -1 {:.32f}\n'.format(\ 195 | cls, alpha, dets[k, 0], dets[k, 1], dets[k, 2], dets[k, 3], dets[k, 4])) 196 | 197 | # write detection results into one file 198 | def evaluate_detections_one_file(self, all_boxes, output_dir): 199 | # open results file 200 | filename = os.path.join(output_dir, 'detections.txt') 201 | print 'Writing all nthu results to file ' + filename 202 | with open(filename, 'wt') as f: 203 | # for each image 204 | for im_ind, index in enumerate(self.image_index): 205 | # for each class 206 | for cls_ind, cls in enumerate(self.classes): 207 | if cls == '__background__': 208 | continue 209 | dets = all_boxes[cls_ind][im_ind] 210 | if dets == []: 211 | continue 212 | for k in xrange(dets.shape[0]): 213 | subcls = int(dets[k, 5]) 214 | cls_name = self.classes[self.subclass_mapping[subcls]] 215 | assert (cls_name == cls), 'subclass not in class' 216 | f.write('{:s} {:s} {:f} {:f} {:f} {:f} {:d} {:f}\n'.format(\ 217 | index, cls, dets[k, 0], dets[k, 1], dets[k, 2], dets[k, 3], subcls, dets[k, 4])) 218 | 219 | def evaluate_proposals(self, all_boxes, output_dir): 220 | # for each image 221 | for im_ind, index in enumerate(self.image_index): 222 | filename = os.path.join(output_dir, index + '.txt') 223 | print 'Writing nthu results to file ' + filename 224 | with open(filename, 'wt') as f: 225 | # for each class 226 | for cls_ind, cls in enumerate(self.classes): 227 | if cls == '__background__': 228 | continue 229 | dets = all_boxes[cls_ind][im_ind] 230 | if dets == []: 231 | continue 232 | for k in xrange(dets.shape[0]): 233 | f.write('{:f} {:f} {:f} {:f} {:.32f}\n'.format(\ 234 | dets[k, 0], dets[k, 1], dets[k, 2], dets[k, 3], dets[k, 4])) 235 | 236 | def evaluate_proposals_msr(self, all_boxes, output_dir): 237 | # for each image 238 | for im_ind, index in enumerate(self.image_index): 239 | filename = os.path.join(output_dir, index + '.txt') 240 | print 'Writing nthu results to file ' + filename 241 | with open(filename, 'wt') as f: 242 | dets = all_boxes[im_ind] 243 | if dets == []: 244 | continue 245 | for k in xrange(dets.shape[0]): 246 | f.write('{:f} {:f} {:f} {:f} {:.32f}\n'.format(dets[k, 0], dets[k, 1], dets[k, 2], dets[k, 3], dets[k, 4])) 247 | 248 | 249 | if __name__ == '__main__': 250 | d = nthu('71') 251 | res = d.roidb 252 | from IPython import embed; embed() 253 | -------------------------------------------------------------------------------- /faster_rcnn/datasets/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os 9 | import cPickle 10 | import numpy as np 11 | import pdb 12 | def parse_rec(filename): 13 | """ Parse a PASCAL VOC xml file """ 14 | tree = ET.parse(filename) 15 | objects = [] 16 | for obj in tree.findall('object'): 17 | obj_struct = {} 18 | obj_struct['name'] = obj.find('name').text 19 | obj_struct['pose'] = obj.find('pose').text 20 | obj_struct['truncated'] = int(obj.find('truncated').text) 21 | obj_struct['difficult'] = int(obj.find('difficult').text) 22 | bbox = obj.find('bndbox') 23 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 24 | int(bbox.find('ymin').text), 25 | int(bbox.find('xmax').text), 26 | int(bbox.find('ymax').text)] 27 | objects.append(obj_struct) 28 | 29 | return objects 30 | 31 | def voc_ap(rec, prec, use_07_metric=False): 32 | """ ap = voc_ap(rec, prec, [use_07_metric]) 33 | Compute VOC AP given precision and recall. 34 | If use_07_metric is true, uses the 35 | VOC 07 11 point method (default:False). 36 | """ 37 | if use_07_metric: 38 | # 11 point metric 39 | ap = 0. 40 | for t in np.arange(0., 1.1, 0.1): 41 | if np.sum(rec >= t) == 0: 42 | p = 0 43 | else: 44 | p = np.max(prec[rec >= t]) 45 | ap = ap + p / 11. 46 | else: 47 | # correct AP calculation 48 | # first append sentinel values at the end 49 | mrec = np.concatenate(([0.], rec, [1.])) 50 | mpre = np.concatenate(([0.], prec, [0.])) 51 | 52 | # compute the precision envelope 53 | for i in range(mpre.size - 1, 0, -1): 54 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 55 | 56 | # to calculate area under PR curve, look for points 57 | # where X axis (recall) changes value 58 | i = np.where(mrec[1:] != mrec[:-1])[0] 59 | 60 | # and sum (\Delta recall) * prec 61 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 62 | return ap 63 | 64 | def voc_eval(detpath, 65 | annopath, 66 | imagesetfile, 67 | classname, 68 | cachedir, 69 | ovthresh=0.5, 70 | use_07_metric=False): 71 | """rec, prec, ap = voc_eval(detpath, 72 | annopath, 73 | imagesetfile, 74 | classname, 75 | [ovthresh], 76 | [use_07_metric]) 77 | 78 | Top level function that does the PASCAL VOC evaluation. 79 | 80 | detpath: Path to detections 81 | detpath.format(classname) should produce the detection results file. 82 | annopath: Path to annotations 83 | annopath.format(imagename) should be the xml annotations file. 84 | imagesetfile: Text file containing the list of images, one image per line. 85 | classname: Category name (duh) 86 | cachedir: Directory for caching the annotations 87 | [ovthresh]: Overlap threshold (default = 0.5) 88 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 89 | (default False) 90 | """ 91 | # assumes detections are in detpath.format(classname) 92 | # assumes annotations are in annopath.format(imagename) 93 | # assumes imagesetfile is a text file with each line an image name 94 | # cachedir caches the annotations in a pickle file 95 | 96 | # first load gt 97 | if not os.path.isdir(cachedir): 98 | os.mkdir(cachedir) 99 | cachefile = os.path.join(cachedir, 'annots.pkl') 100 | # read list of images 101 | with open(imagesetfile, 'r') as f: 102 | lines = f.readlines() 103 | imagenames = [x.strip() for x in lines] 104 | 105 | if not os.path.isfile(cachefile): 106 | # load annots 107 | recs = {} 108 | for i, imagename in enumerate(imagenames): 109 | recs[imagename] = parse_rec(annopath.format(imagename)) 110 | if i % 100 == 0: 111 | print 'Reading annotation for {:d}/{:d}'.format( 112 | i + 1, len(imagenames)) 113 | # save 114 | print 'Saving cached annotations to {:s}'.format(cachefile) 115 | with open(cachefile, 'w') as f: 116 | cPickle.dump(recs, f) 117 | else: 118 | # load 119 | with open(cachefile, 'r') as f: 120 | recs = cPickle.load(f) 121 | 122 | # extract gt objects for this class 123 | class_recs = {} 124 | npos = 0 125 | for imagename in imagenames: 126 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 127 | bbox = np.array([x['bbox'] for x in R]) 128 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 129 | det = [False] * len(R) 130 | npos = npos + sum(~difficult) 131 | class_recs[imagename] = {'bbox': bbox, 132 | 'difficult': difficult, 133 | 'det': det} 134 | 135 | # read dets 136 | detfile = detpath.format(classname) 137 | with open(detfile, 'r') as f: 138 | lines = f.readlines() 139 | if any(lines) == 1: 140 | 141 | splitlines = [x.strip().split(' ') for x in lines] 142 | image_ids = [x[0] for x in splitlines] 143 | confidence = np.array([float(x[1]) for x in splitlines]) 144 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 145 | 146 | # sort by confidence 147 | sorted_ind = np.argsort(-confidence) 148 | sorted_scores = np.sort(-confidence) 149 | BB = BB[sorted_ind, :] 150 | image_ids = [image_ids[x] for x in sorted_ind] 151 | 152 | # go down dets and mark TPs and FPs 153 | nd = len(image_ids) 154 | tp = np.zeros(nd) 155 | fp = np.zeros(nd) 156 | for d in range(nd): 157 | R = class_recs[image_ids[d]] 158 | bb = BB[d, :].astype(float) 159 | ovmax = -np.inf 160 | BBGT = R['bbox'].astype(float) 161 | 162 | if BBGT.size > 0: 163 | # compute overlaps 164 | # intersection 165 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 166 | iymin = np.maximum(BBGT[:, 1], bb[1]) 167 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 168 | iymax = np.minimum(BBGT[:, 3], bb[3]) 169 | iw = np.maximum(ixmax - ixmin + 1., 0.) 170 | ih = np.maximum(iymax - iymin + 1., 0.) 171 | inters = iw * ih 172 | 173 | # union 174 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 175 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 176 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 177 | 178 | overlaps = inters / uni 179 | ovmax = np.max(overlaps) 180 | jmax = np.argmax(overlaps) 181 | 182 | if ovmax > ovthresh: 183 | if not R['difficult'][jmax]: 184 | if not R['det'][jmax]: 185 | tp[d] = 1. 186 | R['det'][jmax] = 1 187 | else: 188 | fp[d] = 1. 189 | else: 190 | fp[d] = 1. 191 | 192 | # compute precision recall 193 | fp = np.cumsum(fp) 194 | tp = np.cumsum(tp) 195 | rec = tp / float(npos) 196 | # avoid divide by zero in case the first detection matches a difficult 197 | # ground truth 198 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 199 | ap = voc_ap(rec, prec, use_07_metric) 200 | else: 201 | rec = -1 202 | prec = -1 203 | ap = -1 204 | 205 | return rec, prec, ap 206 | -------------------------------------------------------------------------------- /faster_rcnn/fast_rcnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from . import config 9 | from . import nms_wrapper 10 | # from nms_wrapper import nms -------------------------------------------------------------------------------- /faster_rcnn/fast_rcnn/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | from sympy.physics.paulialgebra import delta 10 | 11 | 12 | def bbox_transform(ex_rois, gt_rois): 13 | """ 14 | computes the distance from ground-truth boxes to the given boxes, normed by their size 15 | :param ex_rois: n * 4 numpy array, given boxes 16 | :param gt_rois: n * 4 numpy array, ground-truth boxes 17 | :return: deltas: n * 4 numpy array, ground-truth boxes 18 | """ 19 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 20 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 21 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 22 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 23 | 24 | # assert np.min(ex_widths) > 0.1 and np.min(ex_heights) > 0.1, \ 25 | # 'Invalid boxes found: {} {}'. \ 26 | # format(ex_rois[np.argmin(ex_widths), :], ex_rois[np.argmin(ex_heights), :]) 27 | 28 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 29 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 30 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 31 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 32 | 33 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 34 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 35 | targets_dw = np.log(gt_widths / ex_widths) 36 | targets_dh = np.log(gt_heights / ex_heights) 37 | 38 | targets = np.vstack( 39 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 40 | return targets 41 | 42 | 43 | def bbox_transform_inv(boxes, deltas): 44 | if boxes.shape[0] == 0: 45 | return np.zeros((0,), dtype=deltas.dtype) 46 | 47 | boxes = boxes.astype(deltas.dtype, copy=False) 48 | 49 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 50 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 51 | ctr_x = boxes[:, 0] + 0.5 * widths 52 | ctr_y = boxes[:, 1] + 0.5 * heights 53 | 54 | dx = deltas[:, 0::4] 55 | dy = deltas[:, 1::4] 56 | dw = deltas[:, 2::4] 57 | dh = deltas[:, 3::4] 58 | 59 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] 60 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 61 | pred_w = np.exp(dw) * widths[:, np.newaxis] 62 | pred_h = np.exp(dh) * heights[:, np.newaxis] 63 | 64 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) 65 | # x1 66 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 67 | # y1 68 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 69 | # x2 70 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w 71 | # y2 72 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h 73 | 74 | return pred_boxes 75 | 76 | 77 | def clip_boxes(boxes, im_shape): 78 | """ 79 | Clip boxes to image boundaries. 80 | """ 81 | if boxes.shape[0] == 0: 82 | return boxes 83 | 84 | # x1 >= 0 85 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 86 | # y1 >= 0 87 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 88 | # x2 < im_shape[1] 89 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 90 | # y2 < im_shape[0] 91 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 92 | return boxes 93 | -------------------------------------------------------------------------------- /faster_rcnn/fast_rcnn/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from faster_rcnn.nms.cpu_nms import cpu_nms 9 | from faster_rcnn.nms.gpu_nms import gpu_nms 10 | # from ..nms import cpu_nms 11 | # from ..nms import gpu_nms 12 | from .config import cfg 13 | 14 | 15 | def nms(dets, thresh, force_cpu=False): 16 | """Dispatch to either CPU or GPU NMS implementations.""" 17 | 18 | if dets.shape[0] == 0: 19 | return [] 20 | if cfg.USE_GPU_NMS and not force_cpu: 21 | return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 22 | else: 23 | return cpu_nms(dets, thresh) 24 | -------------------------------------------------------------------------------- /faster_rcnn/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_PATH=/usr/local/cuda/ 4 | 5 | python setup.py build_ext --inplace 6 | rm -rf build 7 | cd roi_pooling/src/cuda 8 | 9 | echo "Compiling roi pooling kernels by nvcc..." 10 | nvcc -c -o roi_pooling.cu.o roi_pooling_kernel.cu \ 11 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -arch=sm_52 12 | 13 | #g++ -std=c++11 -shared -o roi_pooling.so roi_pooling_op.cc \ 14 | # roi_pooling_op.cu.o -I $TF_INC -fPIC -lcudart -L $CUDA_PATH/lib64 15 | cd ../../ 16 | python build.py 17 | 18 | cd ../psroi_pooling/src/cuda 19 | echo "Compiling psroi pooling kernels by nvcc..." 20 | 21 | nvcc -c -o psroi_pooling.cu.o psroi_pooling_kernel.cu \ 22 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -arch=sm_52 23 | 24 | cd ../../ 25 | python build.py 26 | -------------------------------------------------------------------------------- /faster_rcnn/network.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | 6 | 7 | class Conv2d(nn.Module): 8 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, relu=True, same_padding=False, bn=False): 9 | super(Conv2d, self).__init__() 10 | padding = int((kernel_size - 1) / 2) if same_padding else 0 11 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding=padding) 12 | self.bn = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0, affine=True) if bn else None 13 | self.relu = nn.ReLU(inplace=True) if relu else None 14 | 15 | def forward(self, x): 16 | x = self.conv(x) 17 | if self.bn is not None: 18 | x = self.bn(x) 19 | if self.relu is not None: 20 | x = self.relu(x) 21 | return x 22 | 23 | 24 | class FC(nn.Module): 25 | def __init__(self, in_features, out_features, relu=True): 26 | super(FC, self).__init__() 27 | self.fc = nn.Linear(in_features, out_features) 28 | self.relu = nn.ReLU(inplace=True) if relu else None 29 | 30 | def forward(self, x): 31 | x = self.fc(x) 32 | if self.relu is not None: 33 | x = self.relu(x) 34 | return x 35 | 36 | 37 | def save_net(fname, net): 38 | import h5py 39 | h5f = h5py.File(fname, mode='w') 40 | for k, v in net.state_dict().items(): 41 | h5f.create_dataset(k, data=v.cpu().numpy()) 42 | 43 | 44 | def load_net(fname, net): 45 | import h5py 46 | h5f = h5py.File(fname, mode='r') 47 | for k, v in net.state_dict().items(): 48 | param = torch.from_numpy(np.asarray(h5f[k])) 49 | v.copy_(param) 50 | 51 | 52 | def load_pretrained_npy(faster_rcnn_model, fname): 53 | params = np.load(fname).item() 54 | # vgg16 55 | vgg16_dict = faster_rcnn_model.rpn.features.state_dict() 56 | for name, val in vgg16_dict.items(): 57 | # # print name 58 | # # print val.size() 59 | # # print param.size() 60 | if name.find('bn.') >= 0: 61 | continue 62 | i, j = int(name[4]), int(name[6]) + 1 63 | ptype = 'weights' if name[-1] == 't' else 'biases' 64 | key = 'conv{}_{}'.format(i, j) 65 | param = torch.from_numpy(params[key][ptype]) 66 | 67 | if ptype == 'weights': 68 | param = param.permute(3, 2, 0, 1) 69 | 70 | val.copy_(param) 71 | 72 | # fc6 fc7 73 | frcnn_dict = faster_rcnn_model.state_dict() 74 | pairs = {'fc6.fc': 'fc6', 'fc7.fc': 'fc7'} 75 | for k, v in pairs.items(): 76 | key = '{}.weight'.format(k) 77 | param = torch.from_numpy(params[v]['weights']).permute(1, 0) 78 | frcnn_dict[key].copy_(param) 79 | 80 | key = '{}.bias'.format(k) 81 | param = torch.from_numpy(params[v]['biases']) 82 | frcnn_dict[key].copy_(param) 83 | 84 | 85 | def np_to_variable(x, is_cuda=True, dtype=torch.FloatTensor): 86 | v = Variable(torch.from_numpy(x).type(dtype)) 87 | if is_cuda: 88 | v = v.cuda() 89 | return v 90 | 91 | 92 | def set_trainable(model, requires_grad): 93 | for param in model.parameters(): 94 | param.requires_grad = requires_grad 95 | 96 | 97 | def weights_normal_init(model, dev=0.01): 98 | if isinstance(model, list): 99 | for m in model: 100 | weights_normal_init(m, dev) 101 | else: 102 | for m in model.modules(): 103 | if isinstance(m, nn.Conv2d): 104 | m.weight.data.normal_(0.0, dev) 105 | elif isinstance(m, nn.Linear): 106 | m.weight.data.normal_(0.0, dev) 107 | 108 | 109 | def clip_gradient(model, clip_norm): 110 | """Computes a gradient clipping coefficient based on gradient norm.""" 111 | totalnorm = 0 112 | for p in model.parameters(): 113 | if p.requires_grad: 114 | modulenorm = p.grad.data.norm() 115 | totalnorm += modulenorm ** 2 116 | totalnorm = np.sqrt(totalnorm) 117 | 118 | norm = clip_norm / max(totalnorm, clip_norm) 119 | for p in model.parameters(): 120 | if p.requires_grad: 121 | p.grad.mul_(norm) 122 | -------------------------------------------------------------------------------- /faster_rcnn/nms/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /faster_rcnn/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/nms/__init__.py -------------------------------------------------------------------------------- /faster_rcnn/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /faster_rcnn/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /faster_rcnn/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /faster_rcnn/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /faster_rcnn/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/psroi_pooling/__init__.py -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/psroi_pooling/__init__.pyc -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/_ext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/psroi_pooling/_ext/__init__.py -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/_ext/psroi_pooling/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._psroi_pooling import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.ffi import create_extension 4 | 5 | sources = [] 6 | headers = [] 7 | defines = [] 8 | with_cuda = False 9 | 10 | if torch.cuda.is_available(): 11 | print('Including CUDA code.') 12 | sources += ['src/psroi_pooling_cuda.c'] 13 | headers += ['src/psroi_pooling_cuda.h'] 14 | defines += [('WITH_CUDA', None)] 15 | with_cuda = True 16 | 17 | this_file = os.path.dirname(os.path.realpath(__file__)) 18 | print(this_file) 19 | extra_objects = ['src/cuda/psroi_pooling.cu.o'] 20 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 21 | 22 | ffi = create_extension( 23 | '_ext.psroi_pooling', 24 | headers=headers, 25 | sources=sources, 26 | define_macros=defines, 27 | relative_to=__file__, 28 | with_cuda=with_cuda, 29 | extra_objects=extra_objects 30 | ) 31 | 32 | if __name__ == '__main__': 33 | ffi.build() 34 | -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/psroi_pooling/functions/__init__.py -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/functions/psroi_pooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from .._ext import psroi_pooling 4 | 5 | 6 | class PSRoIPoolingFunction(Function): 7 | def __init__(self, pooled_height, pooled_width, spatial_scale, group_size, output_dim): 8 | self.pooled_width = int(pooled_width) 9 | self.pooled_height = int(pooled_height) 10 | self.spatial_scale = float(spatial_scale) 11 | self.group_size = int(group_size) 12 | self.output_dim = int(output_dim) 13 | self.output = None 14 | self.mappingchannel = None 15 | self.rois = None 16 | self.feature_size = None 17 | 18 | def forward(self, features, rois): 19 | batch_size, num_channels, data_height, data_width = features.size() 20 | num_rois = rois.size()[0] 21 | output = torch.zeros(num_rois, self.output_dim, self.pooled_height, self.pooled_width) 22 | mappingchannel = torch.IntTensor(num_rois, self.output_dim, self.pooled_height, self.pooled_width).zero_() 23 | output = output.cuda() 24 | mappingchannel = mappingchannel.cuda() 25 | psroi_pooling.psroi_pooling_forward_cuda(self.pooled_height, self.pooled_width, self.spatial_scale, self.group_size, self.output_dim, \ 26 | features, rois, output, mappingchannel); 27 | self.output = output 28 | self.mappingchannel = mappingchannel 29 | self.rois = rois 30 | self.feature_size = features.size() 31 | 32 | return output 33 | 34 | def backward(self, grad_output): 35 | assert(self.feature_size is not None and grad_output.is_cuda) 36 | 37 | batch_size, num_channels, data_height, data_width = self.feature_size 38 | 39 | grad_input = torch.zeros(batch_size, num_channels, data_height, data_width).cuda() 40 | 41 | psroi_pooling.psroi_pooling_backward_cuda(self.pooled_height, self.pooled_width, self.spatial_scale, self.output_dim, \ 42 | grad_output, self.rois, grad_input, self.mappingchannel) 43 | return grad_input, None 44 | -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/psroi_pooling/modules/__init__.py -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/modules/psroi_pool.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | import sys 3 | from ..functions.psroi_pooling import PSRoIPoolingFunction 4 | 5 | 6 | class PSRoIPool(Module): 7 | def __init__(self, pooled_height, pooled_width, spatial_scale, group_size, output_dim): 8 | super(PSRoIPool, self).__init__() 9 | 10 | self.pooled_width = int(pooled_width) 11 | self.pooled_height = int(pooled_height) 12 | self.spatial_scale = float(spatial_scale) 13 | self.group_size = int(group_size) 14 | self.output_dim = int(output_dim) 15 | 16 | def forward(self, features, rois): 17 | return PSRoIPoolingFunction(self.pooled_height, self.pooled_width, self.spatial_scale, self.group_size, self.output_dim)(features, rois) 18 | -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/src/cuda/psroi_pooling_kernel.cu: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include "psroi_pooling_kernel.h" 9 | 10 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 11 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 12 | i += blockDim.x * gridDim.x) 13 | 14 | 15 | __global__ void PSROIPoolForward(const int nthreads, const float* bottom_data, 16 | const float spatial_scale, const int height, const int width, 17 | const int channels, const int pooled_height, const int pooled_width, 18 | const int group_size, const int output_dim, 19 | const float* bottom_rois, float* top_data, int* mapping_channel) 20 | { 21 | CUDA_1D_KERNEL_LOOP(index, nthreads) 22 | { 23 | // (n, c, ph, pw) is an element in the pooled output 24 | int pw = index % pooled_width; 25 | int ph = (index / pooled_width) % pooled_height; 26 | int ctop = (index / pooled_width / pooled_height) % output_dim; 27 | int n = index / pooled_width / pooled_height / output_dim; 28 | 29 | bottom_rois += n * 5; 30 | int roi_batch_ind = bottom_rois[0]; 31 | float roi_start_w = 32 | static_cast(round(bottom_rois[1])) * spatial_scale; 33 | float roi_start_h = 34 | static_cast(round(bottom_rois[2])) * spatial_scale; 35 | float roi_end_w = 36 | static_cast(round(bottom_rois[3]) + 1.) * spatial_scale; 37 | float roi_end_h = 38 | static_cast(round(bottom_rois[4]) + 1.) * spatial_scale; 39 | 40 | // Force malformed ROIs to be 1x1 41 | float roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 42 | float roi_height = max(roi_end_h - roi_start_h, 0.1); 43 | 44 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 45 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 46 | 47 | int hstart = floor(static_cast(ph) * bin_size_h 48 | + roi_start_h); 49 | int wstart = floor(static_cast(pw)* bin_size_w 50 | + roi_start_w); 51 | int hend = ceil(static_cast(ph + 1) * bin_size_h 52 | + roi_start_h); 53 | int wend = ceil(static_cast(pw + 1) * bin_size_w 54 | + roi_start_w); 55 | 56 | // Add roi offsets and clip to input boundaries 57 | hstart = min(max(hstart, 0), height); 58 | hend = min(max(hend, 0), height); 59 | wstart = min(max(wstart, 0), width); 60 | wend = min(max(wend, 0), width); 61 | bool is_empty = (hend <= hstart) || (wend <= wstart); 62 | 63 | int gw = pw; 64 | int gh = ph; 65 | int c = (ctop*group_size + gh)*group_size + gw; 66 | 67 | bottom_data += (roi_batch_ind * channels + c) * height * width; 68 | float out_sum = 0; 69 | for (int h = hstart; h < hend; ++h) { 70 | for (int w = wstart; w < wend; ++w) { 71 | int bottom_index = h*width + w; 72 | out_sum += bottom_data[bottom_index]; 73 | } 74 | } 75 | float bin_area = (hend - hstart)*(wend - wstart); 76 | top_data[index] = is_empty? 0. : out_sum/bin_area; 77 | mapping_channel[index] = c; 78 | } 79 | } 80 | 81 | 82 | int PSROIPoolForwardLauncher( 83 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 84 | const int width, const int channels, const int pooled_height, 85 | const int pooled_width, const float* bottom_rois, 86 | const int group_size, const int output_dim, 87 | float* top_data, int* mapping_channel, cudaStream_t stream) 88 | { 89 | const int kThreadsPerBlock = 1024; 90 | const int output_size = output_dim * pooled_height * pooled_width * num_rois; 91 | cudaError_t err; 92 | 93 | 94 | PSROIPoolForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 95 | output_size, bottom_data, spatial_scale, height, width, channels, pooled_height, 96 | pooled_width, group_size, output_dim, bottom_rois, top_data, mapping_channel); 97 | 98 | err = cudaGetLastError(); 99 | if(cudaSuccess != err) 100 | { 101 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 102 | exit( -1 ); 103 | } 104 | 105 | return 1; 106 | } 107 | 108 | 109 | __global__ void PSROIPoolBackward(const int nthreads, const float* top_diff, 110 | const int* mapping_channel, const int num_rois, const float spatial_scale, 111 | const int height, const int width, const int channels, 112 | const int pooled_height, const int pooled_width, const int output_dim, float* bottom_diff, 113 | const float* bottom_rois) { 114 | CUDA_1D_KERNEL_LOOP(index, nthreads) 115 | { 116 | 117 | int pw = index % pooled_width; 118 | int ph = (index / pooled_width) % pooled_height; 119 | int n = index / pooled_width / pooled_height / output_dim; 120 | 121 | // [start, end) interval for spatial sampling 122 | bottom_rois += n * 5; 123 | int roi_batch_ind = bottom_rois[0]; 124 | float roi_start_w = 125 | static_cast(round(bottom_rois[1])) * spatial_scale; 126 | float roi_start_h = 127 | static_cast(round(bottom_rois[2])) * spatial_scale; 128 | float roi_end_w = 129 | static_cast(round(bottom_rois[3]) + 1.) * spatial_scale; 130 | float roi_end_h = 131 | static_cast(round(bottom_rois[4]) + 1.) * spatial_scale; 132 | 133 | // Force too small ROIs to be 1x1 134 | float roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 135 | float roi_height = max(roi_end_h - roi_start_h, 0.1); 136 | 137 | // Compute w and h at bottom 138 | float bin_size_h = roi_height / static_cast(pooled_height); 139 | float bin_size_w = roi_width / static_cast(pooled_width); 140 | 141 | int hstart = floor(static_cast(ph)* bin_size_h 142 | + roi_start_h); 143 | int wstart = floor(static_cast(pw)* bin_size_w 144 | + roi_start_w); 145 | int hend = ceil(static_cast(ph + 1) * bin_size_h 146 | + roi_start_h); 147 | int wend = ceil(static_cast(pw + 1) * bin_size_w 148 | + roi_start_w); 149 | // Add roi offsets and clip to input boundaries 150 | hstart = min(max(hstart, 0), height); 151 | hend = min(max(hend, 0), height); 152 | wstart = min(max(wstart, 0), width); 153 | wend = min(max(wend, 0), width); 154 | bool is_empty = (hend <= hstart) || (wend <= wstart); 155 | 156 | // Compute c at bottom 157 | int c = mapping_channel[index]; 158 | float* offset_bottom_diff = bottom_diff + 159 | (roi_batch_ind * channels + c) * height * width; 160 | float bin_area = (hend - hstart)*(wend - wstart); 161 | float diff_val = is_empty ? 0. : top_diff[index] / bin_area; 162 | for (int h = hstart; h < hend; ++h) { 163 | for (int w = wstart; w < wend; ++w) { 164 | int bottom_index = h*width + w; 165 | //caffe_gpu_atomic_add(diff_val, offset_bottom_diff + bottom_index); 166 | atomicAdd(offset_bottom_diff + bottom_index, diff_val); 167 | } 168 | } 169 | } 170 | } 171 | 172 | int PSROIPoolBackwardLauncher(const float* top_diff, const int* mapping_channel, const int batch_size, const int num_rois, const float spatial_scale, const int channels, 173 | const int height, const int width, const int pooled_width, 174 | const int pooled_height, const int output_dim, 175 | float* bottom_diff, const float* bottom_rois, cudaStream_t stream) 176 | { 177 | const int kThreadsPerBlock = 1024; 178 | //const int output_size = output_dim * height * width * channels; 179 | const int output_size = output_dim * pooled_height * pooled_width * num_rois; 180 | cudaError_t err; 181 | 182 | PSROIPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 183 | output_size, top_diff, mapping_channel, num_rois, spatial_scale, height, width, channels, pooled_height, 184 | pooled_width, output_dim, bottom_diff, bottom_rois); 185 | 186 | err = cudaGetLastError(); 187 | if(cudaSuccess != err) 188 | { 189 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 190 | exit( -1 ); 191 | } 192 | 193 | return 1; 194 | } 195 | 196 | 197 | #ifdef __cplusplus 198 | } 199 | #endif 200 | -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/src/cuda/psroi_pooling_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef PS_ROI_POOLING_KERNEL 2 | #define PS_ROI_POOLING_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | int PSROIPoolForwardLauncher( 9 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 10 | const int width, const int channels, const int pooled_height, const int pooled_width, 11 | const float* bottom_rois, const int group_size, const int output_dim, float* top_data, int* mapping_channel, cudaStream_t stream); 12 | 13 | 14 | int PSROIPoolBackwardLauncher(const float* top_diff, const int* mapping_channel, const int batch_size, const int num_rois, const float spatial_scale, const int channels, const int height, const int width, const int pooled_width, const int pooled_height, const int output_dim, float* bottom_diff, const float* bottom_rois, cudaStream_t stream); 15 | 16 | #ifdef __cplusplus 17 | } 18 | 19 | #endif 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/src/psroi_pooling_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cuda/psroi_pooling_kernel.h" 4 | 5 | 6 | 7 | extern THCState* state; 8 | 9 | int psroi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, int group_size, int output_dim,THCudaTensor *features, THCudaTensor* rois, THCudaTensor* output, THCudaIntTensor* mappingchannel){ 10 | float* data_in = THCudaTensor_data(state, features); 11 | float* rois_in = THCudaTensor_data(state, rois); 12 | float* output_out = THCudaTensor_data(state, output); 13 | int* mappingchannel_out = THCudaIntTensor_data(state, mappingchannel); 14 | //Get # of Rois 15 | int num_rois = THCudaTensor_size(state, rois, 0); 16 | int size_rois = THCudaTensor_size(state, rois, 1); 17 | if (size_rois!=5) 18 | { 19 | return 0; 20 | } 21 | 22 | //Get # of batch_size 23 | int batch_size = THCudaTensor_size(state, features, 0); 24 | if (batch_size!=1) 25 | { 26 | return 0; 27 | } 28 | 29 | int data_height = THCudaTensor_size(state, features, 2); 30 | int data_width = THCudaTensor_size(state, features, 3); 31 | int num_channels = THCudaTensor_size(state, features, 1); 32 | 33 | cudaStream_t stream = THCState_getCurrentStream(state); 34 | 35 | // call the gpu kernel for psroi_pooling 36 | PSROIPoolForwardLauncher(data_in, spatial_scale, num_rois, data_height, data_width, num_channels, pooled_height, pooled_width,rois_in, group_size, 37 | output_dim, output_out, mappingchannel_out,stream); 38 | return 1; 39 | } 40 | 41 | 42 | int psroi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, int output_dim, 43 | THCudaTensor* top_grad, THCudaTensor* rois, THCudaTensor* bottom_grad, THCudaIntTensor* mappingchannel) 44 | { 45 | float *top_grad_flat = THCudaTensor_data(state, top_grad); 46 | float *rois_flat = THCudaTensor_data(state, rois); 47 | 48 | float *bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 49 | int *mappingchannel_flat = THCudaIntTensor_data(state, mappingchannel); 50 | 51 | // Number of ROIs 52 | int num_rois = THCudaTensor_size(state, rois, 0); 53 | int size_rois = THCudaTensor_size(state, rois, 1); 54 | if (size_rois != 5) 55 | { 56 | return 0; 57 | } 58 | // batch size 59 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 60 | if (batch_size != 1) 61 | { 62 | return 0; 63 | } 64 | // data height 65 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 66 | // data width 67 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 68 | // Number of channels 69 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 70 | 71 | cudaStream_t stream = THCState_getCurrentStream(state); 72 | 73 | PSROIPoolBackwardLauncher(top_grad_flat, mappingchannel_flat, batch_size, num_rois, spatial_scale, num_channels, data_height, data_width, pooled_width, pooled_height, output_dim, bottom_grad_flat, rois_flat, stream); 74 | return 1; 75 | } 76 | -------------------------------------------------------------------------------- /faster_rcnn/psroi_pooling/src/psroi_pooling_cuda.h: -------------------------------------------------------------------------------- 1 | int psroi_pooling_forward_cuda( int pooled_height, int pooled_width, float spatial_scale,int group_size, int output_dim, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * mappingchannel); 3 | 4 | int psroi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, int output_dim, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * mappingchannel); 6 | -------------------------------------------------------------------------------- /faster_rcnn/psroippooling_layer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | from torch.autograd import Function 5 | 6 | class RSPooling(Function): 7 | def __init__(self, spatial_scale=0.0625, output_dim=21, group_size=7): 8 | super(RSPooling, self).__init__() 9 | self.spatial_scale = spatial_scale 10 | self.output_dim = output_dim 11 | self.group_size = group_size 12 | self.pooled_height_ = group_size 13 | self.pooled_width_ = group_size 14 | self.output = None 15 | 16 | def forward(self, features, rois): 17 | num_rois = rois.size()[0] 18 | rois[:, 1:] = rois[:, 1:] * self.spatial_scale 19 | output = torch.zeros(num_rois, self.output_dim, self.pooled_height_, self.pooled_width_) 20 | mappingchannel = torch.IntTensor(num_rois, self.output_dim, self.pooled_height_, self.pooled_width_).zero_() 21 | output = output.cuda(); mappingchannel = mappingchannel.cuda() 22 | 23 | def backward(self, grad_output): 24 | 25 | -------------------------------------------------------------------------------- /faster_rcnn/pycocotools/UPSTREAM_REV: -------------------------------------------------------------------------------- 1 | https://github.com/pdollar/coco/commit/3ac47c77ebd5a1ed4254a98b7fbf2ef4765a3574 2 | -------------------------------------------------------------------------------- /faster_rcnn/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /faster_rcnn/pycocotools/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/pycocotools/__init__.pyc -------------------------------------------------------------------------------- /faster_rcnn/pycocotools/_mask.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/pycocotools/_mask.so -------------------------------------------------------------------------------- /faster_rcnn/pycocotools/coco.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/pycocotools/coco.pyc -------------------------------------------------------------------------------- /faster_rcnn/pycocotools/cocoeval.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/pycocotools/cocoeval.pyc -------------------------------------------------------------------------------- /faster_rcnn/pycocotools/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /faster_rcnn/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | from . import _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | encode = _mask.encode 77 | decode = _mask.decode 78 | iou = _mask.iou 79 | merge = _mask.merge 80 | area = _mask.area 81 | toBbox = _mask.toBbox 82 | frPyObjects = _mask.frPyObjects -------------------------------------------------------------------------------- /faster_rcnn/pycocotools/mask.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/pycocotools/mask.pyc -------------------------------------------------------------------------------- /faster_rcnn/pycocotools/maskApi.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "maskApi.h" 8 | #include 9 | #include 10 | 11 | uint umin( uint a, uint b ) { return (ab) ? a : b; } 13 | 14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { 15 | R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); 16 | if(cnts) for(siz j=0; jcnts[j]=cnts[j]; 17 | } 18 | 19 | void rleFree( RLE *R ) { 20 | free(R->cnts); R->cnts=0; 21 | } 22 | 23 | void rlesInit( RLE **R, siz n ) { 24 | *R = (RLE*) malloc(sizeof(RLE)*n); 25 | for(siz i=0; i0 ) { 61 | c=umin(ca,cb); cc+=c; ct=0; 62 | ca-=c; if(!ca && a0) { 83 | crowd=iscrowd!=NULL && iscrowd[g]; 84 | if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } 85 | siz ka, kb, a, b; uint c, ca, cb, ct, i, u; bool va, vb; 86 | ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; 87 | cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; 88 | while( ct>0 ) { 89 | c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; 90 | ca-=c; if(!ca && ad?1:c=dy && xs>xe) || (dxye); 151 | if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } 152 | s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; 153 | if(dx>=dy) for( int d=0; d<=dx; d++ ) { 154 | t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; 155 | } else for( int d=0; d<=dy; d++ ) { 156 | t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; 157 | } 158 | } 159 | // get points along y-boundary and downsample 160 | free(x); free(y); k=m; m=0; double xd, yd; 161 | x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); 162 | for( j=1; jw-1 ) continue; 165 | yd=(double)(v[j]h) yd=h; yd=ceil(yd); 167 | x[m]=(int) xd; y[m]=(int) yd; m++; 168 | } 169 | // compute rle encoding given y-boundary points 170 | k=m; a=malloc(sizeof(uint)*(k+1)); 171 | for( j=0; j0) b[m++]=a[j++]; else { 177 | j++; if(jm, p=0; long x; bool more; 184 | char *s=malloc(sizeof(char)*m*6); 185 | for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; 187 | while( more ) { 188 | char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; 189 | if(more) c |= 0x20; c+=48; s[p++]=c; 190 | } 191 | } 192 | s[p]=0; return s; 193 | } 194 | 195 | void rleFrString( RLE *R, char *s, siz h, siz w ) { 196 | siz m=0, p=0, k; long x; bool more; uint *cnts; 197 | while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; 198 | while( s[p] ) { 199 | x=0; k=0; more=1; 200 | while( more ) { 201 | char c=s[p]-48; x |= (c & 0x1f) << 5*k; 202 | more = c & 0x20; p++; k++; 203 | if(!more && (c & 0x10)) x |= -1 << 5*k; 204 | } 205 | if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; 206 | } 207 | rleInit(R,h,w,m,cnts); free(cnts); 208 | } 209 | -------------------------------------------------------------------------------- /faster_rcnn/pycocotools/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | #include 9 | 10 | typedef unsigned int uint; 11 | typedef unsigned long siz; 12 | typedef unsigned char byte; 13 | typedef double* BB; 14 | typedef struct { siz h, w, m; uint *cnts; } RLE; 15 | 16 | // Initialize/destroy RLE. 17 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 18 | void rleFree( RLE *R ); 19 | 20 | // Initialize/destroy RLE array. 21 | void rlesInit( RLE **R, siz n ); 22 | void rlesFree( RLE **R, siz n ); 23 | 24 | // Encode binary masks using RLE. 25 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 26 | 27 | // Decode binary masks encoded via RLE. 28 | void rleDecode( const RLE *R, byte *mask, siz n ); 29 | 30 | // Compute union or intersection of encoded masks. 31 | void rleMerge( const RLE *R, RLE *M, siz n, bool intersect ); 32 | 33 | // Compute area of encoded masks. 34 | void rleArea( const RLE *R, siz n, uint *a ); 35 | 36 | // Compute intersection over union between masks. 37 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 38 | 39 | // Compute intersection over union between bounding boxes. 40 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 41 | 42 | // Get bounding boxes surrounding encoded masks. 43 | void rleToBbox( const RLE *R, BB bb, siz n ); 44 | 45 | // Convert bounding boxes to encoded masks. 46 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 47 | 48 | // Convert polygon to encoded mask. 49 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 50 | 51 | // Get compressed string representation of encoded mask. 52 | char* rleToString( const RLE *R ); 53 | 54 | // Convert from compressed string representation of encoded mask. 55 | void rleFrString( RLE *R, char *s, siz h, siz w ); 56 | -------------------------------------------------------------------------------- /faster_rcnn/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | 5 | 6 | def conv3x3(in_planes, out_planes, stride=1): 7 | """3x3 convolution with padding""" 8 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 9 | padding=1, bias=False) 10 | 11 | class Bottleneck(nn.Module): 12 | expansion = 4 13 | 14 | def __init__(self, inplanes, planes, stride=1, downsample=None): 15 | super(Bottleneck, self).__init__() 16 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 17 | self.bn1 = nn.BatchNorm2d(planes) 18 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 19 | padding=1, bias=False) 20 | self.bn2 = nn.BatchNorm2d(planes) 21 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 22 | self.bn3 = nn.BatchNorm2d(planes * 4) 23 | self.relu = nn.ReLU(inplace=True) 24 | self.downsample = downsample 25 | self.stride = stride 26 | 27 | def forward(self, x): 28 | residual = x 29 | 30 | out = self.conv1(x) 31 | out = self.bn1(out) 32 | out = self.relu(out) 33 | 34 | out = self.conv2(out) 35 | out = self.bn2(out) 36 | out = self.relu(out) 37 | 38 | out = self.conv3(out) 39 | out = self.bn3(out) 40 | 41 | if self.downsample is not None: 42 | residual = self.downsample(x) 43 | 44 | out += residual 45 | out = self.relu(out) 46 | 47 | return out 48 | 49 | 50 | class ResNet(nn.Module): 51 | 52 | def __init__(self, block, layers): 53 | self.inplanes = 64 54 | super(ResNet, self).__init__() 55 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 56 | bias=False) 57 | self.bn1 = nn.BatchNorm2d(64) 58 | self.relu = nn.ReLU(inplace=True) 59 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 60 | self.layer1 = self._make_layer(block, 64, layers[0]) 61 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 62 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 63 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 64 | 65 | for m in self.modules(): 66 | if isinstance(m, nn.Conv2d): 67 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 68 | m.weight.data.normal_(0, math.sqrt(2. / n)) 69 | elif isinstance(m, nn.BatchNorm2d): 70 | m.weight.data.fill_(1) 71 | m.bias.data.zero_() 72 | 73 | def _make_layer(self, block, planes, blocks, stride=1): 74 | downsample = None 75 | if stride != 1 or self.inplanes != planes * block.expansion: 76 | downsample = nn.Sequential( 77 | nn.Conv2d(self.inplanes, planes * block.expansion, 78 | kernel_size=1, stride=stride, bias=False), 79 | nn.BatchNorm2d(planes * block.expansion), 80 | ) 81 | 82 | layers = [] 83 | layers.append(block(self.inplanes, planes, stride, downsample)) 84 | self.inplanes = planes * block.expansion 85 | for i in range(1, blocks): 86 | layers.append(block(self.inplanes, planes)) 87 | 88 | return nn.Sequential(*layers) 89 | 90 | def forward(self, x): 91 | x = self.conv1(x) 92 | x = self.bn1(x) 93 | x = self.relu(x) 94 | x = self.maxpool(x) 95 | x = self.layer1(x) 96 | x = self.layer2(x) 97 | x = self.layer3(x) 98 | C4 = x 99 | C5 = self.layer4(x) 100 | return C4, C5 101 | 102 | def resnet101(): 103 | """Constructs a ResNet-101 model. 104 | 105 | Args: 106 | pretrained (bool): If True, returns a model pre-trained on ImageNet 107 | """ 108 | model = ResNet(Bottleneck, [3, 4, 23, 3]) 109 | return model 110 | 111 | -------------------------------------------------------------------------------- /faster_rcnn/roi_data_layer/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | import roidb -------------------------------------------------------------------------------- /faster_rcnn/roi_data_layer/layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """The data layer used during training to train a Fast R-CNN network. 9 | 10 | RoIDataLayer implements a Caffe Python layer. 11 | """ 12 | 13 | import numpy as np 14 | 15 | # TODO: make fast_rcnn irrelevant 16 | # >>>> obsolete, because it depends on sth outside of this project 17 | from ..fast_rcnn.config import cfg 18 | # <<<< obsolete 19 | from ..roi_data_layer.minibatch import get_minibatch 20 | 21 | class RoIDataLayer(object): 22 | """Fast R-CNN data layer used for training.""" 23 | 24 | def __init__(self, roidb, num_classes): 25 | """Set the roidb to be used by this layer during training.""" 26 | self._roidb = roidb 27 | self._num_classes = num_classes 28 | self._shuffle_roidb_inds() 29 | 30 | def _shuffle_roidb_inds(self): 31 | """Randomly permute the training roidb.""" 32 | self._perm = np.random.permutation(np.arange(len(self._roidb))) 33 | # self._perm = np.arange(len(self._roidb)) 34 | self._cur = 0 35 | 36 | def _get_next_minibatch_inds(self): 37 | """Return the roidb indices for the next minibatch.""" 38 | 39 | if cfg.TRAIN.HAS_RPN: 40 | if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): 41 | self._shuffle_roidb_inds() 42 | 43 | db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] 44 | self._cur += cfg.TRAIN.IMS_PER_BATCH 45 | else: 46 | # sample images 47 | db_inds = np.zeros((cfg.TRAIN.IMS_PER_BATCH), dtype=np.int32) 48 | i = 0 49 | while (i < cfg.TRAIN.IMS_PER_BATCH): 50 | ind = self._perm[self._cur] 51 | num_objs = self._roidb[ind]['boxes'].shape[0] 52 | if num_objs != 0: 53 | db_inds[i] = ind 54 | i += 1 55 | 56 | self._cur += 1 57 | if self._cur >= len(self._roidb): 58 | self._shuffle_roidb_inds() 59 | 60 | return db_inds 61 | 62 | def _get_next_minibatch(self): 63 | """Return the blobs to be used for the next minibatch. 64 | 65 | If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a 66 | separate process and made available through self._blob_queue. 67 | """ 68 | db_inds = self._get_next_minibatch_inds() 69 | minibatch_db = [self._roidb[i] for i in db_inds] 70 | return get_minibatch(minibatch_db, self._num_classes) 71 | 72 | def forward(self): 73 | """Get blobs and copy them into this layer's top blob vector.""" 74 | blobs = self._get_next_minibatch() 75 | return blobs 76 | -------------------------------------------------------------------------------- /faster_rcnn/roi_data_layer/minibatch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Compute minibatch blobs for training a Fast R-CNN network.""" 9 | 10 | import numpy as np 11 | import numpy.random as npr 12 | import cv2 13 | import os 14 | 15 | # TODO: make fast_rcnn irrelevant 16 | # >>>> obsolete, because it depends on sth outside of this project 17 | from ..fast_rcnn.config import cfg 18 | # <<<< obsolete 19 | from ..utils.blob import prep_im_for_blob, im_list_to_blob 20 | 21 | def get_minibatch(roidb, num_classes): 22 | """Given a roidb, construct a minibatch sampled from it.""" 23 | num_images = len(roidb) 24 | # Sample random scales to use for each image in this batch 25 | random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), 26 | size=num_images) 27 | assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \ 28 | 'num_images ({}) must divide BATCH_SIZE ({})'. \ 29 | format(num_images, cfg.TRAIN.BATCH_SIZE) 30 | rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images 31 | fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) 32 | 33 | # Get the input image blob, formatted for caffe 34 | im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) 35 | 36 | blobs = {'data': im_blob} 37 | 38 | if cfg.TRAIN.HAS_RPN: 39 | assert len(im_scales) == 1, "Single batch only" 40 | assert len(roidb) == 1, "Single batch only" 41 | # gt boxes: (x1, y1, x2, y2, cls) 42 | gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] 43 | gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) 44 | gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0] 45 | gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] 46 | blobs['gt_boxes'] = gt_boxes 47 | blobs['gt_ishard'] = roidb[0]['gt_ishard'][gt_inds] \ 48 | if 'gt_ishard' in roidb[0] else np.zeros(gt_inds.size, dtype=int) 49 | # blobs['gt_ishard'] = roidb[0]['gt_ishard'][gt_inds] 50 | blobs['dontcare_areas'] = roidb[0]['dontcare_areas'] * im_scales[0] \ 51 | if 'dontcare_areas' in roidb[0] else np.zeros([0, 4], dtype=float) 52 | blobs['im_info'] = np.array( 53 | [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], 54 | dtype=np.float32) 55 | blobs['im_name'] = os.path.basename(roidb[0]['image']) 56 | 57 | else: # not using RPN 58 | # Now, build the region of interest and label blobs 59 | rois_blob = np.zeros((0, 5), dtype=np.float32) 60 | labels_blob = np.zeros((0), dtype=np.float32) 61 | bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32) 62 | bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32) 63 | # all_overlaps = [] 64 | for im_i in xrange(num_images): 65 | labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \ 66 | = _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, 67 | num_classes) 68 | 69 | # Add to RoIs blob 70 | rois = _project_im_rois(im_rois, im_scales[im_i]) 71 | batch_ind = im_i * np.ones((rois.shape[0], 1)) 72 | rois_blob_this_image = np.hstack((batch_ind, rois)) 73 | rois_blob = np.vstack((rois_blob, rois_blob_this_image)) 74 | 75 | # Add to labels, bbox targets, and bbox loss blobs 76 | labels_blob = np.hstack((labels_blob, labels)) 77 | bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets)) 78 | bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights)) 79 | # all_overlaps = np.hstack((all_overlaps, overlaps)) 80 | 81 | # For debug visualizations 82 | # _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps) 83 | 84 | blobs['rois'] = rois_blob 85 | blobs['labels'] = labels_blob 86 | 87 | if cfg.TRAIN.BBOX_REG: 88 | blobs['bbox_targets'] = bbox_targets_blob 89 | blobs['bbox_inside_weights'] = bbox_inside_blob 90 | blobs['bbox_outside_weights'] = \ 91 | np.array(bbox_inside_blob > 0).astype(np.float32) 92 | 93 | return blobs 94 | 95 | def _sample_rois(roidb, fg_rois_per_image, rois_per_image, num_classes): 96 | """Generate a random sample of RoIs comprising foreground and background 97 | examples. 98 | """ 99 | # label = class RoI has max overlap with 100 | labels = roidb['max_classes'] 101 | overlaps = roidb['max_overlaps'] 102 | rois = roidb['boxes'] 103 | 104 | # Select foreground RoIs as those with >= FG_THRESH overlap 105 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] 106 | # Guard against the case when an image has fewer than fg_rois_per_image 107 | # foreground RoIs 108 | fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) 109 | # Sample foreground regions without replacement 110 | if fg_inds.size > 0: 111 | fg_inds = npr.choice( 112 | fg_inds, size=fg_rois_per_this_image, replace=False) 113 | 114 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 115 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & 116 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 117 | # Compute number of background RoIs to take from this image (guarding 118 | # against there being fewer than desired) 119 | bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image 120 | bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, 121 | bg_inds.size) 122 | # Sample foreground regions without replacement 123 | if bg_inds.size > 0: 124 | bg_inds = npr.choice( 125 | bg_inds, size=bg_rois_per_this_image, replace=False) 126 | 127 | # The indices that we're selecting (both fg and bg) 128 | keep_inds = np.append(fg_inds, bg_inds) 129 | # Select sampled values from various arrays: 130 | labels = labels[keep_inds] 131 | # Clamp labels for the background RoIs to 0 132 | labels[fg_rois_per_this_image:] = 0 133 | overlaps = overlaps[keep_inds] 134 | rois = rois[keep_inds] 135 | 136 | bbox_targets, bbox_inside_weights = _get_bbox_regression_labels( 137 | roidb['bbox_targets'][keep_inds, :], num_classes) 138 | 139 | return labels, overlaps, rois, bbox_targets, bbox_inside_weights 140 | 141 | def _get_image_blob(roidb, scale_inds): 142 | """Builds an input blob from the images in the roidb at the specified 143 | scales. 144 | """ 145 | num_images = len(roidb) 146 | processed_ims = [] 147 | im_scales = [] 148 | for i in xrange(num_images): 149 | im = cv2.imread(roidb[i]['image']) 150 | if roidb[i]['flipped']: 151 | im = im[:, ::-1, :] 152 | target_size = cfg.TRAIN.SCALES[scale_inds[i]] 153 | im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, 154 | cfg.TRAIN.MAX_SIZE) 155 | im_scales.append(im_scale) 156 | processed_ims.append(im) 157 | 158 | # Create a blob to hold the input images 159 | blob = im_list_to_blob(processed_ims) 160 | 161 | return blob, im_scales 162 | 163 | def _project_im_rois(im_rois, im_scale_factor): 164 | """Project image RoIs into the rescaled training image.""" 165 | rois = im_rois * im_scale_factor 166 | return rois 167 | 168 | def _get_bbox_regression_labels(bbox_target_data, num_classes): 169 | """Bounding-box regression targets are stored in a compact form in the 170 | roidb. 171 | 172 | This function expands those targets into the 4-of-4*K representation used 173 | by the network (i.e. only one class has non-zero targets). The loss weights 174 | are similarly expanded. 175 | 176 | Returns: 177 | bbox_target_data (ndarray): N x 4K blob of regression targets 178 | bbox_inside_weights (ndarray): N x 4K blob of loss weights 179 | """ 180 | clss = bbox_target_data[:, 0] 181 | bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) 182 | bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 183 | inds = np.where(clss > 0)[0] 184 | for ind in inds: 185 | cls = clss[ind] 186 | start = 4 * cls 187 | end = start + 4 188 | bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] 189 | bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS 190 | return bbox_targets, bbox_inside_weights 191 | 192 | def _vis_minibatch(im_blob, rois_blob, labels_blob, overlaps): 193 | """Visualize a mini-batch for debugging.""" 194 | import matplotlib.pyplot as plt 195 | for i in xrange(rois_blob.shape[0]): 196 | rois = rois_blob[i, :] 197 | im_ind = rois[0] 198 | roi = rois[1:] 199 | im = im_blob[im_ind, :, :, :].transpose((1, 2, 0)).copy() 200 | im += cfg.PIXEL_MEANS 201 | im = im[:, :, (2, 1, 0)] 202 | im = im.astype(np.uint8) 203 | cls = labels_blob[i] 204 | plt.imshow(im) 205 | print 'class: ', cls, ' overlap: ', overlaps[i] 206 | plt.gca().add_patch( 207 | plt.Rectangle((roi[0], roi[1]), roi[2] - roi[0], 208 | roi[3] - roi[1], fill=False, 209 | edgecolor='r', linewidth=3) 210 | ) 211 | plt.show() 212 | -------------------------------------------------------------------------------- /faster_rcnn/roi_data_layer/roidb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Transform a roidb into a trainable roidb by adding a bunch of metadata.""" 9 | 10 | import numpy as np 11 | 12 | import PIL 13 | 14 | # TODO: make fast_rcnn irrelevant 15 | # >>>> obsolete, because it depends on sth outside of this project 16 | from ..fast_rcnn.config import cfg 17 | from ..fast_rcnn.bbox_transform import bbox_transform 18 | # <<<< obsolete 19 | from ..utils.cython_bbox import bbox_overlaps 20 | 21 | def prepare_roidb(imdb): 22 | """Enrich the imdb's roidb by adding some derived quantities that 23 | are useful for training. This function precomputes the maximum 24 | overlap, taken over ground-truth boxes, between each ROI and 25 | each ground-truth box. The class with maximum overlap is also 26 | recorded. 27 | """ 28 | sizes = [PIL.Image.open(imdb.image_path_at(i)).size 29 | for i in xrange(imdb.num_images)] 30 | roidb = imdb.roidb 31 | for i in xrange(len(imdb.image_index)): 32 | roidb[i]['image'] = imdb.image_path_at(i) 33 | roidb[i]['width'] = sizes[i][0] 34 | roidb[i]['height'] = sizes[i][1] 35 | # need gt_overlaps as a dense array for argmax 36 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 37 | # max overlap with gt over classes (columns) 38 | max_overlaps = gt_overlaps.max(axis=1) 39 | # gt class that had the max overlap 40 | max_classes = gt_overlaps.argmax(axis=1) 41 | roidb[i]['max_classes'] = max_classes 42 | roidb[i]['max_overlaps'] = max_overlaps 43 | # sanity checks 44 | # max overlap of 0 => class should be zero (background) 45 | zero_inds = np.where(max_overlaps == 0)[0] 46 | assert all(max_classes[zero_inds] == 0) 47 | # max overlap > 0 => class should not be zero (must be a fg class) 48 | nonzero_inds = np.where(max_overlaps > 0)[0] 49 | assert all(max_classes[nonzero_inds] != 0) 50 | 51 | def add_bbox_regression_targets(roidb): 52 | """ 53 | Add information needed to train bounding-box regressors. 54 | For each roi find the corresponding gt box, and compute the distance. 55 | then normalize the distance into Gaussian by minus mean and divided by std 56 | """ 57 | assert len(roidb) > 0 58 | assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?' 59 | 60 | num_images = len(roidb) 61 | # Infer number of classes from the number of columns in gt_overlaps 62 | num_classes = roidb[0]['gt_overlaps'].shape[1] 63 | for im_i in xrange(num_images): 64 | rois = roidb[im_i]['boxes'] 65 | max_overlaps = roidb[im_i]['max_overlaps'] 66 | max_classes = roidb[im_i]['max_classes'] 67 | roidb[im_i]['bbox_targets'] = \ 68 | _compute_targets(rois, max_overlaps, max_classes) 69 | 70 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 71 | # Use fixed / precomputed "means" and "stds" instead of empirical values 72 | means = np.tile( 73 | np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1)) 74 | stds = np.tile( 75 | np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1)) 76 | else: 77 | # Compute values needed for means and stds 78 | # var(x) = E(x^2) - E(x)^2 79 | class_counts = np.zeros((num_classes, 1)) + cfg.EPS 80 | sums = np.zeros((num_classes, 4)) 81 | squared_sums = np.zeros((num_classes, 4)) 82 | for im_i in xrange(num_images): 83 | targets = roidb[im_i]['bbox_targets'] 84 | for cls in xrange(1, num_classes): 85 | cls_inds = np.where(targets[:, 0] == cls)[0] 86 | if cls_inds.size > 0: 87 | class_counts[cls] += cls_inds.size 88 | sums[cls, :] += targets[cls_inds, 1:].sum(axis=0) 89 | squared_sums[cls, :] += \ 90 | (targets[cls_inds, 1:] ** 2).sum(axis=0) 91 | 92 | means = sums / class_counts 93 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 94 | # too small number will cause nan error 95 | assert np.min(stds) < 0.01, \ 96 | 'Boxes std is too small, std:{}'.format(stds) 97 | 98 | print 'bbox target means:' 99 | print means 100 | print means[1:, :].mean(axis=0) # ignore bg class 101 | print 'bbox target stdevs:' 102 | print stds 103 | print stds[1:, :].mean(axis=0) # ignore bg class 104 | 105 | # Normalize targets 106 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS: 107 | print "Normalizing targets" 108 | for im_i in xrange(num_images): 109 | targets = roidb[im_i]['bbox_targets'] 110 | for cls in xrange(1, num_classes): 111 | cls_inds = np.where(targets[:, 0] == cls)[0] 112 | roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :] 113 | roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :] 114 | else: 115 | print "NOT normalizing targets" 116 | 117 | # These values will be needed for making predictions 118 | # (the predicts will need to be unnormalized and uncentered) 119 | return means.ravel(), stds.ravel() 120 | 121 | def _compute_targets(rois, overlaps, labels): 122 | """ 123 | Compute bounding-box regression targets for an image. 124 | for each roi find the corresponding gt_box, then compute the distance. 125 | """ 126 | # Indices of ground-truth ROIs 127 | gt_inds = np.where(overlaps == 1)[0] 128 | if len(gt_inds) == 0: 129 | # Bail if the image has no ground-truth ROIs 130 | return np.zeros((rois.shape[0], 5), dtype=np.float32) 131 | # Indices of examples for which we try to make predictions 132 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] 133 | 134 | # Get IoU overlap between each ex ROI and gt ROI 135 | ex_gt_overlaps = bbox_overlaps( 136 | np.ascontiguousarray(rois[ex_inds, :], dtype=np.float), 137 | np.ascontiguousarray(rois[gt_inds, :], dtype=np.float)) 138 | 139 | # Find which gt ROI each ex ROI has max overlap with: 140 | # this will be the ex ROI's gt target 141 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 142 | gt_rois = rois[gt_inds[gt_assignment], :] 143 | ex_rois = rois[ex_inds, :] 144 | 145 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 146 | targets[ex_inds, 0] = labels[ex_inds] 147 | targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) 148 | return targets 149 | -------------------------------------------------------------------------------- /faster_rcnn/roi_data_layer/roidb2.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Transform a roidb into a trainable roidb by adding a bunch of metadata.""" 9 | 10 | import numpy as np 11 | 12 | # TODO: make fast_rcnn irrelevant 13 | # >>>> obsolete, because it depends on sth outside of this project 14 | from ..fast_rcnn.config import cfg 15 | from ..fast_rcnn.bbox_transform import bbox_transform 16 | # <<<< obsolete 17 | from ..utils.cython_bbox import bbox_overlaps 18 | 19 | def prepare_roidb(imdb): 20 | """Enrich the imdb's roidb by adding some derived quantities that 21 | are useful for training. This function precomputes the maximum 22 | overlap, taken over ground-truth boxes, between each ROI and 23 | each ground-truth box. The class with maximum overlap is also 24 | recorded. 25 | """ 26 | roidb = imdb.roidb 27 | for i in xrange(len(imdb.image_index)): 28 | roidb[i]['image'] = imdb.image_path_at(i) 29 | # need gt_overlaps as a dense array for argmax 30 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 31 | # max overlap with gt over classes (columns) 32 | max_overlaps = gt_overlaps.max(axis=1) 33 | # gt class that had the max overlap 34 | max_classes = gt_overlaps.argmax(axis=1) 35 | 36 | roidb[i]['max_classes'] = max_classes 37 | roidb[i]['max_overlaps'] = max_overlaps 38 | 39 | # sanity checks 40 | # max overlap of 0 => class should be zero (background) 41 | zero_inds = np.where(max_overlaps == 0)[0] 42 | assert all(max_classes[zero_inds] == 0) 43 | # max overlap > 0 => class should not be zero (must be a fg class) 44 | nonzero_inds = np.where(max_overlaps > 0)[0] 45 | assert all(max_classes[nonzero_inds] != 0) 46 | 47 | def add_bbox_regression_targets(roidb): 48 | """Add information needed to train bounding-box regressors.""" 49 | assert len(roidb) > 0 50 | assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?' 51 | 52 | num_images = len(roidb) 53 | # Infer number of classes from the number of columns in gt_overlaps 54 | num_classes = roidb[0]['gt_overlaps'].shape[1] 55 | for im_i in xrange(num_images): 56 | rois = roidb[im_i]['boxes'] 57 | max_overlaps = roidb[im_i]['max_overlaps'] 58 | max_classes = roidb[im_i]['max_classes'] 59 | roidb[im_i]['bbox_targets'] = \ 60 | _compute_targets(rois, max_overlaps, max_classes, num_classes) 61 | 62 | # Compute values needed for means and stds 63 | # var(x) = E(x^2) - E(x)^2 64 | class_counts = np.zeros((num_classes, 1)) + cfg.EPS 65 | sums = np.zeros((num_classes, 4)) 66 | squared_sums = np.zeros((num_classes, 4)) 67 | for im_i in xrange(num_images): 68 | targets = roidb[im_i]['bbox_targets'] 69 | for cls in xrange(1, num_classes): 70 | cls_inds = np.where(targets[:, 0] == cls)[0] 71 | if cls_inds.size > 0: 72 | class_counts[cls] += cls_inds.size 73 | sums[cls, :] += targets[cls_inds, 1:].sum(axis=0) 74 | squared_sums[cls, :] += (targets[cls_inds, 1:] ** 2).sum(axis=0) 75 | 76 | means = sums / class_counts 77 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 78 | 79 | # Normalize targets 80 | for im_i in xrange(num_images): 81 | targets = roidb[im_i]['bbox_targets'] 82 | for cls in xrange(1, num_classes): 83 | cls_inds = np.where(targets[:, 0] == cls)[0] 84 | roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :] 85 | if stds[cls, 0] != 0: 86 | roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :] 87 | 88 | # These values will be needed for making predictions 89 | # (the predicts will need to be unnormalized and uncentered) 90 | return means.ravel(), stds.ravel() 91 | 92 | def _compute_targets(rois, overlaps, labels, num_classes): 93 | """Compute bounding-box regression targets for an image.""" 94 | # Ensure ROIs are floats 95 | rois = rois.astype(np.float, copy=False) 96 | 97 | # Indices of ground-truth ROIs 98 | gt_inds = np.where(overlaps == 1)[0] 99 | # Indices of examples for which we try to make predictions 100 | ex_inds = [] 101 | for i in xrange(1, num_classes): 102 | ex_inds.extend( np.where((labels == i) & (overlaps >= cfg.TRAIN.BBOX_THRESH))[0] ) 103 | 104 | # Get IoU overlap between each ex ROI and gt ROI 105 | ex_gt_overlaps = utils.cython_bbox.bbox_overlaps(rois[ex_inds, :], 106 | rois[gt_inds, :]) 107 | 108 | # Find which gt ROI each ex ROI has max overlap with: 109 | # this will be the ex ROI's gt target 110 | if ex_gt_overlaps.shape[0] != 0: 111 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 112 | else: 113 | gt_assignment = [] 114 | gt_rois = rois[gt_inds[gt_assignment], :] 115 | ex_rois = rois[ex_inds, :] 116 | 117 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + cfg.EPS 118 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + cfg.EPS 119 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 120 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 121 | 122 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + cfg.EPS 123 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + cfg.EPS 124 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 125 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 126 | 127 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 128 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 129 | targets_dw = np.log(gt_widths / ex_widths) 130 | targets_dh = np.log(gt_heights / ex_heights) 131 | 132 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 133 | targets[ex_inds, 0] = labels[ex_inds] 134 | targets[ex_inds, 1] = targets_dx 135 | targets[ex_inds, 2] = targets_dy 136 | targets[ex_inds, 3] = targets_dw 137 | targets[ex_inds, 4] = targets_dh 138 | return targets 139 | -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/roi_pooling/__init__.py -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/_ext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/roi_pooling/_ext/__init__.py -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/_ext/roi_pooling/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_pooling import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.ffi import create_extension 4 | 5 | 6 | sources = ['src/roi_pooling.c'] 7 | headers = ['src/roi_pooling.h'] 8 | defines = [] 9 | with_cuda = False 10 | 11 | if torch.cuda.is_available(): 12 | print('Including CUDA code.') 13 | sources += ['src/roi_pooling_cuda.c'] 14 | headers += ['src/roi_pooling_cuda.h'] 15 | defines += [('WITH_CUDA', None)] 16 | with_cuda = True 17 | 18 | this_file = os.path.dirname(os.path.realpath(__file__)) 19 | print(this_file) 20 | extra_objects = ['src/cuda/roi_pooling.cu.o'] 21 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 22 | 23 | ffi = create_extension( 24 | '_ext.roi_pooling', 25 | headers=headers, 26 | sources=sources, 27 | define_macros=defines, 28 | relative_to=__file__, 29 | with_cuda=with_cuda, 30 | extra_objects=extra_objects 31 | ) 32 | 33 | if __name__ == '__main__': 34 | ffi.build() 35 | -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/roi_pooling/functions/__init__.py -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/functions/roi_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from .._ext import roi_pooling 4 | 5 | 6 | class RoIPoolFunction(Function): 7 | def __init__(self, pooled_height, pooled_width, spatial_scale): 8 | self.pooled_width = int(pooled_width) 9 | self.pooled_height = int(pooled_height) 10 | self.spatial_scale = float(spatial_scale) 11 | self.output = None 12 | self.argmax = None 13 | self.rois = None 14 | self.feature_size = None 15 | 16 | def forward(self, features, rois): 17 | batch_size, num_channels, data_height, data_width = features.size() 18 | num_rois = rois.size()[0] 19 | output = torch.zeros(num_rois, num_channels, self.pooled_height, self.pooled_width) 20 | argmax = torch.IntTensor(num_rois, num_channels, self.pooled_height, self.pooled_width).zero_() 21 | 22 | if not features.is_cuda: 23 | _features = features.permute(0, 2, 3, 1) 24 | roi_pooling.roi_pooling_forward(self.pooled_height, self.pooled_width, self.spatial_scale, 25 | _features, rois, output) 26 | # output = output.cuda() 27 | else: 28 | output = output.cuda() 29 | argmax = argmax.cuda() 30 | roi_pooling.roi_pooling_forward_cuda(self.pooled_height, self.pooled_width, self.spatial_scale, 31 | features, rois, output, argmax) 32 | self.output = output 33 | self.argmax = argmax 34 | self.rois = rois 35 | self.feature_size = features.size() 36 | 37 | return output 38 | 39 | def backward(self, grad_output): 40 | assert(self.feature_size is not None and grad_output.is_cuda) 41 | 42 | batch_size, num_channels, data_height, data_width = self.feature_size 43 | 44 | grad_input = torch.zeros(batch_size, num_channels, data_height, data_width).cuda() 45 | roi_pooling.roi_pooling_backward_cuda(self.pooled_height, self.pooled_width, self.spatial_scale, 46 | grad_output, self.rois, grad_input, self.argmax) 47 | 48 | # print grad_input 49 | 50 | return grad_input, None 51 | -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingmimfl/pytorch_RFCN/3baed2001fe2ed47cb80ea4193ec945bbc162c06/faster_rcnn/roi_pooling/modules/__init__.py -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/modules/roi_pool.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from ..functions.roi_pool import RoIPoolFunction 3 | 4 | 5 | class RoIPool(Module): 6 | def __init__(self, pooled_height, pooled_width, spatial_scale): 7 | super(RoIPool, self).__init__() 8 | 9 | self.pooled_width = int(pooled_width) 10 | self.pooled_height = int(pooled_height) 11 | self.spatial_scale = float(spatial_scale) 12 | 13 | def forward(self, features, rois): 14 | return RoIPoolFunction(self.pooled_height, self.pooled_width, self.spatial_scale)(features, rois) 15 | -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/modules/roi_pool_py.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | 6 | 7 | class RoIPool(nn.Module): 8 | def __init__(self, pooled_height, pooled_width, spatial_scale): 9 | super(RoIPool, self).__init__() 10 | self.pooled_width = int(pooled_width) 11 | self.pooled_height = int(pooled_height) 12 | self.spatial_scale = float(spatial_scale) 13 | 14 | def forward(self, features, rois): 15 | batch_size, num_channels, data_height, data_width = features.size() 16 | num_rois = rois.size()[0] 17 | outputs = Variable(torch.zeros(num_rois, num_channels, self.pooled_height, self.pooled_width)).cuda() 18 | 19 | for roi_ind, roi in enumerate(rois): 20 | batch_ind = int(roi[0].data[0]) 21 | roi_start_w, roi_start_h, roi_end_w, roi_end_h = np.round( 22 | roi[1:].data.cpu().numpy() * self.spatial_scale).astype(int) 23 | roi_width = max(roi_end_w - roi_start_w + 1, 1) 24 | roi_height = max(roi_end_h - roi_start_h + 1, 1) 25 | bin_size_w = float(roi_width) / float(self.pooled_width) 26 | bin_size_h = float(roi_height) / float(self.pooled_height) 27 | 28 | for ph in range(self.pooled_height): 29 | hstart = int(np.floor(ph * bin_size_h)) 30 | hend = int(np.ceil((ph + 1) * bin_size_h)) 31 | hstart = min(data_height, max(0, hstart + roi_start_h)) 32 | hend = min(data_height, max(0, hend + roi_start_h)) 33 | for pw in range(self.pooled_width): 34 | wstart = int(np.floor(pw * bin_size_w)) 35 | wend = int(np.ceil((pw + 1) * bin_size_w)) 36 | wstart = min(data_width, max(0, wstart + roi_start_w)) 37 | wend = min(data_width, max(0, wend + roi_start_w)) 38 | 39 | is_empty = (hend <= hstart) or(wend <= wstart) 40 | if is_empty: 41 | outputs[roi_ind, :, ph, pw] = 0 42 | else: 43 | data = features[batch_ind] 44 | outputs[roi_ind, :, ph, pw] = torch.max( 45 | torch.max(data[:, hstart:hend, wstart:wend], 1)[0], 2)[0].view(-1) 46 | 47 | return outputs 48 | 49 | -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/src/cuda/roi_pooling_kernel.cu: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include "roi_pooling_kernel.h" 9 | 10 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 11 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 12 | i += blockDim.x * gridDim.x) 13 | 14 | 15 | __global__ void ROIPoolForward(const int nthreads, const float* bottom_data, 16 | const float spatial_scale, const int height, const int width, 17 | const int channels, const int pooled_height, const int pooled_width, 18 | const float* bottom_rois, float* top_data, int* argmax_data) 19 | { 20 | CUDA_1D_KERNEL_LOOP(index, nthreads) 21 | { 22 | // (n, c, ph, pw) is an element in the pooled output 23 | int n = index; 24 | int pw = n % pooled_width; 25 | n /= pooled_width; 26 | int ph = n % pooled_height; 27 | n /= pooled_height; 28 | int c = n % channels; 29 | n /= channels; 30 | 31 | bottom_rois += n * 5; 32 | int roi_batch_ind = bottom_rois[0]; 33 | int roi_start_w = round(bottom_rois[1] * spatial_scale); 34 | int roi_start_h = round(bottom_rois[2] * spatial_scale); 35 | int roi_end_w = round(bottom_rois[3] * spatial_scale); 36 | int roi_end_h = round(bottom_rois[4] * spatial_scale); 37 | 38 | // Force malformed ROIs to be 1x1 39 | int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); 40 | int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); 41 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 42 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 43 | 44 | int hstart = (int)(floor((float)(ph) * bin_size_h)); 45 | int wstart = (int)(floor((float)(pw) * bin_size_w)); 46 | int hend = (int)(ceil((float)(ph + 1) * bin_size_h)); 47 | int wend = (int)(ceil((float)(pw + 1) * bin_size_w)); 48 | 49 | // Add roi offsets and clip to input boundaries 50 | hstart = fminf(fmaxf(hstart + roi_start_h, 0), height); 51 | hend = fminf(fmaxf(hend + roi_start_h, 0), height); 52 | wstart = fminf(fmaxf(wstart + roi_start_w, 0), width); 53 | wend = fminf(fmaxf(wend + roi_start_w, 0), width); 54 | bool is_empty = (hend <= hstart) || (wend <= wstart); 55 | 56 | // Define an empty pooling region to be zero 57 | float maxval = is_empty ? 0 : -FLT_MAX; 58 | // If nothing is pooled, argmax = -1 causes nothing to be backprop'd 59 | int maxidx = -1; 60 | bottom_data += roi_batch_ind * channels * height * width; 61 | for (int h = hstart; h < hend; ++h) { 62 | for (int w = wstart; w < wend; ++w) { 63 | // int bottom_index = (h * width + w) * channels + c; 64 | int bottom_index = (c * height + h) * width + w; 65 | if (bottom_data[bottom_index] > maxval) { 66 | maxval = bottom_data[bottom_index]; 67 | maxidx = bottom_index; 68 | } 69 | } 70 | } 71 | top_data[index] = maxval; 72 | if (argmax_data != NULL) 73 | argmax_data[index] = maxidx; 74 | } 75 | } 76 | 77 | 78 | int ROIPoolForwardLaucher( 79 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 80 | const int width, const int channels, const int pooled_height, 81 | const int pooled_width, const float* bottom_rois, 82 | float* top_data, int* argmax_data, cudaStream_t stream) 83 | { 84 | const int kThreadsPerBlock = 1024; 85 | const int output_size = num_rois * pooled_height * pooled_width * channels; 86 | cudaError_t err; 87 | 88 | 89 | ROIPoolForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 90 | output_size, bottom_data, spatial_scale, height, width, channels, pooled_height, 91 | pooled_width, bottom_rois, top_data, argmax_data); 92 | 93 | err = cudaGetLastError(); 94 | if(cudaSuccess != err) 95 | { 96 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 97 | exit( -1 ); 98 | } 99 | 100 | return 1; 101 | } 102 | 103 | 104 | __global__ void ROIPoolBackward(const int nthreads, const float* top_diff, 105 | const int* argmax_data, const int num_rois, const float spatial_scale, 106 | const int height, const int width, const int channels, 107 | const int pooled_height, const int pooled_width, float* bottom_diff, 108 | const float* bottom_rois) { 109 | CUDA_1D_KERNEL_LOOP(index, nthreads) 110 | { 111 | 112 | // (n, c, ph, pw) is an element in the pooled output 113 | int n = index; 114 | int w = n % width; 115 | n /= width; 116 | int h = n % height; 117 | n /= height; 118 | int c = n % channels; 119 | n /= channels; 120 | 121 | float gradient = 0; 122 | // Accumulate gradient over all ROIs that pooled this element 123 | for (int roi_n = 0; roi_n < num_rois; ++roi_n) 124 | { 125 | const float* offset_bottom_rois = bottom_rois + roi_n * 5; 126 | int roi_batch_ind = offset_bottom_rois[0]; 127 | // Skip if ROI's batch index doesn't match n 128 | if (n != roi_batch_ind) { 129 | continue; 130 | } 131 | 132 | int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); 133 | int roi_start_h = round(offset_bottom_rois[2] * spatial_scale); 134 | int roi_end_w = round(offset_bottom_rois[3] * spatial_scale); 135 | int roi_end_h = round(offset_bottom_rois[4] * spatial_scale); 136 | 137 | // Skip if ROI doesn't include (h, w) 138 | const bool in_roi = (w >= roi_start_w && w <= roi_end_w && 139 | h >= roi_start_h && h <= roi_end_h); 140 | if (!in_roi) { 141 | continue; 142 | } 143 | 144 | int offset = roi_n * pooled_height * pooled_width * channels; 145 | const float* offset_top_diff = top_diff + offset; 146 | const int* offset_argmax_data = argmax_data + offset; 147 | 148 | // Compute feasible set of pooled units that could have pooled 149 | // this bottom unit 150 | 151 | // Force malformed ROIs to be 1x1 152 | int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); 153 | int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); 154 | 155 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 156 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 157 | 158 | int phstart = floor((float)(h - roi_start_h) / bin_size_h); 159 | int phend = ceil((float)(h - roi_start_h + 1) / bin_size_h); 160 | int pwstart = floor((float)(w - roi_start_w) / bin_size_w); 161 | int pwend = ceil((float)(w - roi_start_w + 1) / bin_size_w); 162 | 163 | phstart = fminf(fmaxf(phstart, 0), pooled_height); 164 | phend = fminf(fmaxf(phend, 0), pooled_height); 165 | pwstart = fminf(fmaxf(pwstart, 0), pooled_width); 166 | pwend = fminf(fmaxf(pwend, 0), pooled_width); 167 | 168 | for (int ph = phstart; ph < phend; ++ph) { 169 | for (int pw = pwstart; pw < pwend; ++pw) { 170 | if (offset_argmax_data[(c * pooled_height + ph) * pooled_width + pw] == index) 171 | { 172 | gradient += offset_top_diff[(c * pooled_height + ph) * pooled_width + pw]; 173 | } 174 | } 175 | } 176 | } 177 | bottom_diff[index] = gradient; 178 | } 179 | } 180 | 181 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 182 | const int height, const int width, const int channels, const int pooled_height, 183 | const int pooled_width, const float* bottom_rois, 184 | float* bottom_diff, const int* argmax_data, cudaStream_t stream) 185 | { 186 | const int kThreadsPerBlock = 1024; 187 | const int output_size = batch_size * height * width * channels; 188 | cudaError_t err; 189 | 190 | ROIPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 191 | output_size, top_diff, argmax_data, num_rois, spatial_scale, height, width, channels, pooled_height, 192 | pooled_width, bottom_diff, bottom_rois); 193 | 194 | err = cudaGetLastError(); 195 | if(cudaSuccess != err) 196 | { 197 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 198 | exit( -1 ); 199 | } 200 | 201 | return 1; 202 | } 203 | 204 | 205 | #ifdef __cplusplus 206 | } 207 | #endif 208 | 209 | 210 | -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/src/cuda/roi_pooling_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ROI_POOLING_KERNEL 2 | #define _ROI_POOLING_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | int ROIPoolForwardLaucher( 9 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 10 | const int width, const int channels, const int pooled_height, 11 | const int pooled_width, const float* bottom_rois, 12 | float* top_data, int* argmax_data, cudaStream_t stream); 13 | 14 | 15 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 16 | const int height, const int width, const int channels, const int pooled_height, 17 | const int pooled_width, const float* bottom_rois, 18 | float* bottom_diff, const int* argmax_data, cudaStream_t stream); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif 25 | 26 | -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/src/roi_pooling.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale, 5 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output) 6 | { 7 | // Grab the input tensor 8 | float * data_flat = THFloatTensor_data(features); 9 | float * rois_flat = THFloatTensor_data(rois); 10 | 11 | float * output_flat = THFloatTensor_data(output); 12 | 13 | // Number of ROIs 14 | int num_rois = THFloatTensor_size(rois, 0); 15 | int size_rois = THFloatTensor_size(rois, 1); 16 | // batch size 17 | int batch_size = THFloatTensor_size(features, 0); 18 | if(batch_size != 1) 19 | { 20 | return 0; 21 | } 22 | // data height 23 | int data_height = THFloatTensor_size(features, 1); 24 | // data width 25 | int data_width = THFloatTensor_size(features, 2); 26 | // Number of channels 27 | int num_channels = THFloatTensor_size(features, 3); 28 | 29 | // Set all element of the output tensor to -inf. 30 | THFloatStorage_fill(THFloatTensor_storage(output), -1); 31 | 32 | // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R 33 | int index_roi = 0; 34 | int index_output = 0; 35 | int n; 36 | for (n = 0; n < num_rois; ++n) 37 | { 38 | int roi_batch_ind = rois_flat[index_roi + 0]; 39 | int roi_start_w = round(rois_flat[index_roi + 1] * spatial_scale); 40 | int roi_start_h = round(rois_flat[index_roi + 2] * spatial_scale); 41 | int roi_end_w = round(rois_flat[index_roi + 3] * spatial_scale); 42 | int roi_end_h = round(rois_flat[index_roi + 4] * spatial_scale); 43 | // CHECK_GE(roi_batch_ind, 0); 44 | // CHECK_LT(roi_batch_ind, batch_size); 45 | 46 | int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); 47 | int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); 48 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 49 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 50 | 51 | int index_data = roi_batch_ind * data_height * data_width * num_channels; 52 | const int output_area = pooled_width * pooled_height; 53 | 54 | int c, ph, pw; 55 | for (ph = 0; ph < pooled_height; ++ph) 56 | { 57 | for (pw = 0; pw < pooled_width; ++pw) 58 | { 59 | int hstart = (floor((float)(ph) * bin_size_h)); 60 | int wstart = (floor((float)(pw) * bin_size_w)); 61 | int hend = (ceil((float)(ph + 1) * bin_size_h)); 62 | int wend = (ceil((float)(pw + 1) * bin_size_w)); 63 | 64 | hstart = fminf(fmaxf(hstart + roi_start_h, 0), data_height); 65 | hend = fminf(fmaxf(hend + roi_start_h, 0), data_height); 66 | wstart = fminf(fmaxf(wstart + roi_start_w, 0), data_width); 67 | wend = fminf(fmaxf(wend + roi_start_w, 0), data_width); 68 | 69 | const int pool_index = index_output + (ph * pooled_width + pw); 70 | int is_empty = (hend <= hstart) || (wend <= wstart); 71 | if (is_empty) 72 | { 73 | for (c = 0; c < num_channels * output_area; c += output_area) 74 | { 75 | output_flat[pool_index + c] = 0; 76 | } 77 | } 78 | else 79 | { 80 | int h, w, c; 81 | for (h = hstart; h < hend; ++h) 82 | { 83 | for (w = wstart; w < wend; ++w) 84 | { 85 | for (c = 0; c < num_channels; ++c) 86 | { 87 | const int index = (h * data_width + w) * num_channels + c; 88 | if (data_flat[index_data + index] > output_flat[pool_index + c * output_area]) 89 | { 90 | output_flat[pool_index + c * output_area] = data_flat[index_data + index]; 91 | } 92 | } 93 | } 94 | } 95 | } 96 | } 97 | } 98 | 99 | // Increment ROI index 100 | index_roi += size_rois; 101 | index_output += pooled_height * pooled_width * num_channels; 102 | } 103 | return 1; 104 | } -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/src/roi_pooling.h: -------------------------------------------------------------------------------- 1 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale, 2 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output); -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/src/roi_pooling_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cuda/roi_pooling_kernel.h" 4 | 5 | extern THCState *state; 6 | 7 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, 8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax) 9 | { 10 | // Grab the input tensor 11 | float * data_flat = THCudaTensor_data(state, features); 12 | float * rois_flat = THCudaTensor_data(state, rois); 13 | 14 | float * output_flat = THCudaTensor_data(state, output); 15 | int * argmax_flat = THCudaIntTensor_data(state, argmax); 16 | 17 | // Number of ROIs 18 | int num_rois = THCudaTensor_size(state, rois, 0); 19 | int size_rois = THCudaTensor_size(state, rois, 1); 20 | if (size_rois != 5) 21 | { 22 | return 0; 23 | } 24 | 25 | // batch size 26 | int batch_size = THCudaTensor_size(state, features, 0); 27 | if (batch_size != 1) 28 | { 29 | return 0; 30 | } 31 | // data height 32 | int data_height = THCudaTensor_size(state, features, 2); 33 | // data width 34 | int data_width = THCudaTensor_size(state, features, 3); 35 | // Number of channels 36 | int num_channels = THCudaTensor_size(state, features, 1); 37 | 38 | cudaStream_t stream = THCState_getCurrentStream(state); 39 | 40 | ROIPoolForwardLaucher( 41 | data_flat, spatial_scale, num_rois, data_height, 42 | data_width, num_channels, pooled_height, 43 | pooled_width, rois_flat, 44 | output_flat, argmax_flat, stream); 45 | 46 | return 1; 47 | } 48 | 49 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, 50 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax) 51 | { 52 | // Grab the input tensor 53 | float * top_grad_flat = THCudaTensor_data(state, top_grad); 54 | float * rois_flat = THCudaTensor_data(state, rois); 55 | 56 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 57 | int * argmax_flat = THCudaIntTensor_data(state, argmax); 58 | 59 | // Number of ROIs 60 | int num_rois = THCudaTensor_size(state, rois, 0); 61 | int size_rois = THCudaTensor_size(state, rois, 1); 62 | if (size_rois != 5) 63 | { 64 | return 0; 65 | } 66 | 67 | // batch size 68 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 69 | if (batch_size != 1) 70 | { 71 | return 0; 72 | } 73 | // data height 74 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 75 | // data width 76 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 77 | // Number of channels 78 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 79 | 80 | cudaStream_t stream = THCState_getCurrentStream(state); 81 | ROIPoolBackwardLaucher( 82 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height, 83 | data_width, num_channels, pooled_height, 84 | pooled_width, rois_flat, 85 | bottom_grad_flat, argmax_flat, stream); 86 | 87 | return 1; 88 | } -------------------------------------------------------------------------------- /faster_rcnn/roi_pooling/src/roi_pooling_cuda.h: -------------------------------------------------------------------------------- 1 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax); 3 | 4 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax); -------------------------------------------------------------------------------- /faster_rcnn/rpn_msr/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /faster_rcnn/rpn_msr/generate.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import cv2 10 | import matplotlib.pyplot as plt 11 | 12 | from ..utils.blob import im_list_to_blob 13 | from ..utils.timer import Timer 14 | 15 | # TODO: make fast_rcnn irrelevant 16 | # >>>> obsolete, because it depends on sth outside of this project 17 | from ..fast_rcnn.config import cfg 18 | # <<<< obsolete 19 | 20 | 21 | def _vis_proposals(im, dets, thresh=0.5): 22 | """Draw detected bounding boxes.""" 23 | inds = np.where(dets[:, -1] >= thresh)[0] 24 | if len(inds) == 0: 25 | return 26 | 27 | class_name = 'obj' 28 | im = im[:, :, (2, 1, 0)] 29 | fig, ax = plt.subplots(figsize=(12, 12)) 30 | ax.imshow(im, aspect='equal') 31 | for i in inds: 32 | bbox = dets[i, :4] 33 | score = dets[i, -1] 34 | 35 | ax.add_patch( 36 | plt.Rectangle((bbox[0], bbox[1]), 37 | bbox[2] - bbox[0], 38 | bbox[3] - bbox[1], fill=False, 39 | edgecolor='red', linewidth=3.5) 40 | ) 41 | ax.text(bbox[0], bbox[1] - 2, 42 | '{:s} {:.3f}'.format(class_name, score), 43 | bbox=dict(facecolor='blue', alpha=0.5), 44 | fontsize=14, color='white') 45 | 46 | ax.set_title(('{} detections with ' 47 | 'p({} | box) >= {:.1f}').format(class_name, class_name, 48 | thresh), 49 | fontsize=14) 50 | plt.axis('off') 51 | plt.tight_layout() 52 | plt.draw() 53 | 54 | def _get_image_blob(im): 55 | """Converts an image into a network input. 56 | 57 | Arguments: 58 | im (ndarray): a color image in BGR order 59 | 60 | Returns: 61 | blob (ndarray): a data blob holding an image pyramid 62 | im_scale_factors (list): list of image scales (relative to im) used 63 | in the image pyramid 64 | """ 65 | im_orig = im.astype(np.float32, copy=True) 66 | im_orig -= cfg.PIXEL_MEANS 67 | 68 | processed_ims = [] 69 | 70 | assert len(cfg.TEST.SCALES_BASE) == 1 71 | im_scale = cfg.TRAIN.SCALES_BASE[0] 72 | 73 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 74 | interpolation=cv2.INTER_LINEAR) 75 | im_info = np.hstack((im.shape[:2], im_scale))[np.newaxis, :] 76 | processed_ims.append(im) 77 | 78 | # Create a blob to hold the input images 79 | blob = im_list_to_blob(processed_ims) 80 | 81 | return blob, im_info 82 | 83 | def im_proposals(net, im): 84 | """Generate RPN proposals on a single image.""" 85 | blobs = {} 86 | blobs['data'], blobs['im_info'] = _get_image_blob(im) 87 | net.blobs['data'].reshape(*(blobs['data'].shape)) 88 | net.blobs['im_info'].reshape(*(blobs['im_info'].shape)) 89 | blobs_out = net.forward( 90 | data=blobs['data'].astype(np.float32, copy=False), 91 | im_info=blobs['im_info'].astype(np.float32, copy=False)) 92 | 93 | scale = blobs['im_info'][0, 2] 94 | boxes = blobs_out['rois'][:, 1:].copy() / scale 95 | scores = blobs_out['scores'].copy() 96 | return boxes, scores 97 | 98 | def imdb_proposals(net, imdb): 99 | """Generate RPN proposals on all images in an imdb.""" 100 | 101 | _t = Timer() 102 | imdb_boxes = [[] for _ in xrange(imdb.num_images)] 103 | for i in xrange(imdb.num_images): 104 | im = cv2.imread(imdb.image_path_at(i)) 105 | _t.tic() 106 | imdb_boxes[i], scores = im_proposals(net, im) 107 | _t.toc() 108 | print 'im_proposals: {:d}/{:d} {:.3f}s' \ 109 | .format(i + 1, imdb.num_images, _t.average_time) 110 | if 0: 111 | dets = np.hstack((imdb_boxes[i], scores)) 112 | # from IPython import embed; embed() 113 | _vis_proposals(im, dets[:3, :], thresh=0.9) 114 | plt.show() 115 | 116 | return imdb_boxes 117 | 118 | def imdb_proposals_det(net, imdb): 119 | """Generate RPN proposals on all images in an imdb.""" 120 | 121 | _t = Timer() 122 | imdb_boxes = [[] for _ in xrange(imdb.num_images)] 123 | for i in xrange(imdb.num_images): 124 | im = cv2.imread(imdb.image_path_at(i)) 125 | _t.tic() 126 | boxes, scores = im_proposals(net, im) 127 | _t.toc() 128 | print 'im_proposals: {:d}/{:d} {:.3f}s' \ 129 | .format(i + 1, imdb.num_images, _t.average_time) 130 | dets = np.hstack((boxes, scores)) 131 | imdb_boxes[i] = dets 132 | 133 | if 0: 134 | # from IPython import embed; embed() 135 | _vis_proposals(im, dets[:3, :], thresh=0.9) 136 | plt.show() 137 | 138 | return imdb_boxes 139 | -------------------------------------------------------------------------------- /faster_rcnn/rpn_msr/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 11 | # 12 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 13 | # >> anchors 14 | # 15 | # anchors = 16 | # 17 | # -83 -39 100 56 18 | # -175 -87 192 104 19 | # -359 -183 376 200 20 | # -55 -55 72 72 21 | # -119 -119 136 136 22 | # -247 -247 264 264 23 | # -35 -79 52 96 24 | # -79 -167 96 184 25 | # -167 -343 184 360 26 | 27 | #array([[ -83., -39., 100., 56.], 28 | # [-175., -87., 192., 104.], 29 | # [-359., -183., 376., 200.], 30 | # [ -55., -55., 72., 72.], 31 | # [-119., -119., 136., 136.], 32 | # [-247., -247., 264., 264.], 33 | # [ -35., -79., 52., 96.], 34 | # [ -79., -167., 96., 184.], 35 | # [-167., -343., 184., 360.]]) 36 | 37 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 38 | scales=2**np.arange(3, 6)): 39 | """ 40 | Generate anchor (reference) windows by enumerating aspect ratios X 41 | scales wrt a reference (0, 0, 15, 15) window. 42 | """ 43 | 44 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 45 | ratio_anchors = _ratio_enum(base_anchor, ratios) 46 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 47 | for i in xrange(ratio_anchors.shape[0])]) 48 | return anchors 49 | 50 | def _whctrs(anchor): 51 | """ 52 | Return width, height, x center, and y center for an anchor (window). 53 | """ 54 | 55 | w = anchor[2] - anchor[0] + 1 56 | h = anchor[3] - anchor[1] + 1 57 | x_ctr = anchor[0] + 0.5 * (w - 1) 58 | y_ctr = anchor[1] + 0.5 * (h - 1) 59 | return w, h, x_ctr, y_ctr 60 | 61 | def _mkanchors(ws, hs, x_ctr, y_ctr): 62 | """ 63 | Given a vector of widths (ws) and heights (hs) around a center 64 | (x_ctr, y_ctr), output a set of anchors (windows). 65 | """ 66 | 67 | ws = ws[:, np.newaxis] 68 | hs = hs[:, np.newaxis] 69 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 70 | y_ctr - 0.5 * (hs - 1), 71 | x_ctr + 0.5 * (ws - 1), 72 | y_ctr + 0.5 * (hs - 1))) 73 | return anchors 74 | 75 | def _ratio_enum(anchor, ratios): 76 | """ 77 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 78 | """ 79 | 80 | w, h, x_ctr, y_ctr = _whctrs(anchor) 81 | size = w * h 82 | size_ratios = size / ratios 83 | ws = np.round(np.sqrt(size_ratios)) 84 | hs = np.round(ws * ratios) 85 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 86 | return anchors 87 | 88 | def _scale_enum(anchor, scales): 89 | """ 90 | Enumerate a set of anchors for each scale wrt an anchor. 91 | """ 92 | 93 | w, h, x_ctr, y_ctr = _whctrs(anchor) 94 | ws = w * scales 95 | hs = h * scales 96 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 97 | return anchors 98 | 99 | if __name__ == '__main__': 100 | import time 101 | t = time.time() 102 | a = generate_anchors() 103 | print time.time() - t 104 | print a 105 | from IPython import embed; embed() 106 | -------------------------------------------------------------------------------- /faster_rcnn/rpn_msr/proposal_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import yaml 10 | 11 | from .generate_anchors import generate_anchors 12 | 13 | # TODO: make fast_rcnn irrelevant 14 | # >>>> obsolete, because it depends on sth outside of this project 15 | from ..fast_rcnn.config import cfg 16 | from ..fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes 17 | from ..fast_rcnn.nms_wrapper import nms 18 | 19 | # <<<< obsolete 20 | 21 | 22 | DEBUG = False 23 | """ 24 | Outputs object detection proposals by applying estimated bounding-box 25 | transformations to a set of regular boxes (called "anchors"). 26 | """ 27 | 28 | 29 | def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, _feat_stride=[16, ], 30 | anchor_scales=[8, 16, 32]): 31 | """ 32 | Parameters 33 | ---------- 34 | rpn_cls_prob_reshape: (1 , H , W , Ax2) outputs of RPN, prob of bg or fg 35 | NOTICE: the old version is ordered by (1, H, W, 2, A) !!!! 36 | rpn_bbox_pred: (1 , H , W , Ax4), rgs boxes output of RPN 37 | im_info: a list of [image_height, image_width, scale_ratios] 38 | cfg_key: 'TRAIN' or 'TEST' 39 | _feat_stride: the downsampling ratio of feature map to the original input image 40 | anchor_scales: the scales to the basic_anchor (basic anchor is [16, 16]) 41 | ---------- 42 | Returns 43 | ---------- 44 | rpn_rois : (1 x H x W x A, 5) e.g. [0, x1, y1, x2, y2] 45 | 46 | # Algorithm: 47 | # 48 | # for each (H, W) location i 49 | # generate A anchor boxes centered on cell i 50 | # apply predicted bbox deltas at cell i to each of the A anchors 51 | # clip predicted boxes to image 52 | # remove predicted boxes with either height or width < threshold 53 | # sort all (proposal, score) pairs by score from highest to lowest 54 | # take top pre_nms_topN proposals before NMS 55 | # apply NMS with threshold 0.7 to remaining proposals 56 | # take after_nms_topN proposals after NMS 57 | # return the top proposals (-> RoIs top, scores top) 58 | #layer_params = yaml.load(self.param_str_) 59 | 60 | """ 61 | _anchors = generate_anchors(scales=np.array(anchor_scales)) 62 | _num_anchors = _anchors.shape[0] 63 | # rpn_cls_prob_reshape = np.transpose(rpn_cls_prob_reshape,[0,3,1,2]) #-> (1 , 2xA, H , W) 64 | # rpn_bbox_pred = np.transpose(rpn_bbox_pred,[0,3,1,2]) # -> (1 , Ax4, H , W) 65 | 66 | # rpn_cls_prob_reshape = np.transpose(np.reshape(rpn_cls_prob_reshape,[1,rpn_cls_prob_reshape.shape[0],rpn_cls_prob_reshape.shape[1],rpn_cls_prob_reshape.shape[2]]),[0,3,2,1]) 67 | # rpn_bbox_pred = np.transpose(rpn_bbox_pred,[0,3,2,1]) 68 | im_info = im_info[0] 69 | 70 | assert rpn_cls_prob_reshape.shape[0] == 1, \ 71 | 'Only single item batches are supported' 72 | # cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' 73 | # cfg_key = 'TEST' 74 | pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N 75 | post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N 76 | nms_thresh = cfg[cfg_key].RPN_NMS_THRESH 77 | min_size = cfg[cfg_key].RPN_MIN_SIZE 78 | 79 | # the first set of _num_anchors channels are bg probs 80 | # the second set are the fg probs, which we want 81 | scores = rpn_cls_prob_reshape[:, _num_anchors:, :, :] 82 | bbox_deltas = rpn_bbox_pred 83 | # im_info = bottom[2].data[0, :] 84 | 85 | if DEBUG: 86 | print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) 87 | print 'scale: {}'.format(im_info[2]) 88 | 89 | # 1. Generate proposals from bbox deltas and shifted anchors 90 | height, width = scores.shape[-2:] 91 | 92 | if DEBUG: 93 | print 'score map size: {}'.format(scores.shape) 94 | 95 | # Enumerate all shifts 96 | shift_x = np.arange(0, width) * _feat_stride 97 | shift_y = np.arange(0, height) * _feat_stride 98 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 99 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), 100 | shift_x.ravel(), shift_y.ravel())).transpose() 101 | 102 | # Enumerate all shifted anchors: 103 | # 104 | # add A anchors (1, A, 4) to 105 | # cell K shifts (K, 1, 4) to get 106 | # shift anchors (K, A, 4) 107 | # reshape to (K*A, 4) shifted anchors 108 | A = _num_anchors 109 | K = shifts.shape[0] 110 | anchors = _anchors.reshape((1, A, 4)) + \ 111 | shifts.reshape((1, K, 4)).transpose((1, 0, 2)) 112 | anchors = anchors.reshape((K * A, 4)) 113 | 114 | # Transpose and reshape predicted bbox transformations to get them 115 | # into the same order as the anchors: 116 | # 117 | # bbox deltas will be (1, 4 * A, H, W) format 118 | # transpose to (1, H, W, 4 * A) 119 | # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) 120 | # in slowest to fastest order 121 | bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) 122 | 123 | # Same story for the scores: 124 | # 125 | # scores are (1, A, H, W) format 126 | # transpose to (1, H, W, A) 127 | # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) 128 | scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) 129 | 130 | # Convert anchors into proposals via bbox transformations 131 | proposals = bbox_transform_inv(anchors, bbox_deltas) 132 | 133 | # 2. clip predicted boxes to image 134 | proposals = clip_boxes(proposals, im_info[:2]) 135 | 136 | # 3. remove predicted boxes with either height or width < threshold 137 | # (NOTE: convert min_size to input image scale stored in im_info[2]) 138 | keep = _filter_boxes(proposals, min_size * im_info[2]) 139 | proposals = proposals[keep, :] 140 | scores = scores[keep] 141 | 142 | # # remove irregular boxes, too fat too tall 143 | # keep = _filter_irregular_boxes(proposals) 144 | # proposals = proposals[keep, :] 145 | # scores = scores[keep] 146 | 147 | # 4. sort all (proposal, score) pairs by score from highest to lowest 148 | # 5. take top pre_nms_topN (e.g. 6000) 149 | order = scores.ravel().argsort()[::-1] 150 | if pre_nms_topN > 0: 151 | order = order[:pre_nms_topN] 152 | proposals = proposals[order, :] 153 | scores = scores[order] 154 | 155 | # 6. apply nms (e.g. threshold = 0.7) 156 | # 7. take after_nms_topN (e.g. 300) 157 | # 8. return the top proposals (-> RoIs top) 158 | keep = nms(np.hstack((proposals, scores)), nms_thresh) 159 | if post_nms_topN > 0: 160 | keep = keep[:post_nms_topN] 161 | proposals = proposals[keep, :] 162 | scores = scores[keep] 163 | # Output rois blob 164 | # Our RPN implementation only supports a single input image, so all 165 | # batch inds are 0 166 | batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) 167 | blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) 168 | return blob 169 | # top[0].reshape(*(blob.shape)) 170 | # top[0].data[...] = blob 171 | 172 | # [Optional] output scores blob 173 | # if len(top) > 1: 174 | # top[1].reshape(*(scores.shape)) 175 | # top[1].data[...] = scores 176 | 177 | 178 | def _filter_boxes(boxes, min_size): 179 | """Remove all boxes with any side smaller than min_size.""" 180 | ws = boxes[:, 2] - boxes[:, 0] + 1 181 | hs = boxes[:, 3] - boxes[:, 1] + 1 182 | keep = np.where((ws >= min_size) & (hs >= min_size))[0] 183 | return keep 184 | 185 | 186 | def _filter_irregular_boxes(boxes, min_ratio=0.2, max_ratio=5): 187 | """Remove all boxes with any side smaller than min_size.""" 188 | ws = boxes[:, 2] - boxes[:, 0] + 1 189 | hs = boxes[:, 3] - boxes[:, 1] + 1 190 | rs = ws / hs 191 | keep = np.where((rs <= max_ratio) & (rs >= min_ratio))[0] 192 | return keep 193 | -------------------------------------------------------------------------------- /faster_rcnn/rpn_msr/proposal_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import yaml 9 | import numpy as np 10 | import numpy.random as npr 11 | import pdb 12 | 13 | from ..utils.cython_bbox import bbox_overlaps, bbox_intersections 14 | 15 | # TODO: make fast_rcnn irrelevant 16 | # >>>> obsolete, because it depends on sth outside of this project 17 | from ..fast_rcnn.config import cfg 18 | from ..fast_rcnn.bbox_transform import bbox_transform 19 | 20 | # <<<< obsolete 21 | 22 | DEBUG = False 23 | 24 | 25 | def proposal_target_layer(rpn_rois, gt_boxes, gt_ishard, dontcare_areas, _num_classes): 26 | """ 27 | Assign object detection proposals to ground-truth targets. Produces proposal 28 | classification labels and bounding-box regression targets. 29 | Parameters 30 | ---------- 31 | rpn_rois: (1 x H x W x A, 5) [0, x1, y1, x2, y2] 32 | gt_boxes: (G, 5) [x1 ,y1 ,x2, y2, class] int 33 | gt_ishard: (G, 1) {0 | 1} 1 indicates hard 34 | dontcare_areas: (D, 4) [ x1, y1, x2, y2] 35 | _num_classes 36 | ---------- 37 | Returns 38 | ---------- 39 | rois: (1 x H x W x A, 5) [0, x1, y1, x2, y2] 40 | labels: (1 x H x W x A, 1) {0,1,...,_num_classes-1} 41 | bbox_targets: (1 x H x W x A, K x4) [dx1, dy1, dx2, dy2] 42 | bbox_inside_weights: (1 x H x W x A, Kx4) 0, 1 masks for the computing loss 43 | bbox_outside_weights: (1 x H x W x A, Kx4) 0, 1 masks for the computing loss 44 | """ 45 | 46 | # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN 47 | # (i.e., rpn.proposal_layer.ProposalLayer), or any other source 48 | all_rois = rpn_rois 49 | # TODO(rbg): it's annoying that sometimes I have extra info before 50 | # and other times after box coordinates -- normalize to one format 51 | 52 | # Include ground-truth boxes in the set of candidate rois 53 | if cfg.TRAIN.PRECLUDE_HARD_SAMPLES and gt_ishard is not None and gt_ishard.shape[0] > 0: 54 | assert gt_ishard.shape[0] == gt_boxes.shape[0] 55 | gt_ishard = gt_ishard.astype(int) 56 | gt_easyboxes = gt_boxes[gt_ishard != 1, :] 57 | else: 58 | gt_easyboxes = gt_boxes 59 | 60 | """ 61 | add the ground-truth to rois will cause zero loss! not good for visuallization 62 | """ 63 | jittered_gt_boxes = _jitter_gt_boxes(gt_easyboxes) 64 | zeros = np.zeros((gt_easyboxes.shape[0] * 2, 1), dtype=gt_easyboxes.dtype) 65 | all_rois = np.vstack((all_rois, \ 66 | np.hstack((zeros, np.vstack((gt_easyboxes[:, :-1], jittered_gt_boxes[:, :-1])))))) 67 | 68 | # Sanity check: single batch only 69 | assert np.all(all_rois[:, 0] == 0), \ 70 | 'Only single item batches are supported' 71 | 72 | num_images = 1 73 | rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images 74 | fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)) 75 | 76 | # Sample rois with classification labels and bounding box regression 77 | # targets 78 | labels, rois, bbox_targets, bbox_inside_weights = _sample_rois( 79 | all_rois, gt_boxes, gt_ishard, dontcare_areas, fg_rois_per_image, 80 | rois_per_image, _num_classes) 81 | 82 | _count = 1 83 | if DEBUG: 84 | if _count == 1: 85 | _fg_num, _bg_num = 0, 0 86 | print 'num fg: {}'.format((labels > 0).sum()) 87 | print 'num bg: {}'.format((labels == 0).sum()) 88 | _count += 1 89 | _fg_num += (labels > 0).sum() 90 | _bg_num += (labels == 0).sum() 91 | print 'num fg avg: {}'.format(_fg_num / _count) 92 | print 'num bg avg: {}'.format(_bg_num / _count) 93 | print 'ratio: {:.3f}'.format(float(_fg_num) / float(_bg_num)) 94 | 95 | rois = rois.reshape(-1, 5) 96 | labels = labels.reshape(-1, 1) 97 | bbox_targets = bbox_targets.reshape(-1, _num_classes * 4) 98 | bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4) 99 | 100 | bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32) 101 | 102 | return rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights 103 | 104 | def _get_bbox_regression_labels(bbox_target_data, num_classes): 105 | """Bounding-box regression targets (bbox_target_data) are stored in a 106 | compact form N x (class, tx, ty, tw, th) 107 | 108 | This function expands those targets into the 4-of-4*K representation used 109 | by the network (i.e. only one class has non-zero targets). 110 | 111 | Returns: 112 | bbox_target (ndarray): N x 4K blob of regression targets 113 | bbox_inside_weights (ndarray): N x 4K blob of loss weights 114 | """ 115 | 116 | clss = bbox_target_data[:, 0] 117 | bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) 118 | # print 'proposal_target_layer:', bbox_targets.shape 119 | bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 120 | inds = np.where(clss > 0)[0] 121 | for ind in inds: 122 | cls = clss[ind] 123 | start = 4 * (1 if cls > 0 else 0) 124 | end = start + 4 125 | bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] 126 | bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS 127 | return bbox_targets, bbox_inside_weights 128 | 129 | 130 | def _compute_targets(ex_rois, gt_rois, labels): 131 | """Compute bounding-box regression targets for an image.""" 132 | 133 | assert ex_rois.shape[0] == gt_rois.shape[0] 134 | assert ex_rois.shape[1] == 4 135 | assert gt_rois.shape[1] == 4 136 | 137 | targets = bbox_transform(ex_rois, gt_rois) 138 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 139 | # Optionally normalize targets by a precomputed mean and stdev 140 | targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) 141 | / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) 142 | return np.hstack( 143 | (labels[:, np.newaxis], targets)).astype(np.float32, copy=False) 144 | 145 | #def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): 146 | def _sample_rois(all_rois, gt_boxes, gt_ishard, dontcare_areas, fg_rois_per_image, rois_per_image, num_classes): 147 | """Generate a random sample of RoIs comprising foreground and background 148 | examples. 149 | """ 150 | # overlaps: (rois x gt_boxes) 151 | overlaps = bbox_overlaps( 152 | np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), 153 | np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) 154 | gt_assignment = overlaps.argmax(axis=1) 155 | max_overlaps = overlaps.max(axis=1) 156 | labels = gt_boxes[gt_assignment, 4] 157 | 158 | # Select foreground RoIs as those with >= FG_THRESH overlap 159 | fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] 160 | # Guard against the case when an image has fewer than fg_rois_per_image 161 | # foreground RoIs 162 | fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size) 163 | # Sample foreground regions without replacement 164 | if fg_inds.size > 0: 165 | fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) 166 | 167 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 168 | bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & 169 | (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 170 | # Compute number of background RoIs to take from this image (guarding 171 | # against there being fewer than desired) 172 | bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image 173 | bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) 174 | # Sample background regions without replacement 175 | if bg_inds.size > 0: 176 | bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) 177 | 178 | # The indices that we're selecting (both fg and bg) 179 | keep_inds = np.append(fg_inds, bg_inds) 180 | # print 'proposal_target_layer:', keep_inds 181 | 182 | # Select sampled values from various arrays: 183 | labels = labels[keep_inds] 184 | # Clamp labels for the background RoIs to 0 185 | labels[fg_rois_per_this_image:] = 0 186 | rois = all_rois[keep_inds] 187 | 188 | # print 'proposal_target_layer:', rois 189 | bbox_target_data = _compute_targets( 190 | rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) 191 | 192 | # print 'proposal_target_layer:', bbox_target_data 193 | bbox_targets, bbox_inside_weights = \ 194 | _get_bbox_regression_labels(bbox_target_data, num_classes) 195 | 196 | return labels, rois, bbox_targets, bbox_inside_weights 197 | 198 | 199 | def _jitter_gt_boxes(gt_boxes, jitter=0.05): 200 | """ jitter the gtboxes, before adding them into rois, to be more robust for cls and rgs 201 | gt_boxes: (G, 5) [x1 ,y1 ,x2, y2, class] int 202 | """ 203 | jittered_boxes = gt_boxes.copy() 204 | ws = jittered_boxes[:, 2] - jittered_boxes[:, 0] + 1.0 205 | hs = jittered_boxes[:, 3] - jittered_boxes[:, 1] + 1.0 206 | width_offset = (np.random.rand(jittered_boxes.shape[0]) - 0.5) * jitter * ws 207 | height_offset = (np.random.rand(jittered_boxes.shape[0]) - 0.5) * jitter * hs 208 | jittered_boxes[:, 0] += width_offset 209 | jittered_boxes[:, 2] += width_offset 210 | jittered_boxes[:, 1] += height_offset 211 | jittered_boxes[:, 3] += height_offset 212 | 213 | return jittered_boxes 214 | 215 | -------------------------------------------------------------------------------- /faster_rcnn/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | import numpy as np 11 | from distutils.core import setup 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 19 | for dir in path.split(os.pathsep): 20 | binpath = pjoin(dir, name) 21 | if os.path.exists(binpath): 22 | return os.path.abspath(binpath) 23 | return None 24 | 25 | 26 | def locate_cuda(): 27 | """Locate the CUDA environment on the system 28 | 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | 32 | Starts by looking for the CUDAHOME env variable. If not found, everything 33 | is based on finding 'nvcc' in the PATH. 34 | """ 35 | 36 | # first check if the CUDAHOME env variable is in use 37 | if 'CUDAHOME' in os.environ: 38 | home = os.environ['CUDAHOME'] 39 | nvcc = pjoin(home, 'bin', 'nvcc') 40 | else: 41 | # otherwise, search the PATH for NVCC 42 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 43 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 44 | if nvcc is None: 45 | raise EnvironmentError('The nvcc binary could not be ' 46 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 47 | home = os.path.dirname(os.path.dirname(nvcc)) 48 | 49 | cudaconfig = {'home': home, 'nvcc': nvcc, 50 | 'include': pjoin(home, 'include'), 51 | 'lib64': pjoin(home, 'lib64')} 52 | for k, v in cudaconfig.iteritems(): 53 | if not os.path.exists(v): 54 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 55 | 56 | return cudaconfig 57 | 58 | 59 | CUDA = locate_cuda() 60 | 61 | # Obtain the numpy include directory. This logic works across numpy versions. 62 | try: 63 | numpy_include = np.get_include() 64 | except AttributeError: 65 | numpy_include = np.get_numpy_include() 66 | 67 | 68 | def customize_compiler_for_nvcc(self): 69 | """inject deep into distutils to customize how the dispatch 70 | to gcc/nvcc works. 71 | 72 | If you subclass UnixCCompiler, it's not trivial to get your subclass 73 | injected in, and still have the right customizations (i.e. 74 | distutils.sysconfig.customize_compiler) run on it. So instead of going 75 | the OO route, I have this. Note, it's kindof like a wierd functional 76 | subclassing going on.""" 77 | 78 | # tell the compiler it can processes .cu 79 | self.src_extensions.append('.cu') 80 | 81 | # save references to the default compiler_so and _comple methods 82 | default_compiler_so = self.compiler_so 83 | super = self._compile 84 | 85 | # now redefine the _compile method. This gets executed for each 86 | # object but distutils doesn't have the ability to change compilers 87 | # based on source extension: we add it. 88 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 89 | print extra_postargs 90 | if os.path.splitext(src)[1] == '.cu': 91 | # use the cuda for .cu files 92 | self.set_executable('compiler_so', CUDA['nvcc']) 93 | # use only a subset of the extra_postargs, which are 1-1 translated 94 | # from the extra_compile_args in the Extension class 95 | postargs = extra_postargs['nvcc'] 96 | else: 97 | postargs = extra_postargs['gcc'] 98 | 99 | super(obj, src, ext, cc_args, postargs, pp_opts) 100 | # reset the default compiler_so, which we might have changed for cuda 101 | self.compiler_so = default_compiler_so 102 | 103 | # inject our redefined _compile method into the class 104 | self._compile = _compile 105 | 106 | 107 | # run the customize_compiler 108 | class custom_build_ext(build_ext): 109 | def build_extensions(self): 110 | customize_compiler_for_nvcc(self.compiler) 111 | build_ext.build_extensions(self) 112 | 113 | 114 | ext_modules = [ 115 | Extension( 116 | "utils.cython_bbox", 117 | ["utils/bbox.pyx"], 118 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 119 | include_dirs=[numpy_include] 120 | ), 121 | Extension( 122 | "utils.cython_nms", 123 | ["utils/nms.pyx"], 124 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 125 | include_dirs=[numpy_include] 126 | ), 127 | Extension( 128 | "nms.cpu_nms", 129 | ["nms/cpu_nms.pyx"], 130 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 131 | include_dirs=[numpy_include] 132 | ), 133 | Extension('nms.gpu_nms', 134 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 135 | library_dirs=[CUDA['lib64']], 136 | libraries=['cudart'], 137 | language='c++', 138 | runtime_library_dirs=[CUDA['lib64']], 139 | # this syntax is specific to this build system 140 | # we're only going to use certain compiler args with nvcc and not with gcc 141 | # the implementation of this trick is in customize_compiler() below 142 | extra_compile_args={'gcc': ["-Wno-unused-function"], 143 | 'nvcc': ['-arch=sm_35', 144 | '--ptxas-options=-v', 145 | '-c', 146 | '--compiler-options', 147 | "'-fPIC'"]}, 148 | include_dirs=[numpy_include, CUDA['include']] 149 | ), 150 | Extension( 151 | 'pycocotools._mask', 152 | sources=['pycocotools/maskApi.c', 'pycocotools/_mask.pyx'], 153 | include_dirs=[numpy_include, 'pycocotools'], 154 | extra_compile_args={ 155 | 'gcc': ['-Wno-cpp', '-Wno-unused-function', '-std=c99']}, 156 | ), 157 | ] 158 | 159 | setup( 160 | name='fast_rcnn', 161 | ext_modules=ext_modules, 162 | # inject our custom trigger 163 | cmdclass={'build_ext': custom_build_ext}, 164 | ) 165 | -------------------------------------------------------------------------------- /faster_rcnn/utils/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /faster_rcnn/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | from . import cython_nms 8 | from . import cython_bbox 9 | import blob 10 | import nms 11 | import timer -------------------------------------------------------------------------------- /faster_rcnn/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps(np.ndarray[DTYPE_t, ndim=2] boxes, 16 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 17 | return bbox_overlaps_c(boxes, query_boxes) 18 | 19 | cdef np.ndarray[DTYPE_t, ndim=2] bbox_overlaps_c( 20 | np.ndarray[DTYPE_t, ndim=2] boxes, 21 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 22 | """ 23 | Parameters 24 | ---------- 25 | boxes: (N, 4) ndarray of float 26 | query_boxes: (K, 4) ndarray of float 27 | Returns 28 | ------- 29 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 30 | """ 31 | cdef unsigned int N = boxes.shape[0] 32 | cdef unsigned int K = query_boxes.shape[0] 33 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 34 | cdef DTYPE_t iw, ih, box_area 35 | cdef DTYPE_t ua 36 | cdef unsigned int k, n 37 | for k in range(K): 38 | box_area = ( 39 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 40 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 41 | ) 42 | for n in range(N): 43 | iw = ( 44 | min(boxes[n, 2], query_boxes[k, 2]) - 45 | max(boxes[n, 0], query_boxes[k, 0]) + 1 46 | ) 47 | if iw > 0: 48 | ih = ( 49 | min(boxes[n, 3], query_boxes[k, 3]) - 50 | max(boxes[n, 1], query_boxes[k, 1]) + 1 51 | ) 52 | if ih > 0: 53 | ua = float( 54 | (boxes[n, 2] - boxes[n, 0] + 1) * 55 | (boxes[n, 3] - boxes[n, 1] + 1) + 56 | box_area - iw * ih 57 | ) 58 | overlaps[n, k] = iw * ih / ua 59 | return overlaps 60 | 61 | 62 | def bbox_intersections( 63 | np.ndarray[DTYPE_t, ndim=2] boxes, 64 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 65 | return bbox_intersections_c(boxes, query_boxes) 66 | 67 | 68 | cdef np.ndarray[DTYPE_t, ndim=2] bbox_intersections_c( 69 | np.ndarray[DTYPE_t, ndim=2] boxes, 70 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 71 | """ 72 | For each query box compute the intersection ratio covered by boxes 73 | ---------- 74 | Parameters 75 | ---------- 76 | boxes: (N, 4) ndarray of float 77 | query_boxes: (K, 4) ndarray of float 78 | Returns 79 | ------- 80 | overlaps: (N, K) ndarray of intersec between boxes and query_boxes 81 | """ 82 | cdef unsigned int N = boxes.shape[0] 83 | cdef unsigned int K = query_boxes.shape[0] 84 | cdef np.ndarray[DTYPE_t, ndim=2] intersec = np.zeros((N, K), dtype=DTYPE) 85 | cdef DTYPE_t iw, ih, box_area 86 | cdef DTYPE_t ua 87 | cdef unsigned int k, n 88 | for k in range(K): 89 | box_area = ( 90 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 91 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 92 | ) 93 | for n in range(N): 94 | iw = ( 95 | min(boxes[n, 2], query_boxes[k, 2]) - 96 | max(boxes[n, 0], query_boxes[k, 0]) + 1 97 | ) 98 | if iw > 0: 99 | ih = ( 100 | min(boxes[n, 3], query_boxes[k, 3]) - 101 | max(boxes[n, 1], query_boxes[k, 1]) + 1 102 | ) 103 | if ih > 0: 104 | intersec[n, k] = iw * ih / box_area 105 | return intersec -------------------------------------------------------------------------------- /faster_rcnn/utils/blob.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Blob helper functions.""" 9 | 10 | import numpy as np 11 | import cv2 12 | 13 | def im_list_to_blob(ims): 14 | """Convert a list of images into a network input. 15 | 16 | Assumes images are already prepared (means subtracted, BGR order, ...). 17 | """ 18 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 19 | num_images = len(ims) 20 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 21 | dtype=np.float32) 22 | for i in xrange(num_images): 23 | im = ims[i] 24 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 25 | 26 | return blob 27 | 28 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 29 | """Mean subtract and scale an image for use in a blob.""" 30 | im = im.astype(np.float32, copy=False) 31 | im -= pixel_means 32 | im_shape = im.shape 33 | im_size_min = np.min(im_shape[0:2]) 34 | im_size_max = np.max(im_shape[0:2]) 35 | im_scale = float(target_size) / float(im_size_min) 36 | # Prevent the biggest axis from being more than MAX_SIZE 37 | if np.round(im_scale * im_size_max) > max_size: 38 | im_scale = float(max_size) / float(im_size_max) 39 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 40 | interpolation=cv2.INTER_LINEAR) 41 | 42 | return im, im_scale 43 | -------------------------------------------------------------------------------- /faster_rcnn/utils/boxes_grid.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Subcategory CNN 3 | # Copyright (c) 2015 CVGL Stanford 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Yu Xiang 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import math 10 | # TODO: make fast_rcnn irrelevant 11 | # >>>> obsolete, because it depends on sth outside of this project 12 | from ..fast_rcnn.config import cfg 13 | # <<<< obsolete 14 | 15 | def get_boxes_grid(image_height, image_width): 16 | """ 17 | Return the boxes on image grid. 18 | calling this function when cfg.IS_MULTISCALE is True, otherwise, calling rdl_roidb.prepare_roidb(imdb) instead. 19 | """ 20 | 21 | # fixed a bug, change cfg.TRAIN.SCALES to cfg.TRAIN.SCALES_BASE 22 | # coz, here needs a ratio around 1.0, not the accutual size. 23 | # height and width of the feature map 24 | if cfg.NET_NAME == 'CaffeNet': 25 | height = np.floor((image_height * max(cfg.TRAIN.SCALES_BASE) - 1) / 4.0 + 1) 26 | height = np.floor((height - 1) / 2.0 + 1 + 0.5) 27 | height = np.floor((height - 1) / 2.0 + 1 + 0.5) 28 | 29 | width = np.floor((image_width * max(cfg.TRAIN.SCALES_BASE) - 1) / 4.0 + 1) 30 | width = np.floor((width - 1) / 2.0 + 1 + 0.5) 31 | width = np.floor((width - 1) / 2.0 + 1 + 0.5) 32 | elif cfg.NET_NAME == 'VGGnet': 33 | height = np.floor(image_height * max(cfg.TRAIN.SCALES_BASE) / 2.0 + 0.5) 34 | height = np.floor(height / 2.0 + 0.5) 35 | height = np.floor(height / 2.0 + 0.5) 36 | height = np.floor(height / 2.0 + 0.5) 37 | 38 | width = np.floor(image_width * max(cfg.TRAIN.SCALES_BASE) / 2.0 + 0.5) 39 | width = np.floor(width / 2.0 + 0.5) 40 | width = np.floor(width / 2.0 + 0.5) 41 | width = np.floor(width / 2.0 + 0.5) 42 | else: 43 | assert (1), 'The network architecture is not supported in utils.get_boxes_grid!' 44 | 45 | # compute the grid box centers 46 | h = np.arange(height) 47 | w = np.arange(width) 48 | y, x = np.meshgrid(h, w, indexing='ij') 49 | centers = np.dstack((x, y)) 50 | centers = np.reshape(centers, (-1, 2)) 51 | num = centers.shape[0] 52 | 53 | # compute width and height of grid box 54 | area = cfg.TRAIN.KERNEL_SIZE * cfg.TRAIN.KERNEL_SIZE 55 | aspect = cfg.TRAIN.ASPECTS # height / width 56 | num_aspect = len(aspect) 57 | widths = np.zeros((1, num_aspect), dtype=np.float32) 58 | heights = np.zeros((1, num_aspect), dtype=np.float32) 59 | for i in xrange(num_aspect): 60 | widths[0,i] = math.sqrt(area / aspect[i]) 61 | heights[0,i] = widths[0,i] * aspect[i] 62 | 63 | # construct grid boxes 64 | centers = np.repeat(centers, num_aspect, axis=0) 65 | widths = np.tile(widths, num).transpose() 66 | heights = np.tile(heights, num).transpose() 67 | 68 | x1 = np.reshape(centers[:,0], (-1, 1)) - widths * 0.5 69 | x2 = np.reshape(centers[:,0], (-1, 1)) + widths * 0.5 70 | y1 = np.reshape(centers[:,1], (-1, 1)) - heights * 0.5 71 | y2 = np.reshape(centers[:,1], (-1, 1)) + heights * 0.5 72 | 73 | boxes_grid = np.hstack((x1, y1, x2, y2)) / cfg.TRAIN.SPATIAL_SCALE 74 | 75 | return boxes_grid, centers[:,0], centers[:,1] 76 | -------------------------------------------------------------------------------- /faster_rcnn/utils/nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def nms(dets, thresh): 11 | x1 = dets[:, 0] 12 | y1 = dets[:, 1] 13 | x2 = dets[:, 2] 14 | y2 = dets[:, 3] 15 | scores = dets[:, 4] 16 | 17 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 18 | order = scores.argsort()[::-1] 19 | 20 | keep = [] 21 | while order.size > 0: 22 | i = order[0] 23 | keep.append(i) 24 | xx1 = np.maximum(x1[i], x1[order[1:]]) 25 | yy1 = np.maximum(y1[i], y1[order[1:]]) 26 | xx2 = np.minimum(x2[i], x2[order[1:]]) 27 | yy2 = np.minimum(y2[i], y2[order[1:]]) 28 | 29 | w = np.maximum(0.0, xx2 - xx1 + 1) 30 | h = np.maximum(0.0, yy2 - yy1 + 1) 31 | inter = w * h 32 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 33 | 34 | inds = np.where(ovr <= thresh)[0] 35 | order = order[inds + 1] 36 | 37 | return keep 38 | -------------------------------------------------------------------------------- /faster_rcnn/utils/nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | 70 | def nms_new(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 71 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 72 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 73 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 74 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 75 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 76 | 77 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 78 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 79 | 80 | cdef int ndets = dets.shape[0] 81 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 82 | np.zeros((ndets), dtype=np.int) 83 | 84 | # nominal indices 85 | cdef int _i, _j 86 | # sorted indices 87 | cdef int i, j 88 | # temp variables for box i's (the box currently under consideration) 89 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 90 | # variables for computing overlap with box j (lower scoring box) 91 | cdef np.float32_t xx1, yy1, xx2, yy2 92 | cdef np.float32_t w, h 93 | cdef np.float32_t inter, ovr 94 | 95 | keep = [] 96 | for _i in range(ndets): 97 | i = order[_i] 98 | if suppressed[i] == 1: 99 | continue 100 | keep.append(i) 101 | ix1 = x1[i] 102 | iy1 = y1[i] 103 | ix2 = x2[i] 104 | iy2 = y2[i] 105 | iarea = areas[i] 106 | for _j in range(_i + 1, ndets): 107 | j = order[_j] 108 | if suppressed[j] == 1: 109 | continue 110 | xx1 = max(ix1, x1[j]) 111 | yy1 = max(iy1, y1[j]) 112 | xx2 = min(ix2, x2[j]) 113 | yy2 = min(iy2, y2[j]) 114 | w = max(0.0, xx2 - xx1 + 1) 115 | h = max(0.0, yy2 - yy1 + 1) 116 | inter = w * h 117 | ovr = inter / (iarea + areas[j] - inter) 118 | ovr1 = inter / iarea 119 | ovr2 = inter / areas[j] 120 | if ovr >= thresh or ovr1 > 0.95 or ovr2 > 0.95: 121 | suppressed[j] = 1 122 | 123 | return keep 124 | -------------------------------------------------------------------------------- /faster_rcnn/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | class Timer(object): 11 | """A simple timer.""" 12 | def __init__(self): 13 | self.total_time = 0. 14 | self.calls = 0 15 | self.start_time = 0. 16 | self.diff = 0. 17 | self.average_time = 0. 18 | 19 | def tic(self): 20 | # using time.time instead of time.clock because time time.clock 21 | # does not normalize for multithreading 22 | self.start_time = time.time() 23 | 24 | def toc(self, average=True): 25 | self.diff = time.time() - self.start_time 26 | self.total_time += self.diff 27 | self.calls += 1 28 | self.average_time = self.total_time / self.calls 29 | if average: 30 | return self.average_time 31 | else: 32 | return self.diff 33 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import cv2 4 | import cPickle 5 | import numpy as np 6 | 7 | from faster_rcnn import network 8 | from faster_rcnn.faster_rcnn import FasterRCNN, RPN 9 | from faster_rcnn.utils.timer import Timer 10 | from faster_rcnn.fast_rcnn.nms_wrapper import nms 11 | 12 | from faster_rcnn.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes 13 | from faster_rcnn.datasets.factory import get_imdb 14 | from faster_rcnn.fast_rcnn.config import cfg, cfg_from_file, get_output_dir 15 | 16 | 17 | # hyper-parameters 18 | # ------------ 19 | imdb_name = 'voc_2007_test' 20 | cfg_file = 'experiments/cfgs/faster_rcnn_end2end.yml' 21 | # trained_model = '/media/longc/Data/models/VGGnet_fast_rcnn_iter_70000.h5' 22 | trained_model = 'models/saved_model3/faster_rcnn_250000.pth' 23 | 24 | rand_seed = 1024 25 | 26 | save_name = 'faster_rcnn_100000' 27 | max_per_image = 300 28 | thresh = 0.05 29 | vis = False 30 | 31 | # ------------ 32 | 33 | if rand_seed is not None: 34 | np.random.seed(rand_seed) 35 | 36 | if rand_seed is not None: 37 | np.random.seed(rand_seed) 38 | 39 | # load config 40 | cfg_from_file(cfg_file) 41 | 42 | 43 | def vis_detections(im, class_name, dets, thresh=0.8): 44 | """Visual debugging of detections.""" 45 | for i in range(np.minimum(10, dets.shape[0])): 46 | bbox = tuple(int(np.round(x)) for x in dets[i, :4]) 47 | score = dets[i, -1] 48 | if score > thresh: 49 | cv2.rectangle(im, bbox[0:2], bbox[2:4], (0, 204, 0), 2) 50 | cv2.putText(im, '%s: %.3f' % (class_name, score), (bbox[0], bbox[1] + 15), cv2.FONT_HERSHEY_PLAIN, 51 | 1.0, (0, 0, 255), thickness=1) 52 | return im 53 | 54 | 55 | def im_detect(net, image): 56 | """Detect object classes in an image given object proposals. 57 | Returns: 58 | scores (ndarray): R x K array of object class scores (K includes 59 | background as object category 0) 60 | boxes (ndarray): R x (4*K) array of predicted bounding boxes 61 | """ 62 | 63 | im_data, im_scales = net.get_image_blob(image) 64 | im_info = np.array( 65 | [[im_data.shape[1], im_data.shape[2], im_scales[0]]], 66 | dtype=np.float32) 67 | 68 | cls_prob, bbox_pred, rois = net(im_data, im_info) 69 | scores = cls_prob.data.cpu().numpy() 70 | boxes = rois.data.cpu().numpy()[:, 1:5] / im_info[0][2] 71 | 72 | if cfg.TEST.BBOX_REG: 73 | # Apply bounding-box regression deltas 74 | box_deltas = bbox_pred.data.cpu().numpy() 75 | pred_boxes = bbox_transform_inv(boxes, box_deltas) 76 | pred_boxes = clip_boxes(pred_boxes, image.shape) 77 | else: 78 | # Simply repeat the boxes, once for each class 79 | pred_boxes = np.tile(boxes, (1, scores.shape[1])) 80 | 81 | return scores, pred_boxes 82 | 83 | 84 | def test_net(name, net, imdb, max_per_image=300, thresh=0.05, vis=False): 85 | """Test a Fast R-CNN network on an image database.""" 86 | num_images = len(imdb.image_index) 87 | # all detections are collected into: 88 | # all_boxes[cls][image] = N x 5 array of detections in 89 | # (x1, y1, x2, y2, score) 90 | all_boxes = [[[] for _ in xrange(num_images)] 91 | for _ in xrange(imdb.num_classes)] 92 | 93 | output_dir = get_output_dir(imdb, name) 94 | 95 | # timers 96 | _t = {'im_detect': Timer(), 'misc': Timer()} 97 | det_file = os.path.join(output_dir, 'detections.pkl') 98 | 99 | for i in range(num_images): 100 | 101 | im = cv2.imread(imdb.image_path_at(i)) 102 | _t['im_detect'].tic() 103 | scores, boxes = im_detect(net, im) 104 | detect_time = _t['im_detect'].toc(average=False) 105 | 106 | _t['misc'].tic() 107 | if vis: 108 | # im2show = np.copy(im[:, :, (2, 1, 0)]) 109 | im2show = np.copy(im) 110 | 111 | # skip j = 0, because it's the background class 112 | for j in xrange(1, imdb.num_classes): 113 | inds = np.where(scores[:, j] > thresh)[0] 114 | cls_scores = scores[inds, j] 115 | cls_boxes = boxes[inds, j * 4:(j + 1) * 4] 116 | cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ 117 | .astype(np.float32, copy=False) 118 | keep = nms(cls_dets, cfg.TEST.NMS) 119 | cls_dets = cls_dets[keep, :] 120 | if vis: 121 | im2show = vis_detections(im2show, imdb.classes[j], cls_dets) 122 | all_boxes[j][i] = cls_dets 123 | 124 | # Limit to max_per_image detections *over all classes* 125 | if max_per_image > 0: 126 | image_scores = np.hstack([all_boxes[j][i][:, -1] 127 | for j in xrange(1, imdb.num_classes)]) 128 | if len(image_scores) > max_per_image: 129 | image_thresh = np.sort(image_scores)[-max_per_image] 130 | for j in xrange(1, imdb.num_classes): 131 | keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] 132 | all_boxes[j][i] = all_boxes[j][i][keep, :] 133 | nms_time = _t['misc'].toc(average=False) 134 | 135 | print 'im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \ 136 | .format(i + 1, num_images, detect_time, nms_time) 137 | 138 | if vis: 139 | cv2.imshow('test', im2show) 140 | cv2.waitKey(1) 141 | 142 | with open(det_file, 'wb') as f: 143 | cPickle.dump(all_boxes, f, cPickle.HIGHEST_PROTOCOL) 144 | 145 | print 'Evaluating detections' 146 | imdb.evaluate_detections(all_boxes, output_dir) 147 | 148 | 149 | if __name__ == '__main__': 150 | # load data 151 | imdb = get_imdb(imdb_name) 152 | imdb.competition_mode(on=True) 153 | 154 | # load net 155 | net = FasterRCNN(classes=imdb.classes, debug=False) 156 | network.load_net(trained_model, net) 157 | print('load model successfully!') 158 | 159 | net.cuda() 160 | net.eval() 161 | 162 | # evaluation 163 | test_net(save_name, net, imdb, max_per_image, thresh=thresh, vis=vis) 164 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import cv2 5 | from datetime import datetime 6 | 7 | from faster_rcnn import network 8 | from faster_rcnn.faster_rcnn import FasterRCNN, RPN 9 | from faster_rcnn.utils.timer import Timer 10 | 11 | import faster_rcnn.roi_data_layer.roidb as rdl_roidb 12 | from faster_rcnn.roi_data_layer.layer import RoIDataLayer 13 | from faster_rcnn.datasets.factory import get_imdb 14 | from faster_rcnn.fast_rcnn.config import cfg, cfg_from_file 15 | import torchvision 16 | import torch.nn as nn 17 | import math 18 | 19 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 20 | 21 | def log_print(text): 22 | print(text) 23 | 24 | def weight_init(m): 25 | if isinstance(m, nn.Conv2d): 26 | m.weight.data.normal_(0, 0.01) 27 | if m.bias is not None: 28 | m.bias.data.zero_() 29 | elif isinstance(m, nn.BatchNorm2d): 30 | m.weight.data.fill_(1) 31 | m.bias.data.zero_() 32 | elif isinstance(m, nn.Linear): 33 | m.weight.data.normal_(0, 0.01) 34 | m.bias.data.zero_() 35 | 36 | # hyper-parameters 37 | # ------------ 38 | imdb_name = 'voc_2007_trainval' 39 | cfg_file = 'experiments/cfgs/faster_rcnn_end2end.yml' 40 | pretrained_model = 'models/resnet101-caffe.pth' 41 | output_dir = 'models/saved_model3' 42 | 43 | start_step = 0 44 | end_step = 200000 45 | lr_decay_steps = {80000, 120000, 160000} 46 | lr_decay = 1./10 47 | 48 | rand_seed = 1024 49 | _DEBUG = True 50 | remove_all_log = False # remove all historical experiments in TensorBoard 51 | exp_name = None # the previous experiment name in TensorBoard 52 | 53 | # ------------ 54 | 55 | if rand_seed is not None: 56 | np.random.seed(rand_seed) 57 | 58 | # load config 59 | cfg_from_file(cfg_file) 60 | lr = cfg.TRAIN.LEARNING_RATE 61 | momentum = cfg.TRAIN.MOMENTUM 62 | weight_decay = cfg.TRAIN.WEIGHT_DECAY 63 | disp_interval = cfg.TRAIN.DISPLAY 64 | 65 | # load data 66 | imdb = get_imdb(imdb_name) 67 | rdl_roidb.prepare_roidb(imdb) 68 | roidb = imdb.roidb 69 | data_layer = RoIDataLayer(roidb, imdb.num_classes) 70 | 71 | # load net 72 | net = FasterRCNN(classes=imdb.classes, debug=_DEBUG, training = True) 73 | 74 | net.cuda() 75 | net.train() 76 | net.apply(weight_init) #-- parameters initialize 77 | 78 | #----download resnet101 weights----- 79 | pretrained_state = torch.load(pretrained_model) 80 | net.resnet.load_state_dict({k:v for k, v in pretrained_state.items() if k in net.resnet.state_dict()}) 81 | for p in net.resnet.conv1.parameters(): p.requires_grad=False 82 | for p in net.resnet.bn1.parameters(): p.requires_grad=False 83 | for p in net.resnet.layer1.parameters(): p.requires_grad=False 84 | for p in net.resnet.layer2.parameters(): p.requires_grad=False 85 | 86 | params = [] 87 | #params = list(net.parameters()) 88 | for p in list(net.parameters()): 89 | if p.requires_grad == False: continue 90 | params.append(p) 91 | # optimizer = torch.optim.Adam(params[-8:], lr=lr) 92 | optimizer = torch.optim.SGD(params, lr=lr, momentum=momentum, weight_decay=weight_decay) 93 | 94 | if not os.path.exists(output_dir): 95 | os.makedirs(output_dir) 96 | 97 | # training 98 | train_loss = 0 99 | tp, tf, fg, bg = 0., 0., 0, 0 100 | step_cnt = 0 101 | re_cnt = False 102 | time = Timer() 103 | time.tic() 104 | for step in range(start_step, end_step+1): 105 | 106 | # get one batch 107 | blobs = data_layer.forward() 108 | im_data = blobs['data'] 109 | im_info = blobs['im_info'] #---[heiht, width, scale] 110 | gt_boxes = blobs['gt_boxes'] #---[[x1, y1, x2, y2, label]] 111 | gt_ishard = blobs['gt_ishard'] 112 | dontcare_areas = blobs['dontcare_areas'] 113 | net(im_data, im_info, gt_boxes, gt_ishard, dontcare_areas) 114 | loss = net.loss + net.rpn.loss 115 | if _DEBUG: 116 | tp += float(net.tp) 117 | tf += float(net.tf) 118 | fg += net.fg_cnt 119 | bg += net.bg_cnt 120 | 121 | train_loss += loss.data[0] 122 | step_cnt += 1 123 | 124 | # backward 125 | optimizer.zero_grad() 126 | loss.backward() 127 | network.clip_gradient(net, 10.) 128 | optimizer.step() 129 | 130 | if step % disp_interval == 0: 131 | duration = time.toc(average=False) 132 | fps = step_cnt / duration 133 | 134 | log_text = 'step %d, image: %s, loss: %.4f, fps: %.2f (%.2fs per batch)' % ( 135 | step, blobs['im_name'], train_loss / step_cnt, fps, 1./fps) 136 | log_print(log_text) 137 | 138 | if _DEBUG: 139 | log_print('\tTP: %.2f%%, TF: %.2f%%, fg/bg=(%d/%d)' % (tp/fg*100., tf/bg*100., fg/step_cnt, bg/step_cnt)) 140 | log_print('\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box: %.4f' % ( 141 | net.rpn.cross_entropy.data.cpu().numpy()[0], net.rpn.loss_box.data.cpu().numpy()[0], 142 | net.cross_entropy.data.cpu().numpy()[0], net.loss_box.data.cpu().numpy()[0]) 143 | ) 144 | print " " 145 | re_cnt = True 146 | 147 | 148 | if (step % 40000 == 0) and step > 0: 149 | save_name = os.path.join(output_dir, 'faster_rcnn_{}.pth'.format(step)) 150 | torch.save(net.state_dict(), save_name) 151 | print('save model: {}'.format(save_name)) 152 | 153 | if step in lr_decay_steps: 154 | lr *= lr_decay 155 | optimizer = torch.optim.SGD(params, lr=lr, momentum=momentum, weight_decay=weight_decay) 156 | 157 | if re_cnt: 158 | tp, tf, fg, bg = 0., 0., 0, 0 159 | train_loss = 0 160 | step_cnt = 0 161 | time.tic() 162 | re_cnt = False 163 | --------------------------------------------------------------------------------