├── lib ├── model │ ├── __init__.py │ ├── fpn │ │ └── __init__.py │ ├── nms │ │ ├── __init__.py │ │ ├── _ext │ │ │ ├── __init__.py │ │ │ └── nms │ │ │ │ └── __init__.py │ │ ├── .gitignore │ │ ├── make.sh │ │ ├── src │ │ │ ├── nms_cuda_kernel.h │ │ │ ├── nms_cuda.h │ │ │ ├── nms_cuda.c │ │ │ └── nms_cuda_kernel.cu │ │ ├── nms_gpu.py │ │ ├── nms_wrapper.py │ │ ├── build.py │ │ └── nms_kernel.cu │ ├── rpn │ │ ├── __init__.py │ │ ├── rpn_fpn.py │ │ ├── proposal_layer_fpn.py │ │ ├── generate_anchors.py │ │ ├── anchor_target_layer_fpn.py │ │ ├── proposal_target_layer.py │ │ └── bbox_transform.py │ ├── utils │ │ ├── __init__.py │ │ ├── .gitignore │ │ ├── blob.py │ │ ├── logger.py │ │ └── bbox.pyx │ ├── roi_align │ │ ├── __init__.py │ │ ├── _ext │ │ │ ├── __init__.py │ │ │ └── roi_align │ │ │ │ └── __init__.py │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── roi_align.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── roi_align.py │ │ ├── make.sh │ │ ├── src │ │ │ ├── roi_align_cuda.h │ │ │ ├── roi_align_kernel.h │ │ │ ├── roi_align_cuda.c │ │ │ └── roi_align_kernel.cu │ │ └── build.py │ ├── roi_crop │ │ ├── __init__.py │ │ ├── _ext │ │ │ ├── __init__.py │ │ │ ├── roi_crop │ │ │ │ └── __init__.py │ │ │ └── crop_resize │ │ │ │ └── __init__.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── roi_crop.py │ │ ├── functions │ │ │ ├── __init__.py │ │ │ ├── roi_crop.py │ │ │ ├── crop_resize.py │ │ │ └── gridgen.py │ │ ├── make.sh │ │ ├── src │ │ │ ├── roi_crop_cuda.h │ │ │ ├── roi_crop.h │ │ │ ├── roi_crop_cuda_kernel.h │ │ │ └── roi_crop_cuda.c │ │ └── build.py │ └── roi_pooling │ │ ├── __init__.py │ │ ├── _ext │ │ ├── __init__.py │ │ └── roi_pooling │ │ │ └── __init__.py │ │ ├── functions │ │ ├── __init__.py │ │ └── roi_pool.py │ │ ├── modules │ │ ├── __init__.py │ │ ├── roi_pool.py │ │ └── roi_pool_py.py │ │ ├── src │ │ ├── roi_pooling.h │ │ ├── roi_pooling_cuda.h │ │ ├── roi_pooling_kernel.h │ │ ├── roi_pooling_cuda.c │ │ └── roi_pooling.c │ │ └── build.py ├── pycocotools │ ├── __init__.py │ ├── maskApi.h │ ├── mask.py │ └── maskApi.c ├── datasets │ ├── __init__.py │ ├── VOCdevkit-matlab-wrapper │ │ ├── get_voc_opts.m │ │ ├── xVOCap.m │ │ └── voc_eval.m │ ├── ds_utils.py │ ├── tools │ │ └── mcg_munge.py │ ├── factory.py │ ├── vg_eval.py │ ├── voc_eval.py │ ├── imagenet.py │ └── imdb.py ├── roi_data_layer │ ├── __init__.py │ ├── minibatch.py │ ├── roidb.py │ └── roibatchLoader.py ├── make.sh └── setup.py ├── .gitignore ├── cfgs ├── vgg16.yml ├── res50.yml ├── res101.yml └── res101_ls.yml ├── _init_paths.py ├── LICENSE └── README.md /lib/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/fpn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/nms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/rpn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/nms/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_align/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_crop/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | *.a 4 | -------------------------------------------------------------------------------- /lib/model/roi_align/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_crop/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_crop/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_align/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_align/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_crop/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/nms/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /lib/model/utils/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /lib/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/src/roi_pooling.h: -------------------------------------------------------------------------------- 1 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale, 2 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output); -------------------------------------------------------------------------------- /lib/model/nms/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # CUDA_PATH=/usr/local/cuda/ 4 | 5 | cd src 6 | echo "Compiling stnm kernels by nvcc..." 7 | nvcc -c -o nms_cuda_kernel.cu.o nms_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52 8 | 9 | cd ../ 10 | python build.py 11 | -------------------------------------------------------------------------------- /lib/model/nms/src/nms_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | void nms_cuda_compute(int* keep_out, int *num_out, float* boxes_host, int boxes_num, 6 | int boxes_dim, float nms_overlap_thresh); 7 | 8 | #ifdef __cplusplus 9 | } 10 | #endif 11 | -------------------------------------------------------------------------------- /lib/model/roi_align/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_PATH=/usr/local/cuda/ 4 | 5 | cd src 6 | echo "Compiling my_lib kernels by nvcc..." 7 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52 8 | 9 | cd ../ 10 | python build.py 11 | -------------------------------------------------------------------------------- /lib/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/model/roi_crop/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_PATH=/usr/local/cuda/ 4 | 5 | cd src 6 | echo "Compiling my_lib kernels by nvcc..." 7 | nvcc -c -o roi_crop_cuda_kernel.cu.o roi_crop_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52 8 | 9 | cd ../ 10 | python build.py 11 | -------------------------------------------------------------------------------- /lib/roi_data_layer/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/model/nms/src/nms_cuda.h: -------------------------------------------------------------------------------- 1 | // int nms_cuda(THCudaTensor *keep_out, THCudaTensor *num_out, 2 | // THCudaTensor *boxes_host, THCudaTensor *nms_overlap_thresh); 3 | 4 | int nms_cuda(THCudaIntTensor *keep_out, THCudaTensor *boxes_host, 5 | THCudaIntTensor *num_out, float nms_overlap_thresh); 6 | -------------------------------------------------------------------------------- /lib/model/nms/nms_gpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from _ext import nms 4 | import pdb 5 | 6 | def nms_gpu(dets, thresh): 7 | keep = dets.new(dets.size(0), 1).zero_().int() 8 | num_out = dets.new(1).zero_().int() 9 | nms.nms_cuda(keep, dets, num_out, thresh) 10 | keep = keep[:num_out[0]] 11 | return keep 12 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m: -------------------------------------------------------------------------------- 1 | function VOCopts = get_voc_opts(path) 2 | 3 | tmp = pwd; 4 | cd(path); 5 | try 6 | addpath('VOCcode'); 7 | VOCinit; 8 | catch 9 | rmpath('VOCcode'); 10 | cd(tmp); 11 | error(sprintf('VOCcode directory not found under %s', path)); 12 | end 13 | rmpath('VOCcode'); 14 | cd(tmp); 15 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/xVOCap.m: -------------------------------------------------------------------------------- 1 | function ap = xVOCap(rec,prec) 2 | % From the PASCAL VOC 2011 devkit 3 | 4 | mrec=[0 ; rec ; 1]; 5 | mpre=[0 ; prec ; 0]; 6 | for i=numel(mpre)-1:-1:1 7 | mpre(i)=max(mpre(i),mpre(i+1)); 8 | end 9 | i=find(mrec(2:end)~=mrec(1:end-1))+1; 10 | ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); 11 | -------------------------------------------------------------------------------- /lib/model/roi_crop/modules/roi_crop.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from ..functions.roi_crop import RoICropFunction 3 | 4 | class _RoICrop(Module): 5 | def __init__(self, layout = 'BHWD'): 6 | super(_RoICrop, self).__init__() 7 | def forward(self, input1, input2): 8 | return RoICropFunction()(input1, input2) 9 | -------------------------------------------------------------------------------- /cfgs/vgg16.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: vgg16 2 | TRAIN: 3 | HAS_RPN: True 4 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 5 | RPN_POSITIVE_OVERLAP: 0.7 6 | RPN_BATCHSIZE: 256 7 | PROPOSAL_METHOD: gt 8 | BG_THRESH_LO: 0.0 9 | BATCH_SIZE: 256 10 | LEARNING_RATE: 0.01 11 | TEST: 12 | HAS_RPN: True 13 | POOLING_MODE: align 14 | CROP_RESIZE_WITH_MAX_POOL: False 15 | -------------------------------------------------------------------------------- /lib/model/nms/_ext/nms/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._nms import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /_init_paths.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import sys 3 | 4 | def add_path(path): 5 | if path not in sys.path: 6 | sys.path.insert(0, path) 7 | 8 | this_dir = osp.dirname(__file__) 9 | 10 | # Add lib to PYTHONPATH 11 | lib_path = osp.join(this_dir, 'lib') 12 | add_path(lib_path) 13 | 14 | coco_path = osp.join(this_dir, 'data', 'coco', 'PythonAPI') 15 | add_path(coco_path) 16 | -------------------------------------------------------------------------------- /lib/model/roi_crop/_ext/roi_crop/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_crop import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /lib/model/roi_align/_ext/roi_align/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_align import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /lib/model/roi_crop/_ext/crop_resize/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._crop_resize import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/_ext/roi_pooling/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_pooling import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /lib/model/roi_align/src/roi_align_cuda.h: -------------------------------------------------------------------------------- 1 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output); 3 | 4 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad); 6 | -------------------------------------------------------------------------------- /cfgs/res50.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: res50 2 | TRAIN: 3 | HAS_RPN: True 4 | # IMS_PER_BATCH: 1 5 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 6 | RPN_POSITIVE_OVERLAP: 0.7 7 | RPN_BATCHSIZE: 256 8 | PROPOSAL_METHOD: gt 9 | BG_THRESH_LO: 0.0 10 | DISPLAY: 20 11 | BATCH_SIZE: 256 12 | WEIGHT_DECAY: 0.0001 13 | DOUBLE_BIAS: False 14 | SNAPSHOT_PREFIX: res50_faster_rcnn 15 | TEST: 16 | HAS_RPN: True 17 | POOLING_MODE: crop 18 | -------------------------------------------------------------------------------- /cfgs/res101.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: res101 2 | TRAIN: 3 | HAS_RPN: True 4 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 5 | RPN_POSITIVE_OVERLAP: 0.7 6 | RPN_BATCHSIZE: 256 7 | PROPOSAL_METHOD: gt 8 | BG_THRESH_LO: 0.0 9 | DISPLAY: 20 10 | BATCH_SIZE: 128 11 | WEIGHT_DECAY: 0.0001 12 | DOUBLE_BIAS: False 13 | LEARNING_RATE: 0.001 14 | TEST: 15 | HAS_RPN: True 16 | POOLING_SIZE: 7 17 | POOLING_MODE: align 18 | CROP_RESIZE_WITH_MAX_POOL: False 19 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/src/roi_pooling_cuda.h: -------------------------------------------------------------------------------- 1 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax); 3 | 4 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax); -------------------------------------------------------------------------------- /lib/model/roi_crop/src/roi_crop_cuda.h: -------------------------------------------------------------------------------- 1 | // Bilinear sampling is done in BHWD (coalescing is not obvious in BDHW) 2 | // we assume BHWD format in inputImages 3 | // we assume BHW(YX) format on grids 4 | 5 | int BilinearSamplerBHWD_updateOutput_cuda(THCudaTensor *inputImages, THCudaTensor *grids, THCudaTensor *output); 6 | 7 | int BilinearSamplerBHWD_updateGradInput_cuda(THCudaTensor *inputImages, THCudaTensor *grids, THCudaTensor *gradInputImages, 8 | THCudaTensor *gradGrids, THCudaTensor *gradOutput); 9 | -------------------------------------------------------------------------------- /cfgs/res101_ls.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: res101 2 | TRAIN: 3 | HAS_RPN: True 4 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 5 | RPN_POSITIVE_OVERLAP: 0.7 6 | RPN_BATCHSIZE: 256 7 | PROPOSAL_METHOD: gt 8 | BG_THRESH_LO: 0.0 9 | DISPLAY: 20 10 | BATCH_SIZE: 128 11 | WEIGHT_DECAY: 0.0001 12 | SCALES: [800] 13 | DOUBLE_BIAS: False 14 | LEARNING_RATE: 0.001 15 | TEST: 16 | HAS_RPN: True 17 | SCALES: [800] 18 | MAX_SIZE: 1200 19 | RPN_POST_NMS_TOP_N: 1000 20 | POOLING_SIZE: 7 21 | POOLING_MODE: align 22 | CROP_RESIZE_WITH_MAX_POOL: False 23 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/modules/roi_pool.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from ..functions.roi_pool import RoIPoolFunction 3 | 4 | 5 | class _RoIPooling(Module): 6 | def __init__(self, pooled_height, pooled_width, spatial_scale): 7 | super(_RoIPooling, self).__init__() 8 | 9 | self.pooled_width = int(pooled_width) 10 | self.pooled_height = int(pooled_height) 11 | self.spatial_scale = float(spatial_scale) 12 | 13 | def forward(self, features, rois, scale): 14 | return RoIPoolFunction(self.pooled_height, self.pooled_width, scale)(features, rois) 15 | -------------------------------------------------------------------------------- /lib/model/nms/src/nms_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "nms_cuda_kernel.h" 4 | 5 | // this symbol will be resolved automatically from PyTorch libs 6 | extern THCState *state; 7 | 8 | int nms_cuda(THCudaIntTensor *keep_out, THCudaTensor *boxes_host, 9 | THCudaIntTensor *num_out, float nms_overlap_thresh) { 10 | 11 | nms_cuda_compute(THCudaIntTensor_data(state, keep_out), 12 | THCudaIntTensor_data(state, num_out), 13 | THCudaTensor_data(state, boxes_host), 14 | boxes_host->size[0], 15 | boxes_host->size[1], 16 | nms_overlap_thresh); 17 | 18 | return 1; 19 | } 20 | -------------------------------------------------------------------------------- /lib/model/roi_crop/src/roi_crop.h: -------------------------------------------------------------------------------- 1 | int BilinearSamplerBHWD_updateOutput(THFloatTensor *inputImages, THFloatTensor *grids, THFloatTensor *output); 2 | 3 | int BilinearSamplerBHWD_updateGradInput(THFloatTensor *inputImages, THFloatTensor *grids, THFloatTensor *gradInputImages, 4 | THFloatTensor *gradGrids, THFloatTensor *gradOutput); 5 | 6 | 7 | 8 | int BilinearSamplerBCHW_updateOutput(THFloatTensor *inputImages, THFloatTensor *grids, THFloatTensor *output); 9 | 10 | int BilinearSamplerBCHW_updateGradInput(THFloatTensor *inputImages, THFloatTensor *grids, THFloatTensor *gradInputImages, 11 | THFloatTensor *gradGrids, THFloatTensor *gradOutput); 12 | -------------------------------------------------------------------------------- /lib/model/nms/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | import torch 8 | from model.utils.config import cfg 9 | from model.nms.nms_gpu import nms_gpu 10 | 11 | def nms(dets, thresh, force_cpu=False): 12 | """Dispatch to either CPU or GPU NMS implementations.""" 13 | if dets.shape[0] == 0: 14 | return [] 15 | # ---numpy version--- 16 | # original: return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 17 | # ---pytorch version--- 18 | return nms_gpu(dets, thresh) 19 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/src/roi_pooling_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ROI_POOLING_KERNEL 2 | #define _ROI_POOLING_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | int ROIPoolForwardLaucher( 9 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 10 | const int width, const int channels, const int pooled_height, 11 | const int pooled_width, const float* bottom_rois, 12 | float* top_data, int* argmax_data, cudaStream_t stream); 13 | 14 | 15 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 16 | const int height, const int width, const int channels, const int pooled_height, 17 | const int pooled_width, const float* bottom_rois, 18 | float* bottom_diff, const int* argmax_data, cudaStream_t stream); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif 25 | 26 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.ffi import create_extension 4 | 5 | 6 | sources = ['src/roi_pooling.c'] 7 | headers = ['src/roi_pooling.h'] 8 | defines = [] 9 | with_cuda = False 10 | 11 | if torch.cuda.is_available(): 12 | print('Including CUDA code.') 13 | sources += ['src/roi_pooling_cuda.c'] 14 | headers += ['src/roi_pooling_cuda.h'] 15 | defines += [('WITH_CUDA', None)] 16 | with_cuda = True 17 | 18 | this_file = os.path.dirname(os.path.realpath(__file__)) 19 | print(this_file) 20 | extra_objects = ['src/roi_pooling.cu.o'] 21 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 22 | 23 | ffi = create_extension( 24 | '_ext.roi_pooling', 25 | headers=headers, 26 | sources=sources, 27 | define_macros=defines, 28 | relative_to=__file__, 29 | with_cuda=with_cuda, 30 | extra_objects=extra_objects 31 | ) 32 | 33 | if __name__ == '__main__': 34 | ffi.build() 35 | -------------------------------------------------------------------------------- /lib/model/nms/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.ffi import create_extension 4 | 5 | #this_file = os.path.dirname(__file__) 6 | 7 | sources = [] 8 | headers = [] 9 | defines = [] 10 | with_cuda = False 11 | 12 | if torch.cuda.is_available(): 13 | print('Including CUDA code.') 14 | sources += ['src/nms_cuda.c'] 15 | headers += ['src/nms_cuda.h'] 16 | defines += [('WITH_CUDA', None)] 17 | with_cuda = True 18 | 19 | this_file = os.path.dirname(os.path.realpath(__file__)) 20 | print(this_file) 21 | extra_objects = ['src/nms_cuda_kernel.cu.o'] 22 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 23 | print(extra_objects) 24 | 25 | ffi = create_extension( 26 | '_ext.nms', 27 | headers=headers, 28 | sources=sources, 29 | define_macros=defines, 30 | relative_to=__file__, 31 | with_cuda=with_cuda, 32 | extra_objects=extra_objects 33 | ) 34 | 35 | if __name__ == '__main__': 36 | ffi.build() 37 | -------------------------------------------------------------------------------- /lib/model/roi_align/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.ffi import create_extension 4 | 5 | # sources = ['src/roi_align.c'] 6 | # headers = ['src/roi_align.h'] 7 | sources = [] 8 | headers = [] 9 | defines = [] 10 | with_cuda = False 11 | 12 | if torch.cuda.is_available(): 13 | print('Including CUDA code.') 14 | sources += ['src/roi_align_cuda.c'] 15 | headers += ['src/roi_align_cuda.h'] 16 | defines += [('WITH_CUDA', None)] 17 | with_cuda = True 18 | 19 | this_file = os.path.dirname(os.path.realpath(__file__)) 20 | print(this_file) 21 | extra_objects = ['src/roi_align_kernel.cu.o'] 22 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 23 | 24 | ffi = create_extension( 25 | '_ext.roi_align', 26 | headers=headers, 27 | sources=sources, 28 | define_macros=defines, 29 | relative_to=__file__, 30 | with_cuda=with_cuda, 31 | extra_objects=extra_objects 32 | ) 33 | 34 | if __name__ == '__main__': 35 | ffi.build() 36 | -------------------------------------------------------------------------------- /lib/model/roi_crop/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.ffi import create_extension 4 | 5 | #this_file = os.path.dirname(__file__) 6 | 7 | sources = ['src/roi_crop.c'] 8 | headers = ['src/roi_crop.h'] 9 | defines = [] 10 | with_cuda = False 11 | 12 | if torch.cuda.is_available(): 13 | print('Including CUDA code.') 14 | sources += ['src/roi_crop_cuda.c'] 15 | headers += ['src/roi_crop_cuda.h'] 16 | defines += [('WITH_CUDA', None)] 17 | with_cuda = True 18 | 19 | this_file = os.path.dirname(os.path.realpath(__file__)) 20 | print(this_file) 21 | extra_objects = ['src/roi_crop_cuda_kernel.cu.o'] 22 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 23 | 24 | ffi = create_extension( 25 | '_ext.roi_crop', 26 | headers=headers, 27 | sources=sources, 28 | define_macros=defines, 29 | relative_to=__file__, 30 | with_cuda=with_cuda, 31 | extra_objects=extra_objects 32 | ) 33 | 34 | if __name__ == '__main__': 35 | ffi.build() 36 | -------------------------------------------------------------------------------- /lib/model/roi_crop/functions/roi_crop.py: -------------------------------------------------------------------------------- 1 | # functions/add.py 2 | import torch 3 | from torch.autograd import Function 4 | from .._ext import roi_crop 5 | import pdb 6 | 7 | class RoICropFunction(Function): 8 | def forward(self, input1, input2): 9 | self.input1 = input1.clone() 10 | self.input2 = input2.clone() 11 | output = input2.new(input2.size()[0], input1.size()[1], input2.size()[1], input2.size()[2]).zero_() 12 | assert output.get_device() == input1.get_device(), "output and input1 must on the same device" 13 | assert output.get_device() == input2.get_device(), "output and input2 must on the same device" 14 | roi_crop.BilinearSamplerBHWD_updateOutput_cuda(input1, input2, output) 15 | return output 16 | 17 | def backward(self, grad_output): 18 | grad_input1 = self.input1.new(self.input1.size()).zero_() 19 | grad_input2 = self.input2.new(self.input2.size()).zero_() 20 | roi_crop.BilinearSamplerBHWD_updateGradInput_cuda(self.input1, self.input2, grad_input1, grad_input2, grad_output) 21 | return grad_input1, grad_input2 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Jianwei Yang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_PATH=/usr/local/cuda/ 4 | 5 | python setup.py build_ext --inplace 6 | rm -rf build 7 | 8 | # compile NMS 9 | cd model/nms/src 10 | echo "Compiling nms kernels by nvcc..." 11 | nvcc -c -o nms_cuda_kernel.cu.o nms_cuda_kernel.cu \ 12 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -arch=sm_52 13 | cd ../ 14 | python build.py 15 | 16 | # compile roi_pooling 17 | cd ../../ 18 | cd model/roi_pooling/src 19 | echo "Compiling roi pooling kernels by nvcc..." 20 | nvcc -c -o roi_pooling.cu.o roi_pooling_kernel.cu \ 21 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -arch=sm_52 22 | cd ../ 23 | python build.py 24 | 25 | # compile roi_align 26 | cd ../../ 27 | cd model/roi_align/src 28 | echo "Compiling roi align kernels by nvcc..." 29 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu \ 30 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -arch=sm_52 31 | cd ../ 32 | python build.py 33 | 34 | # compile roi_crop 35 | cd ../../ 36 | cd model/roi_crop/src 37 | echo "Compiling roi crop kernels by nvcc..." 38 | nvcc -c -o roi_crop_cuda_kernel.cu.o roi_crop_cuda_kernel.cu \ 39 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -arch=sm_52 40 | cd ../ 41 | python build.py 42 | -------------------------------------------------------------------------------- /lib/model/roi_align/src/roi_align_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ROI_ALIGN_KERNEL 2 | #define _ROI_ALIGN_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, 9 | const float spatial_scale, const int height, const int width, 10 | const int channels, const int aligned_height, const int aligned_width, 11 | const float* bottom_rois, float* top_data); 12 | 13 | int ROIAlignForwardLaucher( 14 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 15 | const int width, const int channels, const int aligned_height, 16 | const int aligned_width, const float* bottom_rois, 17 | float* top_data, cudaStream_t stream); 18 | 19 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, 20 | const float spatial_scale, const int height, const int width, 21 | const int channels, const int aligned_height, const int aligned_width, 22 | float* bottom_diff, const float* bottom_rois); 23 | 24 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 25 | const int height, const int width, const int channels, const int aligned_height, 26 | const int aligned_width, const float* bottom_rois, 27 | float* bottom_diff, cudaStream_t stream); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif 34 | 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | fpn.pytorch 2 | Pytorch implementation of Feature Pyramid Network (FPN) for Object Detection 3 | 4 | ## Introduction 5 | 6 | This project inherits the property of our [pytorch implementation of faster r-cnn](https://github.com/jwyang/faster-rcnn.pytorch). Hence, it also has the following unique features: 7 | 8 | * **It is pure Pytorch code**. We convert all the numpy implementations to pytorch. 9 | 10 | * **It supports trainig batchsize > 1**. We revise all the layers, including dataloader, rpn, roi-pooling, etc., to train with multiple images at each iteration. 11 | 12 | * **It supports multiple GPUs**. We use a multiple GPU wrapper (nn.DataParallel here) to make it flexible to use one or more GPUs, as a merit of the above two features. 13 | 14 | * **It supports three pooling methods**. We integrate three pooling methods: roi pooing, roi align and roi crop. Besides, we convert them to support multi-image batch training. 15 | 16 | ## Benchmarking 17 | 18 | We benchmark our code thoroughly on three datasets: pascal voc, coco. Below are the results: 19 | 20 | 1). PASCAL VOC 2007 (Train/Test: 07trainval/07test, scale=600, ROI Align) 21 | 22 | model | GPUs | Batch Size | lr | lr_decay | max_epoch | Speed/epoch | Memory/GPU | mAP 23 | ---------|-----------|----|-----------|-----|-----|-------|--------|-------- 24 | Res-101   | 8 TitanX | 24| 1e-2 | 10 | 12 | 0.22 hr | 9688MB | 74.2 25 | 26 | **Results on coco are on the way**. 27 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/voc_eval.m: -------------------------------------------------------------------------------- 1 | function res = voc_eval(path, comp_id, test_set, output_dir) 2 | 3 | VOCopts = get_voc_opts(path); 4 | VOCopts.testset = test_set; 5 | 6 | for i = 1:length(VOCopts.classes) 7 | cls = VOCopts.classes{i}; 8 | res(i) = voc_eval_cls(cls, VOCopts, comp_id, output_dir); 9 | end 10 | 11 | fprintf('\n~~~~~~~~~~~~~~~~~~~~\n'); 12 | fprintf('Results:\n'); 13 | aps = [res(:).ap]'; 14 | fprintf('%.1f\n', aps * 100); 15 | fprintf('%.1f\n', mean(aps) * 100); 16 | fprintf('~~~~~~~~~~~~~~~~~~~~\n'); 17 | 18 | function res = voc_eval_cls(cls, VOCopts, comp_id, output_dir) 19 | 20 | test_set = VOCopts.testset; 21 | year = VOCopts.dataset(4:end); 22 | 23 | addpath(fullfile(VOCopts.datadir, 'VOCcode')); 24 | 25 | res_fn = sprintf(VOCopts.detrespath, comp_id, cls); 26 | 27 | recall = []; 28 | prec = []; 29 | ap = 0; 30 | ap_auc = 0; 31 | 32 | do_eval = (str2num(year) <= 2007) | ~strcmp(test_set, 'test'); 33 | if do_eval 34 | % Bug in VOCevaldet requires that tic has been called first 35 | tic; 36 | [recall, prec, ap] = VOCevaldet(VOCopts, comp_id, cls, true); 37 | ap_auc = xVOCap(recall, prec); 38 | 39 | % force plot limits 40 | ylim([0 1]); 41 | xlim([0 1]); 42 | 43 | print(gcf, '-djpeg', '-r0', ... 44 | [output_dir '/' cls '_pr.jpg']); 45 | end 46 | fprintf('!!! %s : %.4f %.4f\n', cls, ap, ap_auc); 47 | 48 | res.recall = recall; 49 | res.prec = prec; 50 | res.ap = ap; 51 | res.ap_auc = ap_auc; 52 | 53 | save([output_dir '/' cls '_pr.mat'], ... 54 | 'res', 'recall', 'prec', 'ap', 'ap_auc'); 55 | 56 | rmpath(fullfile(VOCopts.datadir, 'VOCcode')); 57 | -------------------------------------------------------------------------------- /lib/datasets/ds_utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Ross Girshick 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | 12 | 13 | def unique_boxes(boxes, scale=1.0): 14 | """Return indices of unique boxes.""" 15 | v = np.array([1, 1e3, 1e6, 1e9]) 16 | hashes = np.round(boxes * scale).dot(v) 17 | _, index = np.unique(hashes, return_index=True) 18 | return np.sort(index) 19 | 20 | 21 | def xywh_to_xyxy(boxes): 22 | """Convert [x y w h] box format to [x1 y1 x2 y2] format.""" 23 | return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1)) 24 | 25 | 26 | def xyxy_to_xywh(boxes): 27 | """Convert [x1 y1 x2 y2] box format to [x y w h] format.""" 28 | return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1)) 29 | 30 | 31 | def validate_boxes(boxes, width=0, height=0): 32 | """Check that a set of boxes are valid.""" 33 | x1 = boxes[:, 0] 34 | y1 = boxes[:, 1] 35 | x2 = boxes[:, 2] 36 | y2 = boxes[:, 3] 37 | assert (x1 >= 0).all() 38 | assert (y1 >= 0).all() 39 | assert (x2 >= x1).all() 40 | assert (y2 >= y1).all() 41 | assert (x2 < width).all() 42 | assert (y2 < height).all() 43 | 44 | 45 | def filter_small_boxes(boxes, min_size): 46 | w = boxes[:, 2] - boxes[:, 0] 47 | h = boxes[:, 3] - boxes[:, 1] 48 | keep = np.where((w >= min_size) & (h > min_size))[0] 49 | return keep 50 | -------------------------------------------------------------------------------- /lib/datasets/tools/mcg_munge.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | """Hacky tool to convert file system layout of MCG boxes downloaded from 5 | http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/mcg/ 6 | so that it's consistent with those computed by Jan Hosang (see: 7 | http://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal- 8 | computing/research/object-recognition-and-scene-understanding/how- 9 | good-are-detection-proposals-really/) 10 | 11 | NB: Boxes from the MCG website are in (y1, x1, y2, x2) order. 12 | Boxes from Hosang et al. are in (x1, y1, x2, y2) order. 13 | """ 14 | 15 | def munge(src_dir): 16 | # stored as: ./MCG-COCO-val2014-boxes/COCO_val2014_000000193401.mat 17 | # want: ./MCG/mat/COCO_val2014_0/COCO_val2014_000000141/COCO_val2014_000000141334.mat 18 | 19 | files = os.listdir(src_dir) 20 | for fn in files: 21 | base, ext = os.path.splitext(fn) 22 | # first 14 chars / first 22 chars / all chars + .mat 23 | # COCO_val2014_0/COCO_val2014_000000447/COCO_val2014_000000447991.mat 24 | first = base[:14] 25 | second = base[:22] 26 | dst_dir = os.path.join('MCG', 'mat', first, second) 27 | if not os.path.exists(dst_dir): 28 | os.makedirs(dst_dir) 29 | src = os.path.join(src_dir, fn) 30 | dst = os.path.join(dst_dir, fn) 31 | print 'MV: {} -> {}'.format(src, dst) 32 | os.rename(src, dst) 33 | 34 | if __name__ == '__main__': 35 | # src_dir should look something like: 36 | # src_dir = 'MCG-COCO-val2014-boxes' 37 | src_dir = sys.argv[1] 38 | munge(src_dir) 39 | -------------------------------------------------------------------------------- /lib/model/roi_crop/functions/crop_resize.py: -------------------------------------------------------------------------------- 1 | # functions/add.py 2 | import torch 3 | from torch.autograd import Function 4 | from .._ext import roi_crop 5 | from cffi import FFI 6 | ffi = FFI() 7 | 8 | class RoICropFunction(Function): 9 | def forward(self, input1, input2): 10 | self.input1 = input1 11 | self.input2 = input2 12 | self.device_c = ffi.new("int *") 13 | output = torch.zeros(input2.size()[0], input1.size()[1], input2.size()[1], input2.size()[2]) 14 | #print('decice %d' % torch.cuda.current_device()) 15 | if input1.is_cuda: 16 | self.device = torch.cuda.current_device() 17 | else: 18 | self.device = -1 19 | self.device_c[0] = self.device 20 | if not input1.is_cuda: 21 | roi_crop.BilinearSamplerBHWD_updateOutput(input1, input2, output) 22 | else: 23 | output = output.cuda(self.device) 24 | roi_crop.BilinearSamplerBHWD_updateOutput_cuda(input1, input2, output) 25 | return output 26 | 27 | def backward(self, grad_output): 28 | grad_input1 = torch.zeros(self.input1.size()) 29 | grad_input2 = torch.zeros(self.input2.size()) 30 | #print('backward decice %d' % self.device) 31 | if not grad_output.is_cuda: 32 | roi_crop.BilinearSamplerBHWD_updateGradInput(self.input1, self.input2, grad_input1, grad_input2, grad_output) 33 | else: 34 | grad_input1 = grad_input1.cuda(self.device) 35 | grad_input2 = grad_input2.cuda(self.device) 36 | roi_crop.BilinearSamplerBHWD_updateGradInput_cuda(self.input1, self.input2, grad_input1, grad_input2, grad_output) 37 | return grad_input1, grad_input2 38 | -------------------------------------------------------------------------------- /lib/model/utils/blob.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Blob helper functions.""" 9 | 10 | import numpy as np 11 | #from scipy.misc import imread, imresize 12 | import cv2 13 | 14 | 15 | 16 | def im_list_to_blob(ims): 17 | """Convert a list of images into a network input. 18 | 19 | Assumes images are already prepared (means subtracted, BGR order, ...). 20 | """ 21 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 22 | num_images = len(ims) 23 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 24 | dtype=np.float32) 25 | for i in xrange(num_images): 26 | im = ims[i] 27 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 28 | 29 | return blob 30 | 31 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 32 | """Mean subtract and scale an image for use in a blob.""" 33 | 34 | im = im.astype(np.float32, copy=False) 35 | im -= pixel_means 36 | # im = im[:, :, ::-1] 37 | im_shape = im.shape 38 | im_size_min = np.min(im_shape[0:2]) 39 | im_size_max = np.max(im_shape[0:2]) 40 | im_scale = float(target_size) / float(im_size_min) 41 | # Prevent the biggest axis from being more than MAX_SIZE 42 | # if np.round(im_scale * im_size_max) > max_size: 43 | # im_scale = float(max_size) / float(im_size_max) 44 | # im = imresize(im, im_scale) 45 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 46 | interpolation=cv2.INTER_LINEAR) 47 | 48 | return im, im_scale 49 | -------------------------------------------------------------------------------- /lib/model/roi_align/modules/roi_align.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from torch.nn.functional import avg_pool2d, max_pool2d 3 | from ..functions.roi_align import RoIAlignFunction 4 | 5 | 6 | class RoIAlign(Module): 7 | def __init__(self, aligned_height, aligned_width, spatial_scale): 8 | super(RoIAlign, self).__init__() 9 | 10 | self.aligned_width = int(aligned_width) 11 | self.aligned_height = int(aligned_height) 12 | self.spatial_scale = float(spatial_scale) 13 | 14 | def forward(self, features, rois, scale): 15 | return RoIAlignFunction(self.aligned_height, self.aligned_width, 16 | scale)(features, rois) 17 | 18 | class RoIAlignAvg(Module): 19 | def __init__(self, aligned_height, aligned_width, spatial_scale): 20 | super(RoIAlignAvg, self).__init__() 21 | 22 | self.aligned_width = int(aligned_width) 23 | self.aligned_height = int(aligned_height) 24 | self.spatial_scale = float(spatial_scale) 25 | 26 | def forward(self, features, rois, scale): 27 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1, 28 | scale)(features, rois) 29 | return avg_pool2d(x, kernel_size=2, stride=1) 30 | 31 | class RoIAlignMax(Module): 32 | def __init__(self, aligned_height, aligned_width, spatial_scale): 33 | super(RoIAlignMax, self).__init__() 34 | 35 | self.aligned_width = int(aligned_width) 36 | self.aligned_height = int(aligned_height) 37 | self.spatial_scale = float(spatial_scale) 38 | 39 | def forward(self, features, rois, scale): 40 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1, 41 | scale)(features, rois) 42 | return max_pool2d(x, kernel_size=2, stride=1) 43 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/functions/roi_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from .._ext import roi_pooling 4 | import pdb 5 | 6 | class RoIPoolFunction(Function): 7 | def __init__(ctx, pooled_height, pooled_width, spatial_scale): 8 | ctx.pooled_width = pooled_width 9 | ctx.pooled_height = pooled_height 10 | ctx.spatial_scale = spatial_scale 11 | ctx.feature_size = None 12 | 13 | def forward(ctx, features, rois): 14 | ctx.feature_size = features.size() 15 | batch_size, num_channels, data_height, data_width = ctx.feature_size 16 | num_rois = rois.size(0) 17 | output = features.new(num_rois, num_channels, ctx.pooled_height, ctx.pooled_width).zero_() 18 | ctx.argmax = features.new(num_rois, num_channels, ctx.pooled_height, ctx.pooled_width).zero_().int() 19 | ctx.rois = rois 20 | if not features.is_cuda: 21 | _features = features.permute(0, 2, 3, 1) 22 | roi_pooling.roi_pooling_forward(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, 23 | _features, rois, output) 24 | else: 25 | roi_pooling.roi_pooling_forward_cuda(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, 26 | features, rois, output, ctx.argmax) 27 | 28 | return output 29 | 30 | def backward(ctx, grad_output): 31 | assert(ctx.feature_size is not None and grad_output.is_cuda) 32 | batch_size, num_channels, data_height, data_width = ctx.feature_size 33 | grad_input = grad_output.new(batch_size, num_channels, data_height, data_width).zero_() 34 | 35 | roi_pooling.roi_pooling_backward_cuda(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, 36 | grad_output, ctx.rois, grad_input, ctx.argmax) 37 | 38 | return grad_input, None 39 | -------------------------------------------------------------------------------- /lib/model/roi_align/functions/roi_align.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from .._ext import roi_align 4 | 5 | 6 | # TODO use save_for_backward instead 7 | class RoIAlignFunction(Function): 8 | def __init__(self, aligned_height, aligned_width, spatial_scale): 9 | self.aligned_width = int(aligned_width) 10 | self.aligned_height = int(aligned_height) 11 | self.spatial_scale = float(spatial_scale) 12 | self.rois = None 13 | self.feature_size = None 14 | 15 | def forward(self, features, rois): 16 | self.rois = rois 17 | self.feature_size = features.size() 18 | 19 | batch_size, num_channels, data_height, data_width = features.size() 20 | num_rois = rois.size(0) 21 | 22 | output = features.new(num_rois, num_channels, self.aligned_height, self.aligned_width).zero_() 23 | if features.is_cuda: 24 | roi_align.roi_align_forward_cuda(self.aligned_height, 25 | self.aligned_width, 26 | self.spatial_scale, features, 27 | rois, output) 28 | else: 29 | raise NotImplementedError 30 | 31 | return output 32 | 33 | def backward(self, grad_output): 34 | assert(self.feature_size is not None and grad_output.is_cuda) 35 | 36 | batch_size, num_channels, data_height, data_width = self.feature_size 37 | 38 | grad_input = self.rois.new(batch_size, num_channels, data_height, 39 | data_width).zero_() 40 | roi_align.roi_align_backward_cuda(self.aligned_height, 41 | self.aligned_width, 42 | self.spatial_scale, grad_output, 43 | self.rois, grad_input) 44 | 45 | # print grad_input 46 | 47 | return grad_input, None 48 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/modules/roi_pool_py.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | 6 | 7 | class RoIPool(nn.Module): 8 | def __init__(self, pooled_height, pooled_width, spatial_scale): 9 | super(RoIPool, self).__init__() 10 | self.pooled_width = int(pooled_width) 11 | self.pooled_height = int(pooled_height) 12 | self.spatial_scale = float(spatial_scale) 13 | 14 | def forward(self, features, rois): 15 | batch_size, num_channels, data_height, data_width = features.size() 16 | num_rois = rois.size()[0] 17 | outputs = Variable(torch.zeros(num_rois, num_channels, self.pooled_height, self.pooled_width)).cuda() 18 | 19 | for roi_ind, roi in enumerate(rois): 20 | batch_ind = int(roi[0].data[0]) 21 | roi_start_w, roi_start_h, roi_end_w, roi_end_h = np.round( 22 | roi[1:].data.cpu().numpy() * self.spatial_scale).astype(int) 23 | roi_width = max(roi_end_w - roi_start_w + 1, 1) 24 | roi_height = max(roi_end_h - roi_start_h + 1, 1) 25 | bin_size_w = float(roi_width) / float(self.pooled_width) 26 | bin_size_h = float(roi_height) / float(self.pooled_height) 27 | 28 | for ph in range(self.pooled_height): 29 | hstart = int(np.floor(ph * bin_size_h)) 30 | hend = int(np.ceil((ph + 1) * bin_size_h)) 31 | hstart = min(data_height, max(0, hstart + roi_start_h)) 32 | hend = min(data_height, max(0, hend + roi_start_h)) 33 | for pw in range(self.pooled_width): 34 | wstart = int(np.floor(pw * bin_size_w)) 35 | wend = int(np.ceil((pw + 1) * bin_size_w)) 36 | wstart = min(data_width, max(0, wstart + roi_start_w)) 37 | wend = min(data_width, max(0, wend + roi_start_w)) 38 | 39 | is_empty = (hend <= hstart) or(wend <= wstart) 40 | if is_empty: 41 | outputs[roi_ind, :, ph, pw] = 0 42 | else: 43 | data = features[batch_ind] 44 | outputs[roi_ind, :, ph, pw] = torch.max( 45 | torch.max(data[:, hstart:hend, wstart:wend], 1)[0], 2)[0].view(-1) 46 | 47 | return outputs 48 | 49 | -------------------------------------------------------------------------------- /lib/model/roi_crop/functions/gridgen.py: -------------------------------------------------------------------------------- 1 | # functions/add.py 2 | import torch 3 | from torch.autograd import Function 4 | import numpy as np 5 | 6 | 7 | class AffineGridGenFunction(Function): 8 | def __init__(self, height, width,lr=1): 9 | super(AffineGridGenFunction, self).__init__() 10 | self.lr = lr 11 | self.height, self.width = height, width 12 | self.grid = np.zeros( [self.height, self.width, 3], dtype=np.float32) 13 | self.grid[:,:,0] = np.expand_dims(np.repeat(np.expand_dims(np.arange(-1, 1, 2.0/(self.height)), 0), repeats = self.width, axis = 0).T, 0) 14 | self.grid[:,:,1] = np.expand_dims(np.repeat(np.expand_dims(np.arange(-1, 1, 2.0/(self.width)), 0), repeats = self.height, axis = 0), 0) 15 | # self.grid[:,:,0] = np.expand_dims(np.repeat(np.expand_dims(np.arange(-1, 1, 2.0/(self.height - 1)), 0), repeats = self.width, axis = 0).T, 0) 16 | # self.grid[:,:,1] = np.expand_dims(np.repeat(np.expand_dims(np.arange(-1, 1, 2.0/(self.width - 1)), 0), repeats = self.height, axis = 0), 0) 17 | self.grid[:,:,2] = np.ones([self.height, width]) 18 | self.grid = torch.from_numpy(self.grid.astype(np.float32)) 19 | #print(self.grid) 20 | 21 | def forward(self, input1): 22 | self.input1 = input1 23 | output = input1.new(torch.Size([input1.size(0)]) + self.grid.size()).zero_() 24 | self.batchgrid = input1.new(torch.Size([input1.size(0)]) + self.grid.size()).zero_() 25 | for i in range(input1.size(0)): 26 | self.batchgrid[i] = self.grid.astype(self.batchgrid[i]) 27 | 28 | # if input1.is_cuda: 29 | # self.batchgrid = self.batchgrid.cuda() 30 | # output = output.cuda() 31 | 32 | for i in range(input1.size(0)): 33 | output = torch.bmm(self.batchgrid.view(-1, self.height*self.width, 3), torch.transpose(input1, 1, 2)).view(-1, self.height, self.width, 2) 34 | 35 | return output 36 | 37 | def backward(self, grad_output): 38 | 39 | grad_input1 = self.input1.new(self.input1.size()).zero_() 40 | 41 | # if grad_output.is_cuda: 42 | # self.batchgrid = self.batchgrid.cuda() 43 | # grad_input1 = grad_input1.cuda() 44 | 45 | grad_input1 = torch.baddbmm(grad_input1, torch.transpose(grad_output.view(-1, self.height*self.width, 2), 1,2), self.batchgrid.view(-1, self.height*self.width, 3)) 46 | return grad_input1 47 | -------------------------------------------------------------------------------- /lib/pycocotools/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | 9 | typedef unsigned int uint; 10 | typedef unsigned long siz; 11 | typedef unsigned char byte; 12 | typedef double* BB; 13 | typedef struct { siz h, w, m; uint *cnts; } RLE; 14 | 15 | /* Initialize/destroy RLE. */ 16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 17 | void rleFree( RLE *R ); 18 | 19 | /* Initialize/destroy RLE array. */ 20 | void rlesInit( RLE **R, siz n ); 21 | void rlesFree( RLE **R, siz n ); 22 | 23 | /* Encode binary masks using RLE. */ 24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 25 | 26 | /* Decode binary masks encoded via RLE. */ 27 | void rleDecode( const RLE *R, byte *mask, siz n ); 28 | 29 | /* Compute union or intersection of encoded masks. */ 30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ); 31 | 32 | /* Compute area of encoded masks. */ 33 | void rleArea( const RLE *R, siz n, uint *a ); 34 | 35 | /* Compute intersection over union between masks. */ 36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 37 | 38 | /* Compute non-maximum suppression between bounding masks */ 39 | void rleNms( RLE *dt, siz n, uint *keep, double thr ); 40 | 41 | /* Compute intersection over union between bounding boxes. */ 42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 43 | 44 | /* Compute non-maximum suppression between bounding boxes */ 45 | void bbNms( BB dt, siz n, uint *keep, double thr ); 46 | 47 | /* Get bounding boxes surrounding encoded masks. */ 48 | void rleToBbox( const RLE *R, BB bb, siz n ); 49 | 50 | /* Convert bounding boxes to encoded masks. */ 51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 52 | 53 | /* Convert polygon to encoded mask. */ 54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 55 | 56 | /* Get compressed string representation of encoded mask. */ 57 | char* rleToString( const RLE *R ); 58 | 59 | /* Convert from compressed string representation of encoded mask. */ 60 | void rleFrString( RLE *R, char *s, siz h, siz w ); 61 | -------------------------------------------------------------------------------- /lib/model/roi_align/src/roi_align_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "roi_align_kernel.h" 4 | 5 | extern THCState *state; 6 | 7 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output) 9 | { 10 | // Grab the input tensor 11 | float * data_flat = THCudaTensor_data(state, features); 12 | float * rois_flat = THCudaTensor_data(state, rois); 13 | 14 | float * output_flat = THCudaTensor_data(state, output); 15 | 16 | // Number of ROIs 17 | int num_rois = THCudaTensor_size(state, rois, 0); 18 | int size_rois = THCudaTensor_size(state, rois, 1); 19 | if (size_rois != 5) 20 | { 21 | return 0; 22 | } 23 | 24 | // data height 25 | int data_height = THCudaTensor_size(state, features, 2); 26 | // data width 27 | int data_width = THCudaTensor_size(state, features, 3); 28 | // Number of channels 29 | int num_channels = THCudaTensor_size(state, features, 1); 30 | 31 | cudaStream_t stream = THCState_getCurrentStream(state); 32 | 33 | ROIAlignForwardLaucher( 34 | data_flat, spatial_scale, num_rois, data_height, 35 | data_width, num_channels, aligned_height, 36 | aligned_width, rois_flat, 37 | output_flat, stream); 38 | 39 | return 1; 40 | } 41 | 42 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 43 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad) 44 | { 45 | // Grab the input tensor 46 | float * top_grad_flat = THCudaTensor_data(state, top_grad); 47 | float * rois_flat = THCudaTensor_data(state, rois); 48 | 49 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 50 | 51 | // Number of ROIs 52 | int num_rois = THCudaTensor_size(state, rois, 0); 53 | int size_rois = THCudaTensor_size(state, rois, 1); 54 | if (size_rois != 5) 55 | { 56 | return 0; 57 | } 58 | 59 | // batch size 60 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 61 | // data height 62 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 63 | // data width 64 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 65 | // Number of channels 66 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 67 | 68 | cudaStream_t stream = THCState_getCurrentStream(state); 69 | ROIAlignBackwardLaucher( 70 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height, 71 | data_width, num_channels, aligned_height, 72 | aligned_width, rois_flat, 73 | bottom_grad_flat, stream); 74 | 75 | return 1; 76 | } 77 | -------------------------------------------------------------------------------- /lib/model/utils/logger.py: -------------------------------------------------------------------------------- 1 | # Code referenced from https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514 2 | import tensorflow as tf 3 | import numpy as np 4 | import scipy.misc 5 | try: 6 | from StringIO import StringIO # Python 2.7 7 | except ImportError: 8 | from io import BytesIO # Python 3.x 9 | 10 | 11 | class Logger(object): 12 | 13 | def __init__(self, log_dir): 14 | """Create a summary writer logging to log_dir.""" 15 | self.writer = tf.summary.FileWriter(log_dir) 16 | 17 | def scalar_summary(self, tag, value, step): 18 | """Log a scalar variable.""" 19 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) 20 | self.writer.add_summary(summary, step) 21 | 22 | def image_summary(self, tag, images, step): 23 | """Log a list of images.""" 24 | 25 | img_summaries = [] 26 | for i, img in enumerate(images): 27 | # Write the image to a string 28 | try: 29 | s = StringIO() 30 | except: 31 | s = BytesIO() 32 | scipy.misc.toimage(img).save(s, format="png") 33 | 34 | # Create an Image object 35 | img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(), 36 | height=img.shape[0], 37 | width=img.shape[1]) 38 | # Create a Summary value 39 | img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum)) 40 | 41 | # Create and write Summary 42 | summary = tf.Summary(value=img_summaries) 43 | self.writer.add_summary(summary, step) 44 | 45 | def histo_summary(self, tag, values, step, bins=1000): 46 | """Log a histogram of the tensor of values.""" 47 | 48 | # Create a histogram using numpy 49 | counts, bin_edges = np.histogram(values, bins=bins) 50 | 51 | # Fill the fields of the histogram proto 52 | hist = tf.HistogramProto() 53 | hist.min = float(np.min(values)) 54 | hist.max = float(np.max(values)) 55 | hist.num = int(np.prod(values.shape)) 56 | hist.sum = float(np.sum(values)) 57 | hist.sum_squares = float(np.sum(values**2)) 58 | 59 | # Drop the start of the first bin 60 | bin_edges = bin_edges[1:] 61 | 62 | # Add bin edges and counts 63 | for edge in bin_edges: 64 | hist.bucket_limit.append(edge) 65 | for c in counts: 66 | hist.bucket.append(c) 67 | 68 | # Create and write Summary 69 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) 70 | self.writer.add_summary(summary, step) 71 | self.writer.flush() 72 | -------------------------------------------------------------------------------- /lib/model/roi_crop/src/roi_crop_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | 6 | int BilinearSamplerBHWD_updateOutput_cuda_kernel(/*output->size[3]*/int oc, 7 | /*output->size[2]*/int ow, 8 | /*output->size[1]*/int oh, 9 | /*output->size[0]*/int ob, 10 | /*THCudaTensor_size(state, inputImages, 3)*/int ic, 11 | /*THCudaTensor_size(state, inputImages, 1)*/int ih, 12 | /*THCudaTensor_size(state, inputImages, 2)*/int iw, 13 | /*THCudaTensor_size(state, inputImages, 0)*/int ib, 14 | /*THCudaTensor *inputImages*/float *inputImages, int isb, int isc, int ish, int isw, 15 | /*THCudaTensor *grids*/float *grids, int gsb, int gsc, int gsh, int gsw, 16 | /*THCudaTensor *output*/float *output, int osb, int osc, int osh, int osw, 17 | /*THCState_getCurrentStream(state)*/cudaStream_t stream); 18 | 19 | int BilinearSamplerBHWD_updateGradInput_cuda_kernel(/*gradOutput->size[3]*/int goc, 20 | /*gradOutput->size[2]*/int gow, 21 | /*gradOutput->size[1]*/int goh, 22 | /*gradOutput->size[0]*/int gob, 23 | /*THCudaTensor_size(state, inputImages, 3)*/int ic, 24 | /*THCudaTensor_size(state, inputImages, 1)*/int ih, 25 | /*THCudaTensor_size(state, inputImages, 2)*/int iw, 26 | /*THCudaTensor_size(state, inputImages, 0)*/int ib, 27 | /*THCudaTensor *inputImages*/float *inputImages, int isb, int isc, int ish, int isw, 28 | /*THCudaTensor *grids*/float *grids, int gsb, int gsc, int gsh, int gsw, 29 | /*THCudaTensor *gradInputImages*/float *gradInputImages, int gisb, int gisc, int gish, int gisw, 30 | /*THCudaTensor *gradGrids*/float *gradGrids, int ggsb, int ggsc, int ggsh, int ggsw, 31 | /*THCudaTensor *gradOutput*/float *gradOutput, int gosb, int gosc, int gosh, int gosw, 32 | /*THCState_getCurrentStream(state)*/cudaStream_t stream); 33 | 34 | 35 | #ifdef __cplusplus 36 | } 37 | #endif 38 | -------------------------------------------------------------------------------- /lib/datasets/factory.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Factory method for easily getting imdbs by name.""" 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | __sets = {} 14 | from datasets.pascal_voc import pascal_voc 15 | from datasets.coco import coco 16 | from datasets.imagenet import imagenet 17 | from datasets.vg import vg 18 | 19 | import numpy as np 20 | 21 | # Set up voc__ 22 | for year in ['2007', '2012']: 23 | for split in ['train', 'val', 'trainval', 'test']: 24 | name = 'voc_{}_{}'.format(year, split) 25 | __sets[name] = (lambda split=split, year=year: pascal_voc(split, year)) 26 | 27 | # Set up coco_2014_ 28 | for year in ['2014']: 29 | for split in ['train', 'val', 'minival', 'valminusminival', 'trainval']: 30 | name = 'coco_{}_{}'.format(year, split) 31 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 32 | 33 | # Set up coco_2014_cap_ 34 | for year in ['2014']: 35 | for split in ['train', 'val', 'capval', 'valminuscapval', 'trainval']: 36 | name = 'coco_{}_{}'.format(year, split) 37 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 38 | 39 | # Set up coco_2015_ 40 | for year in ['2015']: 41 | for split in ['test', 'test-dev']: 42 | name = 'coco_{}_{}'.format(year, split) 43 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 44 | 45 | # Set up vg_ 46 | # for version in ['1600-400-20']: 47 | # for split in ['minitrain', 'train', 'minival', 'val', 'test']: 48 | # name = 'vg_{}_{}'.format(version,split) 49 | # __sets[name] = (lambda split=split, version=version: vg(version, split)) 50 | for version in ['150-50-20', '150-50-50', '500-150-80', '750-250-150', '1750-700-450', '1600-400-20']: 51 | for split in ['minitrain', 'smalltrain', 'train', 'minival', 'smallval', 'val', 'test']: 52 | name = 'vg_{}_{}'.format(version,split) 53 | __sets[name] = (lambda split=split, version=version: vg(version, split)) 54 | 55 | # set up image net. 56 | for split in ['train', 'val', 'val1', 'val2', 'test']: 57 | name = 'imagenet_{}'.format(split) 58 | devkit_path = 'data/imagenet/ILSVRC/devkit' 59 | data_path = 'data/imagenet/ILSVRC' 60 | __sets[name] = (lambda split=split, devkit_path=devkit_path, data_path=data_path: imagenet(split,devkit_path,data_path)) 61 | 62 | def get_imdb(name): 63 | """Get an imdb (image database) by name.""" 64 | if name not in __sets: 65 | raise KeyError('Unknown dataset: {}'.format(name)) 66 | return __sets[name]() 67 | 68 | 69 | def list_imdbs(): 70 | """List all registered imdbs.""" 71 | return list(__sets.keys()) 72 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/src/roi_pooling_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "roi_pooling_kernel.h" 4 | 5 | extern THCState *state; 6 | 7 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, 8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax) 9 | { 10 | // Grab the input tensor 11 | float * data_flat = THCudaTensor_data(state, features); 12 | float * rois_flat = THCudaTensor_data(state, rois); 13 | 14 | float * output_flat = THCudaTensor_data(state, output); 15 | int * argmax_flat = THCudaIntTensor_data(state, argmax); 16 | 17 | // Number of ROIs 18 | int num_rois = THCudaTensor_size(state, rois, 0); 19 | int size_rois = THCudaTensor_size(state, rois, 1); 20 | if (size_rois != 5) 21 | { 22 | return 0; 23 | } 24 | 25 | // batch size 26 | // int batch_size = THCudaTensor_size(state, features, 0); 27 | // if (batch_size != 1) 28 | // { 29 | // return 0; 30 | // } 31 | // data height 32 | int data_height = THCudaTensor_size(state, features, 2); 33 | // data width 34 | int data_width = THCudaTensor_size(state, features, 3); 35 | // Number of channels 36 | int num_channels = THCudaTensor_size(state, features, 1); 37 | 38 | cudaStream_t stream = THCState_getCurrentStream(state); 39 | 40 | ROIPoolForwardLaucher( 41 | data_flat, spatial_scale, num_rois, data_height, 42 | data_width, num_channels, pooled_height, 43 | pooled_width, rois_flat, 44 | output_flat, argmax_flat, stream); 45 | 46 | return 1; 47 | } 48 | 49 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, 50 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax) 51 | { 52 | // Grab the input tensor 53 | float * top_grad_flat = THCudaTensor_data(state, top_grad); 54 | float * rois_flat = THCudaTensor_data(state, rois); 55 | 56 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 57 | int * argmax_flat = THCudaIntTensor_data(state, argmax); 58 | 59 | // Number of ROIs 60 | int num_rois = THCudaTensor_size(state, rois, 0); 61 | int size_rois = THCudaTensor_size(state, rois, 1); 62 | if (size_rois != 5) 63 | { 64 | return 0; 65 | } 66 | 67 | // batch size 68 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 69 | // if (batch_size != 1) 70 | // { 71 | // return 0; 72 | // } 73 | // data height 74 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 75 | // data width 76 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 77 | // Number of channels 78 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 79 | 80 | cudaStream_t stream = THCState_getCurrentStream(state); 81 | ROIPoolBackwardLaucher( 82 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height, 83 | data_width, num_channels, pooled_height, 84 | pooled_width, rois_flat, 85 | bottom_grad_flat, argmax_flat, stream); 86 | 87 | return 1; 88 | } 89 | -------------------------------------------------------------------------------- /lib/roi_data_layer/minibatch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Xinlei Chen 6 | # -------------------------------------------------------- 7 | 8 | """Compute minibatch blobs for training a Fast R-CNN network.""" 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | import numpy as np 14 | import numpy.random as npr 15 | from scipy.misc import imread 16 | from model.utils.config import cfg 17 | from model.utils.blob import prep_im_for_blob, im_list_to_blob 18 | import pdb 19 | def get_minibatch(roidb, num_classes): 20 | """Given a roidb, construct a minibatch sampled from it.""" 21 | num_images = len(roidb) 22 | # Sample random scales to use for each image in this batch 23 | random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), 24 | size=num_images) 25 | assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \ 26 | 'num_images ({}) must divide BATCH_SIZE ({})'. \ 27 | format(num_images, cfg.TRAIN.BATCH_SIZE) 28 | 29 | # Get the input image blob, formatted for caffe 30 | im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) 31 | 32 | blobs = {'data': im_blob} 33 | 34 | assert len(im_scales) == 1, "Single batch only" 35 | assert len(roidb) == 1, "Single batch only" 36 | 37 | # gt boxes: (x1, y1, x2, y2, cls) 38 | if cfg.TRAIN.USE_ALL_GT: 39 | # Include all ground truth boxes 40 | gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] 41 | else: 42 | # For the COCO ground truth boxes, exclude the ones that are ''iscrowd'' 43 | gt_inds = np.where(roidb[0]['gt_classes'] != 0 & np.all(roidb[0]['gt_overlaps'].toarray() > -1.0, axis=1))[0] 44 | gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) 45 | gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0] 46 | gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] 47 | blobs['gt_boxes'] = gt_boxes 48 | blobs['im_info'] = np.array( 49 | [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], 50 | dtype=np.float32) 51 | 52 | blobs['img_id'] = roidb[0]['img_id'] 53 | 54 | return blobs 55 | 56 | def _get_image_blob(roidb, scale_inds): 57 | """Builds an input blob from the images in the roidb at the specified 58 | scales. 59 | """ 60 | num_images = len(roidb) 61 | 62 | processed_ims = [] 63 | im_scales = [] 64 | for i in range(num_images): 65 | #im = cv2.imread(roidb[i]['image']) 66 | im = imread(roidb[i]['image']) 67 | 68 | if len(im.shape) == 2: 69 | im = im[:,:,np.newaxis] 70 | im = np.concatenate((im,im,im), axis=2) 71 | # flip the channel, since the original one using cv2 72 | # rgb -> bgr 73 | im = im[:,:,::-1] 74 | 75 | if roidb[i]['flipped']: 76 | im = im[:, ::-1, :] 77 | target_size = cfg.TRAIN.SCALES[scale_inds[i]] 78 | im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, 79 | cfg.TRAIN.MAX_SIZE) 80 | im_scales.append(im_scale) 81 | processed_ims.append(im) 82 | 83 | # Create a blob to hold the input images 84 | blob = im_list_to_blob(processed_ims) 85 | 86 | return blob, im_scales 87 | -------------------------------------------------------------------------------- /lib/model/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps(np.ndarray[DTYPE_t, ndim=2] boxes, 16 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 17 | return bbox_overlaps_c(boxes, query_boxes) 18 | 19 | cdef np.ndarray[DTYPE_t, ndim=2] bbox_overlaps_c( 20 | np.ndarray[DTYPE_t, ndim=2] boxes, 21 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 22 | """ 23 | Parameters 24 | ---------- 25 | boxes: (N, 4) ndarray of float 26 | query_boxes: (K, 4) ndarray of float 27 | Returns 28 | ------- 29 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 30 | """ 31 | cdef unsigned int N = boxes.shape[0] 32 | cdef unsigned int K = query_boxes.shape[0] 33 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 34 | cdef DTYPE_t iw, ih, box_area 35 | cdef DTYPE_t ua 36 | cdef unsigned int k, n 37 | for k in range(K): 38 | box_area = ( 39 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 40 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 41 | ) 42 | for n in range(N): 43 | iw = ( 44 | min(boxes[n, 2], query_boxes[k, 2]) - 45 | max(boxes[n, 0], query_boxes[k, 0]) + 1 46 | ) 47 | if iw > 0: 48 | ih = ( 49 | min(boxes[n, 3], query_boxes[k, 3]) - 50 | max(boxes[n, 1], query_boxes[k, 1]) + 1 51 | ) 52 | if ih > 0: 53 | ua = float( 54 | (boxes[n, 2] - boxes[n, 0] + 1) * 55 | (boxes[n, 3] - boxes[n, 1] + 1) + 56 | box_area - iw * ih 57 | ) 58 | overlaps[n, k] = iw * ih / ua 59 | return overlaps 60 | 61 | 62 | def bbox_intersections( 63 | np.ndarray[DTYPE_t, ndim=2] boxes, 64 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 65 | return bbox_intersections_c(boxes, query_boxes) 66 | 67 | 68 | cdef np.ndarray[DTYPE_t, ndim=2] bbox_intersections_c( 69 | np.ndarray[DTYPE_t, ndim=2] boxes, 70 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 71 | """ 72 | For each query box compute the intersection ratio covered by boxes 73 | ---------- 74 | Parameters 75 | ---------- 76 | boxes: (N, 4) ndarray of float 77 | query_boxes: (K, 4) ndarray of float 78 | Returns 79 | ------- 80 | overlaps: (N, K) ndarray of intersec between boxes and query_boxes 81 | """ 82 | cdef unsigned int N = boxes.shape[0] 83 | cdef unsigned int K = query_boxes.shape[0] 84 | cdef np.ndarray[DTYPE_t, ndim=2] intersec = np.zeros((N, K), dtype=DTYPE) 85 | cdef DTYPE_t iw, ih, box_area 86 | cdef DTYPE_t ua 87 | cdef unsigned int k, n 88 | for k in range(K): 89 | box_area = ( 90 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 91 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 92 | ) 93 | for n in range(N): 94 | iw = ( 95 | min(boxes[n, 2], query_boxes[k, 2]) - 96 | max(boxes[n, 0], query_boxes[k, 0]) + 1 97 | ) 98 | if iw > 0: 99 | ih = ( 100 | min(boxes[n, 3], query_boxes[k, 3]) - 101 | max(boxes[n, 1], query_boxes[k, 1]) + 1 102 | ) 103 | if ih > 0: 104 | intersec[n, k] = iw * ih / box_area 105 | return intersec -------------------------------------------------------------------------------- /lib/model/roi_pooling/src/roi_pooling.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale, 5 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output) 6 | { 7 | // Grab the input tensor 8 | float * data_flat = THFloatTensor_data(features); 9 | float * rois_flat = THFloatTensor_data(rois); 10 | 11 | float * output_flat = THFloatTensor_data(output); 12 | 13 | // Number of ROIs 14 | int num_rois = THFloatTensor_size(rois, 0); 15 | int size_rois = THFloatTensor_size(rois, 1); 16 | // batch size 17 | int batch_size = THFloatTensor_size(features, 0); 18 | if(batch_size != 1) 19 | { 20 | return 0; 21 | } 22 | // data height 23 | int data_height = THFloatTensor_size(features, 1); 24 | // data width 25 | int data_width = THFloatTensor_size(features, 2); 26 | // Number of channels 27 | int num_channels = THFloatTensor_size(features, 3); 28 | 29 | // Set all element of the output tensor to -inf. 30 | THFloatStorage_fill(THFloatTensor_storage(output), -1); 31 | 32 | // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R 33 | int index_roi = 0; 34 | int index_output = 0; 35 | int n; 36 | for (n = 0; n < num_rois; ++n) 37 | { 38 | int roi_batch_ind = rois_flat[index_roi + 0]; 39 | int roi_start_w = round(rois_flat[index_roi + 1] * spatial_scale); 40 | int roi_start_h = round(rois_flat[index_roi + 2] * spatial_scale); 41 | int roi_end_w = round(rois_flat[index_roi + 3] * spatial_scale); 42 | int roi_end_h = round(rois_flat[index_roi + 4] * spatial_scale); 43 | // CHECK_GE(roi_batch_ind, 0); 44 | // CHECK_LT(roi_batch_ind, batch_size); 45 | 46 | int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); 47 | int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); 48 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 49 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 50 | 51 | int index_data = roi_batch_ind * data_height * data_width * num_channels; 52 | const int output_area = pooled_width * pooled_height; 53 | 54 | int c, ph, pw; 55 | for (ph = 0; ph < pooled_height; ++ph) 56 | { 57 | for (pw = 0; pw < pooled_width; ++pw) 58 | { 59 | int hstart = (floor((float)(ph) * bin_size_h)); 60 | int wstart = (floor((float)(pw) * bin_size_w)); 61 | int hend = (ceil((float)(ph + 1) * bin_size_h)); 62 | int wend = (ceil((float)(pw + 1) * bin_size_w)); 63 | 64 | hstart = fminf(fmaxf(hstart + roi_start_h, 0), data_height); 65 | hend = fminf(fmaxf(hend + roi_start_h, 0), data_height); 66 | wstart = fminf(fmaxf(wstart + roi_start_w, 0), data_width); 67 | wend = fminf(fmaxf(wend + roi_start_w, 0), data_width); 68 | 69 | const int pool_index = index_output + (ph * pooled_width + pw); 70 | int is_empty = (hend <= hstart) || (wend <= wstart); 71 | if (is_empty) 72 | { 73 | for (c = 0; c < num_channels * output_area; c += output_area) 74 | { 75 | output_flat[pool_index + c] = 0; 76 | } 77 | } 78 | else 79 | { 80 | int h, w, c; 81 | for (h = hstart; h < hend; ++h) 82 | { 83 | for (w = wstart; w < wend; ++w) 84 | { 85 | for (c = 0; c < num_channels; ++c) 86 | { 87 | const int index = (h * data_width + w) * num_channels + c; 88 | if (data_flat[index_data + index] > output_flat[pool_index + c * output_area]) 89 | { 90 | output_flat[pool_index + c * output_area] = data_flat[index_data + index]; 91 | } 92 | } 93 | } 94 | } 95 | } 96 | } 97 | } 98 | 99 | // Increment ROI index 100 | index_roi += size_rois; 101 | index_output += pooled_height * pooled_width * num_channels; 102 | } 103 | return 1; 104 | } -------------------------------------------------------------------------------- /lib/roi_data_layer/roidb.py: -------------------------------------------------------------------------------- 1 | """Transform a roidb into a trainable roidb by adding a bunch of metadata.""" 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import datasets 7 | import numpy as np 8 | from model.utils.config import cfg 9 | from datasets.factory import get_imdb 10 | import PIL 11 | import pdb 12 | 13 | def prepare_roidb(imdb): 14 | """Enrich the imdb's roidb by adding some derived quantities that 15 | are useful for training. This function precomputes the maximum 16 | overlap, taken over ground-truth boxes, between each ROI and 17 | each ground-truth box. The class with maximum overlap is also 18 | recorded. 19 | """ 20 | 21 | roidb = imdb.roidb 22 | if not (imdb.name.startswith('coco')): 23 | sizes = [PIL.Image.open(imdb.image_path_at(i)).size 24 | for i in range(imdb.num_images)] 25 | 26 | for i in range(len(imdb.image_index)): 27 | roidb[i]['img_id'] = imdb.image_id_at(i) 28 | roidb[i]['image'] = imdb.image_path_at(i) 29 | if not (imdb.name.startswith('coco')): 30 | roidb[i]['width'] = sizes[i][0] 31 | roidb[i]['height'] = sizes[i][1] 32 | # need gt_overlaps as a dense array for argmax 33 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 34 | # max overlap with gt over classes (columns) 35 | max_overlaps = gt_overlaps.max(axis=1) 36 | # gt class that had the max overlap 37 | max_classes = gt_overlaps.argmax(axis=1) 38 | roidb[i]['max_classes'] = max_classes 39 | roidb[i]['max_overlaps'] = max_overlaps 40 | # sanity checks 41 | # max overlap of 0 => class should be zero (background) 42 | zero_inds = np.where(max_overlaps == 0)[0] 43 | assert all(max_classes[zero_inds] == 0) 44 | # max overlap > 0 => class should not be zero (must be a fg class) 45 | nonzero_inds = np.where(max_overlaps > 0)[0] 46 | assert all(max_classes[nonzero_inds] != 0) 47 | 48 | 49 | def rank_roidb_ratio(roidb): 50 | # rank roidb based on the ratio between width and height. 51 | ratio_large = 2 # largest ratio to preserve. 52 | ratio_small = 0.5 # smallest ratio to preserve. 53 | 54 | ratio_list = [] 55 | for i in range(len(roidb)): 56 | width = roidb[i]['width'] 57 | height = roidb[i]['height'] 58 | ratio = width / float(height) 59 | 60 | if ratio > ratio_large: 61 | roidb[i]['need_crop'] = 1 62 | ratio = ratio_large 63 | elif ratio < ratio_small: 64 | roidb[i]['need_crop'] = 1 65 | ratio = ratio_small 66 | else: 67 | roidb[i]['need_crop'] = 0 68 | 69 | ratio_list.append(ratio) 70 | 71 | ratio_list = np.array(ratio_list) 72 | ratio_index = np.argsort(ratio_list) 73 | return ratio_list[ratio_index], ratio_index 74 | 75 | def filter_roidb(roidb): 76 | # filter the image without bounding box. 77 | print('before filtering, there are %d images...' % (len(roidb))) 78 | i = 0 79 | while i < len(roidb): 80 | if len(roidb[i]['boxes']) == 0: 81 | del roidb[i] 82 | i -= 1 83 | i += 1 84 | 85 | print('after filtering, there are %d images...' % (len(roidb))) 86 | return roidb 87 | 88 | def combined_roidb(imdb_names, training=True): 89 | """ 90 | Combine multiple roidbs 91 | """ 92 | 93 | def get_training_roidb(imdb): 94 | """Returns a roidb (Region of Interest database) for use in training.""" 95 | if cfg.TRAIN.USE_FLIPPED: 96 | print('Appending horizontally-flipped training examples...') 97 | imdb.append_flipped_images() 98 | print('done') 99 | 100 | print('Preparing training data...') 101 | 102 | prepare_roidb(imdb) 103 | #ratio_index = rank_roidb_ratio(imdb) 104 | print('done') 105 | 106 | return imdb.roidb 107 | 108 | def get_roidb(imdb_name): 109 | imdb = get_imdb(imdb_name) 110 | print('Loaded dataset `{:s}` for training'.format(imdb.name)) 111 | imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD) 112 | print('Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)) 113 | roidb = get_training_roidb(imdb) 114 | return roidb 115 | 116 | roidbs = [get_roidb(s) for s in imdb_names.split('+')] 117 | roidb = roidbs[0] 118 | 119 | if len(roidbs) > 1: 120 | for r in roidbs[1:]: 121 | roidb.extend(r) 122 | tmp = get_imdb(imdb_names.split('+')[1]) 123 | imdb = datasets.imdb.imdb(imdb_names, tmp.classes) 124 | else: 125 | imdb = get_imdb(imdb_names) 126 | 127 | if training: 128 | roidb = filter_roidb(roidb) 129 | 130 | ratio_list, ratio_index = rank_roidb_ratio(roidb) 131 | 132 | return imdb, roidb, ratio_list, ratio_index 133 | -------------------------------------------------------------------------------- /lib/datasets/vg_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os 9 | import cPickle 10 | import numpy as np 11 | from voc_eval import voc_ap 12 | 13 | 14 | 15 | def vg_eval( detpath, 16 | gt_roidb, 17 | image_index, 18 | classindex, 19 | ovthresh=0.5, 20 | use_07_metric=False, 21 | eval_attributes=False): 22 | """rec, prec, ap, sorted_scores, npos = voc_eval( 23 | detpath, 24 | gt_roidb, 25 | image_index, 26 | classindex, 27 | [ovthresh], 28 | [use_07_metric]) 29 | 30 | Top level function that does the Visual Genome evaluation. 31 | 32 | detpath: Path to detections 33 | gt_roidb: List of ground truth structs. 34 | image_index: List of image ids. 35 | classindex: Category index 36 | [ovthresh]: Overlap threshold (default = 0.5) 37 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 38 | (default False) 39 | """ 40 | # extract gt objects for this class 41 | class_recs = {} 42 | npos = 0 43 | for item,imagename in zip(gt_roidb,image_index): 44 | if eval_attributes: 45 | bbox = item['boxes'][np.where(np.any(item['gt_attributes'].toarray() == classindex, axis=1))[0], :] 46 | else: 47 | bbox = item['boxes'][np.where(item['gt_classes'] == classindex)[0], :] 48 | difficult = np.zeros((bbox.shape[0],)).astype(np.bool) 49 | det = [False] * bbox.shape[0] 50 | npos = npos + sum(~difficult) 51 | class_recs[str(imagename)] = {'bbox': bbox, 52 | 'difficult': difficult, 53 | 'det': det} 54 | if npos == 0: 55 | # No ground truth examples 56 | return 0,0,0,0,npos 57 | 58 | # read dets 59 | with open(detpath, 'r') as f: 60 | lines = f.readlines() 61 | if len(lines) == 0: 62 | # No detection examples 63 | return 0,0,0,0,npos 64 | 65 | splitlines = [x.strip().split(' ') for x in lines] 66 | image_ids = [x[0] for x in splitlines] 67 | confidence = np.array([float(x[1]) for x in splitlines]) 68 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 69 | 70 | # sort by confidence 71 | sorted_ind = np.argsort(-confidence) 72 | sorted_scores = -np.sort(-confidence) 73 | BB = BB[sorted_ind, :] 74 | image_ids = [image_ids[x] for x in sorted_ind] 75 | 76 | # go down dets and mark TPs and FPs 77 | nd = len(image_ids) 78 | tp = np.zeros(nd) 79 | fp = np.zeros(nd) 80 | for d in range(nd): 81 | R = class_recs[image_ids[d]] 82 | bb = BB[d, :].astype(float) 83 | ovmax = -np.inf 84 | BBGT = R['bbox'].astype(float) 85 | 86 | if BBGT.size > 0: 87 | # compute overlaps 88 | # intersection 89 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 90 | iymin = np.maximum(BBGT[:, 1], bb[1]) 91 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 92 | iymax = np.minimum(BBGT[:, 3], bb[3]) 93 | iw = np.maximum(ixmax - ixmin + 1., 0.) 94 | ih = np.maximum(iymax - iymin + 1., 0.) 95 | inters = iw * ih 96 | 97 | # union 98 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 99 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 100 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 101 | 102 | overlaps = inters / uni 103 | ovmax = np.max(overlaps) 104 | jmax = np.argmax(overlaps) 105 | 106 | if ovmax > ovthresh: 107 | if not R['difficult'][jmax]: 108 | if not R['det'][jmax]: 109 | tp[d] = 1. 110 | R['det'][jmax] = 1 111 | else: 112 | fp[d] = 1. 113 | else: 114 | fp[d] = 1. 115 | 116 | # compute precision recall 117 | fp = np.cumsum(fp) 118 | tp = np.cumsum(tp) 119 | rec = tp / float(npos) 120 | # avoid divide by zero in case the first detection matches a difficult 121 | # ground truth 122 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 123 | ap = voc_ap(rec, prec, use_07_metric) 124 | 125 | return rec, prec, ap, sorted_scores, npos 126 | -------------------------------------------------------------------------------- /lib/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | import pycocotools._mask as _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | iou = _mask.iou 77 | merge = _mask.merge 78 | frPyObjects = _mask.frPyObjects 79 | 80 | def encode(bimask): 81 | if len(bimask.shape) == 3: 82 | return _mask.encode(bimask) 83 | elif len(bimask.shape) == 2: 84 | h, w = bimask.shape 85 | return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] 86 | 87 | def decode(rleObjs): 88 | if type(rleObjs) == list: 89 | return _mask.decode(rleObjs) 90 | else: 91 | return _mask.decode([rleObjs])[:,:,0] 92 | 93 | def area(rleObjs): 94 | if type(rleObjs) == list: 95 | return _mask.area(rleObjs) 96 | else: 97 | return _mask.area([rleObjs])[0] 98 | 99 | def toBbox(rleObjs): 100 | if type(rleObjs) == list: 101 | return _mask.toBbox(rleObjs) 102 | else: 103 | return _mask.toBbox([rleObjs])[0] -------------------------------------------------------------------------------- /lib/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | import numpy as np 11 | from distutils.core import setup 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | from Cython.Build import cythonize 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 19 | for dir in path.split(os.pathsep): 20 | binpath = pjoin(dir, name) 21 | if os.path.exists(binpath): 22 | return os.path.abspath(binpath) 23 | return None 24 | 25 | 26 | def locate_cuda(): 27 | """Locate the CUDA environment on the system 28 | 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | 32 | Starts by looking for the CUDAHOME env variable. If not found, everything 33 | is based on finding 'nvcc' in the PATH. 34 | """ 35 | 36 | # first check if the CUDAHOME env variable is in use 37 | if 'CUDAHOME' in os.environ: 38 | home = os.environ['CUDAHOME'] 39 | nvcc = pjoin(home, 'bin', 'nvcc') 40 | else: 41 | # otherwise, search the PATH for NVCC 42 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 43 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 44 | if nvcc is None: 45 | raise EnvironmentError('The nvcc binary could not be ' 46 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 47 | home = os.path.dirname(os.path.dirname(nvcc)) 48 | 49 | cudaconfig = {'home': home, 'nvcc': nvcc, 50 | 'include': pjoin(home, 'include'), 51 | 'lib64': pjoin(home, 'lib64')} 52 | for k, v in cudaconfig.iteritems(): 53 | if not os.path.exists(v): 54 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 55 | 56 | return cudaconfig 57 | 58 | 59 | CUDA = locate_cuda() 60 | 61 | # Obtain the numpy include directory. This logic works across numpy versions. 62 | try: 63 | numpy_include = np.get_include() 64 | except AttributeError: 65 | numpy_include = np.get_numpy_include() 66 | 67 | 68 | def customize_compiler_for_nvcc(self): 69 | """inject deep into distutils to customize how the dispatch 70 | to gcc/nvcc works. 71 | 72 | If you subclass UnixCCompiler, it's not trivial to get your subclass 73 | injected in, and still have the right customizations (i.e. 74 | distutils.sysconfig.customize_compiler) run on it. So instead of going 75 | the OO route, I have this. Note, it's kindof like a wierd functional 76 | subclassing going on.""" 77 | 78 | # tell the compiler it can processes .cu 79 | self.src_extensions.append('.cu') 80 | 81 | # save references to the default compiler_so and _comple methods 82 | default_compiler_so = self.compiler_so 83 | super = self._compile 84 | 85 | # now redefine the _compile method. This gets executed for each 86 | # object but distutils doesn't have the ability to change compilers 87 | # based on source extension: we add it. 88 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 89 | print extra_postargs 90 | if os.path.splitext(src)[1] == '.cu': 91 | # use the cuda for .cu files 92 | self.set_executable('compiler_so', CUDA['nvcc']) 93 | # use only a subset of the extra_postargs, which are 1-1 translated 94 | # from the extra_compile_args in the Extension class 95 | postargs = extra_postargs['nvcc'] 96 | else: 97 | postargs = extra_postargs['gcc'] 98 | 99 | super(obj, src, ext, cc_args, postargs, pp_opts) 100 | # reset the default compiler_so, which we might have changed for cuda 101 | self.compiler_so = default_compiler_so 102 | 103 | # inject our redefined _compile method into the class 104 | self._compile = _compile 105 | 106 | 107 | # run the customize_compiler 108 | class custom_build_ext(build_ext): 109 | def build_extensions(self): 110 | customize_compiler_for_nvcc(self.compiler) 111 | build_ext.build_extensions(self) 112 | 113 | 114 | ext_modules = [ 115 | Extension( 116 | "model.utils.cython_bbox", 117 | ["model/utils/bbox.pyx"], 118 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 119 | include_dirs=[numpy_include] 120 | ), 121 | Extension( 122 | 'pycocotools._mask', 123 | sources=['pycocotools/maskApi.c', 'pycocotools/_mask.pyx'], 124 | include_dirs = [np.get_include(), '../common'], 125 | extra_compile_args={'gcc': ['-Wno-cpp', '-Wno-unused-function', '-std=c99']}, 126 | ), 127 | ] 128 | 129 | setup( 130 | name='faster_rcnn', 131 | ext_modules=ext_modules, 132 | # inject our custom trigger 133 | cmdclass={'build_ext': custom_build_ext}, 134 | ) 135 | 136 | setup(name='pycocotools', 137 | packages=['pycocotools'], 138 | package_dir = {'pycocotools': 'pycocotools'}, 139 | version='2.0', 140 | ext_modules= 141 | cythonize(ext_modules) 142 | ) 143 | -------------------------------------------------------------------------------- /lib/model/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /lib/model/rpn/rpn_fpn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | 6 | from model.utils.config import cfg 7 | from proposal_layer_fpn import _ProposalLayer_FPN 8 | from anchor_target_layer_fpn import _AnchorTargetLayer_FPN 9 | from model.utils.net_utils import _smooth_l1_loss 10 | 11 | import numpy as np 12 | import math 13 | import pdb 14 | import time 15 | 16 | class _RPN_FPN(nn.Module): 17 | """ region proposal network """ 18 | def __init__(self, din): 19 | super(_RPN_FPN, self).__init__() 20 | 21 | self.din = din # get depth of input feature map, e.g., 512 22 | self.anchor_ratios = cfg.ANCHOR_RATIOS 23 | self.anchor_scales = cfg.ANCHOR_SCALES 24 | self.feat_stride = cfg.FEAT_STRIDE[0] 25 | 26 | # define the convrelu layers processing input feature map 27 | self.RPN_Conv = nn.Conv2d(self.din, 512, 3, 1, 1, bias=True) 28 | 29 | # define bg/fg classifcation score layer 30 | # self.nc_score_out = len(self.anchor_scales) * len(self.anchor_ratios) * 2 # 2(bg/fg) * 9 (anchors) 31 | self.nc_score_out = 1 * len(self.anchor_ratios) * 2 # 2(bg/fg) * 3 (anchor ratios) * 1 (anchor scale) 32 | self.RPN_cls_score = nn.Conv2d(512, self.nc_score_out, 1, 1, 0) 33 | 34 | # define anchor box offset prediction layer 35 | # self.nc_bbox_out = len(self.anchor_scales) * len(self.anchor_ratios) * 4 # 4(coords) * 9 (anchors) 36 | self.nc_bbox_out = 1 * len(self.anchor_ratios) * 4 # 4(coords) * 3 (anchors) * 1 (anchor scale) 37 | self.RPN_bbox_pred = nn.Conv2d(512, self.nc_bbox_out, 1, 1, 0) 38 | 39 | # define proposal layer 40 | self.RPN_proposal = _ProposalLayer_FPN(self.feat_stride, self.anchor_scales, self.anchor_ratios) 41 | 42 | # define anchor target layer 43 | self.RPN_anchor_target = _AnchorTargetLayer_FPN(self.feat_stride, self.anchor_scales, self.anchor_ratios) 44 | 45 | self.rpn_loss_cls = 0 46 | self.rpn_loss_box = 0 47 | 48 | @staticmethod 49 | def reshape(x, d): 50 | input_shape = x.size() 51 | x = x.view( 52 | input_shape[0], 53 | int(d), 54 | int(float(input_shape[1] * input_shape[2]) / float(d)), 55 | input_shape[3] 56 | ) 57 | return x 58 | 59 | def forward(self, rpn_feature_maps, im_info, gt_boxes, num_boxes): 60 | 61 | 62 | n_feat_maps = len(rpn_feature_maps) 63 | 64 | rpn_cls_scores = [] 65 | rpn_cls_probs = [] 66 | rpn_bbox_preds = [] 67 | rpn_shapes = [] 68 | 69 | for i in range(n_feat_maps): 70 | feat_map = rpn_feature_maps[i] 71 | batch_size = feat_map.size(0) 72 | 73 | # return feature map after convrelu layer 74 | rpn_conv1 = F.relu(self.RPN_Conv(feat_map), inplace=True) 75 | # get rpn classification score 76 | rpn_cls_score = self.RPN_cls_score(rpn_conv1) 77 | 78 | rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) 79 | rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape) 80 | rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) 81 | 82 | # get rpn offsets to the anchor boxes 83 | rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) 84 | 85 | rpn_shapes.append([rpn_cls_score.size()[2], rpn_cls_score.size()[3]]) 86 | rpn_cls_scores.append(rpn_cls_score.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2)) 87 | rpn_cls_probs.append(rpn_cls_prob.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2)) 88 | rpn_bbox_preds.append(rpn_bbox_pred.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)) 89 | 90 | rpn_cls_score_alls = torch.cat(rpn_cls_scores, 1) 91 | rpn_cls_prob_alls = torch.cat(rpn_cls_probs, 1) 92 | rpn_bbox_pred_alls = torch.cat(rpn_bbox_preds, 1) 93 | 94 | n_rpn_pred = rpn_cls_score_alls.size(1) 95 | 96 | # proposal layer 97 | cfg_key = 'TRAIN' if self.training else 'TEST' 98 | 99 | rois = self.RPN_proposal((rpn_cls_prob_alls.data, rpn_bbox_pred_alls.data, 100 | im_info, cfg_key, rpn_shapes)) 101 | 102 | self.rpn_loss_cls = 0 103 | self.rpn_loss_box = 0 104 | 105 | # generating training labels and build the rpn loss 106 | if self.training: 107 | assert gt_boxes is not None 108 | 109 | rpn_data = self.RPN_anchor_target((rpn_cls_score_alls.data, gt_boxes, im_info, num_boxes, rpn_shapes)) 110 | 111 | # compute classification loss 112 | rpn_label = rpn_data[0].view(batch_size, -1) 113 | rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) 114 | rpn_cls_score = torch.index_select(rpn_cls_score_alls.view(-1,2), 0, rpn_keep) 115 | rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) 116 | rpn_label = Variable(rpn_label.long()) 117 | self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) 118 | fg_cnt = torch.sum(rpn_label.data.ne(0)) 119 | 120 | rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[1:] 121 | 122 | # compute bbox regression loss 123 | rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights.unsqueeze(2) \ 124 | .expand(batch_size, rpn_bbox_inside_weights.size(1), 4)) 125 | rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights.unsqueeze(2) \ 126 | .expand(batch_size, rpn_bbox_outside_weights.size(1), 4)) 127 | rpn_bbox_targets = Variable(rpn_bbox_targets) 128 | 129 | self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred_alls, rpn_bbox_targets, rpn_bbox_inside_weights, 130 | rpn_bbox_outside_weights, sigma=3) 131 | 132 | return rois, self.rpn_loss_cls, self.rpn_loss_box 133 | -------------------------------------------------------------------------------- /lib/model/rpn/proposal_layer_fpn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | # -------------------------------------------------------- 8 | # Reorganized and modified by Jianwei Yang and Jiasen Lu 9 | # -------------------------------------------------------- 10 | 11 | import torch 12 | import torch.nn as nn 13 | import numpy as np 14 | import math 15 | import yaml 16 | from model.utils.config import cfg 17 | from generate_anchors import generate_anchors, generate_anchors_all_pyramids 18 | from bbox_transform import bbox_transform_inv, clip_boxes, clip_boxes_batch 19 | from model.nms.nms_wrapper import nms 20 | 21 | import pdb 22 | 23 | DEBUG = False 24 | 25 | class _ProposalLayer_FPN(nn.Module): 26 | """ 27 | Outputs object detection proposals by applying estimated bounding-box 28 | transformations to a set of regular boxes (called "anchors"). 29 | """ 30 | 31 | def __init__(self, feat_stride, scales, ratios): 32 | super(_ProposalLayer_FPN, self).__init__() 33 | self._anchor_ratios = ratios 34 | self._feat_stride = feat_stride 35 | self._fpn_scales = np.array(cfg.FPN_ANCHOR_SCALES) 36 | self._fpn_feature_strides = np.array(cfg.FPN_FEAT_STRIDES) 37 | self._fpn_anchor_stride = cfg.FPN_ANCHOR_STRIDE 38 | # self._anchors = torch.from_numpy(generate_anchors_all_pyramids(self._fpn_scales, ratios, self._fpn_feature_strides, fpn_anchor_stride)) 39 | # self._num_anchors = self._anchors.size(0) 40 | 41 | def forward(self, input): 42 | 43 | # Algorithm: 44 | # 45 | # for each (H, W) location i 46 | # generate A anchor boxes centered on cell i 47 | # apply predicted bbox deltas at cell i to each of the A anchors 48 | # clip predicted boxes to image 49 | # remove predicted boxes with either height or width < threshold 50 | # sort all (proposal, score) pairs by score from highest to lowest 51 | # take top pre_nms_topN proposals before NMS 52 | # apply NMS with threshold 0.7 to remaining proposals 53 | # take after_nms_topN proposals after NMS 54 | # return the top proposals (-> RoIs top, scores top) 55 | 56 | 57 | # the first set of _num_anchors channels are bg probs 58 | # the second set are the fg probs 59 | scores = input[0][:, :, 1] # batch_size x num_rois x 1 60 | bbox_deltas = input[1] # batch_size x num_rois x 4 61 | im_info = input[2] 62 | cfg_key = input[3] 63 | feat_shapes = input[4] 64 | 65 | pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N 66 | post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N 67 | nms_thresh = cfg[cfg_key].RPN_NMS_THRESH 68 | min_size = cfg[cfg_key].RPN_MIN_SIZE 69 | 70 | batch_size = bbox_deltas.size(0) 71 | 72 | anchors = torch.from_numpy(generate_anchors_all_pyramids(self._fpn_scales, self._anchor_ratios, 73 | feat_shapes, self._fpn_feature_strides, self._fpn_anchor_stride)).type_as(scores) 74 | num_anchors = anchors.size(0) 75 | 76 | anchors = anchors.view(1, num_anchors, 4).expand(batch_size, num_anchors, 4) 77 | 78 | # Convert anchors into proposals via bbox transformations 79 | proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size) 80 | 81 | # 2. clip predicted boxes to image 82 | proposals = clip_boxes(proposals, im_info, batch_size) 83 | # keep_idx = self._filter_boxes(proposals, min_size).squeeze().long().nonzero().squeeze() 84 | 85 | scores_keep = scores 86 | proposals_keep = proposals 87 | 88 | _, order = torch.sort(scores_keep, 1, True) 89 | 90 | output = scores.new(batch_size, post_nms_topN, 5).zero_() 91 | for i in range(batch_size): 92 | # # 3. remove predicted boxes with either height or width < threshold 93 | # # (NOTE: convert min_size to input image scale stored in im_info[2]) 94 | proposals_single = proposals_keep[i] 95 | scores_single = scores_keep[i] 96 | 97 | # # 4. sort all (proposal, score) pairs by score from highest to lowest 98 | # # 5. take top pre_nms_topN (e.g. 6000) 99 | order_single = order[i] 100 | 101 | if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel(): 102 | order_single = order_single[:pre_nms_topN] 103 | 104 | proposals_single = proposals_single[order_single, :] 105 | scores_single = scores_single[order_single].view(-1,1) 106 | 107 | # 6. apply nms (e.g. threshold = 0.7) 108 | # 7. take after_nms_topN (e.g. 300) 109 | # 8. return the top proposals (-> RoIs top) 110 | 111 | keep_idx_i = nms(torch.cat((proposals_single, scores_single), 1), nms_thresh) 112 | keep_idx_i = keep_idx_i.long().view(-1) 113 | 114 | if post_nms_topN > 0: 115 | keep_idx_i = keep_idx_i[:post_nms_topN] 116 | proposals_single = proposals_single[keep_idx_i, :] 117 | scores_single = scores_single[keep_idx_i, :] 118 | 119 | # padding 0 at the end. 120 | num_proposal = proposals_single.size(0) 121 | output[i,:,0] = i 122 | output[i,:num_proposal,1:] = proposals_single 123 | 124 | return output 125 | 126 | def backward(self, top, propagate_down, bottom): 127 | """This layer does not propagate gradients.""" 128 | pass 129 | 130 | def reshape(self, bottom, top): 131 | """Reshaping happens during the call to forward.""" 132 | pass 133 | 134 | def _filter_boxes(self, boxes, min_size): 135 | """Remove all boxes with any side smaller than min_size.""" 136 | ws = boxes[:, :, 2] - boxes[:, :, 0] + 1 137 | hs = boxes[:, :, 3] - boxes[:, :, 1] + 1 138 | keep = ((ws >= min_size) & (hs >= min_size)) 139 | return keep 140 | -------------------------------------------------------------------------------- /lib/model/nms/src/nms_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "nms_cuda_kernel.h" 13 | 14 | #define CUDA_WARN(XXX) \ 15 | do { if (XXX != cudaSuccess) std::cout << "CUDA Error: " << \ 16 | cudaGetErrorString(XXX) << ", at line " << __LINE__ \ 17 | << std::endl; cudaDeviceSynchronize(); } while (0) 18 | 19 | #define CUDA_CHECK(condition) \ 20 | /* Code block avoids redefinition of cudaError_t error */ \ 21 | do { \ 22 | cudaError_t error = condition; \ 23 | if (error != cudaSuccess) { \ 24 | std::cout << cudaGetErrorString(error) << std::endl; \ 25 | } \ 26 | } while (0) 27 | 28 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 29 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 30 | 31 | __device__ inline float devIoU(float const * const a, float const * const b) { 32 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 33 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 34 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 35 | float interS = width * height; 36 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 37 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 38 | return interS / (Sa + Sb - interS); 39 | } 40 | 41 | __global__ void nms_kernel(int n_boxes, float nms_overlap_thresh, 42 | float *dev_boxes, unsigned long long *dev_mask) { 43 | const int row_start = blockIdx.y; 44 | const int col_start = blockIdx.x; 45 | 46 | // if (row_start > col_start) return; 47 | 48 | const int row_size = 49 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 50 | const int col_size = 51 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 52 | 53 | __shared__ float block_boxes[threadsPerBlock * 5]; 54 | if (threadIdx.x < col_size) { 55 | block_boxes[threadIdx.x * 5 + 0] = 56 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 57 | block_boxes[threadIdx.x * 5 + 1] = 58 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 59 | block_boxes[threadIdx.x * 5 + 2] = 60 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 61 | block_boxes[threadIdx.x * 5 + 3] = 62 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 63 | block_boxes[threadIdx.x * 5 + 4] = 64 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 65 | } 66 | __syncthreads(); 67 | 68 | if (threadIdx.x < row_size) { 69 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 70 | const float *cur_box = dev_boxes + cur_box_idx * 5; 71 | int i = 0; 72 | unsigned long long t = 0; 73 | int start = 0; 74 | if (row_start == col_start) { 75 | start = threadIdx.x + 1; 76 | } 77 | for (i = start; i < col_size; i++) { 78 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 79 | t |= 1ULL << i; 80 | } 81 | } 82 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 83 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 84 | } 85 | } 86 | 87 | void nms_cuda_compute(int* keep_out, int *num_out, float* boxes_host, int boxes_num, 88 | int boxes_dim, float nms_overlap_thresh) { 89 | 90 | float* boxes_dev = NULL; 91 | unsigned long long* mask_dev = NULL; 92 | 93 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 94 | 95 | CUDA_CHECK(cudaMalloc(&boxes_dev, 96 | boxes_num * boxes_dim * sizeof(float))); 97 | CUDA_CHECK(cudaMemcpy(boxes_dev, 98 | boxes_host, 99 | boxes_num * boxes_dim * sizeof(float), 100 | cudaMemcpyHostToDevice)); 101 | 102 | CUDA_CHECK(cudaMalloc(&mask_dev, 103 | boxes_num * col_blocks * sizeof(unsigned long long))); 104 | 105 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 106 | DIVUP(boxes_num, threadsPerBlock)); 107 | dim3 threads(threadsPerBlock); 108 | 109 | // printf("i am at line %d\n", boxes_num); 110 | // printf("i am at line %d\n", boxes_dim); 111 | 112 | nms_kernel<<>>(boxes_num, 113 | nms_overlap_thresh, 114 | boxes_dev, 115 | mask_dev); 116 | 117 | std::vector mask_host(boxes_num * col_blocks); 118 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 119 | mask_dev, 120 | sizeof(unsigned long long) * boxes_num * col_blocks, 121 | cudaMemcpyDeviceToHost)); 122 | 123 | std::vector remv(col_blocks); 124 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 125 | 126 | // we need to create a memory for keep_out on cpu 127 | // otherwise, the following code cannot run 128 | 129 | int* keep_out_cpu = new int[boxes_num]; 130 | 131 | int num_to_keep = 0; 132 | for (int i = 0; i < boxes_num; i++) { 133 | int nblock = i / threadsPerBlock; 134 | int inblock = i % threadsPerBlock; 135 | 136 | if (!(remv[nblock] & (1ULL << inblock))) { 137 | // orignal: keep_out[num_to_keep++] = i; 138 | keep_out_cpu[num_to_keep++] = i; 139 | unsigned long long *p = &mask_host[0] + i * col_blocks; 140 | for (int j = nblock; j < col_blocks; j++) { 141 | remv[j] |= p[j]; 142 | } 143 | } 144 | } 145 | 146 | // copy keep_out_cpu to keep_out on gpu 147 | CUDA_WARN(cudaMemcpy(keep_out, keep_out_cpu, boxes_num * sizeof(int),cudaMemcpyHostToDevice)); 148 | 149 | // *num_out = num_to_keep; 150 | 151 | // original: *num_out = num_to_keep; 152 | // copy num_to_keep to num_out on gpu 153 | 154 | CUDA_WARN(cudaMemcpy(num_out, &num_to_keep, 1 * sizeof(int),cudaMemcpyHostToDevice)); 155 | 156 | // release cuda memory 157 | CUDA_CHECK(cudaFree(boxes_dev)); 158 | CUDA_CHECK(cudaFree(mask_dev)); 159 | // release cpu memory 160 | delete []keep_out_cpu; 161 | } 162 | -------------------------------------------------------------------------------- /lib/model/roi_crop/src/roi_crop_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "roi_crop_cuda_kernel.h" 5 | 6 | #define real float 7 | 8 | // this symbol will be resolved automatically from PyTorch libs 9 | extern THCState *state; 10 | 11 | // Bilinear sampling is done in BHWD (coalescing is not obvious in BDHW) 12 | // we assume BHWD format in inputImages 13 | // we assume BHW(YX) format on grids 14 | 15 | int BilinearSamplerBHWD_updateOutput_cuda(THCudaTensor *inputImages, THCudaTensor *grids, THCudaTensor *output){ 16 | // THCState *state = getCutorchState(L); 17 | // THCudaTensor *inputImages = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor"); 18 | // THCudaTensor *grids = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor"); 19 | // THCudaTensor *output = (THCudaTensor *)luaT_checkudata(L, 4, "torch.CudaTensor"); 20 | 21 | int success = 0; 22 | success = BilinearSamplerBHWD_updateOutput_cuda_kernel(output->size[1], 23 | output->size[3], 24 | output->size[2], 25 | output->size[0], 26 | THCudaTensor_size(state, inputImages, 1), 27 | THCudaTensor_size(state, inputImages, 2), 28 | THCudaTensor_size(state, inputImages, 3), 29 | THCudaTensor_size(state, inputImages, 0), 30 | THCudaTensor_data(state, inputImages), 31 | THCudaTensor_stride(state, inputImages, 0), 32 | THCudaTensor_stride(state, inputImages, 1), 33 | THCudaTensor_stride(state, inputImages, 2), 34 | THCudaTensor_stride(state, inputImages, 3), 35 | THCudaTensor_data(state, grids), 36 | THCudaTensor_stride(state, grids, 0), 37 | THCudaTensor_stride(state, grids, 3), 38 | THCudaTensor_stride(state, grids, 1), 39 | THCudaTensor_stride(state, grids, 2), 40 | THCudaTensor_data(state, output), 41 | THCudaTensor_stride(state, output, 0), 42 | THCudaTensor_stride(state, output, 1), 43 | THCudaTensor_stride(state, output, 2), 44 | THCudaTensor_stride(state, output, 3), 45 | THCState_getCurrentStream(state)); 46 | 47 | //check for errors 48 | if (!success) { 49 | THError("aborting"); 50 | } 51 | return 1; 52 | } 53 | 54 | int BilinearSamplerBHWD_updateGradInput_cuda(THCudaTensor *inputImages, THCudaTensor *grids, THCudaTensor *gradInputImages, 55 | THCudaTensor *gradGrids, THCudaTensor *gradOutput) 56 | { 57 | // THCState *state = getCutorchState(L); 58 | // THCudaTensor *inputImages = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor"); 59 | // THCudaTensor *grids = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor"); 60 | // THCudaTensor *gradInputImages = (THCudaTensor *)luaT_checkudata(L, 4, "torch.CudaTensor"); 61 | // THCudaTensor *gradGrids = (THCudaTensor *)luaT_checkudata(L, 5, "torch.CudaTensor"); 62 | // THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 6, "torch.CudaTensor"); 63 | 64 | int success = 0; 65 | success = BilinearSamplerBHWD_updateGradInput_cuda_kernel(gradOutput->size[1], 66 | gradOutput->size[3], 67 | gradOutput->size[2], 68 | gradOutput->size[0], 69 | THCudaTensor_size(state, inputImages, 1), 70 | THCudaTensor_size(state, inputImages, 2), 71 | THCudaTensor_size(state, inputImages, 3), 72 | THCudaTensor_size(state, inputImages, 0), 73 | THCudaTensor_data(state, inputImages), 74 | THCudaTensor_stride(state, inputImages, 0), 75 | THCudaTensor_stride(state, inputImages, 1), 76 | THCudaTensor_stride(state, inputImages, 2), 77 | THCudaTensor_stride(state, inputImages, 3), 78 | THCudaTensor_data(state, grids), 79 | THCudaTensor_stride(state, grids, 0), 80 | THCudaTensor_stride(state, grids, 3), 81 | THCudaTensor_stride(state, grids, 1), 82 | THCudaTensor_stride(state, grids, 2), 83 | THCudaTensor_data(state, gradInputImages), 84 | THCudaTensor_stride(state, gradInputImages, 0), 85 | THCudaTensor_stride(state, gradInputImages, 1), 86 | THCudaTensor_stride(state, gradInputImages, 2), 87 | THCudaTensor_stride(state, gradInputImages, 3), 88 | THCudaTensor_data(state, gradGrids), 89 | THCudaTensor_stride(state, gradGrids, 0), 90 | THCudaTensor_stride(state, gradGrids, 3), 91 | THCudaTensor_stride(state, gradGrids, 1), 92 | THCudaTensor_stride(state, gradGrids, 2), 93 | THCudaTensor_data(state, gradOutput), 94 | THCudaTensor_stride(state, gradOutput, 0), 95 | THCudaTensor_stride(state, gradOutput, 1), 96 | THCudaTensor_stride(state, gradOutput, 2), 97 | THCudaTensor_stride(state, gradOutput, 3), 98 | THCState_getCurrentStream(state)); 99 | 100 | //check for errors 101 | if (!success) { 102 | THError("aborting"); 103 | } 104 | return 1; 105 | } 106 | -------------------------------------------------------------------------------- /lib/datasets/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import xml.etree.ElementTree as ET 11 | import os 12 | import pickle 13 | import numpy as np 14 | 15 | def parse_rec(filename): 16 | """ Parse a PASCAL VOC xml file """ 17 | tree = ET.parse(filename) 18 | objects = [] 19 | for obj in tree.findall('object'): 20 | obj_struct = {} 21 | obj_struct['name'] = obj.find('name').text 22 | obj_struct['pose'] = obj.find('pose').text 23 | obj_struct['truncated'] = int(obj.find('truncated').text) 24 | obj_struct['difficult'] = int(obj.find('difficult').text) 25 | bbox = obj.find('bndbox') 26 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 27 | int(bbox.find('ymin').text), 28 | int(bbox.find('xmax').text), 29 | int(bbox.find('ymax').text)] 30 | objects.append(obj_struct) 31 | 32 | return objects 33 | 34 | 35 | def voc_ap(rec, prec, use_07_metric=False): 36 | """ ap = voc_ap(rec, prec, [use_07_metric]) 37 | Compute VOC AP given precision and recall. 38 | If use_07_metric is true, uses the 39 | VOC 07 11 point method (default:False). 40 | """ 41 | if use_07_metric: 42 | # 11 point metric 43 | ap = 0. 44 | for t in np.arange(0., 1.1, 0.1): 45 | if np.sum(rec >= t) == 0: 46 | p = 0 47 | else: 48 | p = np.max(prec[rec >= t]) 49 | ap = ap + p / 11. 50 | else: 51 | # correct AP calculation 52 | # first append sentinel values at the end 53 | mrec = np.concatenate(([0.], rec, [1.])) 54 | mpre = np.concatenate(([0.], prec, [0.])) 55 | 56 | # compute the precision envelope 57 | for i in range(mpre.size - 1, 0, -1): 58 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 59 | 60 | # to calculate area under PR curve, look for points 61 | # where X axis (recall) changes value 62 | i = np.where(mrec[1:] != mrec[:-1])[0] 63 | 64 | # and sum (\Delta recall) * prec 65 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 66 | return ap 67 | 68 | 69 | def voc_eval(detpath, 70 | annopath, 71 | imagesetfile, 72 | classname, 73 | cachedir, 74 | ovthresh=0.5, 75 | use_07_metric=False): 76 | """rec, prec, ap = voc_eval(detpath, 77 | annopath, 78 | imagesetfile, 79 | classname, 80 | [ovthresh], 81 | [use_07_metric]) 82 | 83 | Top level function that does the PASCAL VOC evaluation. 84 | 85 | detpath: Path to detections 86 | detpath.format(classname) should produce the detection results file. 87 | annopath: Path to annotations 88 | annopath.format(imagename) should be the xml annotations file. 89 | imagesetfile: Text file containing the list of images, one image per line. 90 | classname: Category name (duh) 91 | cachedir: Directory for caching the annotations 92 | [ovthresh]: Overlap threshold (default = 0.5) 93 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 94 | (default False) 95 | """ 96 | # assumes detections are in detpath.format(classname) 97 | # assumes annotations are in annopath.format(imagename) 98 | # assumes imagesetfile is a text file with each line an image name 99 | # cachedir caches the annotations in a pickle file 100 | 101 | # first load gt 102 | if not os.path.isdir(cachedir): 103 | os.mkdir(cachedir) 104 | cachefile = os.path.join(cachedir, '%s_annots.pkl' % imagesetfile) 105 | # read list of images 106 | with open(imagesetfile, 'r') as f: 107 | lines = f.readlines() 108 | imagenames = [x.strip() for x in lines] 109 | 110 | if not os.path.isfile(cachefile): 111 | # load annotations 112 | recs = {} 113 | for i, imagename in enumerate(imagenames): 114 | recs[imagename] = parse_rec(annopath.format(imagename)) 115 | if i % 100 == 0: 116 | print('Reading annotation for {:d}/{:d}'.format( 117 | i + 1, len(imagenames))) 118 | # save 119 | print('Saving cached annotations to {:s}'.format(cachefile)) 120 | with open(cachefile, 'w') as f: 121 | pickle.dump(recs, f) 122 | else: 123 | # load 124 | with open(cachefile, 'rb') as f: 125 | try: 126 | recs = pickle.load(f) 127 | except: 128 | recs = pickle.load(f, encoding='bytes') 129 | 130 | # extract gt objects for this class 131 | class_recs = {} 132 | npos = 0 133 | for imagename in imagenames: 134 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 135 | bbox = np.array([x['bbox'] for x in R]) 136 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 137 | det = [False] * len(R) 138 | npos = npos + sum(~difficult) 139 | class_recs[imagename] = {'bbox': bbox, 140 | 'difficult': difficult, 141 | 'det': det} 142 | 143 | # read dets 144 | detfile = detpath.format(classname) 145 | with open(detfile, 'r') as f: 146 | lines = f.readlines() 147 | 148 | splitlines = [x.strip().split(' ') for x in lines] 149 | image_ids = [x[0] for x in splitlines] 150 | confidence = np.array([float(x[1]) for x in splitlines]) 151 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 152 | 153 | nd = len(image_ids) 154 | tp = np.zeros(nd) 155 | fp = np.zeros(nd) 156 | 157 | if BB.shape[0] > 0: 158 | # sort by confidence 159 | sorted_ind = np.argsort(-confidence) 160 | sorted_scores = np.sort(-confidence) 161 | BB = BB[sorted_ind, :] 162 | image_ids = [image_ids[x] for x in sorted_ind] 163 | 164 | # go down dets and mark TPs and FPs 165 | for d in range(nd): 166 | R = class_recs[image_ids[d]] 167 | bb = BB[d, :].astype(float) 168 | ovmax = -np.inf 169 | BBGT = R['bbox'].astype(float) 170 | 171 | if BBGT.size > 0: 172 | # compute overlaps 173 | # intersection 174 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 175 | iymin = np.maximum(BBGT[:, 1], bb[1]) 176 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 177 | iymax = np.minimum(BBGT[:, 3], bb[3]) 178 | iw = np.maximum(ixmax - ixmin + 1., 0.) 179 | ih = np.maximum(iymax - iymin + 1., 0.) 180 | inters = iw * ih 181 | 182 | # union 183 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 184 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 185 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 186 | 187 | overlaps = inters / uni 188 | ovmax = np.max(overlaps) 189 | jmax = np.argmax(overlaps) 190 | 191 | if ovmax > ovthresh: 192 | if not R['difficult'][jmax]: 193 | if not R['det'][jmax]: 194 | tp[d] = 1. 195 | R['det'][jmax] = 1 196 | else: 197 | fp[d] = 1. 198 | else: 199 | fp[d] = 1. 200 | 201 | # compute precision recall 202 | fp = np.cumsum(fp) 203 | tp = np.cumsum(tp) 204 | rec = tp / float(npos) 205 | # avoid divide by zero in case the first detection matches a difficult 206 | # ground truth 207 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 208 | ap = voc_ap(rec, prec, use_07_metric) 209 | 210 | return rec, prec, ap 211 | -------------------------------------------------------------------------------- /lib/model/roi_align/src/roi_align_kernel.cu: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include "roi_align_kernel.h" 9 | 10 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 11 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 12 | i += blockDim.x * gridDim.x) 13 | 14 | 15 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width, 16 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) { 17 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 18 | // (n, c, ph, pw) is an element in the aligned output 19 | // int n = index; 20 | // int pw = n % aligned_width; 21 | // n /= aligned_width; 22 | // int ph = n % aligned_height; 23 | // n /= aligned_height; 24 | // int c = n % channels; 25 | // n /= channels; 26 | 27 | int pw = index % aligned_width; 28 | int ph = (index / aligned_width) % aligned_height; 29 | int c = (index / aligned_width / aligned_height) % channels; 30 | int n = index / aligned_width / aligned_height / channels; 31 | 32 | // bottom_rois += n * 5; 33 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 34 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 35 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 36 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 37 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 38 | 39 | // Force malformed ROIs to be 1x1 40 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 41 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 42 | float bin_size_h = roi_height / (aligned_height - 1.); 43 | float bin_size_w = roi_width / (aligned_width - 1.); 44 | 45 | float h = (float)(ph) * bin_size_h + roi_start_h; 46 | float w = (float)(pw) * bin_size_w + roi_start_w; 47 | 48 | int hstart = fminf(floor(h), height - 2); 49 | int wstart = fminf(floor(w), width - 2); 50 | 51 | int img_start = roi_batch_ind * channels * height * width; 52 | 53 | // bilinear interpolation 54 | if (h < 0 || h >= height || w < 0 || w >= width) { 55 | top_data[index] = 0.; 56 | } else { 57 | float h_ratio = h - (float)(hstart); 58 | float w_ratio = w - (float)(wstart); 59 | int upleft = img_start + (c * height + hstart) * width + wstart; 60 | int upright = upleft + 1; 61 | int downleft = upleft + width; 62 | int downright = downleft + 1; 63 | 64 | top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio) 65 | + bottom_data[upright] * (1. - h_ratio) * w_ratio 66 | + bottom_data[downleft] * h_ratio * (1. - w_ratio) 67 | + bottom_data[downright] * h_ratio * w_ratio; 68 | } 69 | } 70 | } 71 | 72 | 73 | int ROIAlignForwardLaucher(const float* bottom_data, const float spatial_scale, const int num_rois, const int height, const int width, 74 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data, cudaStream_t stream) { 75 | const int kThreadsPerBlock = 1024; 76 | const int output_size = num_rois * aligned_height * aligned_width * channels; 77 | cudaError_t err; 78 | 79 | 80 | ROIAlignForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 81 | output_size, bottom_data, spatial_scale, height, width, channels, 82 | aligned_height, aligned_width, bottom_rois, top_data); 83 | 84 | err = cudaGetLastError(); 85 | if(cudaSuccess != err) { 86 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 87 | exit( -1 ); 88 | } 89 | 90 | return 1; 91 | } 92 | 93 | 94 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, const float spatial_scale, const int height, const int width, 95 | const int channels, const int aligned_height, const int aligned_width, float* bottom_diff, const float* bottom_rois) { 96 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 97 | 98 | // (n, c, ph, pw) is an element in the aligned output 99 | int pw = index % aligned_width; 100 | int ph = (index / aligned_width) % aligned_height; 101 | int c = (index / aligned_width / aligned_height) % channels; 102 | int n = index / aligned_width / aligned_height / channels; 103 | 104 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 105 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 106 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 107 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 108 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 109 | /* int roi_start_w = round(bottom_rois[1] * spatial_scale); */ 110 | /* int roi_start_h = round(bottom_rois[2] * spatial_scale); */ 111 | /* int roi_end_w = round(bottom_rois[3] * spatial_scale); */ 112 | /* int roi_end_h = round(bottom_rois[4] * spatial_scale); */ 113 | 114 | // Force malformed ROIs to be 1x1 115 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 116 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 117 | float bin_size_h = roi_height / (aligned_height - 1.); 118 | float bin_size_w = roi_width / (aligned_width - 1.); 119 | 120 | float h = (float)(ph) * bin_size_h + roi_start_h; 121 | float w = (float)(pw) * bin_size_w + roi_start_w; 122 | 123 | int hstart = fminf(floor(h), height - 2); 124 | int wstart = fminf(floor(w), width - 2); 125 | 126 | int img_start = roi_batch_ind * channels * height * width; 127 | 128 | // bilinear interpolation 129 | if (!(h < 0 || h >= height || w < 0 || w >= width)) { 130 | float h_ratio = h - (float)(hstart); 131 | float w_ratio = w - (float)(wstart); 132 | int upleft = img_start + (c * height + hstart) * width + wstart; 133 | int upright = upleft + 1; 134 | int downleft = upleft + width; 135 | int downright = downleft + 1; 136 | 137 | atomicAdd(bottom_diff + upleft, top_diff[index] * (1. - h_ratio) * (1 - w_ratio)); 138 | atomicAdd(bottom_diff + upright, top_diff[index] * (1. - h_ratio) * w_ratio); 139 | atomicAdd(bottom_diff + downleft, top_diff[index] * h_ratio * (1 - w_ratio)); 140 | atomicAdd(bottom_diff + downright, top_diff[index] * h_ratio * w_ratio); 141 | } 142 | } 143 | } 144 | 145 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, const int height, const int width, 146 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* bottom_diff, cudaStream_t stream) { 147 | const int kThreadsPerBlock = 1024; 148 | const int output_size = num_rois * aligned_height * aligned_width * channels; 149 | cudaError_t err; 150 | 151 | ROIAlignBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 152 | output_size, top_diff, spatial_scale, height, width, channels, 153 | aligned_height, aligned_width, bottom_diff, bottom_rois); 154 | 155 | err = cudaGetLastError(); 156 | if(cudaSuccess != err) { 157 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 158 | exit( -1 ); 159 | } 160 | 161 | return 1; 162 | } 163 | 164 | 165 | #ifdef __cplusplus 166 | } 167 | #endif 168 | -------------------------------------------------------------------------------- /lib/model/rpn/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import pdb 10 | 11 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 12 | # 13 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 14 | # >> anchors 15 | # 16 | # anchors = 17 | # 18 | # -83 -39 100 56 19 | # -175 -87 192 104 20 | # -359 -183 376 200 21 | # -55 -55 72 72 22 | # -119 -119 136 136 23 | # -247 -247 264 264 24 | # -35 -79 52 96 25 | # -79 -167 96 184 26 | # -167 -343 184 360 27 | 28 | #array([[ -83., -39., 100., 56.], 29 | # [-175., -87., 192., 104.], 30 | # [-359., -183., 376., 200.], 31 | # [ -55., -55., 72., 72.], 32 | # [-119., -119., 136., 136.], 33 | # [-247., -247., 264., 264.], 34 | # [ -35., -79., 52., 96.], 35 | # [ -79., -167., 96., 184.], 36 | # [-167., -343., 184., 360.]]) 37 | 38 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 39 | scales=2**np.arange(3, 6)): 40 | """ 41 | Generate anchor (reference) windows by enumerating aspect ratios X 42 | scales wrt a reference (0, 0, 15, 15) window. 43 | """ 44 | 45 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 46 | ratio_anchors = _ratio_enum(base_anchor, ratios) 47 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 48 | for i in xrange(ratio_anchors.shape[0])]) 49 | return anchors 50 | 51 | def _whctrs(anchor): 52 | """ 53 | Return width, height, x center, and y center for an anchor (window). 54 | """ 55 | 56 | w = anchor[2] - anchor[0] + 1 57 | h = anchor[3] - anchor[1] + 1 58 | x_ctr = anchor[0] + 0.5 * (w - 1) 59 | y_ctr = anchor[1] + 0.5 * (h - 1) 60 | return w, h, x_ctr, y_ctr 61 | 62 | def _mkanchors(ws, hs, x_ctr, y_ctr): 63 | """ 64 | Given a vector of widths (ws) and heights (hs) around a center 65 | (x_ctr, y_ctr), output a set of anchors (windows). 66 | """ 67 | 68 | ws = ws[:, np.newaxis] 69 | hs = hs[:, np.newaxis] 70 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 71 | y_ctr - 0.5 * (hs - 1), 72 | x_ctr + 0.5 * (ws - 1), 73 | y_ctr + 0.5 * (hs - 1))) 74 | return anchors 75 | 76 | def _ratio_enum(anchor, ratios): 77 | """ 78 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 79 | """ 80 | 81 | w, h, x_ctr, y_ctr = _whctrs(anchor) 82 | size = w * h 83 | size_ratios = size / ratios 84 | ws = np.round(np.sqrt(size_ratios)) 85 | hs = np.round(ws * ratios) 86 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 87 | return anchors 88 | 89 | def _scale_enum(anchor, scales): 90 | """ 91 | Enumerate a set of anchors for each scale wrt an anchor. 92 | """ 93 | 94 | w, h, x_ctr, y_ctr = _whctrs(anchor) 95 | ws = w * scales 96 | hs = h * scales 97 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 98 | return anchors 99 | 100 | if __name__ == '__main__': 101 | import time 102 | t = time.time() 103 | a = generate_anchors() 104 | print time.time() - t 105 | print a 106 | from IPython import embed; embed() 107 | 108 | ############################################################ 109 | # Anchors 110 | ############################################################ 111 | 112 | def generate_anchors_single_pyramid(scales, ratios, shape, feature_stride, anchor_stride): 113 | """ 114 | scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128] 115 | ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2] 116 | shape: [height, width] spatial shape of the feature map over which 117 | to generate anchors. 118 | feature_stride: Stride of the feature map relative to the image in pixels. 119 | anchor_stride: Stride of anchors on the feature map. For example, if the 120 | value is 2 then generate anchors for every other feature map pixel. 121 | """ 122 | # Get all combinations of scales and ratios 123 | scales, ratios = np.meshgrid(np.array(scales), np.array(ratios)) 124 | scales = scales.flatten() 125 | ratios = ratios.flatten() 126 | 127 | # Enumerate heights and widths from scales and ratios 128 | heights = scales / np.sqrt(ratios) 129 | widths = scales * np.sqrt(ratios) 130 | 131 | # Enumerate shifts in feature space 132 | shifts_y = np.arange(0, shape[0], anchor_stride) * feature_stride 133 | shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride 134 | shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y) 135 | 136 | # Enumerate combinations of shifts, widths, and heights 137 | box_widths, box_centers_x = np.meshgrid(widths, shifts_x) 138 | box_heights, box_centers_y = np.meshgrid(heights, shifts_y) 139 | 140 | # # Reshape to get a list of (y, x) and a list of (h, w) 141 | # box_centers = np.stack( 142 | # [box_centers_y, box_centers_x], axis=2).reshape([-1, 2]) 143 | # box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2]) 144 | 145 | # NOTE: the original order is (y, x), we changed it to (x, y) for our code 146 | # Reshape to get a list of (x, y) and a list of (w, h) 147 | box_centers = np.stack( 148 | [box_centers_x, box_centers_y], axis=2).reshape([-1, 2]) 149 | box_sizes = np.stack([box_widths, box_heights], axis=2).reshape([-1, 2]) 150 | 151 | # Convert to corner coordinates (x1, y1, x2, y2) 152 | boxes = np.concatenate([box_centers - 0.5 * box_sizes, 153 | box_centers + 0.5 * box_sizes], axis=1) 154 | return boxes 155 | 156 | 157 | def generate_anchors_all_pyramids(scales, ratios, feature_shapes, feature_strides, 158 | anchor_stride): 159 | """Generate anchors at different levels of a feature pyramid. Each scale 160 | is associated with a level of the pyramid, but each ratio is used in 161 | all levels of the pyramid. 162 | Returns: 163 | anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sorted 164 | with the same order of the given scales. So, anchors of scale[0] come 165 | first, then anchors of scale[1], and so on. 166 | """ 167 | # Anchors 168 | # [anchor_count, (y1, x1, y2, x2)] 169 | anchors = [] 170 | for i in range(len(scales)): 171 | anchors.append(generate_anchors_single_pyramid(scales[i], ratios, feature_shapes[i], 172 | feature_strides[i], anchor_stride)) 173 | return np.concatenate(anchors, axis=0) 174 | 175 | 176 | # def generate_anchors_single_pyramid(scales, ratios): 177 | # """ 178 | # scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128] 179 | # ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2] 180 | # """ 181 | 182 | # # Get all combinations of scales and ratios 183 | # scales, ratios = np.meshgrid(np.array(scales), np.array(ratios)) 184 | # scales = scales.flatten() 185 | # ratios = ratios.flatten() 186 | 187 | # # Enumerate heights and widths from scales and ratios 188 | # heights = scales / np.sqrt(ratios) 189 | # widths = scales * np.sqrt(ratios) 190 | 191 | # # Reshape to get a list of (y, x) and a list of (h, w) 192 | # box_sizes = np.stack([heights, widths], axis=1).reshape([-1, 2]) 193 | 194 | # return box_sizes 195 | 196 | 197 | # def generate_anchors_all_pyramids(scales, ratios): 198 | # """Generate anchors at different levels of a feature pyramid. Each scale 199 | # is associated with a level of the pyramid, but each ratio is used in 200 | # all levels of the pyramid. 201 | # Returns: 202 | # anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sorted 203 | # with the same order of the given scales. So, anchors of scale[0] come 204 | # first, then anchors of scale[1], and so on. 205 | # """ 206 | # # Anchors 207 | # # [anchor_count, (y1, x1, y2, x2)] 208 | # anchors = [] 209 | # for i in range(len(scales)): 210 | # anchors.append(generate_anchors_single_pyramid(scales[i], ratios)) 211 | # return np.concatenate(anchors, axis=0) -------------------------------------------------------------------------------- /lib/model/rpn/anchor_target_layer_fpn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | # -------------------------------------------------------- 8 | # Reorganized and modified by Jianwei Yang and Jiasen Lu 9 | # -------------------------------------------------------- 10 | 11 | import torch 12 | import torch.nn as nn 13 | import numpy as np 14 | import numpy.random as npr 15 | 16 | from model.utils.config import cfg 17 | from generate_anchors import generate_anchors, generate_anchors_all_pyramids 18 | from bbox_transform import clip_boxes, bbox_overlaps_batch, bbox_transform_batch 19 | 20 | import pdb 21 | 22 | DEBUG = False 23 | 24 | class _AnchorTargetLayer_FPN(nn.Module): 25 | """ 26 | Assign anchors to ground-truth targets. Produces anchor classification 27 | labels and bounding-box regression targets. 28 | """ 29 | def __init__(self, feat_stride, scales, ratios): 30 | super(_AnchorTargetLayer_FPN, self).__init__() 31 | self._anchor_ratios = ratios 32 | self._feat_stride = feat_stride 33 | self._fpn_scales = np.array(cfg.FPN_ANCHOR_SCALES) 34 | self._fpn_feature_strides = np.array(cfg.FPN_FEAT_STRIDES) 35 | self._fpn_anchor_stride = cfg.FPN_ANCHOR_STRIDE 36 | 37 | # allow boxes to sit over the edge by a small amount 38 | self._allowed_border = 0 # default is 0 39 | 40 | def forward(self, input): 41 | # Algorithm: 42 | # 43 | # for each (H, W) location i 44 | # generate 9 anchor boxes centered on cell i 45 | # apply predicted bbox deltas at cell i to each of the 9 anchors 46 | # filter out-of-image anchors 47 | 48 | scores = input[0] 49 | gt_boxes = input[1] 50 | im_info = input[2] 51 | num_boxes = input[3] 52 | feat_shapes = input[4] 53 | 54 | # NOTE: need to change 55 | # height, width = scores.size(2), scores.size(3) 56 | height, width = 0, 0 57 | 58 | batch_size = gt_boxes.size(0) 59 | 60 | anchors = torch.from_numpy(generate_anchors_all_pyramids(self._fpn_scales, self._anchor_ratios, 61 | feat_shapes, self._fpn_feature_strides, self._fpn_anchor_stride)).type_as(scores) 62 | total_anchors = anchors.size(0) 63 | 64 | keep = ((anchors[:, 0] >= -self._allowed_border) & 65 | (anchors[:, 1] >= -self._allowed_border) & 66 | (anchors[:, 2] < long(im_info[0][1]) + self._allowed_border) & 67 | (anchors[:, 3] < long(im_info[0][0]) + self._allowed_border)) 68 | 69 | inds_inside = torch.nonzero(keep).view(-1) 70 | 71 | # keep only inside anchors 72 | anchors = anchors[inds_inside, :] 73 | 74 | # label: 1 is positive, 0 is negative, -1 is dont care 75 | labels = gt_boxes.new(batch_size, inds_inside.size(0)).fill_(-1) 76 | bbox_inside_weights = gt_boxes.new(batch_size, inds_inside.size(0)).zero_() 77 | bbox_outside_weights = gt_boxes.new(batch_size, inds_inside.size(0)).zero_() 78 | 79 | overlaps = bbox_overlaps_batch(anchors, gt_boxes) 80 | 81 | max_overlaps, argmax_overlaps = torch.max(overlaps, 2) 82 | gt_max_overlaps, _ = torch.max(overlaps, 1) 83 | 84 | if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: 85 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 86 | 87 | gt_max_overlaps[gt_max_overlaps==0] = 1e-5 88 | keep = torch.sum(overlaps.eq(gt_max_overlaps.view(batch_size,1,-1).expand_as(overlaps)), 2) 89 | 90 | if torch.sum(keep) > 0: 91 | labels[keep>0] = 1 92 | 93 | # fg label: above threshold IOU 94 | labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 95 | 96 | if cfg.TRAIN.RPN_CLOBBER_POSITIVES: 97 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 98 | 99 | num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) 100 | 101 | sum_fg = torch.sum((labels == 1).int(), 1) 102 | sum_bg = torch.sum((labels == 0).int(), 1) 103 | 104 | for i in range(batch_size): 105 | # subsample positive labels if we have too many 106 | if sum_fg[i] > num_fg: 107 | fg_inds = torch.nonzero(labels[i] == 1).view(-1) 108 | # torch.randperm seems has a bug on multi-gpu setting that cause the segfault. 109 | # See https://github.com/pytorch/pytorch/issues/1868 for more details. 110 | # use numpy instead. 111 | #rand_num = torch.randperm(fg_inds.size(0)).type_as(gt_boxes).long() 112 | rand_num = torch.from_numpy(np.random.permutation(fg_inds.size(0))).type_as(gt_boxes).long() 113 | disable_inds = fg_inds[rand_num[:fg_inds.size(0)-num_fg]] 114 | labels[i][disable_inds] = -1 115 | 116 | num_bg = cfg.TRAIN.RPN_BATCHSIZE - sum_fg[i] 117 | 118 | # subsample negative labels if we have too many 119 | if sum_bg[i] > num_bg: 120 | bg_inds = torch.nonzero(labels[i] == 0).view(-1) 121 | #rand_num = torch.randperm(bg_inds.size(0)).type_as(gt_boxes).long() 122 | 123 | rand_num = torch.from_numpy(np.random.permutation(bg_inds.size(0))).type_as(gt_boxes).long() 124 | disable_inds = bg_inds[rand_num[:bg_inds.size(0)-num_bg]] 125 | labels[i][disable_inds] = -1 126 | 127 | offset = torch.arange(0, batch_size)*gt_boxes.size(1) 128 | 129 | argmax_overlaps = argmax_overlaps + offset.view(batch_size, 1).type_as(argmax_overlaps) 130 | bbox_targets = _compute_targets_batch(anchors, gt_boxes.view(-1,5)[argmax_overlaps.view(-1), :].view(batch_size, -1, 5)) 131 | 132 | # use a single value instead of 4 values for easy index. 133 | bbox_inside_weights[labels==1] = cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS[0] 134 | 135 | if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: 136 | num_examples = torch.sum(labels[i] >= 0) 137 | positive_weights = 1.0 / num_examples 138 | negative_weights = 1.0 / num_examples 139 | else: 140 | assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) & 141 | (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) 142 | 143 | bbox_outside_weights[labels == 1] = positive_weights 144 | bbox_outside_weights[labels == 0] = negative_weights 145 | 146 | labels = _unmap(labels, total_anchors, inds_inside, batch_size, fill=-1) 147 | bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, batch_size, fill=0) 148 | bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, batch_size, fill=0) 149 | bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, batch_size, fill=0) 150 | 151 | outputs = [] 152 | 153 | # labels = labels.view(batch_size, height, width, A).permute(0,3,1,2).contiguous() 154 | # labels = labels.view(batch_size, 1, A * height, width) 155 | outputs.append(labels) 156 | # bbox_targets = bbox_targets.view(batch_size, height, width, A*4).permute(0,3,1,2).contiguous() 157 | outputs.append(bbox_targets) 158 | 159 | # anchors_count = bbox_inside_weights.size(1) 160 | # bbox_inside_weights = bbox_inside_weights.view(batch_size,anchors_count,1).expand(batch_size, anchors_count, 4) 161 | # bbox_inside_weights = bbox_inside_weights.contiguous().view(batch_size, height, width, 4*A)\ 162 | # .permute(0,3,1,2).contiguous() 163 | 164 | outputs.append(bbox_inside_weights) 165 | 166 | # bbox_outside_weights = bbox_outside_weights.view(batch_size,anchors_count,1).expand(batch_size, anchors_count, 4) 167 | # bbox_outside_weights = bbox_outside_weights.contiguous().view(batch_size, height, width, 4*A)\ 168 | # .permute(0,3,1,2).contiguous() 169 | outputs.append(bbox_outside_weights) 170 | 171 | return outputs 172 | 173 | def backward(self, top, propagate_down, bottom): 174 | """This layer does not propagate gradients.""" 175 | pass 176 | 177 | def reshape(self, bottom, top): 178 | """Reshaping happens during the call to forward.""" 179 | pass 180 | 181 | def _unmap(data, count, inds, batch_size, fill=0): 182 | """ Unmap a subset of item (data) back to the original set of items (of 183 | size count) """ 184 | 185 | if data.dim() == 2: 186 | ret = torch.Tensor(batch_size, count).fill_(fill).type_as(data) 187 | ret[:, inds] = data 188 | else: 189 | ret = torch.Tensor(batch_size, count, data.size(2)).fill_(fill).type_as(data) 190 | ret[:, inds,:] = data 191 | return ret 192 | 193 | 194 | def _compute_targets_batch(ex_rois, gt_rois): 195 | """Compute bounding-box regression targets for an image.""" 196 | 197 | return bbox_transform_batch(ex_rois, gt_rois[:, :, :4]) 198 | -------------------------------------------------------------------------------- /lib/datasets/imagenet.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import datasets 9 | import datasets.imagenet 10 | import os, sys 11 | from datasets.imdb import imdb 12 | import xml.dom.minidom as minidom 13 | import numpy as np 14 | import scipy.sparse 15 | import scipy.io as sio 16 | import cPickle 17 | import subprocess 18 | import pdb 19 | 20 | class imagenet(imdb): 21 | def __init__(self, image_set, devkit_path, data_path): 22 | imdb.__init__(self, image_set) 23 | self._image_set = image_set 24 | self._devkit_path = devkit_path 25 | self._data_path = data_path 26 | synsets_image = sio.loadmat(os.path.join(self._devkit_path, 'data', 'meta_det.mat')) 27 | synsets_video = sio.loadmat(os.path.join(self._devkit_path, 'data', 'meta_vid.mat')) 28 | self._classes_image = ('__background__',) 29 | self._wnid_image = (0,) 30 | 31 | self._classes = ('__background__',) 32 | self._wnid = (0,) 33 | 34 | for i in xrange(200): 35 | self._classes_image = self._classes_image + (synsets_image['synsets'][0][i][2][0],) 36 | self._wnid_image = self._wnid_image + (synsets_image['synsets'][0][i][1][0],) 37 | 38 | for i in xrange(30): 39 | self._classes = self._classes + (synsets_video['synsets'][0][i][2][0],) 40 | self._wnid = self._wnid + (synsets_video['synsets'][0][i][1][0],) 41 | 42 | self._wnid_to_ind_image = dict(zip(self._wnid_image, xrange(201))) 43 | self._class_to_ind_image = dict(zip(self._classes_image, xrange(201))) 44 | 45 | self._wnid_to_ind = dict(zip(self._wnid, xrange(31))) 46 | self._class_to_ind = dict(zip(self._classes, xrange(31))) 47 | 48 | #check for valid intersection between video and image classes 49 | self._valid_image_flag = [0]*201 50 | 51 | for i in range(1,201): 52 | if self._wnid_image[i] in self._wnid_to_ind: 53 | self._valid_image_flag[i] = 1 54 | 55 | self._image_ext = ['.JPEG'] 56 | 57 | self._image_index = self._load_image_set_index() 58 | # Default to roidb handler 59 | self._roidb_handler = self.gt_roidb 60 | 61 | # Specific config options 62 | self.config = {'cleanup' : True, 63 | 'use_salt' : True, 64 | 'top_k' : 2000} 65 | 66 | assert os.path.exists(self._devkit_path), 'Devkit path does not exist: {}'.format(self._devkit_path) 67 | assert os.path.exists(self._data_path), 'Path does not exist: {}'.format(self._data_path) 68 | 69 | def image_path_at(self, i): 70 | """ 71 | Return the absolute path to image i in the image sequence. 72 | """ 73 | return self.image_path_from_index(self._image_index[i]) 74 | 75 | def image_path_from_index(self, index): 76 | """ 77 | Construct an image path from the image's "index" identifier. 78 | """ 79 | image_path = os.path.join(self._data_path, 'Data', self._image_set, index + self._image_ext[0]) 80 | assert os.path.exists(image_path), 'path does not exist: {}'.format(image_path) 81 | return image_path 82 | 83 | def _load_image_set_index(self): 84 | """ 85 | Load the indexes listed in this dataset's image set file. 86 | """ 87 | # Example path to image set file: 88 | # self._data_path + /ImageSets/val.txt 89 | 90 | if self._image_set == 'train': 91 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'trainr.txt') 92 | image_index = [] 93 | if os.path.exists(image_set_file): 94 | f = open(image_set_file, 'r') 95 | data = f.read().split() 96 | for lines in data: 97 | if lines != '': 98 | image_index.append(lines) 99 | f.close() 100 | return image_index 101 | 102 | for i in range(1,200): 103 | print(i) 104 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'DET', 'train_' + str(i) + '.txt') 105 | with open(image_set_file) as f: 106 | tmp_index = [x.strip() for x in f.readlines()] 107 | vtmp_index = [] 108 | for line in tmp_index: 109 | line = line.split(' ') 110 | image_list = os.popen('ls ' + self._data_path + '/Data/DET/train/' + line[0] + '/*.JPEG').read().split() 111 | tmp_list = [] 112 | for imgs in image_list: 113 | tmp_list.append(imgs[:-5]) 114 | vtmp_index = vtmp_index + tmp_list 115 | 116 | num_lines = len(vtmp_index) 117 | ids = np.random.permutation(num_lines) 118 | count = 0 119 | while count < 2000: 120 | image_index.append(vtmp_index[ids[count % num_lines]]) 121 | count = count + 1 122 | 123 | for i in range(1,201): 124 | if self._valid_image_flag[i] == 1: 125 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'train_pos_' + str(i) + '.txt') 126 | with open(image_set_file) as f: 127 | tmp_index = [x.strip() for x in f.readlines()] 128 | num_lines = len(tmp_index) 129 | ids = np.random.permutation(num_lines) 130 | count = 0 131 | while count < 2000: 132 | image_index.append(tmp_index[ids[count % num_lines]]) 133 | count = count + 1 134 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'trainr.txt') 135 | f = open(image_set_file, 'w') 136 | for lines in image_index: 137 | f.write(lines + '\n') 138 | f.close() 139 | else: 140 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'val.txt') 141 | with open(image_set_file) as f: 142 | image_index = [x.strip() for x in f.readlines()] 143 | return image_index 144 | 145 | def gt_roidb(self): 146 | """ 147 | Return the database of ground-truth regions of interest. 148 | This function loads/saves from/to a cache file to speed up future calls. 149 | """ 150 | cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') 151 | if os.path.exists(cache_file): 152 | with open(cache_file, 'rb') as fid: 153 | roidb = cPickle.load(fid) 154 | print '{} gt roidb loaded from {}'.format(self.name, cache_file) 155 | return roidb 156 | 157 | gt_roidb = [self._load_imagenet_annotation(index) 158 | for index in self.image_index] 159 | with open(cache_file, 'wb') as fid: 160 | cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL) 161 | print 'wrote gt roidb to {}'.format(cache_file) 162 | 163 | return gt_roidb 164 | 165 | 166 | def _load_imagenet_annotation(self, index): 167 | """ 168 | Load image and bounding boxes info from txt files of imagenet. 169 | """ 170 | filename = os.path.join(self._data_path, 'Annotations', self._image_set, index + '.xml') 171 | 172 | # print 'Loading: {}'.format(filename) 173 | def get_data_from_tag(node, tag): 174 | return node.getElementsByTagName(tag)[0].childNodes[0].data 175 | 176 | with open(filename) as f: 177 | data = minidom.parseString(f.read()) 178 | 179 | objs = data.getElementsByTagName('object') 180 | num_objs = len(objs) 181 | 182 | boxes = np.zeros((num_objs, 4), dtype=np.uint16) 183 | gt_classes = np.zeros((num_objs), dtype=np.int32) 184 | overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) 185 | 186 | # Load object bounding boxes into a data frame. 187 | for ix, obj in enumerate(objs): 188 | x1 = float(get_data_from_tag(obj, 'xmin')) 189 | y1 = float(get_data_from_tag(obj, 'ymin')) 190 | x2 = float(get_data_from_tag(obj, 'xmax')) 191 | y2 = float(get_data_from_tag(obj, 'ymax')) 192 | cls = self._wnid_to_ind[ 193 | str(get_data_from_tag(obj, "name")).lower().strip()] 194 | boxes[ix, :] = [x1, y1, x2, y2] 195 | gt_classes[ix] = cls 196 | overlaps[ix, cls] = 1.0 197 | 198 | overlaps = scipy.sparse.csr_matrix(overlaps) 199 | 200 | return {'boxes' : boxes, 201 | 'gt_classes': gt_classes, 202 | 'gt_overlaps' : overlaps, 203 | 'flipped' : False} 204 | 205 | if __name__ == '__main__': 206 | d = datasets.imagenet('val', '') 207 | res = d.roidb 208 | from IPython import embed; embed() 209 | -------------------------------------------------------------------------------- /lib/pycocotools/maskApi.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "maskApi.h" 8 | #include 9 | #include 10 | 11 | uint umin( uint a, uint b ) { return (ab) ? a : b; } 13 | 14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { 15 | R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); 16 | siz j; if(cnts) for(j=0; jcnts[j]=cnts[j]; 17 | } 18 | 19 | void rleFree( RLE *R ) { 20 | free(R->cnts); R->cnts=0; 21 | } 22 | 23 | void rlesInit( RLE **R, siz n ) { 24 | siz i; *R = (RLE*) malloc(sizeof(RLE)*n); 25 | for(i=0; i0 ) { 61 | c=umin(ca,cb); cc+=c; ct=0; 62 | ca-=c; if(!ca && a0) { 83 | crowd=iscrowd!=NULL && iscrowd[g]; 84 | if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } 85 | siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb; 86 | ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; 87 | cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; 88 | while( ct>0 ) { 89 | c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; 90 | ca-=c; if(!ca && athr) keep[j]=0; 105 | } 106 | } 107 | } 108 | 109 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) { 110 | double h, w, i, u, ga, da; siz g, d; int crowd; 111 | for( g=0; gthr) keep[j]=0; 129 | } 130 | } 131 | } 132 | 133 | void rleToBbox( const RLE *R, BB bb, siz n ) { 134 | siz i; for( i=0; id?1:c=dy && xs>xe) || (dxye); 173 | if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } 174 | s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; 175 | if(dx>=dy) for( d=0; d<=dx; d++ ) { 176 | t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; 177 | } else for( d=0; d<=dy; d++ ) { 178 | t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; 179 | } 180 | } 181 | /* get points along y-boundary and downsample */ 182 | free(x); free(y); k=m; m=0; double xd, yd; 183 | x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); 184 | for( j=1; jw-1 ) continue; 187 | yd=(double)(v[j]h) yd=h; yd=ceil(yd); 189 | x[m]=(int) xd; y[m]=(int) yd; m++; 190 | } 191 | /* compute rle encoding given y-boundary points */ 192 | k=m; a=malloc(sizeof(uint)*(k+1)); 193 | for( j=0; j0) b[m++]=a[j++]; else { 199 | j++; if(jm, p=0; long x; int more; 206 | char *s=malloc(sizeof(char)*m*6); 207 | for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; 209 | while( more ) { 210 | char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; 211 | if(more) c |= 0x20; c+=48; s[p++]=c; 212 | } 213 | } 214 | s[p]=0; return s; 215 | } 216 | 217 | void rleFrString( RLE *R, char *s, siz h, siz w ) { 218 | siz m=0, p=0, k; long x; int more; uint *cnts; 219 | while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; 220 | while( s[p] ) { 221 | x=0; k=0; more=1; 222 | while( more ) { 223 | char c=s[p]-48; x |= (c & 0x1f) << 5*k; 224 | more = c & 0x20; p++; k++; 225 | if(!more && (c & 0x10)) x |= -1 << 5*k; 226 | } 227 | if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; 228 | } 229 | rleInit(R,h,w,m,cnts); free(cnts); 230 | } 231 | -------------------------------------------------------------------------------- /lib/roi_data_layer/roibatchLoader.py: -------------------------------------------------------------------------------- 1 | 2 | """The data layer used during training to train a Fast R-CNN network. 3 | """ 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import torch.utils.data as data 10 | from PIL import Image 11 | import torch 12 | 13 | from model.utils.config import cfg 14 | from roi_data_layer.minibatch import get_minibatch, get_minibatch 15 | from model.rpn.bbox_transform import bbox_transform_inv, clip_boxes 16 | 17 | import numpy as np 18 | import random 19 | import time 20 | import pdb 21 | 22 | class roibatchLoader(data.Dataset): 23 | def __init__(self, roidb, ratio_list, ratio_index, batch_size, num_classes, training=True, normalize=None): 24 | self._roidb = roidb 25 | self._num_classes = num_classes 26 | # we make the height of image consistent to trim_height, trim_width 27 | self.trim_height = cfg.TRAIN.TRIM_HEIGHT 28 | self.trim_width = cfg.TRAIN.TRIM_WIDTH 29 | self.max_num_box = cfg.MAX_NUM_GT_BOXES 30 | self.training = training 31 | self.normalize = normalize 32 | self.ratio_list = ratio_list 33 | self.ratio_index = ratio_index 34 | self.batch_size = batch_size 35 | self.data_size = len(self.ratio_list) 36 | 37 | # given the ratio_list, we want to make the ratio same for each batch. 38 | self.ratio_list_batch = torch.Tensor(self.data_size).zero_() 39 | num_batch = int(np.ceil(len(ratio_index) / batch_size)) 40 | for i in range(num_batch): 41 | left_idx = i*batch_size 42 | right_idx = min((i+1)*batch_size-1, self.data_size-1) 43 | 44 | if ratio_list[right_idx] < 1: 45 | # for ratio < 1, we preserve the leftmost in each batch. 46 | target_ratio = ratio_list[left_idx] 47 | elif ratio_list[left_idx] > 1: 48 | # for ratio > 1, we preserve the rightmost in each batch. 49 | target_ratio = ratio_list[right_idx] 50 | else: 51 | # for ratio cross 1, we make it to be 1. 52 | target_ratio = 1 53 | 54 | self.ratio_list_batch[left_idx:(right_idx+1)] = target_ratio 55 | 56 | 57 | def __getitem__(self, index): 58 | if self.training: 59 | index_ratio = int(self.ratio_index[index]) 60 | else: 61 | index_ratio = index 62 | 63 | # get the anchor index for current sample index 64 | # here we set the anchor index to the last one 65 | # sample in this group 66 | minibatch_db = [self._roidb[index_ratio]] 67 | blobs = get_minibatch(minibatch_db, self._num_classes) 68 | data = torch.from_numpy(blobs['data']) 69 | im_info = torch.from_numpy(blobs['im_info']) 70 | # we need to random shuffle the bounding box. 71 | data_height, data_width = data.size(1), data.size(2) 72 | if self.training: 73 | np.random.shuffle(blobs['gt_boxes']) 74 | gt_boxes = torch.from_numpy(blobs['gt_boxes']) 75 | 76 | ######################################################## 77 | # padding the input image to fixed size for each group # 78 | ######################################################## 79 | 80 | # NOTE1: need to cope with the case where a group cover both conditions. (done) 81 | # NOTE2: need to consider the situation for the tail samples. (no worry) 82 | # NOTE3: need to implement a parallel data loader. (no worry) 83 | # get the index range 84 | 85 | # if the image need to crop, crop to the target size. 86 | ratio = self.ratio_list_batch[index] 87 | 88 | if self._roidb[index_ratio]['need_crop']: 89 | if ratio < 1: 90 | # this means that data_width << data_height, we need to crop the 91 | # data_height 92 | min_y = int(torch.min(gt_boxes[:,1])) 93 | max_y = int(torch.max(gt_boxes[:,3])) 94 | trim_size = int(np.floor(data_width / ratio)) 95 | box_region = max_y - min_y + 1 96 | if min_y == 0: 97 | y_s = 0 98 | else: 99 | if (box_region-trim_size) < 0: 100 | y_s_min = max(max_y-trim_size, 0) 101 | y_s_max = min(min_y, data_height-trim_size) 102 | if y_s_min == y_s_max: 103 | y_s = y_s_min 104 | else: 105 | y_s = np.random.choice(range(y_s_min, y_s_max)) 106 | else: 107 | y_s_add = int((box_region-trim_size)/2) 108 | if y_s_add == 0: 109 | y_s = min_y 110 | else: 111 | y_s = np.random.choice(range(min_y, min_y+y_s_add)) 112 | # crop the image 113 | data = data[:, y_s:(y_s + trim_size), :, :] 114 | 115 | # shift y coordiante of gt_boxes 116 | gt_boxes[:, 1] = gt_boxes[:, 1] - y_s 117 | gt_boxes[:, 3] = gt_boxes[:, 3] - y_s 118 | 119 | # update gt bounding box according the trip 120 | gt_boxes[:, 1].clamp_(0, trim_size - 1) 121 | gt_boxes[:, 3].clamp_(0, trim_size - 1) 122 | 123 | else: 124 | # this means that data_width >> data_height, we need to crop the 125 | # data_width 126 | min_x = int(torch.min(gt_boxes[:,0])) 127 | max_x = int(torch.max(gt_boxes[:,2])) 128 | trim_size = int(np.ceil(data_height * ratio)) 129 | box_region = max_x - min_x + 1 130 | if min_x == 0: 131 | x_s = 0 132 | else: 133 | if (box_region-trim_size) < 0: 134 | x_s_min = max(max_x-trim_size, 0) 135 | x_s_max = min(min_x, data_width-trim_size) 136 | if x_s_min == x_s_max: 137 | x_s = x_s_min 138 | else: 139 | x_s = np.random.choice(range(x_s_min, x_s_max)) 140 | else: 141 | x_s_add = int((box_region-trim_size)/2) 142 | if x_s_add == 0: 143 | x_s = min_x 144 | else: 145 | x_s = np.random.choice(range(min_x, min_x+x_s_add)) 146 | # crop the image 147 | data = data[:, :, x_s:(x_s + trim_size), :] 148 | 149 | # shift x coordiante of gt_boxes 150 | gt_boxes[:, 0] = gt_boxes[:, 0] - x_s 151 | gt_boxes[:, 2] = gt_boxes[:, 2] - x_s 152 | # update gt bounding box according the trip 153 | gt_boxes[:, 0].clamp_(0, trim_size - 1) 154 | gt_boxes[:, 2].clamp_(0, trim_size - 1) 155 | 156 | # based on the ratio, padding the image. 157 | if ratio < 1: 158 | # this means that data_width < data_height 159 | trim_size = int(np.floor(data_width / ratio)) 160 | 161 | padding_data = torch.FloatTensor(int(np.ceil(data_width / ratio)), \ 162 | data_width, 3).zero_() 163 | 164 | padding_data[:data_height, :, :] = data[0] 165 | # update im_info 166 | im_info[0, 0] = padding_data.size(0) 167 | # print("height %d %d \n" %(index, anchor_idx)) 168 | elif ratio > 1: 169 | # this means that data_width > data_height 170 | # if the image need to crop. 171 | padding_data = torch.FloatTensor(data_height, \ 172 | int(np.ceil(data_height * ratio)), 3).zero_() 173 | padding_data[:, :data_width, :] = data[0] 174 | im_info[0, 1] = padding_data.size(1) 175 | else: 176 | trim_size = min(data_height, data_width) 177 | padding_data = torch.FloatTensor(trim_size, trim_size, 3).zero_() 178 | padding_data = data[0][:trim_size, :trim_size, :] 179 | gt_boxes.clamp_(0, trim_size) 180 | im_info[0, 0] = trim_size 181 | im_info[0, 1] = trim_size 182 | 183 | 184 | # check the bounding box: 185 | not_keep = (gt_boxes[:,0] == gt_boxes[:,2]) | (gt_boxes[:,1] == gt_boxes[:,3]) 186 | keep = torch.nonzero(not_keep == 0).view(-1) 187 | 188 | gt_boxes_padding = torch.FloatTensor(self.max_num_box, gt_boxes.size(1)).zero_() 189 | if keep.numel() != 0: 190 | gt_boxes = gt_boxes[keep] 191 | num_boxes = min(gt_boxes.size(0), self.max_num_box) 192 | gt_boxes_padding[:num_boxes,:] = gt_boxes[:num_boxes] 193 | else: 194 | num_boxes = 0 195 | 196 | # permute trim_data to adapt to downstream processing 197 | padding_data = padding_data.permute(2, 0, 1).contiguous() 198 | im_info = im_info.view(3) 199 | 200 | return padding_data, im_info, gt_boxes_padding, num_boxes 201 | else: 202 | data = data.permute(0, 3, 1, 2).contiguous().view(3, data_height, data_width) 203 | im_info = im_info.view(3) 204 | 205 | gt_boxes = torch.FloatTensor([1,1,1,1,1]) 206 | num_boxes = 0 207 | 208 | return data, im_info, gt_boxes, num_boxes 209 | 210 | def __len__(self): 211 | return len(self._roidb) 212 | -------------------------------------------------------------------------------- /lib/datasets/imdb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Xinlei Chen 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | import os.path as osp 13 | import PIL 14 | from model.utils.cython_bbox import bbox_overlaps 15 | import numpy as np 16 | import scipy.sparse 17 | from model.utils.config import cfg 18 | import pdb 19 | 20 | ROOT_DIR = osp.join(osp.dirname(__file__), '..', '..') 21 | 22 | class imdb(object): 23 | """Image database.""" 24 | 25 | def __init__(self, name, classes=None): 26 | self._name = name 27 | self._num_classes = 0 28 | if not classes: 29 | self._classes = [] 30 | else: 31 | self._classes = classes 32 | self._image_index = [] 33 | self._obj_proposer = 'gt' 34 | self._roidb = None 35 | self._roidb_handler = self.default_roidb 36 | # Use this dict for storing dataset specific config options 37 | self.config = {} 38 | 39 | @property 40 | def name(self): 41 | return self._name 42 | 43 | @property 44 | def num_classes(self): 45 | return len(self._classes) 46 | 47 | @property 48 | def classes(self): 49 | return self._classes 50 | 51 | @property 52 | def image_index(self): 53 | return self._image_index 54 | 55 | @property 56 | def roidb_handler(self): 57 | return self._roidb_handler 58 | 59 | @roidb_handler.setter 60 | def roidb_handler(self, val): 61 | self._roidb_handler = val 62 | 63 | def set_proposal_method(self, method): 64 | method = eval('self.' + method + '_roidb') 65 | self.roidb_handler = method 66 | 67 | @property 68 | def roidb(self): 69 | # A roidb is a list of dictionaries, each with the following keys: 70 | # boxes 71 | # gt_overlaps 72 | # gt_classes 73 | # flipped 74 | if self._roidb is not None: 75 | return self._roidb 76 | self._roidb = self.roidb_handler() 77 | return self._roidb 78 | 79 | @property 80 | def cache_path(self): 81 | cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache')) 82 | if not os.path.exists(cache_path): 83 | os.makedirs(cache_path) 84 | return cache_path 85 | 86 | @property 87 | def num_images(self): 88 | return len(self.image_index) 89 | 90 | def image_path_at(self, i): 91 | raise NotImplementedError 92 | 93 | def image_id_at(self, i): 94 | raise NotImplementedError 95 | 96 | def default_roidb(self): 97 | raise NotImplementedError 98 | 99 | def evaluate_detections(self, all_boxes, output_dir=None): 100 | """ 101 | all_boxes is a list of length number-of-classes. 102 | Each list element is a list of length number-of-images. 103 | Each of those list elements is either an empty list [] 104 | or a numpy array of detection. 105 | 106 | all_boxes[class][image] = [] or np.array of shape #dets x 5 107 | """ 108 | raise NotImplementedError 109 | 110 | def _get_widths(self): 111 | return [PIL.Image.open(self.image_path_at(i)).size[0] 112 | for i in range(self.num_images)] 113 | 114 | def append_flipped_images(self): 115 | num_images = self.num_images 116 | widths = self._get_widths() 117 | for i in range(num_images): 118 | boxes = self.roidb[i]['boxes'].copy() 119 | oldx1 = boxes[:, 0].copy() 120 | oldx2 = boxes[:, 2].copy() 121 | boxes[:, 0] = widths[i] - oldx2 - 1 122 | boxes[:, 2] = widths[i] - oldx1 - 1 123 | assert (boxes[:, 2] >= boxes[:, 0]).all() 124 | entry = {'boxes': boxes, 125 | 'gt_overlaps': self.roidb[i]['gt_overlaps'], 126 | 'gt_classes': self.roidb[i]['gt_classes'], 127 | 'flipped': True} 128 | self.roidb.append(entry) 129 | self._image_index = self._image_index * 2 130 | 131 | def evaluate_recall(self, candidate_boxes=None, thresholds=None, 132 | area='all', limit=None): 133 | """Evaluate detection proposal recall metrics. 134 | 135 | Returns: 136 | results: dictionary of results with keys 137 | 'ar': average recall 138 | 'recalls': vector recalls at each IoU overlap threshold 139 | 'thresholds': vector of IoU overlap thresholds 140 | 'gt_overlaps': vector of all ground-truth overlaps 141 | """ 142 | # Record max overlap value for each gt box 143 | # Return vector of overlap values 144 | areas = {'all': 0, 'small': 1, 'medium': 2, 'large': 3, 145 | '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7} 146 | area_ranges = [[0 ** 2, 1e5 ** 2], # all 147 | [0 ** 2, 32 ** 2], # small 148 | [32 ** 2, 96 ** 2], # medium 149 | [96 ** 2, 1e5 ** 2], # large 150 | [96 ** 2, 128 ** 2], # 96-128 151 | [128 ** 2, 256 ** 2], # 128-256 152 | [256 ** 2, 512 ** 2], # 256-512 153 | [512 ** 2, 1e5 ** 2], # 512-inf 154 | ] 155 | assert area in areas, 'unknown area range: {}'.format(area) 156 | area_range = area_ranges[areas[area]] 157 | gt_overlaps = np.zeros(0) 158 | num_pos = 0 159 | for i in range(self.num_images): 160 | # Checking for max_overlaps == 1 avoids including crowd annotations 161 | # (...pretty hacking :/) 162 | max_gt_overlaps = self.roidb[i]['gt_overlaps'].toarray().max(axis=1) 163 | gt_inds = np.where((self.roidb[i]['gt_classes'] > 0) & 164 | (max_gt_overlaps == 1))[0] 165 | gt_boxes = self.roidb[i]['boxes'][gt_inds, :] 166 | gt_areas = self.roidb[i]['seg_areas'][gt_inds] 167 | valid_gt_inds = np.where((gt_areas >= area_range[0]) & 168 | (gt_areas <= area_range[1]))[0] 169 | gt_boxes = gt_boxes[valid_gt_inds, :] 170 | num_pos += len(valid_gt_inds) 171 | 172 | if candidate_boxes is None: 173 | # If candidate_boxes is not supplied, the default is to use the 174 | # non-ground-truth boxes from this roidb 175 | non_gt_inds = np.where(self.roidb[i]['gt_classes'] == 0)[0] 176 | boxes = self.roidb[i]['boxes'][non_gt_inds, :] 177 | else: 178 | boxes = candidate_boxes[i] 179 | if boxes.shape[0] == 0: 180 | continue 181 | if limit is not None and boxes.shape[0] > limit: 182 | boxes = boxes[:limit, :] 183 | 184 | overlaps = bbox_overlaps(boxes.astype(np.float), 185 | gt_boxes.astype(np.float)) 186 | 187 | _gt_overlaps = np.zeros((gt_boxes.shape[0])) 188 | for j in range(gt_boxes.shape[0]): 189 | # find which proposal box maximally covers each gt box 190 | argmax_overlaps = overlaps.argmax(axis=0) 191 | # and get the iou amount of coverage for each gt box 192 | max_overlaps = overlaps.max(axis=0) 193 | # find which gt box is 'best' covered (i.e. 'best' = most iou) 194 | gt_ind = max_overlaps.argmax() 195 | gt_ovr = max_overlaps.max() 196 | assert (gt_ovr >= 0) 197 | # find the proposal box that covers the best covered gt box 198 | box_ind = argmax_overlaps[gt_ind] 199 | # record the iou coverage of this gt box 200 | _gt_overlaps[j] = overlaps[box_ind, gt_ind] 201 | assert (_gt_overlaps[j] == gt_ovr) 202 | # mark the proposal box and the gt box as used 203 | overlaps[box_ind, :] = -1 204 | overlaps[:, gt_ind] = -1 205 | # append recorded iou coverage level 206 | gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps)) 207 | 208 | gt_overlaps = np.sort(gt_overlaps) 209 | if thresholds is None: 210 | step = 0.05 211 | thresholds = np.arange(0.5, 0.95 + 1e-5, step) 212 | recalls = np.zeros_like(thresholds) 213 | # compute recall for each iou threshold 214 | for i, t in enumerate(thresholds): 215 | recalls[i] = (gt_overlaps >= t).sum() / float(num_pos) 216 | # ar = 2 * np.trapz(recalls, thresholds) 217 | ar = recalls.mean() 218 | return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds, 219 | 'gt_overlaps': gt_overlaps} 220 | 221 | def create_roidb_from_box_list(self, box_list, gt_roidb): 222 | assert len(box_list) == self.num_images, \ 223 | 'Number of boxes must match number of ground-truth images' 224 | roidb = [] 225 | for i in range(self.num_images): 226 | boxes = box_list[i] 227 | num_boxes = boxes.shape[0] 228 | overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32) 229 | 230 | if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0: 231 | gt_boxes = gt_roidb[i]['boxes'] 232 | gt_classes = gt_roidb[i]['gt_classes'] 233 | gt_overlaps = bbox_overlaps(boxes.astype(np.float), 234 | gt_boxes.astype(np.float)) 235 | argmaxes = gt_overlaps.argmax(axis=1) 236 | maxes = gt_overlaps.max(axis=1) 237 | I = np.where(maxes > 0)[0] 238 | overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] 239 | 240 | overlaps = scipy.sparse.csr_matrix(overlaps) 241 | roidb.append({ 242 | 'boxes': boxes, 243 | 'gt_classes': np.zeros((num_boxes,), dtype=np.int32), 244 | 'gt_overlaps': overlaps, 245 | 'flipped': False, 246 | 'seg_areas': np.zeros((num_boxes,), dtype=np.float32), 247 | }) 248 | return roidb 249 | 250 | @staticmethod 251 | def merge_roidbs(a, b): 252 | assert len(a) == len(b) 253 | for i in range(len(a)): 254 | a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes'])) 255 | a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'], 256 | b[i]['gt_classes'])) 257 | a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'], 258 | b[i]['gt_overlaps']]) 259 | a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'], 260 | b[i]['seg_areas'])) 261 | return a 262 | 263 | def competition_mode(self, on): 264 | """Turn competition mode on or off.""" 265 | pass 266 | -------------------------------------------------------------------------------- /lib/model/rpn/proposal_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | # -------------------------------------------------------- 8 | # Reorganized and modified by Jianwei Yang and Jiasen Lu 9 | # -------------------------------------------------------- 10 | 11 | import torch 12 | import torch.nn as nn 13 | import numpy as np 14 | import numpy.random as npr 15 | from ..utils.config import cfg 16 | from bbox_transform import bbox_overlaps_batch, bbox_transform_batch 17 | import pdb 18 | 19 | class _ProposalTargetLayer(nn.Module): 20 | """ 21 | Assign object detection proposals to ground-truth targets. Produces proposal 22 | classification labels and bounding-box regression targets. 23 | """ 24 | 25 | def __init__(self, nclasses): 26 | super(_ProposalTargetLayer, self).__init__() 27 | self._num_classes = nclasses 28 | self.BBOX_NORMALIZE_MEANS = torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) 29 | self.BBOX_NORMALIZE_STDS = torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) 30 | self.BBOX_INSIDE_WEIGHTS = torch.FloatTensor(cfg.TRAIN.BBOX_INSIDE_WEIGHTS) 31 | 32 | def forward(self, all_rois, gt_boxes, num_boxes): 33 | 34 | self.BBOX_NORMALIZE_MEANS = self.BBOX_NORMALIZE_MEANS.type_as(gt_boxes) 35 | self.BBOX_NORMALIZE_STDS = self.BBOX_NORMALIZE_STDS.type_as(gt_boxes) 36 | self.BBOX_INSIDE_WEIGHTS = self.BBOX_INSIDE_WEIGHTS.type_as(gt_boxes) 37 | 38 | gt_boxes_append = gt_boxes.new(gt_boxes.size()).zero_() 39 | gt_boxes_append[:,:,1:5] = gt_boxes[:,:,:4] 40 | 41 | # Include ground-truth boxes in the set of candidate rois 42 | all_rois = torch.cat([all_rois, gt_boxes_append], 1) 43 | 44 | num_images = 1 45 | rois_per_image = int(cfg.TRAIN.BATCH_SIZE / num_images) 46 | fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)) 47 | 48 | labels, rois, gt_assign, bbox_targets, bbox_inside_weights = self._sample_rois_pytorch( 49 | all_rois, gt_boxes, fg_rois_per_image, 50 | rois_per_image, self._num_classes) 51 | 52 | bbox_outside_weights = (bbox_inside_weights > 0).float() 53 | 54 | return rois, labels, gt_assign, bbox_targets, bbox_inside_weights, bbox_outside_weights 55 | 56 | def backward(self, top, propagate_down, bottom): 57 | """This layer does not propagate gradients.""" 58 | pass 59 | 60 | def reshape(self, bottom, top): 61 | """Reshaping happens during the call to forward.""" 62 | pass 63 | 64 | def _get_bbox_regression_labels_pytorch(self, bbox_target_data, labels_batch, num_classes): 65 | """Bounding-box regression targets (bbox_target_data) are stored in a 66 | compact form b x N x (class, tx, ty, tw, th) 67 | 68 | This function expands those targets into the 4-of-4*K representation used 69 | by the network (i.e. only one class has non-zero targets). 70 | 71 | Returns: 72 | bbox_target (ndarray): b x N x 4K blob of regression targets 73 | bbox_inside_weights (ndarray): b x N x 4K blob of loss weights 74 | """ 75 | batch_size = labels_batch.size(0) 76 | rois_per_image = labels_batch.size(1) 77 | clss = labels_batch 78 | bbox_targets = bbox_target_data.new(batch_size, rois_per_image, 4).zero_() 79 | bbox_inside_weights = bbox_target_data.new(bbox_targets.size()).zero_() 80 | 81 | for b in range(batch_size): 82 | # assert clss[b].sum() > 0 83 | if clss[b].sum() == 0: 84 | continue 85 | inds = torch.nonzero(clss[b] > 0).view(-1) 86 | for i in range(inds.numel()): 87 | ind = inds[i] 88 | bbox_targets[b, ind, :] = bbox_target_data[b, ind, :] 89 | bbox_inside_weights[b, ind, :] = self.BBOX_INSIDE_WEIGHTS 90 | 91 | return bbox_targets, bbox_inside_weights 92 | 93 | 94 | def _compute_targets_pytorch(self, ex_rois, gt_rois): 95 | """Compute bounding-box regression targets for an image.""" 96 | 97 | assert ex_rois.size(1) == gt_rois.size(1) 98 | assert ex_rois.size(2) == 4 99 | assert gt_rois.size(2) == 4 100 | 101 | batch_size = ex_rois.size(0) 102 | rois_per_image = ex_rois.size(1) 103 | 104 | targets = bbox_transform_batch(ex_rois, gt_rois) 105 | 106 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 107 | # Optionally normalize targets by a precomputed mean and stdev 108 | targets = ((targets - self.BBOX_NORMALIZE_MEANS.expand_as(targets)) 109 | / self.BBOX_NORMALIZE_STDS.expand_as(targets)) 110 | 111 | return targets 112 | 113 | 114 | def _sample_rois_pytorch(self, all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): 115 | """Generate a random sample of RoIs comprising foreground and background 116 | examples. 117 | """ 118 | # overlaps: (rois x gt_boxes) 119 | 120 | overlaps = bbox_overlaps_batch(all_rois, gt_boxes) 121 | 122 | max_overlaps, gt_assignment = torch.max(overlaps, 2) 123 | 124 | batch_size = overlaps.size(0) 125 | num_proposal = overlaps.size(1) 126 | num_boxes_per_img = overlaps.size(2) 127 | 128 | offset = torch.arange(0, batch_size)*gt_boxes.size(1) 129 | offset = offset.view(-1, 1).type_as(gt_assignment) + gt_assignment 130 | 131 | labels = gt_boxes[:,:,4].contiguous().view(-1).index(offset.view(-1))\ 132 | .view(batch_size, -1) 133 | 134 | labels_batch = labels.new(batch_size, rois_per_image).zero_() 135 | rois_batch = all_rois.new(batch_size, rois_per_image, 5).zero_() 136 | gt_assign_batch = all_rois.new(batch_size, rois_per_image).zero_() 137 | gt_rois_batch = all_rois.new(batch_size, rois_per_image, 5).zero_() 138 | # Guard against the case when an image has fewer than max_fg_rois_per_image 139 | # foreground RoIs 140 | for i in range(batch_size): 141 | 142 | fg_inds = torch.nonzero(max_overlaps[i] >= cfg.TRAIN.FG_THRESH).view(-1) 143 | fg_num_rois = fg_inds.numel() 144 | 145 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 146 | bg_inds = torch.nonzero((max_overlaps[i] < cfg.TRAIN.BG_THRESH_HI) & 147 | (max_overlaps[i] >= cfg.TRAIN.BG_THRESH_LO)).view(-1) 148 | bg_num_rois = bg_inds.numel() 149 | 150 | if fg_num_rois > 0 and bg_num_rois > 0: 151 | # sampling fg 152 | fg_rois_per_this_image = min(fg_rois_per_image, fg_num_rois) 153 | 154 | # torch.randperm seems has a bug on multi-gpu setting that cause the segfault. 155 | # See https://github.com/pytorch/pytorch/issues/1868 for more details. 156 | # use numpy instead. 157 | #rand_num = torch.randperm(fg_num_rois).long().cuda() 158 | rand_num = torch.from_numpy(np.random.permutation(fg_num_rois)).type_as(gt_boxes).long() 159 | fg_inds = fg_inds[rand_num[:fg_rois_per_this_image]] 160 | 161 | # sampling bg 162 | bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image 163 | 164 | # Seems torch.rand has a bug, it will generate very large number and make an error. 165 | # We use numpy rand instead. 166 | #rand_num = (torch.rand(bg_rois_per_this_image) * bg_num_rois).long().cuda() 167 | rand_num = np.floor(np.random.rand(bg_rois_per_this_image) * bg_num_rois) 168 | rand_num = torch.from_numpy(rand_num).type_as(gt_boxes).long() 169 | bg_inds = bg_inds[rand_num] 170 | 171 | elif fg_num_rois > 0 and bg_num_rois == 0: 172 | # sampling fg 173 | #rand_num = torch.floor(torch.rand(rois_per_image) * fg_num_rois).long().cuda() 174 | rand_num = np.floor(np.random.rand(rois_per_image) * fg_num_rois) 175 | rand_num = torch.from_numpy(rand_num).type_as(gt_boxes).long() 176 | fg_inds = fg_inds[rand_num] 177 | fg_rois_per_this_image = rois_per_image 178 | bg_rois_per_this_image = 0 179 | elif bg_num_rois > 0 and fg_num_rois == 0: 180 | # sampling bg 181 | #rand_num = torch.floor(torch.rand(rois_per_image) * bg_num_rois).long().cuda() 182 | rand_num = np.floor(np.random.rand(rois_per_image) * bg_num_rois) 183 | rand_num = torch.from_numpy(rand_num).type_as(gt_boxes).long() 184 | 185 | bg_inds = bg_inds[rand_num] 186 | bg_rois_per_this_image = rois_per_image 187 | fg_rois_per_this_image = 0 188 | else: 189 | raise ValueError("bg_num_rois = 0 and fg_num_rois = 0, this should not happen!") 190 | 191 | # The indices that we're selecting (both fg and bg) 192 | keep_inds = torch.cat([fg_inds, bg_inds], 0) 193 | 194 | # Select sampled values from various arrays: 195 | labels_batch[i].copy_(labels[i][keep_inds]) 196 | 197 | # Clamp labels for the background RoIs to 0 198 | labels_batch[i][fg_rois_per_this_image:] = 0 199 | 200 | rois_batch[i] = all_rois[i][keep_inds] 201 | rois_batch[i,:,0] = i 202 | 203 | # TODO: check the below line when batch_size > 1, no need to add offset here 204 | gt_assign_batch[i] = gt_assignment[i][keep_inds] 205 | 206 | gt_rois_batch[i] = gt_boxes[i][gt_assignment[i][keep_inds]] 207 | 208 | bbox_target_data = self._compute_targets_pytorch( 209 | rois_batch[:,:,1:5], gt_rois_batch[:,:,:4]) 210 | 211 | bbox_targets, bbox_inside_weights = \ 212 | self._get_bbox_regression_labels_pytorch(bbox_target_data, labels_batch, num_classes) 213 | 214 | return labels_batch, rois_batch, gt_assign_batch, bbox_targets, bbox_inside_weights 215 | -------------------------------------------------------------------------------- /lib/model/rpn/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | # -------------------------------------------------------- 8 | # Reorganized and modified by Jianwei Yang and Jiasen Lu 9 | # -------------------------------------------------------- 10 | 11 | import torch 12 | import numpy as np 13 | import pdb 14 | 15 | def bbox_transform(ex_rois, gt_rois): 16 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 17 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 18 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 19 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 20 | 21 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 22 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 23 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 24 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 25 | 26 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 27 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 28 | targets_dw = torch.log(gt_widths / ex_widths) 29 | targets_dh = torch.log(gt_heights / ex_heights) 30 | 31 | targets = torch.stack( 32 | (targets_dx, targets_dy, targets_dw, targets_dh),1) 33 | 34 | return targets 35 | 36 | def bbox_transform_batch(ex_rois, gt_rois): 37 | 38 | if ex_rois.dim() == 2: 39 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 40 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 41 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 42 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 43 | 44 | gt_widths = gt_rois[:, :, 2] - gt_rois[:, :, 0] + 1.0 45 | gt_heights = gt_rois[:, :, 3] - gt_rois[:, :, 1] + 1.0 46 | gt_ctr_x = gt_rois[:, :, 0] + 0.5 * gt_widths 47 | gt_ctr_y = gt_rois[:, :, 1] + 0.5 * gt_heights 48 | 49 | targets_dx = (gt_ctr_x - ex_ctr_x.view(1,-1).expand_as(gt_ctr_x)) / ex_widths 50 | targets_dy = (gt_ctr_y - ex_ctr_y.view(1,-1).expand_as(gt_ctr_y)) / ex_heights 51 | targets_dw = torch.log(gt_widths / ex_widths.view(1,-1).expand_as(gt_widths)) 52 | targets_dh = torch.log(gt_heights / ex_heights.view(1,-1).expand_as(gt_heights)) 53 | 54 | elif ex_rois.dim() == 3: 55 | ex_widths = ex_rois[:, :, 2] - ex_rois[:, :, 0] + 1.0 56 | ex_heights = ex_rois[:,:, 3] - ex_rois[:,:, 1] + 1.0 57 | ex_ctr_x = ex_rois[:, :, 0] + 0.5 * ex_widths 58 | ex_ctr_y = ex_rois[:, :, 1] + 0.5 * ex_heights 59 | 60 | gt_widths = gt_rois[:, :, 2] - gt_rois[:, :, 0] + 1.0 61 | gt_heights = gt_rois[:, :, 3] - gt_rois[:, :, 1] + 1.0 62 | gt_ctr_x = gt_rois[:, :, 0] + 0.5 * gt_widths 63 | gt_ctr_y = gt_rois[:, :, 1] + 0.5 * gt_heights 64 | 65 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 66 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 67 | targets_dw = torch.log(gt_widths / ex_widths) 68 | targets_dh = torch.log(gt_heights / ex_heights) 69 | else: 70 | raise ValueError('ex_roi input dimension is not correct.') 71 | 72 | targets = torch.stack( 73 | (targets_dx, targets_dy, targets_dw, targets_dh),2) 74 | 75 | return targets 76 | 77 | def bbox_transform_inv(boxes, deltas, batch_size): 78 | widths = boxes[:, :, 2] - boxes[:, :, 0] + 1.0 79 | heights = boxes[:, :, 3] - boxes[:, :, 1] + 1.0 80 | ctr_x = boxes[:, :, 0] + 0.5 * widths 81 | ctr_y = boxes[:, :, 1] + 0.5 * heights 82 | 83 | dx = deltas[:, :, 0::4] 84 | dy = deltas[:, :, 1::4] 85 | dw = deltas[:, :, 2::4] 86 | dh = deltas[:, :, 3::4] 87 | 88 | pred_ctr_x = dx * widths.unsqueeze(2) + ctr_x.unsqueeze(2) 89 | pred_ctr_y = dy * heights.unsqueeze(2) + ctr_y.unsqueeze(2) 90 | pred_w = torch.exp(dw) * widths.unsqueeze(2) 91 | pred_h = torch.exp(dh) * heights.unsqueeze(2) 92 | 93 | pred_boxes = deltas.clone() 94 | # x1 95 | pred_boxes[:, :, 0::4] = pred_ctr_x - 0.5 * pred_w 96 | # y1 97 | pred_boxes[:, :, 1::4] = pred_ctr_y - 0.5 * pred_h 98 | # x2 99 | pred_boxes[:, :, 2::4] = pred_ctr_x + 0.5 * pred_w 100 | # y2 101 | pred_boxes[:, :, 3::4] = pred_ctr_y + 0.5 * pred_h 102 | 103 | return pred_boxes 104 | 105 | def clip_boxes_batch(boxes, im_shape, batch_size): 106 | """ 107 | Clip boxes to image boundaries. 108 | """ 109 | num_rois = boxes.size(1) 110 | 111 | boxes[boxes < 0] = 0 112 | # batch_x = (im_shape[:,0]-1).view(batch_size, 1).expand(batch_size, num_rois) 113 | # batch_y = (im_shape[:,1]-1).view(batch_size, 1).expand(batch_size, num_rois) 114 | 115 | batch_x = im_shape[:, 1] - 1 116 | batch_y = im_shape[:, 0] - 1 117 | 118 | boxes[:,:,0][boxes[:,:,0] > batch_x] = batch_x 119 | boxes[:,:,1][boxes[:,:,1] > batch_y] = batch_y 120 | boxes[:,:,2][boxes[:,:,2] > batch_x] = batch_x 121 | boxes[:,:,3][boxes[:,:,3] > batch_y] = batch_y 122 | 123 | return boxes 124 | 125 | def clip_boxes(boxes, im_shape, batch_size): 126 | 127 | for i in range(batch_size): 128 | boxes[i,:,0::4].clamp_(0, im_shape[i, 1]-1) 129 | boxes[i,:,1::4].clamp_(0, im_shape[i, 0]-1) 130 | boxes[i,:,2::4].clamp_(0, im_shape[i, 1]-1) 131 | boxes[i,:,3::4].clamp_(0, im_shape[i, 0]-1) 132 | 133 | return boxes 134 | 135 | 136 | def bbox_overlaps(anchors, gt_boxes): 137 | """ 138 | anchors: (N, 4) ndarray of float 139 | gt_boxes: (K, 4) ndarray of float 140 | 141 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 142 | """ 143 | N = anchors.size(0) 144 | K = gt_boxes.size(0) 145 | 146 | gt_boxes_area = ((gt_boxes[:,2] - gt_boxes[:,0] + 1) * 147 | (gt_boxes[:,3] - gt_boxes[:,1] + 1)).view(1, K) 148 | 149 | anchors_area = ((anchors[:,2] - anchors[:,0] + 1) * 150 | (anchors[:,3] - anchors[:,1] + 1)).view(N, 1) 151 | 152 | boxes = anchors.view(N, 1, 4).expand(N, K, 4) 153 | query_boxes = gt_boxes.view(1, K, 4).expand(N, K, 4) 154 | 155 | iw = (torch.min(boxes[:,:,2], query_boxes[:,:,2]) - 156 | torch.max(boxes[:,:,0], query_boxes[:,:,0]) + 1) 157 | iw[iw < 0] = 0 158 | 159 | ih = (torch.min(boxes[:,:,3], query_boxes[:,:,3]) - 160 | torch.max(boxes[:,:,1], query_boxes[:,:,1]) + 1) 161 | ih[ih < 0] = 0 162 | 163 | ua = anchors_area + gt_boxes_area - (iw * ih) 164 | overlaps = iw * ih / ua 165 | 166 | return overlaps 167 | 168 | def bbox_overlaps_batch(anchors, gt_boxes): 169 | """ 170 | anchors: (N, 4) ndarray of float 171 | gt_boxes: (b, K, 5) ndarray of float 172 | 173 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 174 | """ 175 | batch_size = gt_boxes.size(0) 176 | 177 | 178 | if anchors.dim() == 2: 179 | 180 | N = anchors.size(0) 181 | K = gt_boxes.size(1) 182 | 183 | anchors = anchors.view(1, N, 4).expand(batch_size, N, 4).contiguous() 184 | gt_boxes = gt_boxes[:,:,:4].contiguous() 185 | 186 | 187 | gt_boxes_x = (gt_boxes[:,:,2] - gt_boxes[:,:,0] + 1) 188 | gt_boxes_y = (gt_boxes[:,:,3] - gt_boxes[:,:,1] + 1) 189 | gt_boxes_area = (gt_boxes_x * gt_boxes_y).view(batch_size, 1, K) 190 | 191 | anchors_boxes_x = (anchors[:,:,2] - anchors[:,:,0] + 1) 192 | anchors_boxes_y = (anchors[:,:,3] - anchors[:,:,1] + 1) 193 | anchors_area = (anchors_boxes_x * anchors_boxes_y).view(batch_size, N, 1) 194 | 195 | gt_area_zero = (gt_boxes_x == 1) & (gt_boxes_y == 1) 196 | anchors_area_zero = (anchors_boxes_x == 1) & (anchors_boxes_y == 1) 197 | 198 | boxes = anchors.view(batch_size, N, 1, 4).expand(batch_size, N, K, 4) 199 | query_boxes = gt_boxes.view(batch_size, 1, K, 4).expand(batch_size, N, K, 4) 200 | 201 | iw = (torch.min(boxes[:,:,:,2], query_boxes[:,:,:,2]) - 202 | torch.max(boxes[:,:,:,0], query_boxes[:,:,:,0]) + 1) 203 | iw[iw < 0] = 0 204 | 205 | ih = (torch.min(boxes[:,:,:,3], query_boxes[:,:,:,3]) - 206 | torch.max(boxes[:,:,:,1], query_boxes[:,:,:,1]) + 1) 207 | ih[ih < 0] = 0 208 | ua = anchors_area + gt_boxes_area - (iw * ih) 209 | overlaps = iw * ih / ua 210 | 211 | # mask the overlap here. 212 | overlaps.masked_fill_(gt_area_zero.view(batch_size, 1, K).expand(batch_size, N, K), 0) 213 | overlaps.masked_fill_(anchors_area_zero.view(batch_size, N, 1).expand(batch_size, N, K), -1) 214 | 215 | elif anchors.dim() == 3: 216 | N = anchors.size(1) 217 | K = gt_boxes.size(1) 218 | 219 | if anchors.size(2) == 4: 220 | anchors = anchors[:,:,:4].contiguous() 221 | else: 222 | anchors = anchors[:,:,1:5].contiguous() 223 | 224 | gt_boxes = gt_boxes[:,:,:4].contiguous() 225 | 226 | gt_boxes_x = (gt_boxes[:,:,2] - gt_boxes[:,:,0] + 1) 227 | gt_boxes_y = (gt_boxes[:,:,3] - gt_boxes[:,:,1] + 1) 228 | gt_boxes_area = (gt_boxes_x * gt_boxes_y).view(batch_size, 1, K) 229 | 230 | anchors_boxes_x = (anchors[:,:,2] - anchors[:,:,0] + 1) 231 | anchors_boxes_y = (anchors[:,:,3] - anchors[:,:,1] + 1) 232 | anchors_area = (anchors_boxes_x * anchors_boxes_y).view(batch_size, N, 1) 233 | 234 | gt_area_zero = (gt_boxes_x == 1) & (gt_boxes_y == 1) 235 | anchors_area_zero = (anchors_boxes_x == 1) & (anchors_boxes_y == 1) 236 | 237 | boxes = anchors.view(batch_size, N, 1, 4).expand(batch_size, N, K, 4) 238 | query_boxes = gt_boxes.view(batch_size, 1, K, 4).expand(batch_size, N, K, 4) 239 | 240 | iw = (torch.min(boxes[:,:,:,2], query_boxes[:,:,:,2]) - 241 | torch.max(boxes[:,:,:,0], query_boxes[:,:,:,0]) + 1) 242 | iw[iw < 0] = 0 243 | 244 | ih = (torch.min(boxes[:,:,:,3], query_boxes[:,:,:,3]) - 245 | torch.max(boxes[:,:,:,1], query_boxes[:,:,:,1]) + 1) 246 | ih[ih < 0] = 0 247 | ua = anchors_area + gt_boxes_area - (iw * ih) 248 | 249 | overlaps = iw * ih / ua 250 | 251 | # mask the overlap here. 252 | overlaps.masked_fill_(gt_area_zero.view(batch_size, 1, K).expand(batch_size, N, K), 0) 253 | overlaps.masked_fill_(anchors_area_zero.view(batch_size, N, 1).expand(batch_size, N, K), -1) 254 | else: 255 | raise ValueError('anchors input dimension is not correct.') 256 | 257 | return overlaps 258 | --------------------------------------------------------------------------------