├── lib ├── model │ ├── __init__.py │ ├── nms │ │ ├── __init__.py │ │ ├── _ext │ │ │ ├── __init__.py │ │ │ └── nms │ │ │ │ └── __init__.py │ │ ├── .gitignore │ │ ├── make.sh │ │ ├── src │ │ │ ├── nms_cuda_kernel.h │ │ │ ├── nms_cuda.h │ │ │ └── nms_cuda_kernel.cu │ │ ├── nms_gpu.py │ │ ├── nms_wrapper.py │ │ ├── build.py │ │ ├── nms_cpu.py │ │ └── nms_kernel.cu │ ├── rpn │ │ ├── __init__.py │ │ ├── generate_anchors.py │ │ └── rpn.py │ ├── utils │ │ ├── __init__.py │ │ ├── .gitignore │ │ ├── blob.py │ │ ├── losses.py │ │ ├── logger.py │ │ ├── bbox.pyx │ │ ├── fsod_logger.py │ │ └── net_utils.py │ ├── framework │ │ └── __init__.py │ ├── roi_align │ │ ├── __init__.py │ │ ├── _ext │ │ │ ├── __init__.py │ │ │ └── roi_align │ │ │ │ └── __init__.py │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── roi_align.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── roi_align.py │ │ ├── make.sh │ │ ├── src │ │ │ ├── roi_align.h │ │ │ ├── roi_align_cuda.h │ │ │ ├── roi_align_kernel.h │ │ │ ├── roi_align_cuda.c │ │ │ ├── roi_align.c │ │ │ └── roi_align_kernel.cu │ │ └── build.py │ ├── roi_crop │ │ ├── __init__.py │ │ ├── _ext │ │ │ ├── __init__.py │ │ │ ├── crop_resize │ │ │ │ └── __init__.py │ │ │ └── roi_crop │ │ │ │ └── __init__.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── roi_crop.py │ │ ├── functions │ │ │ ├── __init__.py │ │ │ ├── roi_crop.py │ │ │ ├── crop_resize.py │ │ │ └── gridgen.py │ │ ├── make.sh │ │ ├── src │ │ │ ├── roi_crop_cuda.h │ │ │ ├── roi_crop.h │ │ │ ├── roi_crop_cuda_kernel.h │ │ │ └── roi_crop_cuda.c │ │ └── build.py │ ├── roi_pooling │ │ ├── __init__.py │ │ ├── _ext │ │ │ ├── __init__.py │ │ │ └── roi_pooling │ │ │ │ └── __init__.py │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── roi_pool.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── roi_pool.py │ │ ├── src │ │ │ ├── roi_pooling.h │ │ │ ├── roi_pooling_cuda.h │ │ │ ├── roi_pooling_kernel.h │ │ │ ├── roi_pooling_cuda.c │ │ │ └── roi_pooling.c │ │ └── build.py │ ├── roi_layers │ │ ├── nms.py │ │ ├── __init__.py │ │ ├── roi_pool.py │ │ └── roi_align.py │ └── csrc │ │ ├── vision.cpp │ │ ├── cpu │ │ ├── vision.h │ │ └── nms_cpu.cpp │ │ ├── nms.h │ │ ├── ROIPool.h │ │ ├── ROIAlign.h │ │ └── cuda │ │ ├── vision.h │ │ └── nms.cu ├── datasets │ ├── __init__.py │ ├── ds_utils.py │ ├── tools │ │ └── mcg_munge.py │ ├── debug.ipynb │ ├── factory.py │ ├── vg_eval.py │ └── voc_eval.py ├── roi_data_layer │ ├── __init__.py │ ├── general_test_loader.py │ ├── minibatch.py │ ├── roidb.py │ ├── allcls_fs_loader.py │ ├── inference_loader.py │ └── multiway_loader.py └── setup.py ├── images ├── prediction.jpg └── attention_visualization.jpg ├── .gitignore ├── cfgs ├── vgg16.yml ├── res101.yml ├── res101_ls.yml └── res50.yml ├── env.yml ├── inference.py ├── README.md └── train.py /lib/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/nms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/rpn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/framework/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/nms/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_align/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_crop/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_align/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_crop/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_crop/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_align/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_align/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_crop/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/nms/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /lib/model/utils/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /images/prediction.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tung-I/Dual-awareness-Attention-for-Few-shot-Object-Detection/HEAD/images/prediction.jpg -------------------------------------------------------------------------------- /images/attention_visualization.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tung-I/Dual-awareness-Attention-for-Few-shot-Object-Detection/HEAD/images/attention_visualization.jpg -------------------------------------------------------------------------------- /lib/model/roi_pooling/src/roi_pooling.h: -------------------------------------------------------------------------------- 1 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale, 2 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output); -------------------------------------------------------------------------------- /lib/model/roi_layers/nms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # from ._utils import _C 3 | from model import _C 4 | 5 | nms = _C.nms 6 | # nms.__doc__ = """ 7 | # This function performs Non-maximum suppresion""" 8 | -------------------------------------------------------------------------------- /lib/model/nms/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # CUDA_PATH=/usr/local/cuda/ 4 | 5 | cd src 6 | echo "Compiling stnm kernels by nvcc..." 7 | nvcc -c -o nms_cuda_kernel.cu.o nms_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52 8 | 9 | cd ../ 10 | python build.py 11 | -------------------------------------------------------------------------------- /lib/model/nms/src/nms_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | void nms_cuda_compute(int* keep_out, int *num_out, float* boxes_host, int boxes_num, 6 | int boxes_dim, float nms_overlap_thresh); 7 | 8 | #ifdef __cplusplus 9 | } 10 | #endif 11 | -------------------------------------------------------------------------------- /lib/model/roi_align/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_PATH=/usr/local/cuda/ 4 | 5 | cd src 6 | echo "Compiling my_lib kernels by nvcc..." 7 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52 8 | 9 | cd ../ 10 | python build.py 11 | -------------------------------------------------------------------------------- /lib/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/model/roi_crop/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_PATH=/usr/local/cuda/ 4 | 5 | cd src 6 | echo "Compiling my_lib kernels by nvcc..." 7 | nvcc -c -o roi_crop_cuda_kernel.cu.o roi_crop_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52 8 | 9 | cd ../ 10 | python build.py 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | models/ 3 | __pycache__/ 4 | lib/datasets/__pycache__/* 5 | lib/model/utils/__pycache__/* 6 | lib/roi_data_layer/__pycache__/* 7 | lib/build/ 8 | lib/pycocotools/ 9 | lib/faster_rcnn.egg-info 10 | lib/model/_C* 11 | output/* 12 | inference_output/* 13 | resnet50*.pth 14 | -------------------------------------------------------------------------------- /lib/roi_data_layer/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/model/nms/src/nms_cuda.h: -------------------------------------------------------------------------------- 1 | // int nms_cuda(THCudaTensor *keep_out, THCudaTensor *num_out, 2 | // THCudaTensor *boxes_host, THCudaTensor *nms_overlap_thresh); 3 | 4 | int nms_cuda(THCudaIntTensor *keep_out, THCudaTensor *boxes_host, 5 | THCudaIntTensor *num_out, float nms_overlap_thresh); 6 | -------------------------------------------------------------------------------- /lib/model/roi_crop/modules/roi_crop.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from ..functions.roi_crop import RoICropFunction 3 | 4 | class _RoICrop(Module): 5 | def __init__(self, layout = 'BHWD'): 6 | super(_RoICrop, self).__init__() 7 | def forward(self, input1, input2): 8 | return RoICropFunction()(input1, input2) 9 | -------------------------------------------------------------------------------- /cfgs/vgg16.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: vgg16 2 | TRAIN: 3 | HAS_RPN: True 4 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 5 | RPN_POSITIVE_OVERLAP: 0.7 6 | RPN_BATCHSIZE: 256 7 | PROPOSAL_METHOD: gt 8 | BG_THRESH_LO: 0.0 9 | BATCH_SIZE: 256 10 | LEARNING_RATE: 0.01 11 | TEST: 12 | HAS_RPN: True 13 | POOLING_MODE: align 14 | CROP_RESIZE_WITH_MAX_POOL: False 15 | -------------------------------------------------------------------------------- /lib/model/roi_layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from .nms import nms 4 | from .roi_align import ROIAlign 5 | from .roi_align import roi_align 6 | from .roi_pool import ROIPool 7 | from .roi_pool import roi_pool 8 | 9 | __all__ = ["nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool"] 10 | -------------------------------------------------------------------------------- /lib/model/nms/nms_gpu.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import torch 3 | import numpy as np 4 | from ._ext import nms 5 | import pdb 6 | 7 | def nms_gpu(dets, thresh): 8 | keep = dets.new(dets.size(0), 1).zero_().int() 9 | num_out = dets.new(1).zero_().int() 10 | nms.nms_cuda(keep, dets, num_out, thresh) 11 | keep = keep[:num_out[0]] 12 | return keep 13 | -------------------------------------------------------------------------------- /lib/model/roi_align/src/roi_align.h: -------------------------------------------------------------------------------- 1 | int roi_align_forward(int aligned_height, int aligned_width, float spatial_scale, 2 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output); 3 | 4 | int roi_align_backward(int aligned_height, int aligned_width, float spatial_scale, 5 | THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad); 6 | -------------------------------------------------------------------------------- /lib/model/roi_crop/_ext/crop_resize/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._crop_resize import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /lib/model/roi_align/src/roi_align_cuda.h: -------------------------------------------------------------------------------- 1 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output); 3 | 4 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad); 6 | -------------------------------------------------------------------------------- /cfgs/res101.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: res101 2 | TRAIN: 3 | HAS_RPN: True 4 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 5 | RPN_POSITIVE_OVERLAP: 0.7 6 | RPN_BATCHSIZE: 256 7 | PROPOSAL_METHOD: gt 8 | BG_THRESH_LO: 0.0 9 | DISPLAY: 20 10 | BATCH_SIZE: 128 11 | WEIGHT_DECAY: 0.0001 12 | DOUBLE_BIAS: False 13 | LEARNING_RATE: 0.001 14 | TEST: 15 | HAS_RPN: True 16 | POOLING_SIZE: 7 17 | POOLING_MODE: align 18 | CROP_RESIZE_WITH_MAX_POOL: False 19 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/src/roi_pooling_cuda.h: -------------------------------------------------------------------------------- 1 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax); 3 | 4 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax); -------------------------------------------------------------------------------- /lib/model/nms/_ext/nms/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._nms import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | if callable(fn): 10 | locals[symbol] = _wrap_function(fn, _ffi) 11 | else: 12 | locals[symbol] = fn 13 | __all__.append(symbol) 14 | 15 | _import_symbols(locals()) 16 | -------------------------------------------------------------------------------- /lib/model/roi_crop/_ext/roi_crop/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_crop import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | if callable(fn): 10 | locals[symbol] = _wrap_function(fn, _ffi) 11 | else: 12 | locals[symbol] = fn 13 | __all__.append(symbol) 14 | 15 | _import_symbols(locals()) 16 | -------------------------------------------------------------------------------- /lib/model/roi_align/_ext/roi_align/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_align import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | if callable(fn): 10 | locals[symbol] = _wrap_function(fn, _ffi) 11 | else: 12 | locals[symbol] = fn 13 | __all__.append(symbol) 14 | 15 | _import_symbols(locals()) 16 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/_ext/roi_pooling/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_pooling import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | if callable(fn): 10 | locals[symbol] = _wrap_function(fn, _ffi) 11 | else: 12 | locals[symbol] = fn 13 | __all__.append(symbol) 14 | 15 | _import_symbols(locals()) 16 | -------------------------------------------------------------------------------- /lib/model/roi_crop/src/roi_crop_cuda.h: -------------------------------------------------------------------------------- 1 | // Bilinear sampling is done in BHWD (coalescing is not obvious in BDHW) 2 | // we assume BHWD format in inputImages 3 | // we assume BHW(YX) format on grids 4 | 5 | int BilinearSamplerBHWD_updateOutput_cuda(THCudaTensor *inputImages, THCudaTensor *grids, THCudaTensor *output); 6 | 7 | int BilinearSamplerBHWD_updateGradInput_cuda(THCudaTensor *inputImages, THCudaTensor *grids, THCudaTensor *gradInputImages, 8 | THCudaTensor *gradGrids, THCudaTensor *gradOutput); 9 | -------------------------------------------------------------------------------- /cfgs/res101_ls.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: res101 2 | TRAIN: 3 | HAS_RPN: True 4 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 5 | RPN_POSITIVE_OVERLAP: 0.7 6 | RPN_BATCHSIZE: 256 7 | PROPOSAL_METHOD: gt 8 | BG_THRESH_LO: 0.0 9 | DISPLAY: 20 10 | BATCH_SIZE: 128 11 | WEIGHT_DECAY: 0.0001 12 | SCALES: [800] 13 | DOUBLE_BIAS: False 14 | LEARNING_RATE: 0.001 15 | TEST: 16 | HAS_RPN: True 17 | SCALES: [800] 18 | MAX_SIZE: 1200 19 | RPN_POST_NMS_TOP_N: 1000 20 | POOLING_SIZE: 7 21 | POOLING_MODE: align 22 | CROP_RESIZE_WITH_MAX_POOL: False 23 | -------------------------------------------------------------------------------- /lib/model/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "nms.h" 3 | #include "ROIAlign.h" 4 | #include "ROIPool.h" 5 | 6 | 7 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 8 | m.def("nms", &nms, "non-maximum suppression"); 9 | m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); 10 | m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); 11 | m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); 12 | m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); 13 | } 14 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/modules/roi_pool.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from ..functions.roi_pool import RoIPoolFunction 3 | 4 | 5 | class _RoIPooling(Module): 6 | def __init__(self, pooled_height, pooled_width, spatial_scale): 7 | super(_RoIPooling, self).__init__() 8 | 9 | self.pooled_width = int(pooled_width) 10 | self.pooled_height = int(pooled_height) 11 | self.spatial_scale = float(spatial_scale) 12 | 13 | def forward(self, features, rois): 14 | return RoIPoolFunction(self.pooled_height, self.pooled_width, self.spatial_scale)(features, rois) 15 | -------------------------------------------------------------------------------- /lib/model/csrc/cpu/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | 14 | at::Tensor nms_cpu(const at::Tensor& dets, 15 | const at::Tensor& scores, 16 | const float threshold); 17 | -------------------------------------------------------------------------------- /lib/model/roi_crop/src/roi_crop.h: -------------------------------------------------------------------------------- 1 | int BilinearSamplerBHWD_updateOutput(THFloatTensor *inputImages, THFloatTensor *grids, THFloatTensor *output); 2 | 3 | int BilinearSamplerBHWD_updateGradInput(THFloatTensor *inputImages, THFloatTensor *grids, THFloatTensor *gradInputImages, 4 | THFloatTensor *gradGrids, THFloatTensor *gradOutput); 5 | 6 | 7 | 8 | int BilinearSamplerBCHW_updateOutput(THFloatTensor *inputImages, THFloatTensor *grids, THFloatTensor *output); 9 | 10 | int BilinearSamplerBCHW_updateGradInput(THFloatTensor *inputImages, THFloatTensor *grids, THFloatTensor *gradInputImages, 11 | THFloatTensor *gradGrids, THFloatTensor *gradOutput); 12 | -------------------------------------------------------------------------------- /lib/model/csrc/nms.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | at::Tensor nms(const at::Tensor& dets, 11 | const at::Tensor& scores, 12 | const float threshold) { 13 | 14 | if (dets.type().is_cuda()) { 15 | #ifdef WITH_CUDA 16 | // TODO raise error if not compiled with CUDA 17 | if (dets.numel() == 0) 18 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 19 | auto b = at::cat({dets, scores.unsqueeze(1)}, 1); 20 | return nms_cuda(b, threshold); 21 | #else 22 | AT_ERROR("Not compiled with GPU support"); 23 | #endif 24 | } 25 | 26 | at::Tensor result = nms_cpu(dets, scores, threshold); 27 | return result; 28 | } 29 | -------------------------------------------------------------------------------- /lib/model/nms/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | import torch 8 | from model.utils.config import cfg 9 | if torch.cuda.is_available(): 10 | from model.nms.nms_gpu import nms_gpu 11 | from model.nms.nms_cpu import nms_cpu 12 | 13 | def nms(dets, thresh, force_cpu=False): 14 | """Dispatch to either CPU or GPU NMS implementations.""" 15 | if dets.shape[0] == 0: 16 | return [] 17 | # ---numpy version--- 18 | # original: return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 19 | # ---pytorch version--- 20 | 21 | return nms_gpu(dets, thresh) if force_cpu == False else nms_cpu(dets, thresh) 22 | -------------------------------------------------------------------------------- /cfgs/res50.yml: -------------------------------------------------------------------------------- 1 | # EXP_DIR: res50 2 | # TRAIN: 3 | # HAS_RPN: True 4 | # # IMS_PER_BATCH: 1 5 | # BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 6 | # RPN_POSITIVE_OVERLAP: 0.7 7 | # RPN_BATCHSIZE: 256 8 | # PROPOSAL_METHOD: gt 9 | # BG_THRESH_LO: 0.0 10 | # DISPLAY: 20 11 | # BATCH_SIZE: 256 12 | # WEIGHT_DECAY: 0.0001 13 | # DOUBLE_BIAS: False 14 | # SNAPSHOT_PREFIX: res50_faster_rcnn 15 | # TEST: 16 | # HAS_RPN: True 17 | # POOLING_MODE: crop 18 | 19 | EXP_DIR: res50 20 | TRAIN: 21 | HAS_RPN: True 22 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 23 | RPN_POSITIVE_OVERLAP: 0.7 24 | RPN_BATCHSIZE: 256 25 | PROPOSAL_METHOD: gt 26 | BG_THRESH_LO: 0.0 27 | DISPLAY: 20 28 | BATCH_SIZE: 128 29 | WEIGHT_DECAY: 0.0001 30 | DOUBLE_BIAS: False 31 | LEARNING_RATE: 0.001 32 | TEST: 33 | HAS_RPN: True 34 | POOLING_SIZE: 7 35 | POOLING_MODE: align 36 | CROP_RESIZE_WITH_MAX_POOL: False 37 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/src/roi_pooling_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ROI_POOLING_KERNEL 2 | #define _ROI_POOLING_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | int ROIPoolForwardLaucher( 9 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 10 | const int width, const int channels, const int pooled_height, 11 | const int pooled_width, const float* bottom_rois, 12 | float* top_data, int* argmax_data, cudaStream_t stream); 13 | 14 | 15 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 16 | const int height, const int width, const int channels, const int pooled_height, 17 | const int pooled_width, const float* bottom_rois, 18 | float* bottom_diff, const int* argmax_data, cudaStream_t stream); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif 25 | 26 | -------------------------------------------------------------------------------- /lib/model/nms/build.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import torch 4 | from torch.utils.ffi import create_extension 5 | 6 | #this_file = os.path.dirname(__file__) 7 | 8 | sources = [] 9 | headers = [] 10 | defines = [] 11 | with_cuda = False 12 | 13 | if torch.cuda.is_available(): 14 | print('Including CUDA code.') 15 | sources += ['src/nms_cuda.c'] 16 | headers += ['src/nms_cuda.h'] 17 | defines += [('WITH_CUDA', None)] 18 | with_cuda = True 19 | 20 | this_file = os.path.dirname(os.path.realpath(__file__)) 21 | print(this_file) 22 | extra_objects = ['src/nms_cuda_kernel.cu.o'] 23 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 24 | print(extra_objects) 25 | 26 | ffi = create_extension( 27 | '_ext.nms', 28 | headers=headers, 29 | sources=sources, 30 | define_macros=defines, 31 | relative_to=__file__, 32 | with_cuda=with_cuda, 33 | extra_objects=extra_objects 34 | ) 35 | 36 | if __name__ == '__main__': 37 | ffi.build() 38 | -------------------------------------------------------------------------------- /lib/model/nms/nms_cpu.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import numpy as np 4 | import torch 5 | 6 | def nms_cpu(dets, thresh): 7 | dets = dets.numpy() 8 | x1 = dets[:, 0] 9 | y1 = dets[:, 1] 10 | x2 = dets[:, 2] 11 | y2 = dets[:, 3] 12 | scores = dets[:, 4] 13 | 14 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 15 | order = scores.argsort()[::-1] 16 | 17 | keep = [] 18 | while order.size > 0: 19 | i = order.item(0) 20 | keep.append(i) 21 | xx1 = np.maximum(x1[i], x1[order[1:]]) 22 | yy1 = np.maximum(y1[i], y1[order[1:]]) 23 | xx2 = np.maximum(x2[i], x2[order[1:]]) 24 | yy2 = np.maximum(y2[i], y2[order[1:]]) 25 | 26 | w = np.maximum(0.0, xx2 - xx1 + 1) 27 | h = np.maximum(0.0, yy2 - yy1 + 1) 28 | inter = w * h 29 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 30 | 31 | inds = np.where(ovr <= thresh)[0] 32 | order = order[inds + 1] 33 | 34 | return torch.IntTensor(keep) 35 | 36 | 37 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/build.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import torch 4 | from torch.utils.ffi import create_extension 5 | 6 | 7 | sources = ['src/roi_pooling.c'] 8 | headers = ['src/roi_pooling.h'] 9 | extra_objects = [] 10 | defines = [] 11 | with_cuda = False 12 | 13 | this_file = os.path.dirname(os.path.realpath(__file__)) 14 | print(this_file) 15 | 16 | if torch.cuda.is_available(): 17 | print('Including CUDA code.') 18 | sources += ['src/roi_pooling_cuda.c'] 19 | headers += ['src/roi_pooling_cuda.h'] 20 | defines += [('WITH_CUDA', None)] 21 | with_cuda = True 22 | extra_objects = ['src/roi_pooling.cu.o'] 23 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 24 | 25 | ffi = create_extension( 26 | '_ext.roi_pooling', 27 | headers=headers, 28 | sources=sources, 29 | define_macros=defines, 30 | relative_to=__file__, 31 | with_cuda=with_cuda, 32 | extra_objects=extra_objects 33 | ) 34 | 35 | if __name__ == '__main__': 36 | ffi.build() 37 | -------------------------------------------------------------------------------- /lib/model/roi_crop/build.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import torch 4 | from torch.utils.ffi import create_extension 5 | 6 | #this_file = os.path.dirname(__file__) 7 | 8 | sources = ['src/roi_crop.c'] 9 | headers = ['src/roi_crop.h'] 10 | defines = [] 11 | with_cuda = False 12 | 13 | if torch.cuda.is_available(): 14 | print('Including CUDA code.') 15 | sources += ['src/roi_crop_cuda.c'] 16 | headers += ['src/roi_crop_cuda.h'] 17 | defines += [('WITH_CUDA', None)] 18 | with_cuda = True 19 | 20 | this_file = os.path.dirname(os.path.realpath(__file__)) 21 | print(this_file) 22 | extra_objects = ['src/roi_crop_cuda_kernel.cu.o'] 23 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 24 | 25 | ffi = create_extension( 26 | '_ext.roi_crop', 27 | headers=headers, 28 | sources=sources, 29 | define_macros=defines, 30 | relative_to=__file__, 31 | with_cuda=with_cuda, 32 | extra_objects=extra_objects 33 | ) 34 | 35 | if __name__ == '__main__': 36 | ffi.build() 37 | -------------------------------------------------------------------------------- /lib/model/roi_align/build.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import torch 4 | from torch.utils.ffi import create_extension 5 | 6 | sources = ['src/roi_align.c'] 7 | headers = ['src/roi_align.h'] 8 | extra_objects = [] 9 | #sources = [] 10 | #headers = [] 11 | defines = [] 12 | with_cuda = False 13 | 14 | this_file = os.path.dirname(os.path.realpath(__file__)) 15 | print(this_file) 16 | 17 | if torch.cuda.is_available(): 18 | print('Including CUDA code.') 19 | sources += ['src/roi_align_cuda.c'] 20 | headers += ['src/roi_align_cuda.h'] 21 | defines += [('WITH_CUDA', None)] 22 | with_cuda = True 23 | 24 | extra_objects = ['src/roi_align_kernel.cu.o'] 25 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 26 | 27 | ffi = create_extension( 28 | '_ext.roi_align', 29 | headers=headers, 30 | sources=sources, 31 | define_macros=defines, 32 | relative_to=__file__, 33 | with_cuda=with_cuda, 34 | extra_objects=extra_objects 35 | ) 36 | 37 | if __name__ == '__main__': 38 | ffi.build() 39 | -------------------------------------------------------------------------------- /lib/model/roi_crop/functions/roi_crop.py: -------------------------------------------------------------------------------- 1 | # functions/add.py 2 | import torch 3 | from torch.autograd import Function 4 | from .._ext import roi_crop 5 | import pdb 6 | 7 | class RoICropFunction(Function): 8 | def forward(self, input1, input2): 9 | self.input1 = input1.clone() 10 | self.input2 = input2.clone() 11 | output = input2.new(input2.size()[0], input1.size()[1], input2.size()[1], input2.size()[2]).zero_() 12 | assert output.get_device() == input1.get_device(), "output and input1 must on the same device" 13 | assert output.get_device() == input2.get_device(), "output and input2 must on the same device" 14 | roi_crop.BilinearSamplerBHWD_updateOutput_cuda(input1, input2, output) 15 | return output 16 | 17 | def backward(self, grad_output): 18 | grad_input1 = self.input1.new(self.input1.size()).zero_() 19 | grad_input2 = self.input2.new(self.input2.size()).zero_() 20 | roi_crop.BilinearSamplerBHWD_updateGradInput_cuda(self.input1, self.input2, grad_input1, grad_input2, grad_output) 21 | return grad_input1, grad_input2 22 | -------------------------------------------------------------------------------- /lib/model/roi_align/src/roi_align_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ROI_ALIGN_KERNEL 2 | #define _ROI_ALIGN_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, 9 | const float spatial_scale, const int height, const int width, 10 | const int channels, const int aligned_height, const int aligned_width, 11 | const float* bottom_rois, float* top_data); 12 | 13 | int ROIAlignForwardLaucher( 14 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 15 | const int width, const int channels, const int aligned_height, 16 | const int aligned_width, const float* bottom_rois, 17 | float* top_data, cudaStream_t stream); 18 | 19 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, 20 | const float spatial_scale, const int height, const int width, 21 | const int channels, const int aligned_height, const int aligned_width, 22 | float* bottom_diff, const float* bottom_rois); 23 | 24 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 25 | const int height, const int width, const int channels, const int aligned_height, 26 | const int aligned_width, const float* bottom_rois, 27 | float* bottom_diff, cudaStream_t stream); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif 34 | 35 | -------------------------------------------------------------------------------- /lib/datasets/ds_utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Ross Girshick 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | 12 | 13 | def unique_boxes(boxes, scale=1.0): 14 | """Return indices of unique boxes.""" 15 | v = np.array([1, 1e3, 1e6, 1e9]) 16 | hashes = np.round(boxes * scale).dot(v) 17 | _, index = np.unique(hashes, return_index=True) 18 | return np.sort(index) 19 | 20 | 21 | def xywh_to_xyxy(boxes): 22 | """Convert [x y w h] box format to [x1 y1 x2 y2] format.""" 23 | return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1)) 24 | 25 | 26 | def xyxy_to_xywh(boxes): 27 | """Convert [x1 y1 x2 y2] box format to [x y w h] format.""" 28 | return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1)) 29 | 30 | 31 | def validate_boxes(boxes, width=0, height=0): 32 | """Check that a set of boxes are valid.""" 33 | x1 = boxes[:, 0] 34 | y1 = boxes[:, 1] 35 | x2 = boxes[:, 2] 36 | y2 = boxes[:, 3] 37 | assert (x1 >= 0).all() 38 | assert (y1 >= 0).all() 39 | assert (x2 >= x1).all() 40 | assert (y2 >= y1).all() 41 | assert (x2 < width).all() 42 | assert (y2 < height).all() 43 | 44 | 45 | def filter_small_boxes(boxes, min_size): 46 | w = boxes[:, 2] - boxes[:, 0] 47 | h = boxes[:, 3] - boxes[:, 1] 48 | keep = np.where((w >= min_size) & (h > min_size))[0] 49 | return keep 50 | -------------------------------------------------------------------------------- /lib/datasets/tools/mcg_munge.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import sys 4 | 5 | """Hacky tool to convert file system layout of MCG boxes downloaded from 6 | http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/mcg/ 7 | so that it's consistent with those computed by Jan Hosang (see: 8 | http://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal- 9 | computing/research/object-recognition-and-scene-understanding/how- 10 | good-are-detection-proposals-really/) 11 | 12 | NB: Boxes from the MCG website are in (y1, x1, y2, x2) order. 13 | Boxes from Hosang et al. are in (x1, y1, x2, y2) order. 14 | """ 15 | 16 | def munge(src_dir): 17 | # stored as: ./MCG-COCO-val2014-boxes/COCO_val2014_000000193401.mat 18 | # want: ./MCG/mat/COCO_val2014_0/COCO_val2014_000000141/COCO_val2014_000000141334.mat 19 | 20 | files = os.listdir(src_dir) 21 | for fn in files: 22 | base, ext = os.path.splitext(fn) 23 | # first 14 chars / first 22 chars / all chars + .mat 24 | # COCO_val2014_0/COCO_val2014_000000447/COCO_val2014_000000447991.mat 25 | first = base[:14] 26 | second = base[:22] 27 | dst_dir = os.path.join('MCG', 'mat', first, second) 28 | if not os.path.exists(dst_dir): 29 | os.makedirs(dst_dir) 30 | src = os.path.join(src_dir, fn) 31 | dst = os.path.join(dst_dir, fn) 32 | print('MV: {} -> {}'.format(src, dst)) 33 | os.rename(src, dst) 34 | 35 | if __name__ == '__main__': 36 | # src_dir should look something like: 37 | # src_dir = 'MCG-COCO-val2014-boxes' 38 | src_dir = sys.argv[1] 39 | munge(src_dir) 40 | -------------------------------------------------------------------------------- /lib/model/roi_crop/functions/crop_resize.py: -------------------------------------------------------------------------------- 1 | # functions/add.py 2 | import torch 3 | from torch.autograd import Function 4 | from .._ext import roi_crop 5 | from cffi import FFI 6 | ffi = FFI() 7 | 8 | class RoICropFunction(Function): 9 | def forward(self, input1, input2): 10 | self.input1 = input1 11 | self.input2 = input2 12 | self.device_c = ffi.new("int *") 13 | output = torch.zeros(input2.size()[0], input1.size()[1], input2.size()[1], input2.size()[2]) 14 | #print('decice %d' % torch.cuda.current_device()) 15 | if input1.is_cuda: 16 | self.device = torch.cuda.current_device() 17 | else: 18 | self.device = -1 19 | self.device_c[0] = self.device 20 | if not input1.is_cuda: 21 | roi_crop.BilinearSamplerBHWD_updateOutput(input1, input2, output) 22 | else: 23 | output = output.cuda(self.device) 24 | roi_crop.BilinearSamplerBHWD_updateOutput_cuda(input1, input2, output) 25 | return output 26 | 27 | def backward(self, grad_output): 28 | grad_input1 = torch.zeros(self.input1.size()) 29 | grad_input2 = torch.zeros(self.input2.size()) 30 | #print('backward decice %d' % self.device) 31 | if not grad_output.is_cuda: 32 | roi_crop.BilinearSamplerBHWD_updateGradInput(self.input1, self.input2, grad_input1, grad_input2, grad_output) 33 | else: 34 | grad_input1 = grad_input1.cuda(self.device) 35 | grad_input2 = grad_input2.cuda(self.device) 36 | roi_crop.BilinearSamplerBHWD_updateGradInput_cuda(self.input1, self.input2, grad_input1, grad_input2, grad_output) 37 | return grad_input1, grad_input2 38 | -------------------------------------------------------------------------------- /lib/model/csrc/ROIPool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | 11 | std::tuple ROIPool_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width) { 16 | if (input.type().is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | AT_ERROR("Not implemented on the CPU"); 24 | } 25 | 26 | at::Tensor ROIPool_backward(const at::Tensor& grad, 27 | const at::Tensor& input, 28 | const at::Tensor& rois, 29 | const at::Tensor& argmax, 30 | const float spatial_scale, 31 | const int pooled_height, 32 | const int pooled_width, 33 | const int batch_size, 34 | const int channels, 35 | const int height, 36 | const int width) { 37 | if (grad.type().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /lib/model/csrc/ROIAlign.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | // Interface for Python 11 | at::Tensor ROIAlign_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width, 16 | const int sampling_ratio) { 17 | if (input.type().is_cuda()) { 18 | #ifdef WITH_CUDA 19 | return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 20 | #else 21 | AT_ERROR("Not compiled with GPU support"); 22 | #endif 23 | } 24 | return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 25 | } 26 | 27 | at::Tensor ROIAlign_backward(const at::Tensor& grad, 28 | const at::Tensor& rois, 29 | const float spatial_scale, 30 | const int pooled_height, 31 | const int pooled_width, 32 | const int batch_size, 33 | const int channels, 34 | const int height, 35 | const int width, 36 | const int sampling_ratio) { 37 | if (grad.type().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /lib/model/roi_align/modules/roi_align.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from torch.nn.functional import avg_pool2d, max_pool2d 3 | from ..functions.roi_align import RoIAlignFunction 4 | 5 | 6 | class RoIAlign(Module): 7 | def __init__(self, aligned_height, aligned_width, spatial_scale): 8 | super(RoIAlign, self).__init__() 9 | 10 | self.aligned_width = int(aligned_width) 11 | self.aligned_height = int(aligned_height) 12 | self.spatial_scale = float(spatial_scale) 13 | 14 | def forward(self, features, rois): 15 | return RoIAlignFunction(self.aligned_height, self.aligned_width, 16 | self.spatial_scale)(features, rois) 17 | 18 | class RoIAlignAvg(Module): 19 | def __init__(self, aligned_height, aligned_width, spatial_scale): 20 | super(RoIAlignAvg, self).__init__() 21 | 22 | self.aligned_width = int(aligned_width) 23 | self.aligned_height = int(aligned_height) 24 | self.spatial_scale = float(spatial_scale) 25 | 26 | def forward(self, features, rois): 27 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1, 28 | self.spatial_scale)(features, rois) 29 | return avg_pool2d(x, kernel_size=2, stride=1) 30 | 31 | class RoIAlignMax(Module): 32 | def __init__(self, aligned_height, aligned_width, spatial_scale): 33 | super(RoIAlignMax, self).__init__() 34 | 35 | self.aligned_width = int(aligned_width) 36 | self.aligned_height = int(aligned_height) 37 | self.spatial_scale = float(spatial_scale) 38 | 39 | def forward(self, features, rois): 40 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1, 41 | self.spatial_scale)(features, rois) 42 | return max_pool2d(x, kernel_size=2, stride=1) 43 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/functions/roi_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from .._ext import roi_pooling 4 | import pdb 5 | 6 | class RoIPoolFunction(Function): 7 | def __init__(ctx, pooled_height, pooled_width, spatial_scale): 8 | ctx.pooled_width = pooled_width 9 | ctx.pooled_height = pooled_height 10 | ctx.spatial_scale = spatial_scale 11 | ctx.feature_size = None 12 | 13 | def forward(ctx, features, rois): 14 | ctx.feature_size = features.size() 15 | batch_size, num_channels, data_height, data_width = ctx.feature_size 16 | num_rois = rois.size(0) 17 | output = features.new(num_rois, num_channels, ctx.pooled_height, ctx.pooled_width).zero_() 18 | ctx.argmax = features.new(num_rois, num_channels, ctx.pooled_height, ctx.pooled_width).zero_().int() 19 | ctx.rois = rois 20 | if not features.is_cuda: 21 | _features = features.permute(0, 2, 3, 1) 22 | roi_pooling.roi_pooling_forward(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, 23 | _features, rois, output) 24 | else: 25 | roi_pooling.roi_pooling_forward_cuda(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, 26 | features, rois, output, ctx.argmax) 27 | 28 | return output 29 | 30 | def backward(ctx, grad_output): 31 | assert(ctx.feature_size is not None and grad_output.is_cuda) 32 | batch_size, num_channels, data_height, data_width = ctx.feature_size 33 | grad_input = grad_output.new(batch_size, num_channels, data_height, data_width).zero_() 34 | 35 | roi_pooling.roi_pooling_backward_cuda(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, 36 | grad_output, ctx.rois, grad_input, ctx.argmax) 37 | 38 | return grad_input, None 39 | -------------------------------------------------------------------------------- /lib/model/utils/blob.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Blob helper functions.""" 9 | 10 | import numpy as np 11 | # from scipy.misc import imread, imresize 12 | import cv2 13 | 14 | try: 15 | xrange # Python 2 16 | except NameError: 17 | xrange = range # Python 3 18 | 19 | 20 | def im_list_to_blob(ims): 21 | """Convert a list of images into a network input. 22 | 23 | Assumes images are already prepared (means subtracted, BGR order, ...). 24 | """ 25 | max_shape = np.array([im.shape for im in ims]).max(axis=0) # (max of H, max of W), but there is just one image actually 26 | num_images = len(ims) # num_images = 1 27 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 28 | dtype=np.float32) 29 | for i in xrange(num_images): 30 | im = ims[i] 31 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 32 | 33 | return blob 34 | 35 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 36 | """Mean subtract and scale an image for use in a blob.""" 37 | 38 | im = im.astype(np.float32, copy=False) 39 | im -= pixel_means 40 | # im = im[:, :, ::-1] 41 | im_shape = im.shape 42 | im_size_min = np.min(im_shape[0:2]) 43 | im_size_max = np.max(im_shape[0:2]) 44 | im_scale = float(target_size) / float(im_size_min) 45 | # Prevent the biggest axis from being more than MAX_SIZE 46 | # if np.round(im_scale * im_size_max) > max_size: 47 | # im_scale = float(max_size) / float(im_size_max) 48 | # im = imresize(im, im_scale) 49 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 50 | interpolation=cv2.INTER_LINEAR) 51 | 52 | return im, im_scale 53 | -------------------------------------------------------------------------------- /lib/setup.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import torch 5 | from setuptools import find_packages 6 | from setuptools import setup 7 | from torch.utils.cpp_extension import CUDA_HOME 8 | from torch.utils.cpp_extension import CppExtension 9 | from torch.utils.cpp_extension import CUDAExtension 10 | 11 | requirements = ["torch", "torchvision"] 12 | 13 | 14 | def get_extensions(): 15 | this_dir = os.path.dirname(os.path.abspath(__file__)) 16 | extensions_dir = os.path.join(this_dir, "model", "csrc") 17 | 18 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 19 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 20 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 21 | 22 | sources = main_file + source_cpu 23 | extension = CppExtension 24 | 25 | extra_compile_args = {"cxx": []} 26 | define_macros = [] 27 | 28 | if torch.cuda.is_available() and CUDA_HOME is not None: 29 | extension = CUDAExtension 30 | sources += source_cuda 31 | define_macros += [("WITH_CUDA", None)] 32 | extra_compile_args["nvcc"] = [ 33 | "-DCUDA_HAS_FP16=1", 34 | "-D__CUDA_NO_HALF_OPERATORS__", 35 | "-D__CUDA_NO_HALF_CONVERSIONS__", 36 | "-D__CUDA_NO_HALF2_OPERATORS__", 37 | ] 38 | 39 | sources = [os.path.join(extensions_dir, s) for s in sources] 40 | 41 | include_dirs = [extensions_dir] 42 | 43 | ext_modules = [ 44 | extension( 45 | "model._C", 46 | sources, 47 | include_dirs=include_dirs, 48 | define_macros=define_macros, 49 | extra_compile_args=extra_compile_args, 50 | ) 51 | ] 52 | 53 | return ext_modules 54 | 55 | 56 | setup( 57 | name="faster_rcnn", 58 | version="0.1", 59 | description="object detection in pytorch", 60 | packages=find_packages(exclude=("configs", "tests",)), 61 | # install_requires=requirements, 62 | ext_modules=get_extensions(), 63 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 64 | ) 65 | -------------------------------------------------------------------------------- /lib/model/roi_layers/roi_pool.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from model import _C 9 | 10 | 11 | class _ROIPool(Function): 12 | @staticmethod 13 | def forward(ctx, input, roi, output_size, spatial_scale): 14 | ctx.output_size = _pair(output_size) 15 | ctx.spatial_scale = spatial_scale 16 | ctx.input_shape = input.size() 17 | output, argmax = _C.roi_pool_forward( 18 | input, roi, spatial_scale, output_size[0], output_size[1] 19 | ) 20 | ctx.save_for_backward(input, roi, argmax) 21 | return output 22 | 23 | @staticmethod 24 | @once_differentiable 25 | def backward(ctx, grad_output): 26 | input, rois, argmax = ctx.saved_tensors 27 | output_size = ctx.output_size 28 | spatial_scale = ctx.spatial_scale 29 | bs, ch, h, w = ctx.input_shape 30 | grad_input = _C.roi_pool_backward( 31 | grad_output, 32 | input, 33 | rois, 34 | argmax, 35 | spatial_scale, 36 | output_size[0], 37 | output_size[1], 38 | bs, 39 | ch, 40 | h, 41 | w, 42 | ) 43 | return grad_input, None, None, None 44 | 45 | 46 | roi_pool = _ROIPool.apply 47 | 48 | 49 | class ROIPool(nn.Module): 50 | def __init__(self, output_size, spatial_scale): 51 | super(ROIPool, self).__init__() 52 | self.output_size = output_size 53 | self.spatial_scale = spatial_scale 54 | 55 | def forward(self, input, rois): 56 | return roi_pool(input, rois, self.output_size, self.spatial_scale) 57 | 58 | def __repr__(self): 59 | tmpstr = self.__class__.__name__ + "(" 60 | tmpstr += "output_size=" + str(self.output_size) 61 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 62 | tmpstr += ")" 63 | return tmpstr 64 | -------------------------------------------------------------------------------- /lib/model/utils/losses.py: -------------------------------------------------------------------------------- 1 | class TripletLoss(nn.Module): 2 | """Triplet loss with hard positive/negative mining. 3 | 4 | Reference: 5 | Hermans et al. In Defense of the Triplet Loss for Person Re-Identification. arXiv:1703.07737. 6 | 7 | Imported from ``_. 8 | 9 | Args: 10 | margin (float, optional): margin for triplet. Default is 0.3. 11 | """ 12 | 13 | def __init__(self, margin=0.3,global_feat, labels): 14 | super(TripletLoss, self).__init__() 15 | self.margin = margin 16 | self.ranking_loss = nn.MarginRankingLoss(margin=margin) 17 | 18 | def forward(self, inputs, targets): 19 | """ 20 | Args: 21 | inputs (torch.Tensor): feature matrix with shape (batch_size, feat_dim). 22 | targets (torch.LongTensor): ground truth labels with shape (num_classes). 23 | """ 24 | n = inputs.size(0) 25 | 26 | # Compute pairwise distance, replace by the official when merged 27 | dist = torch.pow(inputs, 2).sum(dim=1, keepdim=True).expand(n, n) 28 | dist = dist + dist.t() 29 | dist.addmm_(1, -2, inputs, inputs.t()) 30 | dist = dist.clamp(min=1e-12).sqrt() # for numerical stability 31 | 32 | # For each anchor, find the hardest positive and negative 33 | mask = targets.expand(n, n).eq(targets.expand(n, n).t()) 34 | dist_ap, dist_an = [], [] 35 | for i in range(n): 36 | dist_ap.append(dist[i][mask[i]].max().unsqueeze(0)) 37 | dist_an.append(dist[i][mask[i] == 0].min().unsqueeze(0)) 38 | dist_ap = torch.cat(dist_ap) 39 | dist_an = torch.cat(dist_an) 40 | 41 | # Compute ranking hinge loss 42 | y = torch.ones_like(dist_an) 43 | return self.ranking_loss(dist_an, dist_ap, y) 44 | 45 | triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2) 46 | anchor = torch.randn(100, 128, requires_grad=True) 47 | positive = torch.randn(100, 128, requires_grad=True) 48 | negative = torch.randn(100, 128, requires_grad=True) 49 | output = triplet_loss(anchor, positive, negative) -------------------------------------------------------------------------------- /lib/model/roi_align/functions/roi_align.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from .._ext import roi_align 4 | 5 | 6 | # TODO use save_for_backward instead 7 | class RoIAlignFunction(Function): 8 | def __init__(self, aligned_height, aligned_width, spatial_scale): 9 | self.aligned_width = int(aligned_width) 10 | self.aligned_height = int(aligned_height) 11 | self.spatial_scale = float(spatial_scale) 12 | self.rois = None 13 | self.feature_size = None 14 | 15 | def forward(self, features, rois): 16 | self.rois = rois 17 | self.feature_size = features.size() 18 | 19 | batch_size, num_channels, data_height, data_width = features.size() 20 | num_rois = rois.size(0) 21 | 22 | output = features.new(num_rois, num_channels, self.aligned_height, self.aligned_width).zero_() 23 | if features.is_cuda: 24 | roi_align.roi_align_forward_cuda(self.aligned_height, 25 | self.aligned_width, 26 | self.spatial_scale, features, 27 | rois, output) 28 | else: 29 | roi_align.roi_align_forward(self.aligned_height, 30 | self.aligned_width, 31 | self.spatial_scale, features, 32 | rois, output) 33 | # raise NotImplementedError 34 | 35 | return output 36 | 37 | def backward(self, grad_output): 38 | assert(self.feature_size is not None and grad_output.is_cuda) 39 | 40 | batch_size, num_channels, data_height, data_width = self.feature_size 41 | 42 | grad_input = self.rois.new(batch_size, num_channels, data_height, 43 | data_width).zero_() 44 | roi_align.roi_align_backward_cuda(self.aligned_height, 45 | self.aligned_width, 46 | self.spatial_scale, grad_output, 47 | self.rois, grad_input) 48 | 49 | # print grad_input 50 | 51 | return grad_input, None 52 | -------------------------------------------------------------------------------- /lib/model/csrc/cuda/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, 14 | const at::Tensor& rois, 15 | const float spatial_scale, 16 | const int pooled_height, 17 | const int pooled_width, 18 | const int batch_size, 19 | const int channels, 20 | const int height, 21 | const int width, 22 | const int sampling_ratio); 23 | 24 | 25 | std::tuple ROIPool_forward_cuda(const at::Tensor& input, 26 | const at::Tensor& rois, 27 | const float spatial_scale, 28 | const int pooled_height, 29 | const int pooled_width); 30 | 31 | at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, 32 | const at::Tensor& input, 33 | const at::Tensor& rois, 34 | const at::Tensor& argmax, 35 | const float spatial_scale, 36 | const int pooled_height, 37 | const int pooled_width, 38 | const int batch_size, 39 | const int channels, 40 | const int height, 41 | const int width); 42 | 43 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); 44 | 45 | 46 | at::Tensor compute_flow_cuda(const at::Tensor& boxes, 47 | const int height, 48 | const int width); 49 | -------------------------------------------------------------------------------- /lib/model/roi_layers/roi_align.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from model import _C 9 | 10 | import pdb 11 | 12 | class _ROIAlign(Function): 13 | @staticmethod 14 | def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): 15 | ctx.save_for_backward(roi) 16 | ctx.output_size = _pair(output_size) 17 | ctx.spatial_scale = spatial_scale 18 | ctx.sampling_ratio = sampling_ratio 19 | ctx.input_shape = input.size() 20 | output = _C.roi_align_forward(input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio) 21 | return output 22 | 23 | @staticmethod 24 | @once_differentiable 25 | def backward(ctx, grad_output): 26 | rois, = ctx.saved_tensors 27 | output_size = ctx.output_size 28 | spatial_scale = ctx.spatial_scale 29 | sampling_ratio = ctx.sampling_ratio 30 | bs, ch, h, w = ctx.input_shape 31 | grad_input = _C.roi_align_backward( 32 | grad_output, 33 | rois, 34 | spatial_scale, 35 | output_size[0], 36 | output_size[1], 37 | bs, 38 | ch, 39 | h, 40 | w, 41 | sampling_ratio, 42 | ) 43 | return grad_input, None, None, None, None 44 | 45 | 46 | roi_align = _ROIAlign.apply 47 | 48 | 49 | class ROIAlign(nn.Module): 50 | def __init__(self, output_size, spatial_scale, sampling_ratio): 51 | super(ROIAlign, self).__init__() 52 | self.output_size = output_size 53 | self.spatial_scale = spatial_scale 54 | self.sampling_ratio = sampling_ratio 55 | 56 | def forward(self, input, rois): 57 | return roi_align( 58 | input, rois, self.output_size, self.spatial_scale, self.sampling_ratio 59 | ) 60 | 61 | def __repr__(self): 62 | tmpstr = self.__class__.__name__ + "(" 63 | tmpstr += "output_size=" + str(self.output_size) 64 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 65 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 66 | tmpstr += ")" 67 | return tmpstr 68 | -------------------------------------------------------------------------------- /lib/model/roi_crop/functions/gridgen.py: -------------------------------------------------------------------------------- 1 | # functions/add.py 2 | import torch 3 | from torch.autograd import Function 4 | import numpy as np 5 | 6 | 7 | class AffineGridGenFunction(Function): 8 | def __init__(self, height, width,lr=1): 9 | super(AffineGridGenFunction, self).__init__() 10 | self.lr = lr 11 | self.height, self.width = height, width 12 | self.grid = np.zeros( [self.height, self.width, 3], dtype=np.float32) 13 | self.grid[:,:,0] = np.expand_dims(np.repeat(np.expand_dims(np.arange(-1, 1, 2.0/(self.height)), 0), repeats = self.width, axis = 0).T, 0) 14 | self.grid[:,:,1] = np.expand_dims(np.repeat(np.expand_dims(np.arange(-1, 1, 2.0/(self.width)), 0), repeats = self.height, axis = 0), 0) 15 | # self.grid[:,:,0] = np.expand_dims(np.repeat(np.expand_dims(np.arange(-1, 1, 2.0/(self.height - 1)), 0), repeats = self.width, axis = 0).T, 0) 16 | # self.grid[:,:,1] = np.expand_dims(np.repeat(np.expand_dims(np.arange(-1, 1, 2.0/(self.width - 1)), 0), repeats = self.height, axis = 0), 0) 17 | self.grid[:,:,2] = np.ones([self.height, width]) 18 | self.grid = torch.from_numpy(self.grid.astype(np.float32)) 19 | #print(self.grid) 20 | 21 | def forward(self, input1): 22 | self.input1 = input1 23 | output = input1.new(torch.Size([input1.size(0)]) + self.grid.size()).zero_() 24 | self.batchgrid = input1.new(torch.Size([input1.size(0)]) + self.grid.size()).zero_() 25 | for i in range(input1.size(0)): 26 | self.batchgrid[i] = self.grid.astype(self.batchgrid[i]) 27 | 28 | # if input1.is_cuda: 29 | # self.batchgrid = self.batchgrid.cuda() 30 | # output = output.cuda() 31 | 32 | for i in range(input1.size(0)): 33 | output = torch.bmm(self.batchgrid.view(-1, self.height*self.width, 3), torch.transpose(input1, 1, 2)).view(-1, self.height, self.width, 2) 34 | 35 | return output 36 | 37 | def backward(self, grad_output): 38 | 39 | grad_input1 = self.input1.new(self.input1.size()).zero_() 40 | 41 | # if grad_output.is_cuda: 42 | # self.batchgrid = self.batchgrid.cuda() 43 | # grad_input1 = grad_input1.cuda() 44 | 45 | grad_input1 = torch.baddbmm(grad_input1, torch.transpose(grad_output.view(-1, self.height*self.width, 2), 1,2), self.batchgrid.view(-1, self.height*self.width, 3)) 46 | return grad_input1 47 | -------------------------------------------------------------------------------- /lib/model/roi_align/src/roi_align_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "roi_align_kernel.h" 4 | 5 | extern THCState *state; 6 | 7 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output) 9 | { 10 | // Grab the input tensor 11 | float * data_flat = THCudaTensor_data(state, features); 12 | float * rois_flat = THCudaTensor_data(state, rois); 13 | 14 | float * output_flat = THCudaTensor_data(state, output); 15 | 16 | // Number of ROIs 17 | int num_rois = THCudaTensor_size(state, rois, 0); 18 | int size_rois = THCudaTensor_size(state, rois, 1); 19 | if (size_rois != 5) 20 | { 21 | return 0; 22 | } 23 | 24 | // data height 25 | int data_height = THCudaTensor_size(state, features, 2); 26 | // data width 27 | int data_width = THCudaTensor_size(state, features, 3); 28 | // Number of channels 29 | int num_channels = THCudaTensor_size(state, features, 1); 30 | 31 | cudaStream_t stream = THCState_getCurrentStream(state); 32 | 33 | ROIAlignForwardLaucher( 34 | data_flat, spatial_scale, num_rois, data_height, 35 | data_width, num_channels, aligned_height, 36 | aligned_width, rois_flat, 37 | output_flat, stream); 38 | 39 | return 1; 40 | } 41 | 42 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 43 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad) 44 | { 45 | // Grab the input tensor 46 | float * top_grad_flat = THCudaTensor_data(state, top_grad); 47 | float * rois_flat = THCudaTensor_data(state, rois); 48 | 49 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 50 | 51 | // Number of ROIs 52 | int num_rois = THCudaTensor_size(state, rois, 0); 53 | int size_rois = THCudaTensor_size(state, rois, 1); 54 | if (size_rois != 5) 55 | { 56 | return 0; 57 | } 58 | 59 | // batch size 60 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 61 | // data height 62 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 63 | // data width 64 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 65 | // Number of channels 66 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 67 | 68 | cudaStream_t stream = THCState_getCurrentStream(state); 69 | ROIAlignBackwardLaucher( 70 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height, 71 | data_width, num_channels, aligned_height, 72 | aligned_width, rois_flat, 73 | bottom_grad_flat, stream); 74 | 75 | return 1; 76 | } 77 | -------------------------------------------------------------------------------- /lib/model/utils/logger.py: -------------------------------------------------------------------------------- 1 | # Code referenced from https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514 2 | import tensorflow as tf 3 | import numpy as np 4 | import scipy.misc 5 | try: 6 | from StringIO import StringIO # Python 2.7 7 | except ImportError: 8 | from io import BytesIO # Python 3.x 9 | 10 | 11 | class Logger(object): 12 | 13 | def __init__(self, log_dir): 14 | """Create a summary writer logging to log_dir.""" 15 | self.writer = tf.summary.FileWriter(log_dir) 16 | 17 | def scalar_summary(self, tag, value, step): 18 | """Log a scalar variable.""" 19 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) 20 | self.writer.add_summary(summary, step) 21 | 22 | def image_summary(self, tag, images, step): 23 | """Log a list of images.""" 24 | 25 | img_summaries = [] 26 | for i, img in enumerate(images): 27 | # Write the image to a string 28 | try: 29 | s = StringIO() 30 | except: 31 | s = BytesIO() 32 | scipy.misc.toimage(img).save(s, format="png") 33 | 34 | # Create an Image object 35 | img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(), 36 | height=img.shape[0], 37 | width=img.shape[1]) 38 | # Create a Summary value 39 | img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum)) 40 | 41 | # Create and write Summary 42 | summary = tf.Summary(value=img_summaries) 43 | self.writer.add_summary(summary, step) 44 | 45 | def histo_summary(self, tag, values, step, bins=1000): 46 | """Log a histogram of the tensor of values.""" 47 | 48 | # Create a histogram using numpy 49 | counts, bin_edges = np.histogram(values, bins=bins) 50 | 51 | # Fill the fields of the histogram proto 52 | hist = tf.HistogramProto() 53 | hist.min = float(np.min(values)) 54 | hist.max = float(np.max(values)) 55 | hist.num = int(np.prod(values.shape)) 56 | hist.sum = float(np.sum(values)) 57 | hist.sum_squares = float(np.sum(values**2)) 58 | 59 | # Drop the start of the first bin 60 | bin_edges = bin_edges[1:] 61 | 62 | # Add bin edges and counts 63 | for edge in bin_edges: 64 | hist.bucket_limit.append(edge) 65 | for c in counts: 66 | hist.bucket.append(c) 67 | 68 | # Create and write Summary 69 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) 70 | self.writer.add_summary(summary, step) 71 | self.writer.flush() 72 | -------------------------------------------------------------------------------- /lib/model/csrc/cpu/nms_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "cpu/vision.h" 3 | 4 | 5 | template 6 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, 7 | const at::Tensor& scores, 8 | const float threshold) { 9 | AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); 10 | AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); 11 | AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); 12 | 13 | if (dets.numel() == 0) { 14 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 15 | } 16 | 17 | auto x1_t = dets.select(1, 0).contiguous(); 18 | auto y1_t = dets.select(1, 1).contiguous(); 19 | auto x2_t = dets.select(1, 2).contiguous(); 20 | auto y2_t = dets.select(1, 3).contiguous(); 21 | 22 | at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); 23 | 24 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 25 | 26 | auto ndets = dets.size(0); 27 | at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); 28 | 29 | auto suppressed = suppressed_t.data(); 30 | auto order = order_t.data(); 31 | auto x1 = x1_t.data(); 32 | auto y1 = y1_t.data(); 33 | auto x2 = x2_t.data(); 34 | auto y2 = y2_t.data(); 35 | auto areas = areas_t.data(); 36 | 37 | for (int64_t _i = 0; _i < ndets; _i++) { 38 | auto i = order[_i]; 39 | if (suppressed[i] == 1) 40 | continue; 41 | auto ix1 = x1[i]; 42 | auto iy1 = y1[i]; 43 | auto ix2 = x2[i]; 44 | auto iy2 = y2[i]; 45 | auto iarea = areas[i]; 46 | 47 | for (int64_t _j = _i + 1; _j < ndets; _j++) { 48 | auto j = order[_j]; 49 | if (suppressed[j] == 1) 50 | continue; 51 | auto xx1 = std::max(ix1, x1[j]); 52 | auto yy1 = std::max(iy1, y1[j]); 53 | auto xx2 = std::min(ix2, x2[j]); 54 | auto yy2 = std::min(iy2, y2[j]); 55 | 56 | auto w = std::max(static_cast(0), xx2 - xx1 + 1); 57 | auto h = std::max(static_cast(0), yy2 - yy1 + 1); 58 | auto inter = w * h; 59 | auto ovr = inter / (iarea + areas[j] - inter); 60 | if (ovr >= threshold) 61 | suppressed[j] = 1; 62 | } 63 | } 64 | return at::nonzero(suppressed_t == 0).squeeze(1); 65 | } 66 | 67 | at::Tensor nms_cpu(const at::Tensor& dets, 68 | const at::Tensor& scores, 69 | const float threshold) { 70 | at::Tensor result; 71 | AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { 72 | result = nms_cpu_kernel(dets, scores, threshold); 73 | }); 74 | return result; 75 | } 76 | -------------------------------------------------------------------------------- /lib/model/roi_crop/src/roi_crop_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | 6 | int BilinearSamplerBHWD_updateOutput_cuda_kernel(/*output->size[3]*/int oc, 7 | /*output->size[2]*/int ow, 8 | /*output->size[1]*/int oh, 9 | /*output->size[0]*/int ob, 10 | /*THCudaTensor_size(state, inputImages, 3)*/int ic, 11 | /*THCudaTensor_size(state, inputImages, 1)*/int ih, 12 | /*THCudaTensor_size(state, inputImages, 2)*/int iw, 13 | /*THCudaTensor_size(state, inputImages, 0)*/int ib, 14 | /*THCudaTensor *inputImages*/float *inputImages, int isb, int isc, int ish, int isw, 15 | /*THCudaTensor *grids*/float *grids, int gsb, int gsc, int gsh, int gsw, 16 | /*THCudaTensor *output*/float *output, int osb, int osc, int osh, int osw, 17 | /*THCState_getCurrentStream(state)*/cudaStream_t stream); 18 | 19 | int BilinearSamplerBHWD_updateGradInput_cuda_kernel(/*gradOutput->size[3]*/int goc, 20 | /*gradOutput->size[2]*/int gow, 21 | /*gradOutput->size[1]*/int goh, 22 | /*gradOutput->size[0]*/int gob, 23 | /*THCudaTensor_size(state, inputImages, 3)*/int ic, 24 | /*THCudaTensor_size(state, inputImages, 1)*/int ih, 25 | /*THCudaTensor_size(state, inputImages, 2)*/int iw, 26 | /*THCudaTensor_size(state, inputImages, 0)*/int ib, 27 | /*THCudaTensor *inputImages*/float *inputImages, int isb, int isc, int ish, int isw, 28 | /*THCudaTensor *grids*/float *grids, int gsb, int gsc, int gsh, int gsw, 29 | /*THCudaTensor *gradInputImages*/float *gradInputImages, int gisb, int gisc, int gish, int gisw, 30 | /*THCudaTensor *gradGrids*/float *gradGrids, int ggsb, int ggsc, int ggsh, int ggsw, 31 | /*THCudaTensor *gradOutput*/float *gradOutput, int gosb, int gosc, int gosh, int gosw, 32 | /*THCState_getCurrentStream(state)*/cudaStream_t stream); 33 | 34 | 35 | #ifdef __cplusplus 36 | } 37 | #endif 38 | -------------------------------------------------------------------------------- /lib/roi_data_layer/general_test_loader.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | import torch 3 | import numpy as np 4 | import random 5 | import cv2 6 | from PIL import Image 7 | from torch.utils.data.sampler import Sampler 8 | 9 | from model.utils.config import cfg 10 | from model.rpn.bbox_transform import bbox_transform_inv, clip_boxes 11 | from roi_data_layer.minibatch import get_minibatch, get_minibatch 12 | 13 | 14 | class GeneralTestLoader(data.Dataset): 15 | def __init__(self, roidb, ratio_list, ratio_index, batch_size, training=True, normalize=None): 16 | self._roidb = roidb 17 | # we make the height of image consistent to trim_height, trim_width 18 | self.trim_height = cfg.TRAIN.TRIM_HEIGHT 19 | self.trim_width = cfg.TRAIN.TRIM_WIDTH 20 | self.max_num_box = cfg.MAX_NUM_GT_BOXES 21 | self.training = training 22 | self.normalize = normalize 23 | self.ratio_list = ratio_list 24 | self.ratio_index = ratio_index 25 | self.batch_size = batch_size 26 | self.data_size = len(self.ratio_list) 27 | 28 | # given the ratio_list, we want to make the ratio same for each batch. 29 | self.ratio_list_batch = torch.Tensor(self.data_size).zero_() 30 | num_batch = int(np.ceil(len(ratio_index) / batch_size)) 31 | for i in range(num_batch): 32 | left_idx = i*batch_size 33 | right_idx = min((i+1)*batch_size-1, self.data_size-1) 34 | 35 | if ratio_list[right_idx] < 1: 36 | # for ratio < 1, we preserve the leftmost in each batch. 37 | target_ratio = ratio_list[left_idx] 38 | elif ratio_list[left_idx] > 1: 39 | # for ratio > 1, we preserve the rightmost in each batch. 40 | target_ratio = ratio_list[right_idx] 41 | else: 42 | # for ratio cross 1, we make it to be 1. 43 | target_ratio = 1 44 | 45 | self.ratio_list_batch[left_idx:(right_idx+1)] = target_ratio 46 | 47 | 48 | def __getitem__(self, index): 49 | index_ratio = index 50 | 51 | # get the anchor index for current sample index 52 | # here we set the anchor index to the last one 53 | # sample in this group 54 | minibatch_db = [self._roidb[index_ratio]] 55 | blobs = get_minibatch(minibatch_db) 56 | data = torch.from_numpy(blobs['data']) 57 | im_info = torch.from_numpy(blobs['im_info']) # (H, W, scale) 58 | # we need to random shuffle the bounding box. 59 | data_height, data_width = data.size(1), data.size(2) 60 | 61 | data = data.permute(0, 3, 1, 2).contiguous().view(3, data_height, data_width) 62 | im_info = im_info.view(3) 63 | 64 | # gt_boxes = torch.FloatTensor([1,1,1,1,1]) 65 | gt_boxes = torch.from_numpy(blobs['gt_boxes']) 66 | num_boxes = 0 67 | 68 | return data, im_info, gt_boxes, num_boxes 69 | 70 | def __len__(self): 71 | return len(self._roidb) -------------------------------------------------------------------------------- /lib/model/roi_pooling/src/roi_pooling_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "roi_pooling_kernel.h" 4 | 5 | extern THCState *state; 6 | 7 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, 8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax) 9 | { 10 | // Grab the input tensor 11 | float * data_flat = THCudaTensor_data(state, features); 12 | float * rois_flat = THCudaTensor_data(state, rois); 13 | 14 | float * output_flat = THCudaTensor_data(state, output); 15 | int * argmax_flat = THCudaIntTensor_data(state, argmax); 16 | 17 | // Number of ROIs 18 | int num_rois = THCudaTensor_size(state, rois, 0); 19 | int size_rois = THCudaTensor_size(state, rois, 1); 20 | if (size_rois != 5) 21 | { 22 | return 0; 23 | } 24 | 25 | // batch size 26 | // int batch_size = THCudaTensor_size(state, features, 0); 27 | // if (batch_size != 1) 28 | // { 29 | // return 0; 30 | // } 31 | // data height 32 | int data_height = THCudaTensor_size(state, features, 2); 33 | // data width 34 | int data_width = THCudaTensor_size(state, features, 3); 35 | // Number of channels 36 | int num_channels = THCudaTensor_size(state, features, 1); 37 | 38 | cudaStream_t stream = THCState_getCurrentStream(state); 39 | 40 | ROIPoolForwardLaucher( 41 | data_flat, spatial_scale, num_rois, data_height, 42 | data_width, num_channels, pooled_height, 43 | pooled_width, rois_flat, 44 | output_flat, argmax_flat, stream); 45 | 46 | return 1; 47 | } 48 | 49 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, 50 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax) 51 | { 52 | // Grab the input tensor 53 | float * top_grad_flat = THCudaTensor_data(state, top_grad); 54 | float * rois_flat = THCudaTensor_data(state, rois); 55 | 56 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 57 | int * argmax_flat = THCudaIntTensor_data(state, argmax); 58 | 59 | // Number of ROIs 60 | int num_rois = THCudaTensor_size(state, rois, 0); 61 | int size_rois = THCudaTensor_size(state, rois, 1); 62 | if (size_rois != 5) 63 | { 64 | return 0; 65 | } 66 | 67 | // batch size 68 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 69 | // if (batch_size != 1) 70 | // { 71 | // return 0; 72 | // } 73 | // data height 74 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 75 | // data width 76 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 77 | // Number of channels 78 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 79 | 80 | cudaStream_t stream = THCState_getCurrentStream(state); 81 | ROIPoolBackwardLaucher( 82 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height, 83 | data_width, num_channels, pooled_height, 84 | pooled_width, rois_flat, 85 | bottom_grad_flat, argmax_flat, stream); 86 | 87 | return 1; 88 | } 89 | -------------------------------------------------------------------------------- /lib/datasets/debug.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "ModuleNotFoundError", 10 | "evalue": "No module named '__main__.imdb'; '__main__' is not a package", 11 | "output_type": "error", 12 | "traceback": [ 13 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 14 | "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 15 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mxml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0metree\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mElementTree\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mET\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mimdb\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mimdb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mimdb\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mROOT_DIR\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mds_utils\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 16 | "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named '__main__.imdb'; '__main__' is not a package" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "import xml.dom.minidom as minidom\n", 22 | "\n", 23 | "import os\n", 24 | "# import PIL\n", 25 | "import numpy as np\n", 26 | "import scipy.sparse\n", 27 | "import subprocess\n", 28 | "import math\n", 29 | "import glob\n", 30 | "import uuid\n", 31 | "import scipy.io as sio\n", 32 | "import xml.etree.ElementTree as ET\n", 33 | "import pickle\n", 34 | "from .imdb import imdb\n", 35 | "from .imdb import ROOT_DIR\n", 36 | "from . import ds_utils\n", 37 | "from .voc_eval import voc_eval\n", 38 | "\n", 39 | "# TODO: make fast_rcnn irrelevant\n", 40 | "# >>>> obsolete, because it depends on sth outside of this project\n", 41 | "from model.utils.config import cfg" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [] 50 | } 51 | ], 52 | "metadata": { 53 | "kernelspec": { 54 | "display_name": "pytorch-rcnn", 55 | "language": "python", 56 | "name": "pytorch-rcnn" 57 | }, 58 | "language_info": { 59 | "codemirror_mode": { 60 | "name": "ipython", 61 | "version": 3 62 | }, 63 | "file_extension": ".py", 64 | "mimetype": "text/x-python", 65 | "name": "python", 66 | "nbconvert_exporter": "python", 67 | "pygments_lexer": "ipython3", 68 | "version": "3.7.3" 69 | } 70 | }, 71 | "nbformat": 4, 72 | "nbformat_minor": 2 73 | } 74 | -------------------------------------------------------------------------------- /env.yml: -------------------------------------------------------------------------------- 1 | name: DAnA 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - _pytorch_select=0.1=cpu_0 9 | - absl-py=0.9.0=py36_0 10 | - blas=1.0=mkl 11 | - blinker=1.4=py_1 12 | - brotlipy=0.7.0=py36h8c4c3a4_1000 13 | - c-ares=1.15.0=h516909a_1001 14 | - ca-certificates=2020.6.24=0 15 | - cachetools=4.1.0=py_1 16 | - certifi=2020.6.20=py36_0 17 | - cffi=1.14.0=py36h2e261b9_0 18 | - chardet=3.0.4=py36h9f0ad1d_1006 19 | - click=7.1.2=pyh9f0ad1d_0 20 | - cryptography=2.9.2=py36h45558ae_0 21 | - cudatoolkit=10.0.130=0 22 | - freetype=2.9.1=h8a8886c_1 23 | - google-auth=1.14.3=pyh9f0ad1d_0 24 | - google-auth-oauthlib=0.4.1=py_2 25 | - grpcio=1.27.2=py36hf8bcb03_0 26 | - idna=2.9=py_1 27 | - importlib-metadata=1.6.0=py36h9f0ad1d_0 28 | - intel-openmp=2020.0=166 29 | - jpeg=9b=h024ee3a_2 30 | - ld_impl_linux-64=2.33.1=h53a641e_7 31 | - libedit=3.1.20181209=hc058e9b_0 32 | - libffi=3.2.1=hd88cf55_4 33 | - libgcc-ng=9.1.0=hdf63c60_0 34 | - libgfortran-ng=7.3.0=hdf63c60_0 35 | - libpng=1.6.37=hbc83047_0 36 | - libprotobuf=3.11.4=h8b12597_0 37 | - libstdcxx-ng=9.1.0=hdf63c60_0 38 | - libtiff=4.1.0=h2733197_0 39 | - markdown=3.2.2=py_0 40 | - mkl=2020.0=166 41 | - mkl-service=2.3.0=py36he904b0f_0 42 | - mkl_fft=1.0.15=py36ha843d7b_0 43 | - mkl_random=1.1.0=py36hd6b4f25_0 44 | - ncurses=6.2=he6710b0_0 45 | - ninja=1.9.0=py36hfd86e86_0 46 | - numpy=1.18.1=py36h4f9e942_0 47 | - numpy-base=1.18.1=py36hde5b4d6_1 48 | - oauthlib=3.0.1=py_0 49 | - olefile=0.46=py36_0 50 | - openssl=1.1.1g=h516909a_0 51 | - pillow=6.1.0=py36h34e0f95_0 52 | - pip=20.0.2=py36_1 53 | - pyasn1=0.4.8=py_0 54 | - pyasn1-modules=0.2.7=py_0 55 | - pycparser=2.20=py_0 56 | - pyjwt=1.7.1=py_0 57 | - pyopenssl=19.1.0=py_1 58 | - pysocks=1.7.1=py36h9f0ad1d_1 59 | - python=3.6.10=hcf32534_1 60 | - python_abi=3.6=1_cp36m 61 | - pytorch=1.2.0=py3.6_cuda10.0.130_cudnn7.6.2_0 62 | - readline=8.0=h7b6447c_0 63 | - requests=2.23.0=pyh8c360ce_2 64 | - requests-oauthlib=1.2.0=py_0 65 | - rsa=4.0=py_0 66 | - setuptools=46.1.3=py36_0 67 | - six=1.14.0=py36_0 68 | - sqlite=3.31.1=h7b6447c_0 69 | - tensorboard=2.1.1=py_1 70 | - tk=8.6.8=hbc83047_0 71 | - torchvision=0.4.0=py36_cu100 72 | - urllib3=1.25.9=py_0 73 | - werkzeug=1.0.1=pyh9f0ad1d_0 74 | - wheel=0.34.2=py36_0 75 | - xz=5.2.4=h14c3975_4 76 | - zipp=3.1.0=py_0 77 | - zlib=1.2.11=h7b6447c_3 78 | - zstd=1.3.7=h0b5b093_0 79 | - pip: 80 | - cycler==0.10.0 81 | - cython==0.29.16 82 | - easydict==1.9 83 | - future==0.18.2 84 | - kiwisolver==1.2.0 85 | - matplotlib==3.2.1 86 | - msgpack==1.0.0 87 | - opencv-python==4.2.0.34 88 | - packaging==20.3 89 | - pandas==1.0.3 90 | - protobuf==3.11.3 91 | - pyparsing==2.4.7 92 | - python-box==4.2.3 93 | - python-dateutil==2.8.1 94 | - pytz==2020.1 95 | - pyyaml==5.3.1 96 | - ruamel-yaml==0.16.10 97 | - ruamel-yaml-clib==0.2.0 98 | - scipy==1.1.0 99 | - tensorboardx==2.0 100 | - toml==0.10.1 101 | - torchsummary==1.5.1 102 | - tqdm==4.48.0 103 | prefix: /home/tony/anaconda3/envs/rcnn36 104 | 105 | -------------------------------------------------------------------------------- /lib/roi_data_layer/minibatch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Xinlei Chen 6 | # -------------------------------------------------------- 7 | 8 | """Compute minibatch blobs for training a Fast R-CNN network.""" 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | import numpy as np 14 | import numpy.random as npr 15 | from scipy.misc import imread 16 | from model.utils.config import cfg 17 | from model.utils.blob import prep_im_for_blob, im_list_to_blob 18 | import pdb 19 | 20 | 21 | # def get_minibatch(roidb, num_classes): 22 | def get_minibatch(roidb): 23 | """Given a roidb, construct a minibatch sampled from it.""" 24 | # num_images here is always 1, independent of bs 25 | # because this func is called by __getitem__() of dataset 26 | num_images = len(roidb) 27 | 28 | # Sample random scales to use for each image in this batch 29 | random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), 30 | size=num_images) 31 | assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \ 32 | 'num_images ({}) must divide BATCH_SIZE ({})'. \ 33 | format(num_images, cfg.TRAIN.BATCH_SIZE) 34 | 35 | # Get the input image blob, formatted for caffe 36 | im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) 37 | # each im_blob has different H, same W (ex. [1, 600, 899, 3]) 38 | 39 | blobs = {'data': im_blob} 40 | 41 | assert len(im_scales) == 1, "Single batch only" 42 | assert len(roidb) == 1, "Single batch only" 43 | 44 | # gt boxes: (x1, y1, x2, y2, cls) 45 | if cfg.TRAIN.USE_ALL_GT: 46 | # Include all ground truth boxes 47 | gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] 48 | else: 49 | # For the COCO ground truth boxes, exclude the ones that are ''iscrowd'' 50 | gt_inds = np.where((roidb[0]['gt_classes'] != 0) & np.all(roidb[0]['gt_overlaps'].toarray() > -1.0, axis=1))[0] 51 | 52 | gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) 53 | gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0] 54 | gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] 55 | blobs['gt_boxes'] = gt_boxes 56 | blobs['im_info'] = np.array( 57 | [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], 58 | dtype=np.float32) 59 | 60 | blobs['img_id'] = roidb[0]['img_id'] 61 | 62 | return blobs 63 | 64 | def _get_image_blob(roidb, scale_inds): 65 | """Builds an input blob from the images in the roidb at the specified 66 | scales. 67 | """ 68 | num_images = len(roidb) 69 | 70 | processed_ims = [] 71 | im_scales = [] 72 | for i in range(num_images): 73 | #im = cv2.imread(roidb[i]['image']) 74 | im = imread(roidb[i]['image']) 75 | 76 | if len(im.shape) == 2: 77 | im = im[:,:,np.newaxis] 78 | im = np.concatenate((im,im,im), axis=2) 79 | # flip the channel, since the original one using cv2 80 | # rgb -> bgr 81 | im = im[:,:,::-1] 82 | 83 | if roidb[i]['flipped']: 84 | im = im[:, ::-1, :] 85 | target_size = cfg.TRAIN.SCALES[scale_inds[i]] 86 | # normalize (minus the mean) and scale, return the scaled_img & scale 87 | im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, 88 | cfg.TRAIN.MAX_SIZE) 89 | im_scales.append(im_scale) 90 | processed_ims.append(im) 91 | 92 | # Create a blob to hold the input images 93 | # im_list_to_blob has zero padding for different size of imgs 94 | blob = im_list_to_blob(processed_ims) # a np.array image of [B, H, W, C] 95 | 96 | return blob, im_scales # batch of imgs, list of scales 97 | -------------------------------------------------------------------------------- /lib/model/rpn/generate_anchors.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | # -------------------------------------------------------- 3 | # Faster R-CNN 4 | # Copyright (c) 2015 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Written by Ross Girshick and Sean Bell 7 | # -------------------------------------------------------- 8 | 9 | import numpy as np 10 | import pdb 11 | 12 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 13 | # 14 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 15 | # >> anchors 16 | # 17 | # anchors = 18 | # 19 | # -83 -39 100 56 20 | # -175 -87 192 104 21 | # -359 -183 376 200 22 | # -55 -55 72 72 23 | # -119 -119 136 136 24 | # -247 -247 264 264 25 | # -35 -79 52 96 26 | # -79 -167 96 184 27 | # -167 -343 184 360 28 | 29 | #array([[ -83., -39., 100., 56.], 30 | # [-175., -87., 192., 104.], 31 | # [-359., -183., 376., 200.], 32 | # [ -55., -55., 72., 72.], 33 | # [-119., -119., 136., 136.], 34 | # [-247., -247., 264., 264.], 35 | # [ -35., -79., 52., 96.], 36 | # [ -79., -167., 96., 184.], 37 | # [-167., -343., 184., 360.]]) 38 | 39 | try: 40 | xrange # Python 2 41 | except NameError: 42 | xrange = range # Python 3 43 | 44 | 45 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 46 | scales=2**np.arange(3, 6)): 47 | """ 48 | Generate anchor (reference) windows by enumerating aspect ratios X 49 | scales wrt a reference (0, 0, 15, 15) window. 50 | """ 51 | 52 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 53 | ratio_anchors = _ratio_enum(base_anchor, ratios) 54 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 55 | for i in xrange(ratio_anchors.shape[0])]) 56 | return anchors 57 | 58 | def _whctrs(anchor): 59 | """ 60 | Return width, height, x center, and y center for an anchor (window). 61 | """ 62 | 63 | w = anchor[2] - anchor[0] + 1 64 | h = anchor[3] - anchor[1] + 1 65 | x_ctr = anchor[0] + 0.5 * (w - 1) 66 | y_ctr = anchor[1] + 0.5 * (h - 1) 67 | return w, h, x_ctr, y_ctr 68 | 69 | def _mkanchors(ws, hs, x_ctr, y_ctr): 70 | """ 71 | Given a vector of widths (ws) and heights (hs) around a center 72 | (x_ctr, y_ctr), output a set of anchors (windows). 73 | """ 74 | 75 | ws = ws[:, np.newaxis] 76 | hs = hs[:, np.newaxis] 77 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 78 | y_ctr - 0.5 * (hs - 1), 79 | x_ctr + 0.5 * (ws - 1), 80 | y_ctr + 0.5 * (hs - 1))) 81 | return anchors 82 | 83 | def _ratio_enum(anchor, ratios): 84 | """ 85 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 86 | """ 87 | 88 | w, h, x_ctr, y_ctr = _whctrs(anchor) 89 | size = w * h 90 | size_ratios = size / ratios 91 | ws = np.round(np.sqrt(size_ratios)) 92 | hs = np.round(ws * ratios) 93 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 94 | return anchors 95 | 96 | def _scale_enum(anchor, scales): 97 | """ 98 | Enumerate a set of anchors for each scale wrt an anchor. 99 | """ 100 | 101 | w, h, x_ctr, y_ctr = _whctrs(anchor) 102 | ws = w * scales 103 | hs = h * scales 104 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 105 | return anchors 106 | 107 | if __name__ == '__main__': 108 | import time 109 | t = time.time() 110 | a = generate_anchors() 111 | print(time.time() - t) 112 | print(a) 113 | from IPython import embed; embed() 114 | -------------------------------------------------------------------------------- /lib/model/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps(np.ndarray[DTYPE_t, ndim=2] boxes, 16 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 17 | return bbox_overlaps_c(boxes, query_boxes) 18 | 19 | cdef np.ndarray[DTYPE_t, ndim=2] bbox_overlaps_c( 20 | np.ndarray[DTYPE_t, ndim=2] boxes, 21 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 22 | """ 23 | Parameters 24 | ---------- 25 | boxes: (N, 4) ndarray of float 26 | query_boxes: (K, 4) ndarray of float 27 | Returns 28 | ------- 29 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 30 | """ 31 | cdef unsigned int N = boxes.shape[0] 32 | cdef unsigned int K = query_boxes.shape[0] 33 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 34 | cdef DTYPE_t iw, ih, box_area 35 | cdef DTYPE_t ua 36 | cdef unsigned int k, n 37 | for k in range(K): 38 | box_area = ( 39 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 40 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 41 | ) 42 | for n in range(N): 43 | iw = ( 44 | min(boxes[n, 2], query_boxes[k, 2]) - 45 | max(boxes[n, 0], query_boxes[k, 0]) + 1 46 | ) 47 | if iw > 0: 48 | ih = ( 49 | min(boxes[n, 3], query_boxes[k, 3]) - 50 | max(boxes[n, 1], query_boxes[k, 1]) + 1 51 | ) 52 | if ih > 0: 53 | ua = float( 54 | (boxes[n, 2] - boxes[n, 0] + 1) * 55 | (boxes[n, 3] - boxes[n, 1] + 1) + 56 | box_area - iw * ih 57 | ) 58 | overlaps[n, k] = iw * ih / ua 59 | return overlaps 60 | 61 | 62 | def bbox_intersections( 63 | np.ndarray[DTYPE_t, ndim=2] boxes, 64 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 65 | return bbox_intersections_c(boxes, query_boxes) 66 | 67 | 68 | cdef np.ndarray[DTYPE_t, ndim=2] bbox_intersections_c( 69 | np.ndarray[DTYPE_t, ndim=2] boxes, 70 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 71 | """ 72 | For each query box compute the intersection ratio covered by boxes 73 | ---------- 74 | Parameters 75 | ---------- 76 | boxes: (N, 4) ndarray of float 77 | query_boxes: (K, 4) ndarray of float 78 | Returns 79 | ------- 80 | overlaps: (N, K) ndarray of intersec between boxes and query_boxes 81 | """ 82 | cdef unsigned int N = boxes.shape[0] 83 | cdef unsigned int K = query_boxes.shape[0] 84 | cdef np.ndarray[DTYPE_t, ndim=2] intersec = np.zeros((N, K), dtype=DTYPE) 85 | cdef DTYPE_t iw, ih, box_area 86 | cdef DTYPE_t ua 87 | cdef unsigned int k, n 88 | for k in range(K): 89 | box_area = ( 90 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 91 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 92 | ) 93 | for n in range(N): 94 | iw = ( 95 | min(boxes[n, 2], query_boxes[k, 2]) - 96 | max(boxes[n, 0], query_boxes[k, 0]) + 1 97 | ) 98 | if iw > 0: 99 | ih = ( 100 | min(boxes[n, 3], query_boxes[k, 3]) - 101 | max(boxes[n, 1], query_boxes[k, 1]) + 1 102 | ) 103 | if ih > 0: 104 | intersec[n, k] = iw * ih / box_area 105 | return intersec -------------------------------------------------------------------------------- /lib/datasets/factory.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | import numpy as np 5 | __sets = {} 6 | from datasets.pascal_voc import pascal_voc 7 | from datasets.coco import coco 8 | from datasets.coco_split import coco_split 9 | from datasets.imagenet import imagenet 10 | from datasets.vg import vg 11 | from datasets.episode import episode 12 | from datasets.ycb2d import ycb2d 13 | 14 | for i in [256, 240, 224, 208, 200, 192, 160, 128, 100, 96, 80, 64, 50, 48, 32, 30, 20, 16, 10]: 15 | name = f'ycb2d_replace{i}' 16 | __sets[name] = (lambda split='replace', year=str(i): ycb2d(split, year)) 17 | 18 | name = 'ycb2d_inference_sparse' 19 | __sets[name] = (lambda split='inference', year='sparse': ycb2d(split, year)) 20 | name = 'ycb2d_inferencefs_sparse' 21 | __sets[name] = (lambda split='inferencefs', year='sparse': ycb2d(split, year)) 22 | name = 'ycb2d_inference_dense' 23 | __sets[name] = (lambda split='inference', year='dense': ycb2d(split, year)) 24 | name = 'ycb2d_inferencefs_dense' 25 | __sets[name] = (lambda split='inferencefs', year='dense': ycb2d(split, year)) 26 | name = 'ycb2d_inference' 27 | __sets[name] = (lambda split='inference', year='1234': ycb2d(split, year)) 28 | 29 | for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, '1cls', '2cls', '3cls', '4cls']: 30 | name = f'ycb2d_stage{i}' 31 | __sets[name] = (lambda split='stage', year=str(i): ycb2d(split, year)) 32 | for i in [512, 256, 128, 64, 32, 16, 8]: 33 | name = f'ycb2d_oracle{i}' 34 | __sets[name] = (lambda split='oracle', year=str(i): ycb2d(split, year)) 35 | for i in [64, 32, 16]: 36 | name = f'ycb2d_oracle_dense{i}' 37 | __sets[name] = (lambda split='oracledense', year=str(i): ycb2d(split, year)) 38 | for i in [20, 10, 5]: 39 | name = f'ycb2d_fsoracle_dense{i}' 40 | __sets[name] = (lambda split='fsoracledense', year=str(i): ycb2d(split, year)) 41 | 42 | name = 'ycb2d_pseudo' 43 | for i in range(1, 10): 44 | __sets[name+str(i)] = (lambda split='pseudo', year=str(i): ycb2d(split, year)) 45 | 46 | __sets['coco_ft'] = (lambda split='shot', year='10': coco_split(split, year)) 47 | 48 | # coco 20 evaluation 49 | for year in ['set1', 'set2']: 50 | for split in ['3way', '5way']: 51 | name = 'coco_{}_{}'.format(split, year) 52 | __sets[name] = (lambda split=split, year=year: coco_split(split, year)) 53 | 54 | # vis 55 | for year in ['set1', 'set2', 'set3', 'set4']: 56 | for split in ['vis']: 57 | name = 'coco_{}_{}'.format(split, year) 58 | __sets[name] = (lambda split=split, year=year: coco_split(split, year)) 59 | 60 | # coco 20 evaluation 61 | for year in ['set1', 'set2', 'set3', 'set4']: 62 | for split in ['20']: 63 | name = 'coco_{}_{}'.format(split, year) 64 | __sets[name] = (lambda split=split, year=year: coco_split(split, year)) 65 | 66 | # coco 60 training 67 | for year in ['set1', 'set2', 'set3', 'set4', 'set1allcat']: 68 | for split in ['60']: 69 | name = 'coco_{}_{}'.format(split, year) 70 | __sets[name] = (lambda split=split, year=year: coco_split(split, year)) 71 | 72 | # episode 73 | for year in ['novel', 'base', 'val']: 74 | for n in range(600): 75 | split = 'ep' + str(n) 76 | name = 'coco_{}_{}'.format(year, split) 77 | __sets[name] = (lambda split=split, year=year: episode(split, year)) 78 | 79 | 80 | # Set up voc__ 81 | for year in ['2007', '2012']: 82 | for split in ['train', 'val', 'trainval', 'test']: 83 | name = 'voc_{}_{}'.format(year, split) 84 | __sets[name] = (lambda split=split, year=year: pascal_voc(split, year)) 85 | 86 | # Set up coco_2014_ 87 | for year in ['2014']: 88 | for split in ['train', 'val', 'minival', 'valminusminival', 'trainval']: 89 | name = 'coco_{}_{}'.format(year, split) 90 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 91 | 92 | 93 | def get_imdb(name): 94 | """Get an imdb (image database) by name.""" 95 | if name not in __sets: 96 | raise KeyError('Unknown dataset: {}'.format(name)) 97 | return __sets[name]() 98 | 99 | 100 | def list_imdbs(): 101 | """List all registered imdbs.""" 102 | return list(__sets.keys()) 103 | -------------------------------------------------------------------------------- /lib/model/roi_pooling/src/roi_pooling.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale, 5 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output) 6 | { 7 | // Grab the input tensor 8 | float * data_flat = THFloatTensor_data(features); 9 | float * rois_flat = THFloatTensor_data(rois); 10 | 11 | float * output_flat = THFloatTensor_data(output); 12 | 13 | // Number of ROIs 14 | int num_rois = THFloatTensor_size(rois, 0); 15 | int size_rois = THFloatTensor_size(rois, 1); 16 | // batch size 17 | int batch_size = THFloatTensor_size(features, 0); 18 | if(batch_size != 1) 19 | { 20 | return 0; 21 | } 22 | // data height 23 | int data_height = THFloatTensor_size(features, 1); 24 | // data width 25 | int data_width = THFloatTensor_size(features, 2); 26 | // Number of channels 27 | int num_channels = THFloatTensor_size(features, 3); 28 | 29 | // Set all element of the output tensor to -inf. 30 | THFloatStorage_fill(THFloatTensor_storage(output), -1); 31 | 32 | // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R 33 | int index_roi = 0; 34 | int index_output = 0; 35 | int n; 36 | for (n = 0; n < num_rois; ++n) 37 | { 38 | int roi_batch_ind = rois_flat[index_roi + 0]; 39 | int roi_start_w = round(rois_flat[index_roi + 1] * spatial_scale); 40 | int roi_start_h = round(rois_flat[index_roi + 2] * spatial_scale); 41 | int roi_end_w = round(rois_flat[index_roi + 3] * spatial_scale); 42 | int roi_end_h = round(rois_flat[index_roi + 4] * spatial_scale); 43 | // CHECK_GE(roi_batch_ind, 0); 44 | // CHECK_LT(roi_batch_ind, batch_size); 45 | 46 | int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); 47 | int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); 48 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 49 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 50 | 51 | int index_data = roi_batch_ind * data_height * data_width * num_channels; 52 | const int output_area = pooled_width * pooled_height; 53 | 54 | int c, ph, pw; 55 | for (ph = 0; ph < pooled_height; ++ph) 56 | { 57 | for (pw = 0; pw < pooled_width; ++pw) 58 | { 59 | int hstart = (floor((float)(ph) * bin_size_h)); 60 | int wstart = (floor((float)(pw) * bin_size_w)); 61 | int hend = (ceil((float)(ph + 1) * bin_size_h)); 62 | int wend = (ceil((float)(pw + 1) * bin_size_w)); 63 | 64 | hstart = fminf(fmaxf(hstart + roi_start_h, 0), data_height); 65 | hend = fminf(fmaxf(hend + roi_start_h, 0), data_height); 66 | wstart = fminf(fmaxf(wstart + roi_start_w, 0), data_width); 67 | wend = fminf(fmaxf(wend + roi_start_w, 0), data_width); 68 | 69 | const int pool_index = index_output + (ph * pooled_width + pw); 70 | int is_empty = (hend <= hstart) || (wend <= wstart); 71 | if (is_empty) 72 | { 73 | for (c = 0; c < num_channels * output_area; c += output_area) 74 | { 75 | output_flat[pool_index + c] = 0; 76 | } 77 | } 78 | else 79 | { 80 | int h, w, c; 81 | for (h = hstart; h < hend; ++h) 82 | { 83 | for (w = wstart; w < wend; ++w) 84 | { 85 | for (c = 0; c < num_channels; ++c) 86 | { 87 | const int index = (h * data_width + w) * num_channels + c; 88 | if (data_flat[index_data + index] > output_flat[pool_index + c * output_area]) 89 | { 90 | output_flat[pool_index + c * output_area] = data_flat[index_data + index]; 91 | } 92 | } 93 | } 94 | } 95 | } 96 | } 97 | } 98 | 99 | // Increment ROI index 100 | index_roi += size_rois; 101 | index_output += pooled_height * pooled_width * num_channels; 102 | } 103 | return 1; 104 | } -------------------------------------------------------------------------------- /lib/datasets/vg_eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | # -------------------------------------------------------- 3 | # Fast/er R-CNN 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Bharath Hariharan 6 | # -------------------------------------------------------- 7 | 8 | import xml.etree.ElementTree as ET 9 | import os 10 | import numpy as np 11 | from .voc_eval import voc_ap 12 | 13 | def vg_eval( detpath, 14 | gt_roidb, 15 | image_index, 16 | classindex, 17 | ovthresh=0.5, 18 | use_07_metric=False, 19 | eval_attributes=False): 20 | """rec, prec, ap, sorted_scores, npos = voc_eval( 21 | detpath, 22 | gt_roidb, 23 | image_index, 24 | classindex, 25 | [ovthresh], 26 | [use_07_metric]) 27 | 28 | Top level function that does the Visual Genome evaluation. 29 | 30 | detpath: Path to detections 31 | gt_roidb: List of ground truth structs. 32 | image_index: List of image ids. 33 | classindex: Category index 34 | [ovthresh]: Overlap threshold (default = 0.5) 35 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 36 | (default False) 37 | """ 38 | # extract gt objects for this class 39 | class_recs = {} 40 | npos = 0 41 | for item,imagename in zip(gt_roidb,image_index): 42 | if eval_attributes: 43 | bbox = item['boxes'][np.where(np.any(item['gt_attributes'].toarray() == classindex, axis=1))[0], :] 44 | else: 45 | bbox = item['boxes'][np.where(item['gt_classes'] == classindex)[0], :] 46 | difficult = np.zeros((bbox.shape[0],)).astype(np.bool) 47 | det = [False] * bbox.shape[0] 48 | npos = npos + sum(~difficult) 49 | class_recs[str(imagename)] = {'bbox': bbox, 50 | 'difficult': difficult, 51 | 'det': det} 52 | if npos == 0: 53 | # No ground truth examples 54 | return 0,0,0,0,npos 55 | 56 | # read dets 57 | with open(detpath, 'r') as f: 58 | lines = f.readlines() 59 | if len(lines) == 0: 60 | # No detection examples 61 | return 0,0,0,0,npos 62 | 63 | splitlines = [x.strip().split(' ') for x in lines] 64 | image_ids = [x[0] for x in splitlines] 65 | confidence = np.array([float(x[1]) for x in splitlines]) 66 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 67 | 68 | # sort by confidence 69 | sorted_ind = np.argsort(-confidence) 70 | sorted_scores = -np.sort(-confidence) 71 | BB = BB[sorted_ind, :] 72 | image_ids = [image_ids[x] for x in sorted_ind] 73 | 74 | # go down dets and mark TPs and FPs 75 | nd = len(image_ids) 76 | tp = np.zeros(nd) 77 | fp = np.zeros(nd) 78 | for d in range(nd): 79 | R = class_recs[image_ids[d]] 80 | bb = BB[d, :].astype(float) 81 | ovmax = -np.inf 82 | BBGT = R['bbox'].astype(float) 83 | 84 | if BBGT.size > 0: 85 | # compute overlaps 86 | # intersection 87 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 88 | iymin = np.maximum(BBGT[:, 1], bb[1]) 89 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 90 | iymax = np.minimum(BBGT[:, 3], bb[3]) 91 | iw = np.maximum(ixmax - ixmin + 1., 0.) 92 | ih = np.maximum(iymax - iymin + 1., 0.) 93 | inters = iw * ih 94 | 95 | # union 96 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 97 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 98 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 99 | 100 | overlaps = inters / uni 101 | ovmax = np.max(overlaps) 102 | jmax = np.argmax(overlaps) 103 | 104 | if ovmax > ovthresh: 105 | if not R['difficult'][jmax]: 106 | if not R['det'][jmax]: 107 | tp[d] = 1. 108 | R['det'][jmax] = 1 109 | else: 110 | fp[d] = 1. 111 | else: 112 | fp[d] = 1. 113 | 114 | # compute precision recall 115 | fp = np.cumsum(fp) 116 | tp = np.cumsum(tp) 117 | rec = tp / float(npos) 118 | # avoid divide by zero in case the first detection matches a difficult 119 | # ground truth 120 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 121 | ap = voc_ap(rec, prec, use_07_metric) 122 | 123 | return rec, prec, ap, sorted_scores, npos 124 | -------------------------------------------------------------------------------- /lib/roi_data_layer/roidb.py: -------------------------------------------------------------------------------- 1 | """Transform a roidb into a trainable roidb by adding a bunch of metadata.""" 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import datasets 7 | import numpy as np 8 | from model.utils.config import cfg 9 | from datasets.factory import get_imdb 10 | import PIL 11 | import pdb 12 | 13 | def prepare_roidb(imdb): 14 | """Enrich the imdb's roidb by adding some derived quantities that 15 | are useful for training. This function precomputes the maximum 16 | overlap, taken over ground-truth boxes, between each ROI and 17 | each ground-truth box. The class with maximum overlap is also 18 | recorded. 19 | """ 20 | 21 | roidb = imdb.roidb 22 | if not (imdb.name.startswith('coco')): 23 | sizes = [PIL.Image.open(imdb.image_path_at(i)).size 24 | for i in range(imdb.num_images)] 25 | for i in range(len(imdb.image_index)): 26 | roidb[i]['img_id'] = imdb.image_id_at(i) 27 | roidb[i]['image'] = imdb.image_path_at(i) 28 | if not (imdb.name.startswith('coco')): 29 | roidb[i]['width'] = sizes[i][0] 30 | roidb[i]['height'] = sizes[i][1] 31 | # need gt_overlaps as a dense array for argmax 32 | # (num_obj, num_class) 33 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 34 | # max overlap with gt over classes (columns) 35 | max_overlaps = gt_overlaps.max(axis=1) 36 | # gt class that had the max overlap 37 | max_classes = gt_overlaps.argmax(axis=1) 38 | roidb[i]['max_classes'] = max_classes 39 | roidb[i]['max_overlaps'] = max_overlaps 40 | # sanity checks 41 | # max overlap of 0 => class should be zero (background) 42 | zero_inds = np.where(max_overlaps == 0)[0] 43 | assert all(max_classes[zero_inds] == 0) 44 | # max overlap > 0 => class should not be zero (must be a fg class) 45 | nonzero_inds = np.where(max_overlaps > 0)[0] 46 | assert all(max_classes[nonzero_inds] != 0) 47 | 48 | 49 | def rank_roidb_ratio(roidb): 50 | # rank roidb based on the ratio between width and height. 51 | ratio_large = 2 # largest ratio to preserve. 52 | ratio_small = 0.5 # smallest ratio to preserve. 53 | 54 | ratio_list = [] 55 | for i in range(len(roidb)): 56 | width = roidb[i]['width'] 57 | height = roidb[i]['height'] 58 | ratio = width / float(height) 59 | 60 | # trim the ratio into 0.5 ~ 2. 61 | # remark need_crop if the ratio over that range 62 | if ratio > ratio_large: 63 | roidb[i]['need_crop'] = 1 64 | ratio = ratio_large 65 | elif ratio < ratio_small: 66 | roidb[i]['need_crop'] = 1 67 | ratio = ratio_small 68 | else: 69 | roidb[i]['need_crop'] = 0 70 | 71 | ratio_list.append(ratio) 72 | 73 | ratio_list = np.array(ratio_list) 74 | ratio_index = np.argsort(ratio_list) 75 | # return sorted ratio list, index 76 | # ex. [0.5, 0.5, 1., 1.6, 2. 2.] 77 | return ratio_list[ratio_index], ratio_index 78 | 79 | def filter_roidb(roidb): 80 | # filter the image without bounding box. 81 | print('before filtering, there are %d images...' % (len(roidb))) 82 | i = 0 83 | while i < len(roidb): 84 | if len(roidb[i]['boxes']) == 0: 85 | del roidb[i] 86 | i -= 1 87 | i += 1 88 | 89 | print('after filtering, there are %d images...' % (len(roidb))) 90 | return roidb 91 | 92 | def combined_roidb(imdb_names, training=True): 93 | """ 94 | Combine multiple roidbs 95 | """ 96 | print(imdb_names) 97 | 98 | def get_training_roidb(imdb): 99 | """Returns a roidb (Region of Interest database) for use in training.""" 100 | if cfg.TRAIN.USE_FLIPPED: 101 | print('Appending horizontally-flipped training examples...') 102 | imdb.append_flipped_images() 103 | print('done') 104 | 105 | print('Preparing training data...') 106 | 107 | prepare_roidb(imdb) 108 | #ratio_index = rank_roidb_ratio(imdb) 109 | print('done') 110 | 111 | return imdb.roidb 112 | 113 | def get_roidb(imdb_name): 114 | imdb = get_imdb(imdb_name) 115 | print('Loaded dataset `{:s}` for training'.format(imdb.name)) 116 | imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD) 117 | print('Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)) 118 | roidb = get_training_roidb(imdb) 119 | return roidb 120 | 121 | roidbs = [get_roidb(s) for s in imdb_names.split('+')] 122 | roidb = roidbs[0] 123 | 124 | if len(roidbs) > 1: 125 | for r in roidbs[1:]: 126 | roidb.extend(r) 127 | tmp = get_imdb(imdb_names.split('+')[1]) 128 | imdb = datasets.imdb.imdb(imdb_names, tmp.classes) 129 | else: 130 | imdb = get_imdb(imdb_names) 131 | 132 | if training: 133 | roidb = filter_roidb(roidb) 134 | 135 | ratio_list, ratio_index = rank_roidb_ratio(roidb) 136 | 137 | return imdb, roidb, ratio_list, ratio_index 138 | -------------------------------------------------------------------------------- /lib/model/roi_crop/src/roi_crop_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "roi_crop_cuda_kernel.h" 5 | 6 | #define real float 7 | 8 | // this symbol will be resolved automatically from PyTorch libs 9 | extern THCState *state; 10 | 11 | // Bilinear sampling is done in BHWD (coalescing is not obvious in BDHW) 12 | // we assume BHWD format in inputImages 13 | // we assume BHW(YX) format on grids 14 | 15 | int BilinearSamplerBHWD_updateOutput_cuda(THCudaTensor *inputImages, THCudaTensor *grids, THCudaTensor *output){ 16 | // THCState *state = getCutorchState(L); 17 | // THCudaTensor *inputImages = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor"); 18 | // THCudaTensor *grids = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor"); 19 | // THCudaTensor *output = (THCudaTensor *)luaT_checkudata(L, 4, "torch.CudaTensor"); 20 | 21 | int success = 0; 22 | success = BilinearSamplerBHWD_updateOutput_cuda_kernel(THCudaTensor_size(state, output, 1), 23 | THCudaTensor_size(state, output, 3), 24 | THCudaTensor_size(state, output, 2), 25 | THCudaTensor_size(state, output, 0), 26 | THCudaTensor_size(state, inputImages, 1), 27 | THCudaTensor_size(state, inputImages, 2), 28 | THCudaTensor_size(state, inputImages, 3), 29 | THCudaTensor_size(state, inputImages, 0), 30 | THCudaTensor_data(state, inputImages), 31 | THCudaTensor_stride(state, inputImages, 0), 32 | THCudaTensor_stride(state, inputImages, 1), 33 | THCudaTensor_stride(state, inputImages, 2), 34 | THCudaTensor_stride(state, inputImages, 3), 35 | THCudaTensor_data(state, grids), 36 | THCudaTensor_stride(state, grids, 0), 37 | THCudaTensor_stride(state, grids, 3), 38 | THCudaTensor_stride(state, grids, 1), 39 | THCudaTensor_stride(state, grids, 2), 40 | THCudaTensor_data(state, output), 41 | THCudaTensor_stride(state, output, 0), 42 | THCudaTensor_stride(state, output, 1), 43 | THCudaTensor_stride(state, output, 2), 44 | THCudaTensor_stride(state, output, 3), 45 | THCState_getCurrentStream(state)); 46 | 47 | //check for errors 48 | if (!success) { 49 | THError("aborting"); 50 | } 51 | return 1; 52 | } 53 | 54 | int BilinearSamplerBHWD_updateGradInput_cuda(THCudaTensor *inputImages, THCudaTensor *grids, THCudaTensor *gradInputImages, 55 | THCudaTensor *gradGrids, THCudaTensor *gradOutput) 56 | { 57 | // THCState *state = getCutorchState(L); 58 | // THCudaTensor *inputImages = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor"); 59 | // THCudaTensor *grids = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor"); 60 | // THCudaTensor *gradInputImages = (THCudaTensor *)luaT_checkudata(L, 4, "torch.CudaTensor"); 61 | // THCudaTensor *gradGrids = (THCudaTensor *)luaT_checkudata(L, 5, "torch.CudaTensor"); 62 | // THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 6, "torch.CudaTensor"); 63 | 64 | int success = 0; 65 | success = BilinearSamplerBHWD_updateGradInput_cuda_kernel(THCudaTensor_size(state, gradOutput, 1), 66 | THCudaTensor_size(state, gradOutput, 3), 67 | THCudaTensor_size(state, gradOutput, 2), 68 | THCudaTensor_size(state, gradOutput, 0), 69 | THCudaTensor_size(state, inputImages, 1), 70 | THCudaTensor_size(state, inputImages, 2), 71 | THCudaTensor_size(state, inputImages, 3), 72 | THCudaTensor_size(state, inputImages, 0), 73 | THCudaTensor_data(state, inputImages), 74 | THCudaTensor_stride(state, inputImages, 0), 75 | THCudaTensor_stride(state, inputImages, 1), 76 | THCudaTensor_stride(state, inputImages, 2), 77 | THCudaTensor_stride(state, inputImages, 3), 78 | THCudaTensor_data(state, grids), 79 | THCudaTensor_stride(state, grids, 0), 80 | THCudaTensor_stride(state, grids, 3), 81 | THCudaTensor_stride(state, grids, 1), 82 | THCudaTensor_stride(state, grids, 2), 83 | THCudaTensor_data(state, gradInputImages), 84 | THCudaTensor_stride(state, gradInputImages, 0), 85 | THCudaTensor_stride(state, gradInputImages, 1), 86 | THCudaTensor_stride(state, gradInputImages, 2), 87 | THCudaTensor_stride(state, gradInputImages, 3), 88 | THCudaTensor_data(state, gradGrids), 89 | THCudaTensor_stride(state, gradGrids, 0), 90 | THCudaTensor_stride(state, gradGrids, 3), 91 | THCudaTensor_stride(state, gradGrids, 1), 92 | THCudaTensor_stride(state, gradGrids, 2), 93 | THCudaTensor_data(state, gradOutput), 94 | THCudaTensor_stride(state, gradOutput, 0), 95 | THCudaTensor_stride(state, gradOutput, 1), 96 | THCudaTensor_stride(state, gradOutput, 2), 97 | THCudaTensor_stride(state, gradOutput, 3), 98 | THCState_getCurrentStream(state)); 99 | 100 | //check for errors 101 | if (!success) { 102 | THError("aborting"); 103 | } 104 | return 1; 105 | } 106 | -------------------------------------------------------------------------------- /lib/model/rpn/rpn.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | from model.utils.config import cfg 8 | from .proposal_layer import _ProposalLayer 9 | from .anchor_target_layer import _AnchorTargetLayer 10 | from model.utils.net_utils import _smooth_l1_loss 11 | 12 | import numpy as np 13 | import math 14 | import pdb 15 | import time 16 | 17 | class _RPN(nn.Module): 18 | """ region proposal network """ 19 | def __init__(self, din): 20 | super(_RPN, self).__init__() 21 | 22 | self.din = din # get depth of input feature map, e.g., 512 23 | self.anchor_scales = cfg.ANCHOR_SCALES 24 | self.anchor_ratios = cfg.ANCHOR_RATIOS 25 | self.feat_stride = cfg.FEAT_STRIDE[0] 26 | 27 | # define the convrelu layers processing input feature map 28 | self.RPN_Conv = nn.Conv2d(self.din, 512, 3, 1, 1, bias=True) 29 | 30 | # define bg/fg classifcation score layer 31 | self.nc_score_out = len(self.anchor_scales) * len(self.anchor_ratios) * 2 # 2(bg/fg) * 9 (anchors) 32 | self.RPN_cls_score = nn.Conv2d(512, self.nc_score_out, 1, 1, 0) 33 | 34 | # define anchor box offset prediction layer 35 | self.nc_bbox_out = len(self.anchor_scales) * len(self.anchor_ratios) * 4 # 4(coords) * 9 (anchors) 36 | self.RPN_bbox_pred = nn.Conv2d(512, self.nc_bbox_out, 1, 1, 0) 37 | 38 | # define proposal layer 39 | self.RPN_proposal = _ProposalLayer(self.feat_stride, self.anchor_scales, self.anchor_ratios) 40 | 41 | # define anchor target layer 42 | self.RPN_anchor_target = _AnchorTargetLayer(self.feat_stride, self.anchor_scales, self.anchor_ratios) 43 | 44 | self.rpn_loss_cls = 0 45 | self.rpn_loss_box = 0 46 | 47 | @staticmethod 48 | def reshape(x, d): 49 | input_shape = x.size() 50 | x = x.view( 51 | input_shape[0], 52 | int(d), 53 | int(float(input_shape[1] * input_shape[2]) / float(d)), 54 | input_shape[3] 55 | ) 56 | return x 57 | 58 | def forward(self, base_feat, im_info, gt_boxes, num_boxes): 59 | 60 | batch_size = base_feat.size(0) 61 | 62 | # return feature map after convrelu layer 63 | rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace=True) 64 | # get rpn classification score 65 | rpn_cls_score = self.RPN_cls_score(rpn_conv1) # [B, 9*2, H, W] 66 | 67 | rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) 68 | rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, 1) 69 | rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) # [B, 9*2, H, W] 70 | 71 | # get rpn offsets to the anchor boxes 72 | rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) # [B, 9*4, H, W] 73 | 74 | # proposal layer 75 | cfg_key = 'TRAIN' if self.training else 'TEST' 76 | 77 | rois = self.RPN_proposal((rpn_cls_prob.data, rpn_bbox_pred.data, 78 | im_info, cfg_key)) 79 | 80 | self.rpn_loss_cls = 0 81 | self.rpn_loss_box = 0 82 | 83 | # generating training labels and build the rpn loss 84 | if self.training: 85 | assert gt_boxes is not None 86 | 87 | rpn_data = self.RPN_anchor_target((rpn_cls_score.data, gt_boxes, im_info, num_boxes)) 88 | ################## 89 | # rpn_data: list of length=4 90 | # [0]: labels [B, 1, 9*H, W] 91 | # [1]: bbox_targets [B, 9*4, H, W] 92 | # [2]: bbox_inside_weights [B, 9*4, H, W] 93 | # [3]: bbox_outside_weights [B, 9*4, H, W] 94 | ################## 95 | 96 | # compute classification loss 97 | rpn_cls_score = rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2) 98 | rpn_label = rpn_data[0].view(batch_size, -1) 99 | rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) 100 | 101 | rpn_cls_score = torch.index_select(rpn_cls_score.view(-1,2), 0, rpn_keep) # [B*RPN_BATCHSIZE, 2] 102 | rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) 103 | rpn_label = Variable(rpn_label.long()) # [B*RPN_BATCHSIZE] 104 | self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) 105 | fg_cnt = torch.sum(rpn_label.data.ne(0)) 106 | 107 | rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[1:] 108 | 109 | # compute bbox regression loss 110 | rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights) 111 | rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) 112 | rpn_bbox_targets = Variable(rpn_bbox_targets) 113 | 114 | self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, 115 | rpn_bbox_outside_weights, sigma=3, dim=[1,2,3]) 116 | 117 | return rois, self.rpn_loss_cls, self.rpn_loss_box 118 | -------------------------------------------------------------------------------- /lib/model/csrc/cuda/nms.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 12 | 13 | __device__ inline float devIoU(float const * const a, float const * const b) { 14 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 15 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 16 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 17 | float interS = width * height; 18 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 19 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 20 | return interS / (Sa + Sb - interS); 21 | } 22 | 23 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 24 | const float *dev_boxes, unsigned long long *dev_mask) { 25 | const int row_start = blockIdx.y; 26 | const int col_start = blockIdx.x; 27 | 28 | // if (row_start > col_start) return; 29 | 30 | const int row_size = 31 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 32 | const int col_size = 33 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 34 | 35 | __shared__ float block_boxes[threadsPerBlock * 5]; 36 | if (threadIdx.x < col_size) { 37 | block_boxes[threadIdx.x * 5 + 0] = 38 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 39 | block_boxes[threadIdx.x * 5 + 1] = 40 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 41 | block_boxes[threadIdx.x * 5 + 2] = 42 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 43 | block_boxes[threadIdx.x * 5 + 3] = 44 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 45 | block_boxes[threadIdx.x * 5 + 4] = 46 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 47 | } 48 | __syncthreads(); 49 | 50 | if (threadIdx.x < row_size) { 51 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 52 | const float *cur_box = dev_boxes + cur_box_idx * 5; 53 | int i = 0; 54 | unsigned long long t = 0; 55 | int start = 0; 56 | if (row_start == col_start) { 57 | start = threadIdx.x + 1; 58 | } 59 | for (i = start; i < col_size; i++) { 60 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 61 | t |= 1ULL << i; 62 | } 63 | } 64 | const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); 65 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 66 | } 67 | } 68 | 69 | // boxes is a N x 5 tensor 70 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { 71 | using scalar_t = float; 72 | AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); 73 | auto scores = boxes.select(1, 4); 74 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 75 | auto boxes_sorted = boxes.index_select(0, order_t); 76 | 77 | int boxes_num = boxes.size(0); 78 | 79 | const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); 80 | 81 | scalar_t* boxes_dev = boxes_sorted.data(); 82 | 83 | THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState 84 | 85 | unsigned long long* mask_dev = NULL; 86 | //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, 87 | // boxes_num * col_blocks * sizeof(unsigned long long))); 88 | 89 | mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); 90 | 91 | dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), 92 | THCCeilDiv(boxes_num, threadsPerBlock)); 93 | dim3 threads(threadsPerBlock); 94 | nms_kernel<<>>(boxes_num, 95 | nms_overlap_thresh, 96 | boxes_dev, 97 | mask_dev); 98 | 99 | std::vector mask_host(boxes_num * col_blocks); 100 | THCudaCheck(cudaMemcpy(&mask_host[0], 101 | mask_dev, 102 | sizeof(unsigned long long) * boxes_num * col_blocks, 103 | cudaMemcpyDeviceToHost)); 104 | 105 | std::vector remv(col_blocks); 106 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 107 | 108 | at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); 109 | int64_t* keep_out = keep.data(); 110 | 111 | int num_to_keep = 0; 112 | for (int i = 0; i < boxes_num; i++) { 113 | int nblock = i / threadsPerBlock; 114 | int inblock = i % threadsPerBlock; 115 | 116 | if (!(remv[nblock] & (1ULL << inblock))) { 117 | keep_out[num_to_keep++] = i; 118 | unsigned long long *p = &mask_host[0] + i * col_blocks; 119 | for (int j = nblock; j < col_blocks; j++) { 120 | remv[j] |= p[j]; 121 | } 122 | } 123 | } 124 | 125 | THCudaFree(state, mask_dev); 126 | // TODO improve this part 127 | return std::get<0>(order_t.index({ 128 | keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( 129 | order_t.device(), keep.scalar_type()) 130 | }).sort(0, false)); 131 | } 132 | -------------------------------------------------------------------------------- /lib/model/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /lib/model/utils/fsod_logger.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import cv2 4 | from torch.utils.tensorboard import SummaryWriter 5 | from torchvision.utils import make_grid 6 | 7 | 8 | class FSODInferenceLogger: 9 | def __init__(self, log_dir): 10 | self.writer = SummaryWriter(log_dir) 11 | 12 | def write(self, save_step, gt, support, predict, save_im=False): 13 | # self._add_scalars(save_step, train_log) 14 | if save_im: 15 | self._add_images(save_step, gt, support, predict) 16 | 17 | def close(self): 18 | """Close the writer. 19 | """ 20 | self.writer.close() 21 | 22 | def _add_images(self, save_step, gt, support, predict): 23 | # gt = gt.cpu() 24 | # support = support.cpu() 25 | # predict = predict.cpu() 26 | # H, W = gt[0].size(1), gt[0].size(2) 27 | # support_img = support[i].permute(1, 2, 0).numpy() 28 | # support_img = support_img[:, :, ::-1].copy() 29 | gt_grid = make_grid(gt, nrow=1, normalize=True, scale_each=True, pad_value=1) 30 | support_grid = make_grid(support, nrow=1, normalize=True, scale_each=True, pad_value=1) 31 | pred_grid = make_grid(predict, nrow=1, normalize=True, scale_each=True, pad_value=1) 32 | 33 | grid = torch.cat((gt_grid, support_grid, pred_grid), dim=-1) 34 | self.writer.add_image('gt&pred', grid, save_step) 35 | 36 | 37 | class FSODLogger: 38 | def __init__(self, log_dir, train_shot=5): 39 | self.writer = SummaryWriter(log_dir) 40 | self.train_shot = train_shot 41 | 42 | def write(self, save_step, train_log, query=None, support=None, boxes=None, save_im=False): 43 | self._add_scalars(save_step, train_log) 44 | if save_im: 45 | self._add_images(save_step, query, support, boxes) 46 | 47 | def close(self): 48 | """Close the writer. 49 | """ 50 | self.writer.close() 51 | 52 | def _add_scalars(self, save_step, train_log): 53 | for key in train_log.keys(): 54 | self.writer.add_scalars(key, {'train': train_log[key]}, save_step) 55 | 56 | def _add_images(self, save_step, query, supports, boxes): 57 | query = query.cpu() 58 | support = supports[:, 0, :, :, :].cpu() 59 | neg_support = supports[:, self.train_shot, :, :, :].cpu() 60 | boxes = boxes.cpu() 61 | query_ims = [] 62 | support_ims = [] 63 | neg_support_ims = [] 64 | H, W = query[0].size(1), query[0].size(2) 65 | for i in range(query.size(0)): 66 | query_im = query[i].permute(1, 2, 0).numpy() 67 | support_im = support[i].permute(1, 2, 0).numpy() 68 | neg_support_im = neg_support[i].permute(1, 2, 0).numpy() 69 | query_im = query_im[:, :, ::-1].copy() 70 | support_im = support_im[:, :, ::-1].copy() 71 | neg_support_im = neg_support_im[:, :, ::-1].copy() 72 | boxes_of_one_img = boxes[i] 73 | 74 | for ii in range(boxes_of_one_img.size(0)): 75 | box = boxes_of_one_img[ii] 76 | if box[4] == 0: 77 | continue 78 | x = box[0] 79 | y = box[1] 80 | w = box[2] - box[0] 81 | h = box[3] - box[1] 82 | query_im = cv2.rectangle(np.array(query_im), (int(x), int(y)), (int(x+w), int(y+h)), (220, 0, 50), 2) 83 | 84 | query_im = torch.from_numpy(query_im).permute(2,0,1) 85 | support_im = cv2.resize(support_im, (W, H), interpolation=cv2.INTER_LINEAR) 86 | neg_support_im = cv2.resize(neg_support_im, (W, H), interpolation=cv2.INTER_LINEAR) 87 | support_im = torch.from_numpy(support_im).permute(2,0,1) 88 | neg_support_im = torch.from_numpy(neg_support_im).permute(2,0,1) 89 | 90 | query_ims += [query_im] 91 | support_ims += [support_im] 92 | neg_support_ims += [neg_support_im] 93 | 94 | query_ims = torch.stack(query_ims, 0) 95 | support_ims = torch.stack(support_ims, 0) 96 | neg_support_ims = torch.stack(neg_support_ims, 0) 97 | 98 | train_query = make_grid(query_ims, nrow=1, normalize=True, scale_each=True, pad_value=1) 99 | train_support = make_grid(support_ims, nrow=1, normalize=True, scale_each=True, pad_value=1) 100 | train_support_2 = make_grid(neg_support_ims, nrow=1, normalize=True, scale_each=True, pad_value=1) 101 | grid = torch.cat((train_query, train_support, train_support_2), dim=-1) 102 | self.writer.add_image('train', grid, save_step) 103 | 104 | class BaseLogger: 105 | def __init__(self, log_dir): 106 | self.writer = SummaryWriter(log_dir) 107 | 108 | def write(self, save_step, gt, support, predict): 109 | # self._add_scalars(save_step, train_log) 110 | self._add_images(save_step, gt, support, predict) 111 | 112 | def close(self): 113 | """Close the writer. 114 | """ 115 | self.writer.close() 116 | 117 | def _add_images(self, save_step, gt, support, predict): 118 | # gt = gt.cpu() 119 | # support = support.cpu() 120 | # predict = predict.cpu() 121 | # H, W = gt[0].size(1), gt[0].size(2) 122 | # support_img = support[i].permute(1, 2, 0).numpy() 123 | # support_img = support_img[:, :, ::-1].copy() 124 | gt_grid = make_grid(gt, nrow=1, normalize=True, scale_each=True, pad_value=1) 125 | support_grid = make_grid(support, nrow=1, normalize=True, scale_each=True, pad_value=1) 126 | pred_grid = make_grid(predict, nrow=1, normalize=True, scale_each=True, pad_value=1) 127 | 128 | gt_pred_grid = torch.cat((gt_grid, pred_grid), dim=-1) 129 | self.writer.add_image('gt&pred', gt_pred_grid) 130 | self.writer.add_image('support', support_grid) 131 | 132 | -------------------------------------------------------------------------------- /lib/roi_data_layer/allcls_fs_loader.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | import torch 3 | import numpy as np 4 | import random 5 | import cv2 6 | import os 7 | from PIL import Image 8 | from pathlib import Path 9 | from torch.utils.data.sampler import Sampler 10 | from scipy.misc import imread 11 | from model.utils.config import cfg 12 | from model.rpn.bbox_transform import bbox_transform_inv, clip_boxes 13 | from roi_data_layer.minibatch import get_minibatch, get_minibatch 14 | from model.utils.blob import prep_im_for_blob, im_list_to_blob 15 | 16 | 17 | class ALLCLSFSLoader(data.Dataset): 18 | def __init__(self, imdb, roidb, ratio_list, ratio_index, support_dir, 19 | batch_size, num_classes, num_shot=5, training=True, normalize=None): 20 | self._imdb = imdb 21 | self._roidb = roidb 22 | self._num_classes = num_classes 23 | self.trim_height = cfg.TRAIN.TRIM_HEIGHT 24 | self.trim_width = cfg.TRAIN.TRIM_WIDTH 25 | self.max_num_box = cfg.MAX_NUM_GT_BOXES 26 | self.training = training 27 | self.normalize = normalize 28 | self.ratio_list = ratio_list 29 | self.ratio_index = ratio_index 30 | self.batch_size = batch_size 31 | self.data_size = len(self.ratio_list) 32 | 33 | # given the ratio_list, we want to make the ratio same for each batch. 34 | self.ratio_list_batch = torch.Tensor(self.data_size).zero_() 35 | num_batch = int(np.ceil(len(ratio_index) / batch_size)) 36 | for i in range(num_batch): 37 | left_idx = i*batch_size 38 | right_idx = min((i+1)*batch_size-1, self.data_size-1) 39 | 40 | if ratio_list[right_idx] < 1: 41 | # for ratio < 1, we preserve the leftmost in each batch. 42 | target_ratio = ratio_list[left_idx] 43 | elif ratio_list[left_idx] > 1: 44 | # for ratio > 1, we preserve the rightmost in each batch. 45 | target_ratio = ratio_list[right_idx] 46 | else: 47 | # for ratio cross 1, we make it to be 1. 48 | target_ratio = 1 49 | 50 | self.ratio_list_batch[left_idx:(right_idx+1)] = target_ratio 51 | 52 | self.support_pool = [[] for i in range(self._num_classes)] 53 | self._label_to_cls_name = dict(list(zip(list(range(self._num_classes)), self._imdb.classes))) 54 | for _label in range(1, self._num_classes): 55 | cls_name = self._label_to_cls_name[_label] 56 | cls_dir = os.path.join(support_dir, cls_name) 57 | support_im_paths = [str(_p) for _p in list(Path(cls_dir).glob('*.jpg'))] 58 | if len(support_im_paths) == 0: 59 | raise Exception(f'support data not found in {cls_dir}') 60 | self.support_pool[_label].extend(support_im_paths) 61 | 62 | self.support_im_size = 320 63 | self.testing_shot = num_shot 64 | 65 | 66 | def __getitem__(self, index): 67 | index_ratio = index 68 | 69 | # get the anchor index for current sample index 70 | # here we set the anchor index to the last one 71 | # sample in this group 72 | minibatch_db = [self._roidb[index_ratio]] 73 | blobs = get_minibatch(minibatch_db) 74 | data = torch.from_numpy(blobs['data']) 75 | im_info = torch.from_numpy(blobs['im_info']) # (H, W, scale) 76 | # we need to random shuffle the bounding box. 77 | data_height, data_width = data.size(1), data.size(2) 78 | 79 | data = data.permute(0, 3, 1, 2).contiguous().view(3, data_height, data_width) 80 | im_info = im_info.view(3) 81 | 82 | # gt_boxes = torch.FloatTensor([1,1,1,1,1]) 83 | gt_boxes = torch.from_numpy(blobs['gt_boxes']) 84 | 85 | all_cls_gt_boxes = gt_boxes.clone() 86 | 87 | cur_cls_id_list = [] 88 | for i in range(gt_boxes.size(0)): 89 | if gt_boxes[i, 4] not in cur_cls_id_list: 90 | cur_cls_id_list.append(gt_boxes[i, 4]) 91 | random.seed(0) 92 | chosen_cls = random.sample(cur_cls_id_list, k=1)[0] 93 | 94 | new_gt_boxes = [] 95 | for i in range(gt_boxes.size(0)): 96 | if gt_boxes[i, 4] == chosen_cls: 97 | new_gt_boxes.append([gt_boxes[i, 0], gt_boxes[i, 1], gt_boxes[i, 2], gt_boxes[i, 3], chosen_cls]) 98 | gt_boxes = torch.from_numpy(np.asarray(new_gt_boxes)) 99 | 100 | num_boxes = 0 101 | 102 | # get supports 103 | support_data_all = np.zeros((self.testing_shot, 3, self.support_im_size, self.support_im_size), dtype=np.float32) 104 | current_gt_class_id = int(gt_boxes[0][4]) 105 | pool = self.support_pool[current_gt_class_id] 106 | 107 | random.seed(index) 108 | selected_supports = random.sample(pool, k=self.testing_shot) 109 | 110 | for i, _path in enumerate(selected_supports): 111 | support_im = imread(_path)[:,:,::-1] # rgb -> bgr 112 | target_size = np.min(support_im.shape[0:2]) # don't change the size 113 | support_im, _ = prep_im_for_blob(support_im, cfg.PIXEL_MEANS, target_size, cfg.TRAIN.MAX_SIZE) 114 | _h, _w = support_im.shape[0], support_im.shape[1] 115 | if _h > _w: 116 | resize_scale = float(self.support_im_size) / float(_h) 117 | unfit_size = int(_w * resize_scale) 118 | support_im = cv2.resize(support_im, (unfit_size, self.support_im_size), interpolation=cv2.INTER_LINEAR) 119 | else: 120 | resize_scale = float(self.support_im_size) / float(_w) 121 | unfit_size = int(_h * resize_scale) 122 | support_im = cv2.resize(support_im, (self.support_im_size, unfit_size), interpolation=cv2.INTER_LINEAR) 123 | h, w = support_im.shape[0], support_im.shape[1] 124 | support_data_all[i, :, :h, :w] = np.transpose(support_im, (2, 0, 1)) 125 | supports = torch.from_numpy(support_data_all) 126 | 127 | 128 | return data, im_info, gt_boxes, num_boxes, supports, all_cls_gt_boxes 129 | 130 | def __len__(self): 131 | return len(self._roidb) -------------------------------------------------------------------------------- /lib/model/nms/src/nms_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "nms_cuda_kernel.h" 13 | 14 | #define CUDA_WARN(XXX) \ 15 | do { if (XXX != cudaSuccess) std::cout << "CUDA Error: " << \ 16 | cudaGetErrorString(XXX) << ", at line " << __LINE__ \ 17 | << std::endl; cudaDeviceSynchronize(); } while (0) 18 | 19 | #define CUDA_CHECK(condition) \ 20 | /* Code block avoids redefinition of cudaError_t error */ \ 21 | do { \ 22 | cudaError_t error = condition; \ 23 | if (error != cudaSuccess) { \ 24 | std::cout << cudaGetErrorString(error) << std::endl; \ 25 | } \ 26 | } while (0) 27 | 28 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 29 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 30 | 31 | __device__ inline float devIoU(float const * const a, float const * const b) { 32 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 33 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 34 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 35 | float interS = width * height; 36 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 37 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 38 | return interS / (Sa + Sb - interS); 39 | } 40 | 41 | __global__ void nms_kernel(int n_boxes, float nms_overlap_thresh, 42 | float *dev_boxes, unsigned long long *dev_mask) { 43 | const int row_start = blockIdx.y; 44 | const int col_start = blockIdx.x; 45 | 46 | // if (row_start > col_start) return; 47 | 48 | const int row_size = 49 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 50 | const int col_size = 51 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 52 | 53 | __shared__ float block_boxes[threadsPerBlock * 5]; 54 | if (threadIdx.x < col_size) { 55 | block_boxes[threadIdx.x * 5 + 0] = 56 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 57 | block_boxes[threadIdx.x * 5 + 1] = 58 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 59 | block_boxes[threadIdx.x * 5 + 2] = 60 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 61 | block_boxes[threadIdx.x * 5 + 3] = 62 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 63 | block_boxes[threadIdx.x * 5 + 4] = 64 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 65 | } 66 | __syncthreads(); 67 | 68 | if (threadIdx.x < row_size) { 69 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 70 | const float *cur_box = dev_boxes + cur_box_idx * 5; 71 | int i = 0; 72 | unsigned long long t = 0; 73 | int start = 0; 74 | if (row_start == col_start) { 75 | start = threadIdx.x + 1; 76 | } 77 | for (i = start; i < col_size; i++) { 78 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 79 | t |= 1ULL << i; 80 | } 81 | } 82 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 83 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 84 | } 85 | } 86 | 87 | void nms_cuda_compute(int* keep_out, int *num_out, float* boxes_host, int boxes_num, 88 | int boxes_dim, float nms_overlap_thresh) { 89 | 90 | float* boxes_dev = NULL; 91 | unsigned long long* mask_dev = NULL; 92 | 93 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 94 | 95 | CUDA_CHECK(cudaMalloc(&boxes_dev, 96 | boxes_num * boxes_dim * sizeof(float))); 97 | CUDA_CHECK(cudaMemcpy(boxes_dev, 98 | boxes_host, 99 | boxes_num * boxes_dim * sizeof(float), 100 | cudaMemcpyHostToDevice)); 101 | 102 | CUDA_CHECK(cudaMalloc(&mask_dev, 103 | boxes_num * col_blocks * sizeof(unsigned long long))); 104 | 105 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 106 | DIVUP(boxes_num, threadsPerBlock)); 107 | dim3 threads(threadsPerBlock); 108 | 109 | // printf("i am at line %d\n", boxes_num); 110 | // printf("i am at line %d\n", boxes_dim); 111 | 112 | nms_kernel<<>>(boxes_num, 113 | nms_overlap_thresh, 114 | boxes_dev, 115 | mask_dev); 116 | 117 | std::vector mask_host(boxes_num * col_blocks); 118 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 119 | mask_dev, 120 | sizeof(unsigned long long) * boxes_num * col_blocks, 121 | cudaMemcpyDeviceToHost)); 122 | 123 | std::vector remv(col_blocks); 124 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 125 | 126 | // we need to create a memory for keep_out on cpu 127 | // otherwise, the following code cannot run 128 | 129 | int* keep_out_cpu = new int[boxes_num]; 130 | 131 | int num_to_keep = 0; 132 | for (int i = 0; i < boxes_num; i++) { 133 | int nblock = i / threadsPerBlock; 134 | int inblock = i % threadsPerBlock; 135 | 136 | if (!(remv[nblock] & (1ULL << inblock))) { 137 | // orignal: keep_out[num_to_keep++] = i; 138 | keep_out_cpu[num_to_keep++] = i; 139 | unsigned long long *p = &mask_host[0] + i * col_blocks; 140 | for (int j = nblock; j < col_blocks; j++) { 141 | remv[j] |= p[j]; 142 | } 143 | } 144 | } 145 | 146 | // copy keep_out_cpu to keep_out on gpu 147 | CUDA_WARN(cudaMemcpy(keep_out, keep_out_cpu, boxes_num * sizeof(int),cudaMemcpyHostToDevice)); 148 | 149 | // *num_out = num_to_keep; 150 | 151 | // original: *num_out = num_to_keep; 152 | // copy num_to_keep to num_out on gpu 153 | 154 | CUDA_WARN(cudaMemcpy(num_out, &num_to_keep, 1 * sizeof(int),cudaMemcpyHostToDevice)); 155 | 156 | // release cuda memory 157 | CUDA_CHECK(cudaFree(boxes_dev)); 158 | CUDA_CHECK(cudaFree(mask_dev)); 159 | // release cpu memory 160 | delete []keep_out_cpu; 161 | } 162 | -------------------------------------------------------------------------------- /lib/roi_data_layer/inference_loader.py: -------------------------------------------------------------------------------- 1 | """The data layer used during training to train a Fast R-CNN network. 2 | """ 3 | import numpy as np 4 | import random 5 | import time 6 | import pdb 7 | import cv2 8 | import torch.utils.data as data 9 | import torch 10 | import os 11 | from pathlib import Path 12 | from PIL import Image 13 | from scipy.misc import imread 14 | 15 | from roi_data_layer.minibatch import get_minibatch 16 | from model.utils.config import cfg 17 | from model.rpn.bbox_transform import bbox_transform_inv, clip_boxes 18 | from model.utils.blob import prep_im_for_blob, im_list_to_blob 19 | 20 | from pycocotools.coco import COCO 21 | 22 | 23 | class InferenceLoader(data.Dataset): 24 | def __init__(self, epi_random_seed, imdb, roidb, ratio_list, ratio_index, support_dir, 25 | batch_size, num_classes, num_shot=5, training=True, normalize=None): 26 | self._imdb = imdb 27 | self._roidb = roidb 28 | self._num_classes = num_classes 29 | self.trim_height = cfg.TRAIN.TRIM_HEIGHT 30 | self.trim_width = cfg.TRAIN.TRIM_WIDTH 31 | self.max_num_box = cfg.MAX_NUM_GT_BOXES 32 | self.training = training 33 | self.normalize = normalize 34 | self.ratio_list = ratio_list 35 | self.ratio_index = ratio_index 36 | self.batch_size = batch_size 37 | self.data_size = len(self.ratio_list) 38 | self.epi_random_seed = epi_random_seed 39 | ############################################################################# 40 | # roidb: 41 | # {'width': 640, 'height': 484, 'boxes': array([[ 58, 152, 268, 243]], dtype=uint16), 42 | # 'gt_classes': array([79], dtype=int32), flipped': False, 'seg_areas': array([12328.567], dtype=float32), 43 | # 'img_id': 565198, 'image': '/home/tungi/FSOD/data/coco/images/val2014/COCO_val2014_000000565198.jpg', 44 | # 'max_classes': array([79]), 'max_overlaps': array([1.], dtype=float32), 'need_crop': 0} 45 | 46 | # name_to_coco_cls_ind = {'person': 1, 'bicycle': 2, 'car': 3, 'motorcycle': 4, 'airplane': 5, 'bus': 6, 'train': 7, 47 | # 'truck': 8, 'boat': 9, 'traffic light': 10, 'fire hydrant': 11, 'stop sign': 13, 'parking meter': 14, 'bench': 15, 48 | # 'bird': 16, 'cat': 17, 'dog': 18, 'horse': 19, 'sheep': 20, 'cow': 21, 'elephant': 22, 'bear': 23, 'zebra': 24, 49 | # 'giraffe': 25, 'backpack': 27, 'umbrella': 28, 'handbag': 31, 'tie': 32, 'suitcase': 33, 'frisbee': 34, 'skis': 35, 50 | # 'snowboard': 36, 'sports ball': 37, 'kite': 38, 'baseball bat': 39, 'baseball glove': 40, 'skateboard': 41, 'surfboard': 42, 51 | # 'tennis racket': 43, 'bottle': 44, 'wine glass': 46, 'cup': 47, 'fork': 48, 'knife': 49, 'spoon': 50, 'bowl': 51, 52 | # 'banana': 52, 'apple': 53, 'sandwich': 54, 'orange': 55, 'broccoli': 56, 'carrot': 57, 'hot dog': 58, 'pizza': 59, 53 | # 'donut': 60, 'cake': 61, 'chair': 62, 'couch': 63, 'potted plant': 64, 'bed': 65, 'dining table': 67, 'toilet': 70, 'tv': 72, 54 | # 'laptop': 73, 'mouse': 74, 'remote': 75, 'keyboard': 76, 'cell phone': 77, 'microwave': 78, 'oven': 79, 'toaster': 80, 55 | # 'sink': 81, 'refrigerator': 82, 'book': 84, 'clock': 85, 'vase': 86, 'scissors': 87, 'teddy bear': 88, 'hair drier': 89, 'toothbrush': 90} 56 | ############################################################################# 57 | 58 | self.support_im_size = 320 59 | self.testing_shot = num_shot 60 | 61 | self.support_pool = [[] for i in range(self._num_classes)] 62 | self._label_to_cls_name = dict(list(zip(list(range(self._num_classes)), self._imdb.classes))) 63 | for _label in range(1, self._num_classes): 64 | cls_name = self._label_to_cls_name[_label] 65 | cls_dir = os.path.join(support_dir, cls_name) 66 | support_im_paths = [str(_p) for _p in list(Path(cls_dir).glob('*.jpg'))] 67 | if len(support_im_paths) == 0: 68 | raise Exception(f'support data not found in {cls_dir}') 69 | random.seed(epi_random_seed) # fix the shots 70 | support_im_paths = random.sample(support_im_paths, k=self.testing_shot) 71 | self.support_pool[_label].extend(support_im_paths) 72 | 73 | 74 | def __getitem__(self, index): 75 | # testing 76 | index_ratio = index 77 | # though it is called minibatch, in fact it contains only one img here 78 | minibatch_db = [self._roidb[index_ratio]] 79 | 80 | # load query 81 | blobs = get_minibatch(minibatch_db) 82 | data = torch.from_numpy(blobs['data']) 83 | im_info = torch.from_numpy(blobs['im_info']) # (H, W, scale) 84 | data_height, data_width = data.size(1), data.size(2) 85 | data = data.permute(0, 3, 1, 2).contiguous().view(3, data_height, data_width) 86 | im_info = im_info.view(3) 87 | gt_boxes = torch.from_numpy(blobs['gt_boxes']) 88 | num_boxes = gt_boxes.size(0) 89 | 90 | # get supports 91 | support_data_all = np.zeros((self.testing_shot, 3, self.support_im_size, self.support_im_size), dtype=np.float32) 92 | current_gt_class_id = int(gt_boxes[0][4]) 93 | selected_supports = self.support_pool[current_gt_class_id] 94 | 95 | for i, _path in enumerate(selected_supports): 96 | support_im = imread(_path)[:,:,::-1] # rgb -> bgr 97 | target_size = np.min(support_im.shape[0:2]) # don't change the size 98 | support_im, _ = prep_im_for_blob(support_im, cfg.PIXEL_MEANS, target_size, cfg.TRAIN.MAX_SIZE) 99 | _h, _w = support_im.shape[0], support_im.shape[1] 100 | if _h > _w: 101 | resize_scale = float(self.support_im_size) / float(_h) 102 | unfit_size = int(_w * resize_scale) 103 | support_im = cv2.resize(support_im, (unfit_size, self.support_im_size), interpolation=cv2.INTER_LINEAR) 104 | else: 105 | resize_scale = float(self.support_im_size) / float(_w) 106 | unfit_size = int(_h * resize_scale) 107 | support_im = cv2.resize(support_im, (self.support_im_size, unfit_size), interpolation=cv2.INTER_LINEAR) 108 | h, w = support_im.shape[0], support_im.shape[1] 109 | support_data_all[i, :, :h, :w] = np.transpose(support_im, (2, 0, 1)) 110 | supports = torch.from_numpy(support_data_all) 111 | 112 | 113 | return data, im_info, gt_boxes, num_boxes, supports 114 | 115 | def __len__(self): 116 | return len(self._roidb) -------------------------------------------------------------------------------- /lib/model/utils/net_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy as np 6 | import torchvision.models as models 7 | from model.utils.config import cfg 8 | import cv2 9 | import pdb 10 | import random 11 | 12 | def save_net(fname, net): 13 | import h5py 14 | h5f = h5py.File(fname, mode='w') 15 | for k, v in net.state_dict().items(): 16 | h5f.create_dataset(k, data=v.cpu().numpy()) 17 | 18 | def load_net(fname, net): 19 | import h5py 20 | h5f = h5py.File(fname, mode='r') 21 | for k, v in net.state_dict().items(): 22 | param = torch.from_numpy(np.asarray(h5f[k])) 23 | v.copy_(param) 24 | 25 | def weights_normal_init(model, dev=0.01): 26 | if isinstance(model, list): 27 | for m in model: 28 | weights_normal_init(m, dev) 29 | else: 30 | for m in model.modules(): 31 | if isinstance(m, nn.Conv2d): 32 | m.weight.data.normal_(0.0, dev) 33 | elif isinstance(m, nn.Linear): 34 | m.weight.data.normal_(0.0, dev) 35 | 36 | 37 | def clip_gradient(model, clip_norm): 38 | """Computes a gradient clipping coefficient based on gradient norm.""" 39 | totalnorm = 0 40 | for p in model.parameters(): 41 | if p.requires_grad and p.grad is not None: 42 | modulenorm = p.grad.norm() 43 | totalnorm += modulenorm ** 2 44 | totalnorm = torch.sqrt(totalnorm).item() 45 | norm = (clip_norm / max(totalnorm, clip_norm)) 46 | for p in model.parameters(): 47 | if p.requires_grad and p.grad is not None: 48 | p.grad.mul_(norm) 49 | 50 | def vis_detections(im, class_name, dets, thresh=0.8): 51 | """Visual debugging of detections.""" 52 | for i in range(np.minimum(10, dets.shape[0])): 53 | bbox = tuple(int(np.round(x)) for x in dets[i, :4]) 54 | score = dets[i, -1] 55 | if score > thresh: 56 | cv2.rectangle(im, bbox[0:2], bbox[2:4], (0, 204, 0), 2) 57 | cv2.putText(im, '%s: %.3f' % (class_name, score), (bbox[0], bbox[1] + 15), cv2.FONT_HERSHEY_PLAIN, 58 | 1.0, (0, 0, 255), thickness=1) 59 | return im 60 | 61 | 62 | def adjust_learning_rate(optimizer, decay=0.1): 63 | """Sets the learning rate to the initial LR decayed by 0.5 every 20 epochs""" 64 | for param_group in optimizer.param_groups: 65 | param_group['lr'] = decay * param_group['lr'] 66 | 67 | 68 | def save_checkpoint(state, filename): 69 | torch.save(state, filename) 70 | 71 | def _smooth_l1_loss(bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, sigma=1.0, dim=[1]): 72 | 73 | sigma_2 = sigma ** 2 74 | box_diff = bbox_pred - bbox_targets 75 | in_box_diff = bbox_inside_weights * box_diff 76 | abs_in_box_diff = torch.abs(in_box_diff) 77 | smoothL1_sign = (abs_in_box_diff < 1. / sigma_2).detach().float() 78 | in_loss_box = torch.pow(in_box_diff, 2) * (sigma_2 / 2.) * smoothL1_sign \ 79 | + (abs_in_box_diff - (0.5 / sigma_2)) * (1. - smoothL1_sign) 80 | out_loss_box = bbox_outside_weights * in_loss_box 81 | loss_box = out_loss_box 82 | for i in sorted(dim, reverse=True): 83 | loss_box = loss_box.sum(i) 84 | loss_box = loss_box.mean() 85 | return loss_box 86 | 87 | def _crop_pool_layer(bottom, rois, max_pool=True): 88 | # code modified from 89 | # https://github.com/ruotianluo/pytorch-faster-rcnn 90 | # implement it using stn 91 | # box to affine 92 | # input (x1,y1,x2,y2) 93 | """ 94 | [ x2-x1 x1 + x2 - W + 1 ] 95 | [ ----- 0 --------------- ] 96 | [ W - 1 W - 1 ] 97 | [ ] 98 | [ y2-y1 y1 + y2 - H + 1 ] 99 | [ 0 ----- --------------- ] 100 | [ H - 1 H - 1 ] 101 | """ 102 | rois = rois.detach() 103 | batch_size = bottom.size(0) 104 | D = bottom.size(1) 105 | H = bottom.size(2) 106 | W = bottom.size(3) 107 | roi_per_batch = rois.size(0) / batch_size 108 | x1 = rois[:, 1::4] / 16.0 109 | y1 = rois[:, 2::4] / 16.0 110 | x2 = rois[:, 3::4] / 16.0 111 | y2 = rois[:, 4::4] / 16.0 112 | 113 | height = bottom.size(2) 114 | width = bottom.size(3) 115 | 116 | # affine theta 117 | zero = Variable(rois.data.new(rois.size(0), 1).zero_()) 118 | theta = torch.cat([\ 119 | (x2 - x1) / (width - 1), 120 | zero, 121 | (x1 + x2 - width + 1) / (width - 1), 122 | zero, 123 | (y2 - y1) / (height - 1), 124 | (y1 + y2 - height + 1) / (height - 1)], 1).view(-1, 2, 3) 125 | 126 | if max_pool: 127 | pre_pool_size = cfg.POOLING_SIZE * 2 128 | grid = F.affine_grid(theta, torch.Size((rois.size(0), 1, pre_pool_size, pre_pool_size))) 129 | bottom = bottom.view(1, batch_size, D, H, W).contiguous().expand(roi_per_batch, batch_size, D, H, W)\ 130 | .contiguous().view(-1, D, H, W) 131 | crops = F.grid_sample(bottom, grid) 132 | crops = F.max_pool2d(crops, 2, 2) 133 | else: 134 | grid = F.affine_grid(theta, torch.Size((rois.size(0), 1, cfg.POOLING_SIZE, cfg.POOLING_SIZE))) 135 | bottom = bottom.view(1, batch_size, D, H, W).contiguous().expand(roi_per_batch, batch_size, D, H, W)\ 136 | .contiguous().view(-1, D, H, W) 137 | crops = F.grid_sample(bottom, grid) 138 | 139 | return crops, grid 140 | 141 | def _affine_grid_gen(rois, input_size, grid_size): 142 | 143 | rois = rois.detach() 144 | x1 = rois[:, 1::4] / 16.0 145 | y1 = rois[:, 2::4] / 16.0 146 | x2 = rois[:, 3::4] / 16.0 147 | y2 = rois[:, 4::4] / 16.0 148 | 149 | height = input_size[0] 150 | width = input_size[1] 151 | 152 | zero = Variable(rois.data.new(rois.size(0), 1).zero_()) 153 | theta = torch.cat([\ 154 | (x2 - x1) / (width - 1), 155 | zero, 156 | (x1 + x2 - width + 1) / (width - 1), 157 | zero, 158 | (y2 - y1) / (height - 1), 159 | (y1 + y2 - height + 1) / (height - 1)], 1).view(-1, 2, 3) 160 | 161 | grid = F.affine_grid(theta, torch.Size((rois.size(0), 1, grid_size, grid_size))) 162 | 163 | return grid 164 | 165 | def _affine_theta(rois, input_size): 166 | 167 | rois = rois.detach() 168 | x1 = rois[:, 1::4] / 16.0 169 | y1 = rois[:, 2::4] / 16.0 170 | x2 = rois[:, 3::4] / 16.0 171 | y2 = rois[:, 4::4] / 16.0 172 | 173 | height = input_size[0] 174 | width = input_size[1] 175 | 176 | zero = Variable(rois.data.new(rois.size(0), 1).zero_()) 177 | 178 | # theta = torch.cat([\ 179 | # (x2 - x1) / (width - 1), 180 | # zero, 181 | # (x1 + x2 - width + 1) / (width - 1), 182 | # zero, 183 | # (y2 - y1) / (height - 1), 184 | # (y1 + y2 - height + 1) / (height - 1)], 1).view(-1, 2, 3) 185 | 186 | theta = torch.cat([\ 187 | (y2 - y1) / (height - 1), 188 | zero, 189 | (y1 + y2 - height + 1) / (height - 1), 190 | zero, 191 | (x2 - x1) / (width - 1), 192 | (x1 + x2 - width + 1) / (width - 1)], 1).view(-1, 2, 3) 193 | 194 | return theta 195 | -------------------------------------------------------------------------------- /lib/roi_data_layer/multiway_loader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import time 4 | import pdb 5 | import cv2 6 | import torch.utils.data as data 7 | import torch 8 | import os 9 | from pathlib import Path 10 | from PIL import Image 11 | from scipy.misc import imread 12 | 13 | from roi_data_layer.minibatch import get_minibatch, get_minibatch 14 | from model.utils.config import cfg 15 | from model.rpn.bbox_transform import bbox_transform_inv, clip_boxes 16 | from model.utils.blob import prep_im_for_blob, im_list_to_blob 17 | 18 | from pycocotools.coco import COCO 19 | 20 | 21 | class MultiwayLoader(data.Dataset): 22 | def __init__(self, epi_random_seed, imdb, roidb, ratio_list, ratio_index, support_dir, 23 | batch_size, num_classes, num_shot=5, training=True, normalize=None, num_way=1): 24 | self._imdb = imdb 25 | self._roidb = roidb 26 | self._num_classes = num_classes 27 | self.trim_height = cfg.TRAIN.TRIM_HEIGHT 28 | self.trim_width = cfg.TRAIN.TRIM_WIDTH 29 | self.max_num_box = cfg.MAX_NUM_GT_BOXES 30 | self.training = training 31 | self.normalize = normalize 32 | self.ratio_list = ratio_list 33 | self.ratio_index = ratio_index 34 | self.batch_size = batch_size 35 | self.data_size = len(self.ratio_list) 36 | self.epi_random_seed = epi_random_seed 37 | self.num_way = num_way 38 | ############################################################################# 39 | # roidb: 40 | # {'width': 640, 'height': 484, 'boxes': array([[ 58, 152, 268, 243]], dtype=uint16), 41 | # 'gt_classes': array([79], dtype=int32), flipped': False, 'seg_areas': array([12328.567], dtype=float32), 42 | # 'img_id': 565198, 'image': '/home/tungi/FSOD/data/coco/images/val2014/COCO_val2014_000000565198.jpg', 43 | # 'max_classes': array([79]), 'max_overlaps': array([1.], dtype=float32), 'need_crop': 0} 44 | 45 | # name_to_coco_cls_ind = {'person': 1, 'bicycle': 2, 'car': 3, 'motorcycle': 4, 'airplane': 5, 'bus': 6, 'train': 7, 46 | # 'truck': 8, 'boat': 9, 'traffic light': 10, 'fire hydrant': 11, 'stop sign': 13, 'parking meter': 14, 'bench': 15, 47 | # 'bird': 16, 'cat': 17, 'dog': 18, 'horse': 19, 'sheep': 20, 'cow': 21, 'elephant': 22, 'bear': 23, 'zebra': 24, 48 | # 'giraffe': 25, 'backpack': 27, 'umbrella': 28, 'handbag': 31, 'tie': 32, 'suitcase': 33, 'frisbee': 34, 'skis': 35, 49 | # 'snowboard': 36, 'sports ball': 37, 'kite': 38, 'baseball bat': 39, 'baseball glove': 40, 'skateboard': 41, 'surfboard': 42, 50 | # 'tennis racket': 43, 'bottle': 44, 'wine glass': 46, 'cup': 47, 'fork': 48, 'knife': 49, 'spoon': 50, 'bowl': 51, 51 | # 'banana': 52, 'apple': 53, 'sandwich': 54, 'orange': 55, 'broccoli': 56, 'carrot': 57, 'hot dog': 58, 'pizza': 59, 52 | # 'donut': 60, 'cake': 61, 'chair': 62, 'couch': 63, 'potted plant': 64, 'bed': 65, 'dining table': 67, 'toilet': 70, 'tv': 72, 53 | # 'laptop': 73, 'mouse': 74, 'remote': 75, 'keyboard': 76, 'cell phone': 77, 'microwave': 78, 'oven': 79, 'toaster': 80, 54 | # 'sink': 81, 'refrigerator': 82, 'book': 84, 'clock': 85, 'vase': 86, 'scissors': 87, 'teddy bear': 88, 'hair drier': 89, 'toothbrush': 90} 55 | ############################################################################# 56 | 57 | self.support_im_size = 320 58 | self.testing_shot = num_shot 59 | 60 | self.support_pool = [[] for i in range(self._num_classes)] 61 | self._label_to_cls_name = dict(list(zip(list(range(self._num_classes)), self._imdb.classes))) 62 | for _label in range(1, self._num_classes): 63 | cls_name = self._label_to_cls_name[_label] 64 | cls_dir = os.path.join(support_dir, cls_name) 65 | support_im_paths = [str(_p) for _p in list(Path(cls_dir).glob('*.jpg'))] 66 | if len(support_im_paths) == 0: 67 | raise Exception(f'support data not found in {cls_dir}') 68 | random.seed(epi_random_seed) # fix the shots 69 | support_im_paths = random.sample(support_im_paths, k=self.testing_shot) 70 | self.support_pool[_label].extend(support_im_paths) 71 | 72 | 73 | def __getitem__(self, index): 74 | # testing 75 | index_ratio = index 76 | # though it is called minibatch, in fact it contains only one img here 77 | minibatch_db = [self._roidb[index_ratio]] 78 | 79 | # load query 80 | blobs = get_minibatch(minibatch_db) 81 | data = torch.from_numpy(blobs['data']) 82 | im_info = torch.from_numpy(blobs['im_info']) # (H, W, scale) 83 | data_height, data_width = data.size(1), data.size(2) 84 | data = data.permute(0, 3, 1, 2).contiguous().view(3, data_height, data_width) 85 | im_info = im_info.view(3) 86 | gt_boxes = torch.from_numpy(blobs['gt_boxes']) 87 | num_boxes = gt_boxes.size(0) 88 | all_cls_in_im = [] 89 | for i in range(num_boxes): 90 | _cls = int(gt_boxes[i, 4]) 91 | all_cls_in_im.append(_cls) 92 | all_cls_in_im = list(set(all_cls_in_im)) 93 | if len(all_cls_in_im) > self.num_way: 94 | random.seed(self.epi_random_seed) # fix 95 | selected_ways = random.sample(all_cls_in_im, k=self.num_way) 96 | else: 97 | other_cls = list(range(self._num_classes)) 98 | other_cls.remove(0) 99 | for _cls_ind in all_cls_in_im: 100 | other_cls.remove(_cls_ind) 101 | random.seed(self.epi_random_seed) # fix 102 | random_neg_cls = random.sample(other_cls, k=(self.num_way - len(all_cls_in_im))) 103 | selected_ways = all_cls_in_im 104 | selected_ways.extend(random_neg_cls) 105 | 106 | # get supports 107 | support_data_all = np.zeros((self.testing_shot * self.num_way, 3, self.support_im_size, self.support_im_size), dtype=np.float32) 108 | 109 | for n in range(self.num_way): 110 | selected_supports = self.support_pool[selected_ways[n]] 111 | 112 | for i, _path in enumerate(selected_supports): 113 | support_im = imread(_path)[:,:,::-1] # rgb -> bgr 114 | target_size = np.min(support_im.shape[0:2]) # don't change the size 115 | support_im, _ = prep_im_for_blob(support_im, cfg.PIXEL_MEANS, target_size, cfg.TRAIN.MAX_SIZE) 116 | _h, _w = support_im.shape[0], support_im.shape[1] 117 | if _h > _w: 118 | resize_scale = float(self.support_im_size) / float(_h) 119 | unfit_size = int(_w * resize_scale) 120 | support_im = cv2.resize(support_im, (unfit_size, self.support_im_size), interpolation=cv2.INTER_LINEAR) 121 | else: 122 | resize_scale = float(self.support_im_size) / float(_w) 123 | unfit_size = int(_h * resize_scale) 124 | support_im = cv2.resize(support_im, (self.support_im_size, unfit_size), interpolation=cv2.INTER_LINEAR) 125 | h, w = support_im.shape[0], support_im.shape[1] 126 | support_data_all[self.testing_shot*n+i, :, :h, :w] = np.transpose(support_im, (2, 0, 1)) 127 | supports = torch.from_numpy(support_data_all) 128 | 129 | 130 | return data, im_info, gt_boxes, num_boxes, supports, selected_ways 131 | 132 | def __len__(self): 133 | return len(self._roidb) -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import argparse 4 | import time 5 | import pickle 6 | import cv2 7 | import sys 8 | import torch 9 | import torch.nn as nn 10 | from torch.autograd import Variable 11 | from tqdm import tqdm 12 | from matplotlib import pyplot as plt 13 | from roi_data_layer.roidb import combined_roidb 14 | from roi_data_layer.inference_loader import InferenceLoader 15 | from roi_data_layer.general_test_loader import GeneralTestLoader 16 | from model.utils.config import cfg, cfg_from_file, cfg_from_list, get_output_dir 17 | from model.rpn.bbox_transform import clip_boxes 18 | from model.roi_layers import nms 19 | from model.rpn.bbox_transform import bbox_transform_inv 20 | from model.utils.net_utils import save_net, load_net, vis_detections 21 | from model.utils.fsod_logger import FSODInferenceLogger 22 | from utils import * 23 | 24 | 25 | if __name__ == '__main__': 26 | 27 | args = parse_args() 28 | print(args) 29 | cfg_from_file(args.cfg_file) 30 | cfg_from_list(args.set_cfgs) 31 | 32 | # prepare roidb 33 | cfg.TRAIN.USE_FLIPPED = False 34 | imdb, roidb, ratio_list, ratio_index = combined_roidb(args.imdbval_name, False) 35 | CWD = os.getcwd() 36 | support_dir = os.path.join(CWD, 'data/supports', args.sup_dir) 37 | 38 | # load dir 39 | input_dir = os.path.join(args.load_dir, "train/checkpoints") 40 | if not os.path.exists(input_dir): 41 | raise Exception('There is no input directory for loading network from ' + input_dir) 42 | load_name = os.path.join(input_dir, 43 | 'model_{}_{}.pth'.format(args.checkepoch, args.checkpoint)) 44 | 45 | # initilize the network 46 | classes = ['fg', 'bg'] 47 | model = get_model(args.net, pretrained=False, way=args.way, shot=args.shot, classes=classes) 48 | print("load checkpoint %s" % (load_name)) 49 | checkpoint = torch.load(load_name) 50 | model.load_state_dict(checkpoint['model']) 51 | if args.mGPUs: 52 | model = model.module 53 | if 'pooling_mode' in checkpoint.keys(): 54 | cfg.POOLING_MODE = checkpoint['pooling_mode'] 55 | print('load model successfully!') 56 | cfg.CUDA = True 57 | model.cuda() 58 | model.eval() 59 | 60 | # initilize the tensor holders 61 | holders = prepare_var(support=True) 62 | im_data = holders[0] 63 | im_info = holders[1] 64 | num_boxes = holders[2] 65 | gt_boxes = holders[3] 66 | support_ims = holders[4] 67 | 68 | # prepare holder for predicted boxes 69 | start = time.time() 70 | max_per_image = 100 71 | thresh = 0.05 72 | num_images = len(imdb.image_index) 73 | all_boxes = [[[] for _ in range(num_images)] 74 | for _ in range(imdb.num_classes)] 75 | _t = {'im_detect': time.time(), 'misc': time.time()} 76 | 77 | model.eval() 78 | empty_array = np.transpose(np.array([[],[],[],[],[]]), (1,0)) 79 | 80 | imdb, roidb, ratio_list, ratio_index = combined_roidb(args.imdbval_name, False) 81 | imdb.competition_mode(on=True) 82 | dataset = InferenceLoader(0, imdb, roidb, ratio_list, ratio_index, support_dir, 83 | 1, len(imdb._classes), num_shot=args.shot, training=False, normalize=False) 84 | 85 | dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0, pin_memory=True) 86 | data_iter = iter(dataloader) 87 | 88 | for i in tqdm(range(num_images)): 89 | data = next(data_iter) 90 | with torch.no_grad(): 91 | im_data.resize_(data[0].size()).copy_(data[0]) 92 | im_info.resize_(data[1].size()).copy_(data[1]) 93 | gt_boxes.resize_(data[2].size()).copy_(data[2]) 94 | num_boxes.resize_(data[3].size()).copy_(data[3]) 95 | support_ims.resize_(data[4].size()).copy_(data[4]) 96 | 97 | 98 | det_tic = time.time() 99 | with torch.no_grad(): 100 | rois, cls_prob, bbox_pred, \ 101 | rpn_loss_cls, rpn_loss_box, \ 102 | RCNN_loss_cls, RCNN_loss_bbox, \ 103 | rois_label = model(im_data, im_info, gt_boxes, num_boxes, support_ims) 104 | det_toc = time.time() 105 | detect_time = det_toc - det_tic 106 | misc_tic = time.time() 107 | 108 | scores = cls_prob.data 109 | boxes = rois.data[:, :, 1:5] 110 | 111 | # Apply bounding-box regression deltas 112 | box_deltas = bbox_pred.data 113 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 114 | # Optionally normalize targets by a precomputed mean and stdev 115 | 116 | box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ 117 | + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() 118 | box_deltas = box_deltas.view(1, -1, 4) 119 | 120 | 121 | pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) 122 | pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) 123 | 124 | # re-scale boxes to the origin img scale 125 | pred_boxes /= data[1][0][2].item() 126 | 127 | scores = scores.squeeze() 128 | pred_boxes = pred_boxes.squeeze() 129 | 130 | 131 | for j in range(1, imdb.num_classes): 132 | if j != gt_boxes[0, 0, 4]: 133 | all_boxes[j][i] = empty_array 134 | continue 135 | inds = torch.nonzero(scores[:,1]>thresh).view(-1) 136 | if inds.numel() > 0: 137 | cls_scores = scores[:,1][inds] 138 | cls_boxes = pred_boxes[inds, :] 139 | cls_dets = NMS(cls_boxes, cls_scores) 140 | all_boxes[j][i] = cls_dets.cpu().numpy() 141 | else: 142 | all_boxes[j][i] = empty_array 143 | 144 | misc_toc = time.time() 145 | nms_time = misc_toc - misc_tic 146 | 147 | # if args.imlog: 148 | # origin_im = im_data[0].permute(1, 2, 0).contiguous().cpu().numpy()[:, :, ::-1] 149 | # origin_im = origin_im - origin_im.min() 150 | # origin_im /= origin_im.max() 151 | # gt_im = origin_im.copy() 152 | # pt_im = origin_im.copy() 153 | # np_gt_boxes = gt_boxes[0] 154 | # for n in range(np_gt_boxes.shape[0]): 155 | # box = np_gt_boxes[n].clone() 156 | # cv2.rectangle(gt_im, (box[0], box[1]), (box[2], box[3]), (0.1, 1, 0.1), 2) 157 | # plt.imshow(gt_im) 158 | # plt.show() 159 | # sup_im = support_ims[0][0].permute(1, 2, 0).contiguous().cpu().numpy()[:, :, ::-1] 160 | # sup_im = sup_im - sup_im.min() 161 | # sup_im /= sup_im.max() 162 | # plt.imshow(sup_im) 163 | # plt.show() 164 | # raise Exception(' ') 165 | 166 | # raise Exception(' ') 167 | # cv2.rectangle(im, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (20, 255, 20), 2) 168 | # tb_logger.write(i, gt, support_ims, predict, save_im=True) 169 | 170 | sys.stdout.write('im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r' \ 171 | .format(i + 1, num_images, detect_time, nms_time)) 172 | sys.stdout.flush() 173 | 174 | output_dir = os.path.join(CWD, 'inference_output', args.eval_dir) 175 | if not os.path.exists(output_dir): 176 | os.makedirs(output_dir) 177 | det_file = os.path.join(output_dir, 'detections.pkl') 178 | with open(det_file, 'wb') as f: 179 | pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) 180 | print('Evaluating detections') 181 | imdb.evaluate_detections(all_boxes, output_dir) 182 | -------------------------------------------------------------------------------- /lib/datasets/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import xml.etree.ElementTree as ET 11 | import os 12 | import pickle 13 | import numpy as np 14 | 15 | def parse_rec(filename): 16 | """ Parse a PASCAL VOC xml file """ 17 | tree = ET.parse(filename) 18 | objects = [] 19 | for obj in tree.findall('object'): 20 | obj_struct = {} 21 | obj_struct['name'] = obj.find('name').text 22 | obj_struct['pose'] = obj.find('pose').text 23 | obj_struct['truncated'] = int(obj.find('truncated').text) 24 | obj_struct['difficult'] = int(obj.find('difficult').text) 25 | bbox = obj.find('bndbox') 26 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 27 | int(bbox.find('ymin').text), 28 | int(bbox.find('xmax').text), 29 | int(bbox.find('ymax').text)] 30 | objects.append(obj_struct) 31 | 32 | return objects 33 | 34 | 35 | def voc_ap(rec, prec, use_07_metric=False): 36 | """ ap = voc_ap(rec, prec, [use_07_metric]) 37 | Compute VOC AP given precision and recall. 38 | If use_07_metric is true, uses the 39 | VOC 07 11 point method (default:False). 40 | """ 41 | if use_07_metric: 42 | # 11 point metric 43 | ap = 0. 44 | for t in np.arange(0., 1.1, 0.1): 45 | if np.sum(rec >= t) == 0: 46 | p = 0 47 | else: 48 | p = np.max(prec[rec >= t]) 49 | ap = ap + p / 11. 50 | else: 51 | # correct AP calculation 52 | # first append sentinel values at the end 53 | mrec = np.concatenate(([0.], rec, [1.])) 54 | mpre = np.concatenate(([0.], prec, [0.])) 55 | 56 | # compute the precision envelope 57 | for i in range(mpre.size - 1, 0, -1): 58 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 59 | 60 | # to calculate area under PR curve, look for points 61 | # where X axis (recall) changes value 62 | i = np.where(mrec[1:] != mrec[:-1])[0] 63 | 64 | # and sum (\Delta recall) * prec 65 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 66 | return ap 67 | 68 | 69 | def voc_eval(detpath, 70 | annopath, 71 | imagesetfile, 72 | classname, 73 | cachedir, 74 | ovthresh=0.5, 75 | use_07_metric=False): 76 | """rec, prec, ap = voc_eval(detpath, 77 | annopath, 78 | imagesetfile, 79 | classname, 80 | [ovthresh], 81 | [use_07_metric]) 82 | 83 | Top level function that does the PASCAL VOC evaluation. 84 | 85 | detpath: Path to detections 86 | detpath.format(classname) should produce the detection results file. 87 | annopath: Path to annotations 88 | annopath.format(imagename) should be the xml annotations file. 89 | imagesetfile: Text file containing the list of images, one image per line. 90 | classname: Category name (duh) 91 | cachedir: Directory for caching the annotations 92 | [ovthresh]: Overlap threshold (default = 0.5) 93 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 94 | (default False) 95 | """ 96 | # assumes detections are in detpath.format(classname) 97 | # assumes annotations are in annopath.format(imagename) 98 | # assumes imagesetfile is a text file with each line an image name 99 | # cachedir caches the annotations in a pickle file 100 | 101 | # first load gt 102 | if not os.path.isdir(cachedir): 103 | os.mkdir(cachedir) 104 | cachefile = os.path.join(cachedir, '%s_annots.pkl' % imagesetfile) 105 | # read list of images 106 | with open(imagesetfile, 'r') as f: 107 | lines = f.readlines() 108 | imagenames = [x.strip() for x in lines] 109 | 110 | if not os.path.isfile(cachefile): 111 | # load annotations 112 | recs = {} 113 | for i, imagename in enumerate(imagenames): 114 | recs[imagename] = parse_rec(annopath.format(imagename)) 115 | if i % 100 == 0: 116 | print('Reading annotation for {:d}/{:d}'.format( 117 | i + 1, len(imagenames))) 118 | # save 119 | print('Saving cached annotations to {:s}'.format(cachefile)) 120 | with open(cachefile, 'wb') as f: 121 | pickle.dump(recs, f) 122 | else: 123 | # load 124 | with open(cachefile, 'rb') as f: 125 | try: 126 | recs = pickle.load(f) 127 | except: 128 | recs = pickle.load(f, encoding='bytes') 129 | 130 | # extract gt objects for this class 131 | class_recs = {} 132 | npos = 0 133 | for imagename in imagenames: 134 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 135 | bbox = np.array([x['bbox'] for x in R]) 136 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 137 | det = [False] * len(R) 138 | npos = npos + sum(~difficult) 139 | class_recs[imagename] = {'bbox': bbox, 140 | 'difficult': difficult, 141 | 'det': det} 142 | 143 | # read dets 144 | detfile = detpath.format(classname) 145 | with open(detfile, 'r') as f: 146 | lines = f.readlines() 147 | 148 | splitlines = [x.strip().split(' ') for x in lines] 149 | image_ids = [x[0] for x in splitlines] 150 | confidence = np.array([float(x[1]) for x in splitlines]) 151 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 152 | 153 | nd = len(image_ids) 154 | tp = np.zeros(nd) 155 | fp = np.zeros(nd) 156 | 157 | if BB.shape[0] > 0: 158 | # sort by confidence 159 | sorted_ind = np.argsort(-confidence) 160 | sorted_scores = np.sort(-confidence) 161 | BB = BB[sorted_ind, :] 162 | image_ids = [image_ids[x] for x in sorted_ind] 163 | 164 | # go down dets and mark TPs and FPs 165 | for d in range(nd): 166 | R = class_recs[image_ids[d]] 167 | bb = BB[d, :].astype(float) 168 | ovmax = -np.inf 169 | BBGT = R['bbox'].astype(float) 170 | 171 | if BBGT.size > 0: 172 | # compute overlaps 173 | # intersection 174 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 175 | iymin = np.maximum(BBGT[:, 1], bb[1]) 176 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 177 | iymax = np.minimum(BBGT[:, 3], bb[3]) 178 | iw = np.maximum(ixmax - ixmin + 1., 0.) 179 | ih = np.maximum(iymax - iymin + 1., 0.) 180 | inters = iw * ih 181 | 182 | # union 183 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 184 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 185 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 186 | 187 | overlaps = inters / uni 188 | ovmax = np.max(overlaps) 189 | jmax = np.argmax(overlaps) 190 | 191 | if ovmax > ovthresh: 192 | if not R['difficult'][jmax]: 193 | if not R['det'][jmax]: 194 | tp[d] = 1. 195 | R['det'][jmax] = 1 196 | else: 197 | fp[d] = 1. 198 | else: 199 | fp[d] = 1. 200 | 201 | # compute precision recall 202 | fp = np.cumsum(fp) 203 | tp = np.cumsum(tp) 204 | rec = tp / float(npos) 205 | # avoid divide by zero in case the first detection matches a difficult 206 | # ground truth 207 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 208 | ap = voc_ap(rec, prec, use_07_metric) 209 | 210 | return rec, prec, ap 211 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dual-awareness Attention for Few-shot Object Detection 2 | 3 | 4 | 5 | 6 |
7 | Table of Contents 8 |
    9 |
  1. 10 | Introduction 11 |
  2. 12 |
  3. 13 | Getting Started 14 | 20 |
  4. 21 |
  5. Train
  6. 22 |
  7. Inference
  8. 23 |
  9. Acknowledgements
  10. 24 |
25 |
26 | 27 | 28 | ## Introduction 29 | 30 | While recent progress has significantly boosted few-shot classification (FSC) performance, few-shot object detection (FSOD) remains challenging for modern learning systems. 31 | Therefore, we propose DAnA (Dual-awareness Attention) mechanism which is adaptable to various existing object detection networks and enhances FSOD performance by paying adaptable attention to support images conditioned on given query information. The proposed method achieves SOTA results on COCO benchmark, outperforming the strongest baseline by 47% on performance.\ 32 | paper link: https://arxiv.org/abs/2102.12152 33 | 34 |
35 |

36 | 38 | prediction 39 | 40 |

41 | 42 | 43 | ## Getting Started 44 | ### Prerequisites 45 | * Python 3.6 46 | * Cuda 10.0 or 10.1 47 | * Pytorch 1.2.0 or higher 48 | 49 | ### Data Preparation 50 | 1. First, clone the repository and create a data folder: 51 | ``` 52 | cd Dual-awareness-Attention-for-Few-shot-Object-Detection && mkdir data 53 | ``` 54 | 2. Download the COCO dataset. Please follow the instruction in [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn#beyond-the-demo-installation-for-training-and-testing-models). 55 | Create the symlinks to datasets. 56 | ``` 57 | $ cd data 58 | 59 | For VOC 2007 60 | $ ln -s [your-path-to]/VOC2007/VOCdevkit VOCdevkit2007 61 | 62 | For COCO 63 | $ ln -s [your-path-to]/coco coco 64 | ``` 65 | 66 | 3. The COCO dataset must be preprocessed to conform to the problem setting of FSOD. At training, we must remove the labels of novel instances in each query image. For testing, we should fix the target category of each query image to ensure the results are reproducible. For your convenience, we provide the preprocessed .json files of COCO for both training and testing. Users can process the COCO annotation to construct customized datasets for their research purposes as well. 67 | * 60 base classes for training (https://drive.google.com/file/d/10mXvdpgSjFYML_9J-zMDLPuBYrSrG2ub/view?usp=sharing) 68 | * 20 novel classes for testing (https://drive.google.com/file/d/1FZJhC-Ob-IXTKf5heNeNAN00V8OUJXi2/view?usp=sharing) 69 | To use them, simply put the folder into *COCO annotations*. 70 | ``` 71 | $ mv coco60_train [yout-path-to]/coco/annotations/coco60_train 72 | ``` 73 | For those who want to apply customized annotations, please refer to lib/datasets/factory.py and lib/datasets/coco_split.py. 74 | 75 | 4. At training, the support images are image patches randomly cropped from other query images according to box annotations. At testing, to ensure the results are reproducible, a set of support images of 80 categories should be constructed in advance. The support image set we used is available [here](https://drive.google.com/file/d/1nl9-DEpBBJ5w6hxVdijY6hFxoQdz8aso/view?usp=sharing). To use them: 76 | ``` 77 | Create the soft link of support imgs 78 | $ ln -s /your/path/to/supports supports 79 | ``` 80 | 5. Create the folder to save model weights 81 | ``` 82 | $ mkdir models 83 | ``` 84 | 85 | ### Pretrained Weights 86 | Please download the pretrained backbone models (e.g., res50, vgg16) and put them into data/pretrained_model, which can be found in [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn#beyond-the-demo-installation-for-training-and-testing-models). 87 | ``` 88 | $ mkdir data/pretrained_model && cd data/pretrained_model 89 | $ ln -s /your/path/to/res50.pth res50.pth 90 | ``` 91 | **NOTE**. We would suggest to use Caffe pretrained models to reproduce our results. 92 | **If you want to use pytorch pre-trained models, please remember to transpose images from BGR to RGB, and also use the same data transformer (minus mean and normalize) as used in pretrained model.** 93 | 94 | For those who would like to test the model only, the weights of DAnA can be download [here](https://drive.google.com/file/d/1JaYF-Ep-C6b5X01_e9tFRzFgRXMJQYQ7/view?usp=sharing). **NOTE**. The provided fine-tuned model weights "cisa_coco_ft30" was fine-tuned on 30-shot novel object classes without using BA block. Therefore, to use them, please set get_model(..., use_BA_block=False) at train.py. 95 | ``` 96 | $ cd models 97 | $ ln -s [your-path-to]/cisa_coco_ft30 cisa_coco_ft30 98 | ``` 99 | 100 | ### Installation 101 | Install the conda environment. 102 | ``` 103 | $ conda env create -f env.yml 104 | $ source activate [NAME_OF_THE_ENV] 105 | ``` 106 | Compile COCO API. 107 | ``` 108 | $ cd lib 109 | $ git clone https://github.com/pdollar/coco.git 110 | $ cd coco/PythonAPI 111 | $ make && make install 112 | put pycocotools under data/ 113 | $ mv cocoapi/PythonAPI/pycocotools . 114 | ``` 115 | Compile the cuda dependencies using following commands. 116 | ``` 117 | $ cd lib 118 | $ python setup.py build develop 119 | ``` 120 | If you are confronted with error during the compilation, you might miss to export the CUDA paths to your environment. 121 | 122 | ## Train 123 | 124 | 125 | ***To train from scratch*** 126 | ``` 127 | $ python train.py --dataset coco_base --flip --net DAnA --lr 0.001 --lr_decay_step 12 --bs 4 --epochs 16 --disp_interval 20 --save_dir models/DAnA --way 2 --shot 3 128 | ``` 129 | 130 | ***To resume*** 131 | ``` 132 | $ python train.py --dataset coco_base --flip --net DAnA --lr 0.001 --lr_decay_step 12 --bs 4 --epochs 16 --disp_interval 20 --save_dir models/DAnA --way 2 --shot 3 --r --load_dir models/DAnA --checkepoch 12 --checkpoint 4307 133 | ``` 134 | 135 | 144 | 145 | ## Inference 146 | ``` 147 | $ python inference.py --eval --dataset val2014_novel --net DAnA --r --load_dir models/DAnA_coco_ft30 --checkepoch 4 --checkpoint 299 --bs 1 --shot 3 --eval_dir dana 148 | ``` 149 | 150 | ## Attention Visualization 151 |
152 |

153 | 155 | attention_visualization 156 | 157 |

158 | 159 | ## Acknowledgements 160 | This work was supported in part by the Ministry of Science and Technology, Taiwan, under Grant MOST 110-2634-F-002-026. We benefit from NVIDIA DGX-1 AI Supercomputer and are grateful to the National Center for High-performance Computing. The code is mainly build on [faster-rcnn.pytorch](https://github.com/jwyang/faster-rcnn.pytorch/tree/pytorch-1.0). 161 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import argparse 5 | import time 6 | import random 7 | import cv2 8 | import torch 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | from torch.autograd import Variable 12 | from tqdm import tqdm 13 | 14 | from roi_data_layer.roidb import combined_roidb 15 | from roi_data_layer.fs_loader import FewShotLoader, sampler 16 | from model.utils.config import cfg, cfg_from_file, cfg_from_list, get_output_dir 17 | from model.utils.net_utils import weights_normal_init, save_net, load_net, \ 18 | adjust_learning_rate, save_checkpoint, clip_gradient 19 | from model.utils.fsod_logger import FSODLogger 20 | 21 | from utils import * 22 | 23 | 24 | if __name__ == '__main__': 25 | 26 | args = parse_args() 27 | print(args) 28 | 29 | cfg_from_file(args.cfg_file) 30 | cfg_from_list(args.set_cfgs) 31 | 32 | # make results determinable 33 | random_seed = 1996 34 | np.random.seed(random_seed) 35 | random.seed(random_seed) 36 | torch.manual_seed(random_seed) 37 | torch.cuda.manual_seed_all(random_seed) 38 | torch.backends.cudnn.deterministic = True 39 | torch.backends.cudnn.benchmark = False 40 | cfg.CUDA = True 41 | 42 | # prepare output dir 43 | output_dir = os.path.join(args.save_dir, "train/checkpoints") 44 | if not os.path.exists(output_dir): 45 | os.makedirs(output_dir) 46 | 47 | # prepare dataloader 48 | cfg.TRAIN.USE_FLIPPED = args.use_flip 49 | cfg.USE_GPU_NMS = True 50 | imdb, roidb, ratio_list, ratio_index = combined_roidb(args.imdb_name) 51 | 52 | 53 | dataset = FewShotLoader(roidb, ratio_list, ratio_index, args.batch_size, \ 54 | imdb.num_classes, training=True, num_way=args.way, num_shot=args.shot) 55 | train_size = len(roidb) 56 | print('{:d} roidb entries'.format(len(roidb))) 57 | sampler_batch = sampler(train_size, args.batch_size) 58 | dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, 59 | sampler=sampler_batch, num_workers=args.num_workers) 60 | 61 | # initilize the tensor holders 62 | holders = prepare_var(support=True) 63 | im_data = holders[0] 64 | im_info = holders[1] 65 | num_boxes = holders[2] 66 | gt_boxes = holders[3] 67 | support_ims = holders[4] 68 | 69 | # initilize the network 70 | pre_weight = False if args.resume else True 71 | classes = ['fg', 'bg'] 72 | model = get_model(args.net, pretrained=pre_weight, way=args.way, shot=args.shot, classes=classes) 73 | model.cuda() 74 | 75 | # optimizer 76 | lr = cfg.TRAIN.LEARNING_RATE 77 | lr = args.lr 78 | params = [] 79 | for key, value in dict(model.named_parameters()).items(): 80 | if value.requires_grad: 81 | if 'bias' in key: 82 | params += [{'params':[value],'lr':lr*(cfg.TRAIN.DOUBLE_BIAS + 1), \ 83 | 'weight_decay': cfg.TRAIN.BIAS_DECAY and cfg.TRAIN.WEIGHT_DECAY or 0}] 84 | else: 85 | params += [{'params':[value],'lr':lr, 'weight_decay': cfg.TRAIN.WEIGHT_DECAY}] 86 | if args.optimizer == "adam": 87 | optimizer = torch.optim.Adam(params) 88 | elif args.optimizer == "sgd": 89 | optimizer = torch.optim.SGD(params, momentum=cfg.TRAIN.MOMENTUM) 90 | 91 | # load checkpoints 92 | if args.resume: 93 | load_dir = os.path.join(args.load_dir, "train/checkpoints") 94 | load_name = os.path.join(load_dir, f'model_{args.checkepoch}_{args.checkpoint}.pth') 95 | checkpoint = torch.load(load_name) 96 | args.start_epoch = checkpoint['epoch'] 97 | model.load_state_dict(checkpoint['model']) 98 | optimizer.load_state_dict(checkpoint['optimizer']) 99 | lr = optimizer.param_groups[0]['lr'] 100 | if 'pooling_mode' in checkpoint.keys(): 101 | cfg.POOLING_MODE = checkpoint['pooling_mode'] 102 | print(f'loaded checkpoint: {load_name}') 103 | 104 | if args.mGPUs: 105 | model = nn.DataParallel(model) 106 | 107 | # initialize logger 108 | if not args.dlog: 109 | logger_save_dir = os.path.join(args.save_dir, "train") 110 | tb_logger = FSODLogger(logger_save_dir) 111 | 112 | # training 113 | iters_per_epoch = int(train_size / args.batch_size) 114 | for epoch in range(args.start_epoch, args.max_epochs + 1): 115 | model.train() 116 | loss_temp = 0 117 | start_time = time.time() 118 | if epoch % (args.lr_decay_step + 1) == 0: 119 | adjust_learning_rate(optimizer, args.lr_decay_gamma) 120 | lr *= args.lr_decay_gamma 121 | data_iter = iter(dataloader) 122 | for step in range(iters_per_epoch): 123 | data = next(data_iter) 124 | with torch.no_grad(): 125 | im_data.resize_(data[0].size()).copy_(data[0]) 126 | im_info.resize_(data[1].size()).copy_(data[1]) 127 | gt_boxes.resize_(data[2].size()).copy_(data[2]) 128 | num_boxes.resize_(data[3].size()).copy_(data[3]) 129 | support_ims.resize_(data[4].size()).copy_(data[4]) 130 | 131 | model.zero_grad() 132 | 133 | rois, cls_prob, bbox_pred, \ 134 | rpn_loss_cls, rpn_loss_box, \ 135 | RCNN_loss_cls, RCNN_loss_bbox, \ 136 | rois_label = model(im_data, im_info, gt_boxes, num_boxes, support_ims) 137 | 138 | loss = rpn_loss_cls.mean() + rpn_loss_box.mean() \ 139 | + RCNN_loss_cls.mean() + RCNN_loss_bbox.mean() 140 | loss_temp += loss.item() 141 | 142 | optimizer.zero_grad() 143 | loss.backward() 144 | optimizer.step() 145 | 146 | if step % args.disp_interval == 0: 147 | end_time = time.time() 148 | if step > 0: 149 | loss_temp /= (args.disp_interval + 1) 150 | if args.mGPUs: 151 | loss_rpn_cls = rpn_loss_cls.mean().item() 152 | loss_rpn_box = rpn_loss_box.mean().item() 153 | loss_rcnn_cls = RCNN_loss_cls.mean().item() 154 | loss_rcnn_box = RCNN_loss_bbox.mean().item() 155 | fg_cnt = torch.sum(rois_label.data.ne(0)) 156 | bg_cnt = rois_label.data.numel() - fg_cnt 157 | else: 158 | loss_rpn_cls = rpn_loss_cls.item() 159 | loss_rpn_box = rpn_loss_box.item() 160 | loss_rcnn_cls = RCNN_loss_cls.item() 161 | loss_rcnn_box = RCNN_loss_bbox.item() 162 | fg_cnt = torch.sum(rois_label.data.ne(0)) 163 | bg_cnt = rois_label.data.numel() - fg_cnt 164 | 165 | print("[epoch %2d][iter %4d/%4d] loss: %.4f, lr: %.2e" \ 166 | % (epoch, step, iters_per_epoch, loss_temp, lr)) 167 | print("\t\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, end_time-start_time)) 168 | print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" \ 169 | % (loss_rpn_cls, loss_rpn_box, loss_rcnn_cls, loss_rcnn_box)) 170 | 171 | info = { 172 | 'loss': loss_temp, 173 | 'loss_rpn_cls': loss_rpn_cls, 174 | 'loss_rpn_box': loss_rpn_box, 175 | 'loss_rcnn_cls': loss_rcnn_cls, 176 | 'loss_rcnn_box': loss_rcnn_box 177 | } 178 | loss_temp = 0 179 | start_time = time.time() 180 | if not args.dlog: 181 | tb_logger.write(epoch, info, save_im=args.imlog) 182 | 183 | save_name = os.path.join(output_dir, 'model_{}_{}.pth'.format(epoch, step)) 184 | save_checkpoint({ 185 | 'epoch': epoch + 1, 186 | 'model': model.module.state_dict() if args.mGPUs else model.state_dict(), 187 | 'optimizer': optimizer.state_dict(), 188 | 'pooling_mode': cfg.POOLING_MODE, 189 | }, save_name) 190 | print('save model: {}'.format(save_name)) 191 | 192 | 193 | -------------------------------------------------------------------------------- /lib/model/roi_align/src/roi_align.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois, 7 | const int height, const int width, const int channels, 8 | const int aligned_height, const int aligned_width, const float * bottom_rois, 9 | float* top_data); 10 | 11 | void ROIAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois, 12 | const int height, const int width, const int channels, 13 | const int aligned_height, const int aligned_width, const float * bottom_rois, 14 | float* top_data); 15 | 16 | int roi_align_forward(int aligned_height, int aligned_width, float spatial_scale, 17 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output) 18 | { 19 | //Grab the input tensor 20 | float * data_flat = THFloatTensor_data(features); 21 | float * rois_flat = THFloatTensor_data(rois); 22 | 23 | float * output_flat = THFloatTensor_data(output); 24 | 25 | // Number of ROIs 26 | int num_rois = THFloatTensor_size(rois, 0); 27 | int size_rois = THFloatTensor_size(rois, 1); 28 | if (size_rois != 5) 29 | { 30 | return 0; 31 | } 32 | 33 | // data height 34 | int data_height = THFloatTensor_size(features, 2); 35 | // data width 36 | int data_width = THFloatTensor_size(features, 3); 37 | // Number of channels 38 | int num_channels = THFloatTensor_size(features, 1); 39 | 40 | // do ROIAlignForward 41 | ROIAlignForwardCpu(data_flat, spatial_scale, num_rois, data_height, data_width, num_channels, 42 | aligned_height, aligned_width, rois_flat, output_flat); 43 | 44 | return 1; 45 | } 46 | 47 | int roi_align_backward(int aligned_height, int aligned_width, float spatial_scale, 48 | THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad) 49 | { 50 | //Grab the input tensor 51 | float * top_grad_flat = THFloatTensor_data(top_grad); 52 | float * rois_flat = THFloatTensor_data(rois); 53 | 54 | float * bottom_grad_flat = THFloatTensor_data(bottom_grad); 55 | 56 | // Number of ROIs 57 | int num_rois = THFloatTensor_size(rois, 0); 58 | int size_rois = THFloatTensor_size(rois, 1); 59 | if (size_rois != 5) 60 | { 61 | return 0; 62 | } 63 | 64 | // batch size 65 | // int batch_size = THFloatTensor_size(bottom_grad, 0); 66 | // data height 67 | int data_height = THFloatTensor_size(bottom_grad, 2); 68 | // data width 69 | int data_width = THFloatTensor_size(bottom_grad, 3); 70 | // Number of channels 71 | int num_channels = THFloatTensor_size(bottom_grad, 1); 72 | 73 | // do ROIAlignBackward 74 | ROIAlignBackwardCpu(top_grad_flat, spatial_scale, num_rois, data_height, 75 | data_width, num_channels, aligned_height, aligned_width, rois_flat, bottom_grad_flat); 76 | 77 | return 1; 78 | } 79 | 80 | void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois, 81 | const int height, const int width, const int channels, 82 | const int aligned_height, const int aligned_width, const float * bottom_rois, 83 | float* top_data) 84 | { 85 | const int output_size = num_rois * aligned_height * aligned_width * channels; 86 | 87 | int idx = 0; 88 | for (idx = 0; idx < output_size; ++idx) 89 | { 90 | // (n, c, ph, pw) is an element in the aligned output 91 | int pw = idx % aligned_width; 92 | int ph = (idx / aligned_width) % aligned_height; 93 | int c = (idx / aligned_width / aligned_height) % channels; 94 | int n = idx / aligned_width / aligned_height / channels; 95 | 96 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 97 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 98 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 99 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 100 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 101 | 102 | // Force malformed ROI to be 1x1 103 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 104 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 105 | float bin_size_h = roi_height / (aligned_height - 1.); 106 | float bin_size_w = roi_width / (aligned_width - 1.); 107 | 108 | float h = (float)(ph) * bin_size_h + roi_start_h; 109 | float w = (float)(pw) * bin_size_w + roi_start_w; 110 | 111 | int hstart = fminf(floor(h), height - 2); 112 | int wstart = fminf(floor(w), width - 2); 113 | 114 | int img_start = roi_batch_ind * channels * height * width; 115 | 116 | // bilinear interpolation 117 | if (h < 0 || h >= height || w < 0 || w >= width) 118 | { 119 | top_data[idx] = 0.; 120 | } 121 | else 122 | { 123 | float h_ratio = h - (float)(hstart); 124 | float w_ratio = w - (float)(wstart); 125 | int upleft = img_start + (c * height + hstart) * width + wstart; 126 | int upright = upleft + 1; 127 | int downleft = upleft + width; 128 | int downright = downleft + 1; 129 | 130 | top_data[idx] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio) 131 | + bottom_data[upright] * (1. - h_ratio) * w_ratio 132 | + bottom_data[downleft] * h_ratio * (1. - w_ratio) 133 | + bottom_data[downright] * h_ratio * w_ratio; 134 | } 135 | } 136 | } 137 | 138 | void ROIAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois, 139 | const int height, const int width, const int channels, 140 | const int aligned_height, const int aligned_width, const float * bottom_rois, 141 | float* bottom_diff) 142 | { 143 | const int output_size = num_rois * aligned_height * aligned_width * channels; 144 | 145 | int idx = 0; 146 | for (idx = 0; idx < output_size; ++idx) 147 | { 148 | // (n, c, ph, pw) is an element in the aligned output 149 | int pw = idx % aligned_width; 150 | int ph = (idx / aligned_width) % aligned_height; 151 | int c = (idx / aligned_width / aligned_height) % channels; 152 | int n = idx / aligned_width / aligned_height / channels; 153 | 154 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 155 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 156 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 157 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 158 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 159 | 160 | // Force malformed ROI to be 1x1 161 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 162 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 163 | float bin_size_h = roi_height / (aligned_height - 1.); 164 | float bin_size_w = roi_width / (aligned_width - 1.); 165 | 166 | float h = (float)(ph) * bin_size_h + roi_start_h; 167 | float w = (float)(pw) * bin_size_w + roi_start_w; 168 | 169 | int hstart = fminf(floor(h), height - 2); 170 | int wstart = fminf(floor(w), width - 2); 171 | 172 | int img_start = roi_batch_ind * channels * height * width; 173 | 174 | // bilinear interpolation 175 | if (h < 0 || h >= height || w < 0 || w >= width) 176 | { 177 | float h_ratio = h - (float)(hstart); 178 | float w_ratio = w - (float)(wstart); 179 | int upleft = img_start + (c * height + hstart) * width + wstart; 180 | int upright = upleft + 1; 181 | int downleft = upleft + width; 182 | int downright = downleft + 1; 183 | 184 | bottom_diff[upleft] += top_diff[idx] * (1. - h_ratio) * (1. - w_ratio); 185 | bottom_diff[upright] += top_diff[idx] * (1. - h_ratio) * w_ratio; 186 | bottom_diff[downleft] += top_diff[idx] * h_ratio * (1. - w_ratio); 187 | bottom_diff[downright] += top_diff[idx] * h_ratio * w_ratio; 188 | } 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /lib/model/roi_align/src/roi_align_kernel.cu: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include "roi_align_kernel.h" 9 | 10 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 11 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 12 | i += blockDim.x * gridDim.x) 13 | 14 | 15 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width, 16 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) { 17 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 18 | // (n, c, ph, pw) is an element in the aligned output 19 | // int n = index; 20 | // int pw = n % aligned_width; 21 | // n /= aligned_width; 22 | // int ph = n % aligned_height; 23 | // n /= aligned_height; 24 | // int c = n % channels; 25 | // n /= channels; 26 | 27 | int pw = index % aligned_width; 28 | int ph = (index / aligned_width) % aligned_height; 29 | int c = (index / aligned_width / aligned_height) % channels; 30 | int n = index / aligned_width / aligned_height / channels; 31 | 32 | // bottom_rois += n * 5; 33 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 34 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 35 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 36 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 37 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 38 | 39 | // Force malformed ROIs to be 1x1 40 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 41 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 42 | float bin_size_h = roi_height / (aligned_height - 1.); 43 | float bin_size_w = roi_width / (aligned_width - 1.); 44 | 45 | float h = (float)(ph) * bin_size_h + roi_start_h; 46 | float w = (float)(pw) * bin_size_w + roi_start_w; 47 | 48 | int hstart = fminf(floor(h), height - 2); 49 | int wstart = fminf(floor(w), width - 2); 50 | 51 | int img_start = roi_batch_ind * channels * height * width; 52 | 53 | // bilinear interpolation 54 | if (h < 0 || h >= height || w < 0 || w >= width) { 55 | top_data[index] = 0.; 56 | } else { 57 | float h_ratio = h - (float)(hstart); 58 | float w_ratio = w - (float)(wstart); 59 | int upleft = img_start + (c * height + hstart) * width + wstart; 60 | int upright = upleft + 1; 61 | int downleft = upleft + width; 62 | int downright = downleft + 1; 63 | 64 | top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio) 65 | + bottom_data[upright] * (1. - h_ratio) * w_ratio 66 | + bottom_data[downleft] * h_ratio * (1. - w_ratio) 67 | + bottom_data[downright] * h_ratio * w_ratio; 68 | } 69 | } 70 | } 71 | 72 | 73 | int ROIAlignForwardLaucher(const float* bottom_data, const float spatial_scale, const int num_rois, const int height, const int width, 74 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data, cudaStream_t stream) { 75 | const int kThreadsPerBlock = 1024; 76 | const int output_size = num_rois * aligned_height * aligned_width * channels; 77 | cudaError_t err; 78 | 79 | 80 | ROIAlignForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 81 | output_size, bottom_data, spatial_scale, height, width, channels, 82 | aligned_height, aligned_width, bottom_rois, top_data); 83 | 84 | err = cudaGetLastError(); 85 | if(cudaSuccess != err) { 86 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 87 | exit( -1 ); 88 | } 89 | 90 | return 1; 91 | } 92 | 93 | 94 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, const float spatial_scale, const int height, const int width, 95 | const int channels, const int aligned_height, const int aligned_width, float* bottom_diff, const float* bottom_rois) { 96 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 97 | 98 | // (n, c, ph, pw) is an element in the aligned output 99 | int pw = index % aligned_width; 100 | int ph = (index / aligned_width) % aligned_height; 101 | int c = (index / aligned_width / aligned_height) % channels; 102 | int n = index / aligned_width / aligned_height / channels; 103 | 104 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 105 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 106 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 107 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 108 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 109 | /* int roi_start_w = round(bottom_rois[1] * spatial_scale); */ 110 | /* int roi_start_h = round(bottom_rois[2] * spatial_scale); */ 111 | /* int roi_end_w = round(bottom_rois[3] * spatial_scale); */ 112 | /* int roi_end_h = round(bottom_rois[4] * spatial_scale); */ 113 | 114 | // Force malformed ROIs to be 1x1 115 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 116 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 117 | float bin_size_h = roi_height / (aligned_height - 1.); 118 | float bin_size_w = roi_width / (aligned_width - 1.); 119 | 120 | float h = (float)(ph) * bin_size_h + roi_start_h; 121 | float w = (float)(pw) * bin_size_w + roi_start_w; 122 | 123 | int hstart = fminf(floor(h), height - 2); 124 | int wstart = fminf(floor(w), width - 2); 125 | 126 | int img_start = roi_batch_ind * channels * height * width; 127 | 128 | // bilinear interpolation 129 | if (!(h < 0 || h >= height || w < 0 || w >= width)) { 130 | float h_ratio = h - (float)(hstart); 131 | float w_ratio = w - (float)(wstart); 132 | int upleft = img_start + (c * height + hstart) * width + wstart; 133 | int upright = upleft + 1; 134 | int downleft = upleft + width; 135 | int downright = downleft + 1; 136 | 137 | atomicAdd(bottom_diff + upleft, top_diff[index] * (1. - h_ratio) * (1 - w_ratio)); 138 | atomicAdd(bottom_diff + upright, top_diff[index] * (1. - h_ratio) * w_ratio); 139 | atomicAdd(bottom_diff + downleft, top_diff[index] * h_ratio * (1 - w_ratio)); 140 | atomicAdd(bottom_diff + downright, top_diff[index] * h_ratio * w_ratio); 141 | } 142 | } 143 | } 144 | 145 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, const int height, const int width, 146 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* bottom_diff, cudaStream_t stream) { 147 | const int kThreadsPerBlock = 1024; 148 | const int output_size = num_rois * aligned_height * aligned_width * channels; 149 | cudaError_t err; 150 | 151 | ROIAlignBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 152 | output_size, top_diff, spatial_scale, height, width, channels, 153 | aligned_height, aligned_width, bottom_diff, bottom_rois); 154 | 155 | err = cudaGetLastError(); 156 | if(cudaSuccess != err) { 157 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 158 | exit( -1 ); 159 | } 160 | 161 | return 1; 162 | } 163 | 164 | 165 | #ifdef __cplusplus 166 | } 167 | #endif 168 | --------------------------------------------------------------------------------