├── utils ├── __init__.py ├── net_utils.py ├── cpu_nms.pyx └── box_utils.py ├── hello.py ├── model ├── roi_align │ ├── __init__.py │ ├── _ext │ │ ├── __init__.py │ │ └── roi_align │ │ │ └── __init__.py │ ├── functions │ │ ├── __init__.py │ │ └── roi_align.py │ ├── modules │ │ ├── __init__.py │ │ └── roi_align.py │ ├── src │ │ ├── roi_align_kernel.cu.o │ │ ├── roi_align_cuda.h │ │ ├── roi_align_kernel.h │ │ ├── roi_align_cuda.c │ │ └── roi_align_kernel.cu │ ├── make.sh │ └── build.py ├── roi_pooling │ ├── __init__.py │ ├── _ext │ │ ├── __init__.py │ │ └── roi_pooling │ │ │ └── __init__.py │ ├── modules │ │ ├── __init__.py │ │ └── roi_pool.py │ ├── functions │ │ ├── __init__.py │ │ └── roi_pool.py │ ├── src │ │ ├── roi_pooling.cu.o │ │ ├── roi_pooling.h │ │ ├── roi_pooling_cuda.h │ │ ├── roi_pooling_kernel.h │ │ ├── roi_pooling_cuda.c │ │ ├── roi_pooling.c │ │ └── roi_pooling_kernel.cu │ └── build.py ├── __init__.py └── wsddn_vgg16.py ├── README.md ├── .gitignore ├── frcnn_eval ├── __init__.py ├── imdb.py ├── pascal_voc.py └── voc_eval.py ├── datasets ├── __init__.py ├── voc_loader.py └── wsddn_dataset.py ├── make.sh ├── LICENSE ├── setup.py ├── train.py └── eval.py /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hello.py: -------------------------------------------------------------------------------- 1 | from scipy.misc -------------------------------------------------------------------------------- /model/roi_align/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WSDDN.pytorch 2 | -------------------------------------------------------------------------------- /model/roi_align/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/roi_pooling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/roi_align/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/roi_align/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/roi_pooling/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/roi_pooling/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/roi_pooling/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/* 2 | data 3 | .idea 4 | .idea/* 5 | *.pyc 6 | *~ 7 | *.so 8 | -------------------------------------------------------------------------------- /model/roi_pooling/src/roi_pooling.cu.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deneb2016/WSDDN.pytorch/HEAD/model/roi_pooling/src/roi_pooling.cu.o -------------------------------------------------------------------------------- /model/roi_align/src/roi_align_kernel.cu.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deneb2016/WSDDN.pytorch/HEAD/model/roi_align/src/roi_align_kernel.cu.o -------------------------------------------------------------------------------- /model/roi_pooling/src/roi_pooling.h: -------------------------------------------------------------------------------- 1 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale, 2 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output); -------------------------------------------------------------------------------- /model/roi_align/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_PATH=/usr/local/cuda/ 4 | 5 | cd src 6 | echo "Compiling my_lib kernels by nvcc..." 7 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52 8 | 9 | cd ../ 10 | python build.py 11 | -------------------------------------------------------------------------------- /frcnn_eval/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # PyTorch WSDDN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Seungkwan Lee 5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang 6 | # -------------------------------------------------------- -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # PyTorch WSDDN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Seungkwan Lee 5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang 6 | # -------------------------------------------------------- -------------------------------------------------------------------------------- /model/roi_align/src/roi_align_cuda.h: -------------------------------------------------------------------------------- 1 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output); 3 | 4 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad); 6 | -------------------------------------------------------------------------------- /model/roi_pooling/src/roi_pooling_cuda.h: -------------------------------------------------------------------------------- 1 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax); 3 | 4 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax); -------------------------------------------------------------------------------- /model/roi_align/_ext/roi_align/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_align import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | if callable(fn): 10 | locals[symbol] = _wrap_function(fn, _ffi) 11 | else: 12 | locals[symbol] = fn 13 | __all__.append(symbol) 14 | 15 | _import_symbols(locals()) 16 | -------------------------------------------------------------------------------- /model/roi_pooling/_ext/roi_pooling/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_pooling import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | if callable(fn): 10 | locals[symbol] = _wrap_function(fn, _ffi) 11 | else: 12 | locals[symbol] = fn 13 | __all__.append(symbol) 14 | 15 | _import_symbols(locals()) 16 | -------------------------------------------------------------------------------- /model/roi_pooling/modules/roi_pool.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from ..functions.roi_pool import RoIPoolFunction 3 | 4 | 5 | class _RoIPooling(Module): 6 | def __init__(self, pooled_height, pooled_width, spatial_scale): 7 | super(_RoIPooling, self).__init__() 8 | 9 | self.pooled_width = int(pooled_width) 10 | self.pooled_height = int(pooled_height) 11 | self.spatial_scale = float(spatial_scale) 12 | 13 | def forward(self, features, rois): 14 | return RoIPoolFunction(self.pooled_height, self.pooled_width, self.spatial_scale)(features, rois) 15 | -------------------------------------------------------------------------------- /make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_PATH=/usr/local/cuda/ 4 | export CXXFLAGS="-std=c++11" 5 | export CFLAGS="-std=c99" 6 | export PATH=$CUDA_PATH/bin:$PATH 7 | 8 | python setup.py build_ext --inplace 9 | rm -rf build 10 | 11 | 12 | CUDA_ARCH="-gencode arch=compute_52,code=sm_52 -arch=sm_52" 13 | 14 | 15 | # compile roi_pooling 16 | cd model/roi_pooling/src 17 | echo "Compiling roi pooling kernels by nvcc..." 18 | nvcc -c -o roi_pooling.cu.o roi_pooling_kernel.cu \ 19 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC $CUDA_ARCH 20 | cd ../ 21 | python build.py 22 | 23 | # compile roi_align 24 | cd ../../ 25 | cd model/roi_align/src 26 | echo "Compiling roi align kernels by nvcc..." 27 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu \ 28 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC $CUDA_ARCH 29 | cd ../ 30 | python build.py 31 | -------------------------------------------------------------------------------- /model/roi_pooling/src/roi_pooling_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ROI_POOLING_KERNEL 2 | #define _ROI_POOLING_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | int ROIPoolForwardLaucher( 9 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 10 | const int width, const int channels, const int pooled_height, 11 | const int pooled_width, const float* bottom_rois, 12 | float* top_data, int* argmax_data, cudaStream_t stream); 13 | 14 | 15 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 16 | const int height, const int width, const int channels, const int pooled_height, 17 | const int pooled_width, const float* bottom_rois, 18 | float* bottom_diff, const int* argmax_data, cudaStream_t stream); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif 25 | 26 | -------------------------------------------------------------------------------- /model/roi_pooling/build.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import torch 4 | from torch.utils.ffi import create_extension 5 | 6 | 7 | sources = ['src/roi_pooling.c'] 8 | headers = ['src/roi_pooling.h'] 9 | defines = [] 10 | with_cuda = False 11 | 12 | if torch.cuda.is_available(): 13 | print('Including CUDA code.') 14 | sources += ['src/roi_pooling_cuda.c'] 15 | headers += ['src/roi_pooling_cuda.h'] 16 | defines += [('WITH_CUDA', None)] 17 | with_cuda = True 18 | 19 | this_file = os.path.dirname(os.path.realpath(__file__)) 20 | print(this_file) 21 | extra_objects = ['src/roi_pooling.cu.o'] 22 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 23 | 24 | ffi = create_extension( 25 | '_ext.roi_pooling', 26 | headers=headers, 27 | sources=sources, 28 | define_macros=defines, 29 | relative_to=__file__, 30 | with_cuda=with_cuda, 31 | extra_objects=extra_objects 32 | ) 33 | 34 | if __name__ == '__main__': 35 | ffi.build() 36 | -------------------------------------------------------------------------------- /model/roi_align/build.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import torch 4 | from torch.utils.ffi import create_extension 5 | 6 | # sources = ['src/roi_align.c'] 7 | # headers = ['src/roi_align.h'] 8 | sources = [] 9 | headers = [] 10 | defines = [] 11 | with_cuda = False 12 | 13 | if torch.cuda.is_available(): 14 | print('Including CUDA code.') 15 | sources += ['src/roi_align_cuda.c'] 16 | headers += ['src/roi_align_cuda.h'] 17 | defines += [('WITH_CUDA', None)] 18 | with_cuda = True 19 | 20 | this_file = os.path.dirname(os.path.realpath(__file__)) 21 | print(this_file) 22 | extra_objects = ['src/roi_align_kernel.cu.o'] 23 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 24 | 25 | ffi = create_extension( 26 | '_ext.roi_align', 27 | headers=headers, 28 | sources=sources, 29 | define_macros=defines, 30 | relative_to=__file__, 31 | with_cuda=with_cuda, 32 | extra_objects=extra_objects 33 | ) 34 | 35 | if __name__ == '__main__': 36 | ffi.build() 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Seungkwan Lee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /utils/net_utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # PyTorch WSDDN 3 | # Copyright 2018. Seungkwan Lee 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Seungkwan Lee 6 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang 7 | # -------------------------------------------------------- 8 | import torch 9 | import numpy as np 10 | 11 | 12 | def clip_gradient(model, clip_norm): 13 | totalnorm = 0 14 | for p in model.parameters(): 15 | if p.requires_grad and p.grad is not None: 16 | modulenorm = p.grad.data.norm().item() 17 | totalnorm = totalnorm + modulenorm ** 2 18 | totalnorm = np.sqrt(totalnorm) 19 | 20 | norm = clip_norm / max(totalnorm, clip_norm) 21 | for p in model.parameters(): 22 | if p.requires_grad and p.grad is not None: 23 | p.grad.mul_(norm) 24 | 25 | 26 | def adjust_learning_rate(optimizer, decay=0.1): 27 | for param_group in optimizer.param_groups: 28 | param_group['lr'] = decay * param_group['lr'] 29 | 30 | 31 | def save_checkpoint(state, filename): 32 | torch.save(state, filename) 33 | -------------------------------------------------------------------------------- /model/roi_align/src/roi_align_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ROI_ALIGN_KERNEL 2 | #define _ROI_ALIGN_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, 9 | const float spatial_scale, const int height, const int width, 10 | const int channels, const int aligned_height, const int aligned_width, 11 | const float* bottom_rois, float* top_data); 12 | 13 | int ROIAlignForwardLaucher( 14 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 15 | const int width, const int channels, const int aligned_height, 16 | const int aligned_width, const float* bottom_rois, 17 | float* top_data, cudaStream_t stream); 18 | 19 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, 20 | const float spatial_scale, const int height, const int width, 21 | const int channels, const int aligned_height, const int aligned_width, 22 | float* bottom_diff, const float* bottom_rois); 23 | 24 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 25 | const int height, const int width, const int channels, const int aligned_height, 26 | const int aligned_width, const float* bottom_rois, 27 | float* bottom_diff, cudaStream_t stream); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif 34 | 35 | -------------------------------------------------------------------------------- /model/roi_align/modules/roi_align.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from torch.nn.functional import avg_pool2d, max_pool2d 3 | from ..functions.roi_align import RoIAlignFunction 4 | 5 | 6 | class RoIAlign(Module): 7 | def __init__(self, aligned_height, aligned_width, spatial_scale): 8 | super(RoIAlign, self).__init__() 9 | 10 | self.aligned_width = int(aligned_width) 11 | self.aligned_height = int(aligned_height) 12 | self.spatial_scale = float(spatial_scale) 13 | 14 | def forward(self, features, rois): 15 | return RoIAlignFunction(self.aligned_height, self.aligned_width, 16 | self.spatial_scale)(features, rois) 17 | 18 | class RoIAlignAvg(Module): 19 | def __init__(self, aligned_height, aligned_width, spatial_scale): 20 | super(RoIAlignAvg, self).__init__() 21 | 22 | self.aligned_width = int(aligned_width) 23 | self.aligned_height = int(aligned_height) 24 | self.spatial_scale = float(spatial_scale) 25 | 26 | def forward(self, features, rois): 27 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1, 28 | self.spatial_scale)(features, rois) 29 | return avg_pool2d(x, kernel_size=2, stride=1) 30 | 31 | class RoIAlignMax(Module): 32 | def __init__(self, aligned_height, aligned_width, spatial_scale): 33 | super(RoIAlignMax, self).__init__() 34 | 35 | self.aligned_width = int(aligned_width) 36 | self.aligned_height = int(aligned_height) 37 | self.spatial_scale = float(spatial_scale) 38 | 39 | def forward(self, features, rois): 40 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1, 41 | self.spatial_scale)(features, rois) 42 | return max_pool2d(x, kernel_size=2, stride=1) 43 | -------------------------------------------------------------------------------- /model/roi_pooling/functions/roi_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from .._ext import roi_pooling 4 | import pdb 5 | 6 | class RoIPoolFunction(Function): 7 | def __init__(ctx, pooled_height, pooled_width, spatial_scale): 8 | ctx.pooled_width = pooled_width 9 | ctx.pooled_height = pooled_height 10 | ctx.spatial_scale = spatial_scale 11 | ctx.feature_size = None 12 | 13 | def forward(ctx, features, rois): 14 | ctx.feature_size = features.size() 15 | batch_size, num_channels, data_height, data_width = ctx.feature_size 16 | num_rois = rois.size(0) 17 | output = features.new(num_rois, num_channels, ctx.pooled_height, ctx.pooled_width).zero_() 18 | ctx.argmax = features.new(num_rois, num_channels, ctx.pooled_height, ctx.pooled_width).zero_().int() 19 | ctx.rois = rois 20 | if not features.is_cuda: 21 | _features = features.permute(0, 2, 3, 1) 22 | roi_pooling.roi_pooling_forward(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, 23 | _features, rois, output) 24 | else: 25 | roi_pooling.roi_pooling_forward_cuda(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, 26 | features, rois, output, ctx.argmax) 27 | 28 | return output 29 | 30 | def backward(ctx, grad_output): 31 | assert(ctx.feature_size is not None and grad_output.is_cuda) 32 | batch_size, num_channels, data_height, data_width = ctx.feature_size 33 | grad_input = grad_output.new(batch_size, num_channels, data_height, data_width).zero_() 34 | 35 | roi_pooling.roi_pooling_backward_cuda(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, 36 | grad_output, ctx.rois, grad_input, ctx.argmax) 37 | 38 | return grad_input, None 39 | -------------------------------------------------------------------------------- /model/roi_align/functions/roi_align.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from .._ext import roi_align 4 | 5 | 6 | # TODO use save_for_backward instead 7 | class RoIAlignFunction(Function): 8 | def __init__(self, aligned_height, aligned_width, spatial_scale): 9 | self.aligned_width = int(aligned_width) 10 | self.aligned_height = int(aligned_height) 11 | self.spatial_scale = float(spatial_scale) 12 | self.rois = None 13 | self.feature_size = None 14 | 15 | def forward(self, features, rois): 16 | self.rois = rois 17 | self.feature_size = features.size() 18 | 19 | batch_size, num_channels, data_height, data_width = features.size() 20 | num_rois = rois.size(0) 21 | 22 | output = features.new(num_rois, num_channels, self.aligned_height, self.aligned_width).zero_() 23 | if features.is_cuda: 24 | roi_align.roi_align_forward_cuda(self.aligned_height, 25 | self.aligned_width, 26 | self.spatial_scale, features, 27 | rois, output) 28 | else: 29 | raise NotImplementedError 30 | 31 | return output 32 | 33 | def backward(self, grad_output): 34 | assert(self.feature_size is not None and grad_output.is_cuda) 35 | 36 | batch_size, num_channels, data_height, data_width = self.feature_size 37 | 38 | grad_input = self.rois.new(batch_size, num_channels, data_height, 39 | data_width).zero_() 40 | roi_align.roi_align_backward_cuda(self.aligned_height, 41 | self.aligned_width, 42 | self.spatial_scale, grad_output, 43 | self.rois, grad_input) 44 | 45 | # print grad_input 46 | 47 | return grad_input, None 48 | -------------------------------------------------------------------------------- /frcnn_eval/imdb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # 7 | # Modified by Seungkwan Lee for WSDDN 8 | # -------------------------------------------------------- 9 | 10 | import os 11 | import os.path as osp 12 | 13 | 14 | class imdb(object): 15 | """Image database.""" 16 | 17 | def __init__(self, name): 18 | self._name = name 19 | self._num_classes = 0 20 | self._classes = [] 21 | self._image_index = [] 22 | # Use this dict for storing dataset specific config options 23 | self.config = {} 24 | 25 | @property 26 | def name(self): 27 | return self._name 28 | 29 | @property 30 | def num_classes(self): 31 | return len(self._classes) 32 | 33 | @property 34 | def classes(self): 35 | return self._classes 36 | 37 | @property 38 | def image_index(self): 39 | return self._image_index 40 | 41 | @property 42 | def cache_path(self): 43 | cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache')) 44 | if not os.path.exists(cache_path): 45 | os.makedirs(cache_path) 46 | return cache_path 47 | 48 | @property 49 | def num_images(self): 50 | return len(self.image_index) 51 | 52 | def image_path_at(self, i): 53 | raise NotImplementedError 54 | 55 | def default_roidb(self): 56 | raise NotImplementedError 57 | 58 | def evaluate_detections(self, all_boxes, output_dir=None): 59 | """ 60 | all_boxes is a list of length number-of-classes. 61 | Each list element is a list of length number-of-images. 62 | Each of those list elements is either an empty list [] 63 | or a numpy array of detection. 64 | 65 | all_boxes[class][image] = [] or np.array of shape #dets x 5 66 | """ 67 | raise NotImplementedError 68 | 69 | def competition_mode(self, on): 70 | """Turn competition mode on or off.""" 71 | pass 72 | -------------------------------------------------------------------------------- /utils/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep -------------------------------------------------------------------------------- /model/roi_align/src/roi_align_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "roi_align_kernel.h" 4 | 5 | extern THCState *state; 6 | 7 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output) 9 | { 10 | // Grab the input tensor 11 | float * data_flat = THCudaTensor_data(state, features); 12 | float * rois_flat = THCudaTensor_data(state, rois); 13 | 14 | float * output_flat = THCudaTensor_data(state, output); 15 | 16 | // Number of ROIs 17 | int num_rois = THCudaTensor_size(state, rois, 0); 18 | int size_rois = THCudaTensor_size(state, rois, 1); 19 | if (size_rois != 5) 20 | { 21 | return 0; 22 | } 23 | 24 | // data height 25 | int data_height = THCudaTensor_size(state, features, 2); 26 | // data width 27 | int data_width = THCudaTensor_size(state, features, 3); 28 | // Number of channels 29 | int num_channels = THCudaTensor_size(state, features, 1); 30 | 31 | cudaStream_t stream = THCState_getCurrentStream(state); 32 | 33 | ROIAlignForwardLaucher( 34 | data_flat, spatial_scale, num_rois, data_height, 35 | data_width, num_channels, aligned_height, 36 | aligned_width, rois_flat, 37 | output_flat, stream); 38 | 39 | return 1; 40 | } 41 | 42 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 43 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad) 44 | { 45 | // Grab the input tensor 46 | float * top_grad_flat = THCudaTensor_data(state, top_grad); 47 | float * rois_flat = THCudaTensor_data(state, rois); 48 | 49 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 50 | 51 | // Number of ROIs 52 | int num_rois = THCudaTensor_size(state, rois, 0); 53 | int size_rois = THCudaTensor_size(state, rois, 1); 54 | if (size_rois != 5) 55 | { 56 | return 0; 57 | } 58 | 59 | // batch size 60 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 61 | // data height 62 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 63 | // data width 64 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 65 | // Number of channels 66 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 67 | 68 | cudaStream_t stream = THCState_getCurrentStream(state); 69 | ROIAlignBackwardLaucher( 70 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height, 71 | data_width, num_channels, aligned_height, 72 | aligned_width, rois_flat, 73 | bottom_grad_flat, stream); 74 | 75 | return 1; 76 | } 77 | -------------------------------------------------------------------------------- /model/roi_pooling/src/roi_pooling_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "roi_pooling_kernel.h" 4 | 5 | extern THCState *state; 6 | 7 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale, 8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax) 9 | { 10 | // Grab the input tensor 11 | float * data_flat = THCudaTensor_data(state, features); 12 | float * rois_flat = THCudaTensor_data(state, rois); 13 | 14 | float * output_flat = THCudaTensor_data(state, output); 15 | int * argmax_flat = THCudaIntTensor_data(state, argmax); 16 | 17 | // Number of ROIs 18 | int num_rois = THCudaTensor_size(state, rois, 0); 19 | int size_rois = THCudaTensor_size(state, rois, 1); 20 | if (size_rois != 5) 21 | { 22 | return 0; 23 | } 24 | 25 | // batch size 26 | // int batch_size = THCudaTensor_size(state, features, 0); 27 | // if (batch_size != 1) 28 | // { 29 | // return 0; 30 | // } 31 | // data height 32 | int data_height = THCudaTensor_size(state, features, 2); 33 | // data width 34 | int data_width = THCudaTensor_size(state, features, 3); 35 | // Number of channels 36 | int num_channels = THCudaTensor_size(state, features, 1); 37 | 38 | cudaStream_t stream = THCState_getCurrentStream(state); 39 | 40 | ROIPoolForwardLaucher( 41 | data_flat, spatial_scale, num_rois, data_height, 42 | data_width, num_channels, pooled_height, 43 | pooled_width, rois_flat, 44 | output_flat, argmax_flat, stream); 45 | 46 | return 1; 47 | } 48 | 49 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale, 50 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax) 51 | { 52 | // Grab the input tensor 53 | float * top_grad_flat = THCudaTensor_data(state, top_grad); 54 | float * rois_flat = THCudaTensor_data(state, rois); 55 | 56 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 57 | int * argmax_flat = THCudaIntTensor_data(state, argmax); 58 | 59 | // Number of ROIs 60 | int num_rois = THCudaTensor_size(state, rois, 0); 61 | int size_rois = THCudaTensor_size(state, rois, 1); 62 | if (size_rois != 5) 63 | { 64 | return 0; 65 | } 66 | 67 | // batch size 68 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 69 | // if (batch_size != 1) 70 | // { 71 | // return 0; 72 | // } 73 | // data height 74 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 75 | // data width 76 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 77 | // Number of channels 78 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 79 | 80 | cudaStream_t stream = THCState_getCurrentStream(state); 81 | ROIPoolBackwardLaucher( 82 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height, 83 | data_width, num_channels, pooled_height, 84 | pooled_width, rois_flat, 85 | bottom_grad_flat, argmax_flat, stream); 86 | 87 | return 1; 88 | } 89 | -------------------------------------------------------------------------------- /utils/box_utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # PyTorch WSDDN 3 | # Copyright 2018. Seungkwan Lee 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Seungkwan Lee 6 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang 7 | # -------------------------------------------------------- 8 | import torch 9 | 10 | 11 | def element_wise_iou(boxes_a, boxes_b): 12 | """ 13 | Compute the element wise IoU 14 | :param box_a: (n, 4) minmax form boxes 15 | :param box_b: (n, 4) minmax form boxes 16 | :return: (n) iou 17 | """ 18 | max_xy = torch.min(boxes_a[:, 2:], boxes_b[:, 2:]) 19 | min_xy = torch.max(boxes_a[:, :2], boxes_b[:, :2]) 20 | inter_wh = torch.clamp((max_xy - min_xy + 1), min=0) 21 | I = inter_wh[:, 0] * inter_wh[:, 1] 22 | A = (boxes_a[:, 2] - boxes_a[:, 0] + 1) * (boxes_a[:, 3] - boxes_a[:, 1] + 1) 23 | B = (boxes_b[:, 2] - boxes_b[:, 0] + 1) * (boxes_b[:, 3] - boxes_b[:, 1] + 1) 24 | U = A + B - I 25 | return I / U 26 | 27 | 28 | def all_pair_iou(boxes_a, boxes_b): 29 | """ 30 | Compute the IoU of all pairs. 31 | :param boxes_a: (n, 4) minmax form boxes 32 | :param boxes_b: (m, 4) minmax form boxes 33 | :return: (n, m) iou of all pairs of two set 34 | """ 35 | 36 | N = boxes_a.size(0) 37 | M = boxes_b.size(0) 38 | max_xy = torch.min(boxes_a[:, 2:].unsqueeze(1).expand(N, M, 2), boxes_b[:, 2:].unsqueeze(0).expand(N, M, 2)) 39 | min_xy = torch.max(boxes_a[:, :2].unsqueeze(1).expand(N, M, 2), boxes_b[:, :2].unsqueeze(0).expand(N, M, 2)) 40 | inter_wh = torch.clamp((max_xy - min_xy + 1), min=0) 41 | I = inter_wh[:, :, 0] * inter_wh[:, :, 1] 42 | A = ((boxes_a[:, 2] - boxes_a[:, 0] + 1) * (boxes_a[:, 3] - boxes_a[:, 1] + 1)).unsqueeze(1).expand_as(I) 43 | B = ((boxes_b[:, 2] - boxes_b[:, 0] + 1) * (boxes_b[:, 3] - boxes_b[:, 1] + 1)).unsqueeze(0).expand_as(I) 44 | U = A + B - I 45 | 46 | return I / U 47 | 48 | 49 | def transform(boxes, transform_param): 50 | """ 51 | transform boxes 52 | :param boxes: (n, 4) tensor, (cx, cy, w, h) form. 53 | :param transform_param: (n, 4) tensor. 54 | :return: (n, 4) transformed boxes, (cx, cy, w, h) form. 55 | """ 56 | 57 | cx = boxes[:, 0] + transform_param[:, 0] * boxes[:, 2] 58 | cy = boxes[:, 1] + transform_param[:, 1] * boxes[:, 3] 59 | w = boxes[:, 2] * torch.exp(transform_param[:, 2]) 60 | h = boxes[:, 3] * torch.exp(transform_param[:, 3]) 61 | 62 | return torch.stack([cx, cy, w, h], 1) 63 | 64 | 65 | def to_cwh_form(boxes): 66 | """ 67 | :param boxes: (n, 4) tensor, (cx, cy, w, h) form. 68 | :return: (n, 4) tensor, (xmin, ymin, xmax, ymax) form 69 | """ 70 | 71 | cx = (boxes[:, 0] + boxes[:, 2]) / 2 72 | cy = (boxes[:, 1] + boxes[:, 3]) / 2 73 | w = boxes[:, 2] - boxes[:, 0] + 1 74 | h = boxes[:, 3] - boxes[:, 1] + 1 75 | return torch.stack([cx, cy, w, h], 1) 76 | 77 | 78 | def to_minmax_form(boxes): 79 | """ 80 | :param boxes: (n, 4) tensor, (xmin, ymin, xmax, ymax) form. 81 | :return: (n, 4) tensor, (cx, cy, w, h) form 82 | """ 83 | 84 | xmin = boxes[:, 0] - boxes[:, 2] / 2 + 0.5 85 | ymin = boxes[:, 1] - boxes[:, 3] / 2 + 0.5 86 | xmax = boxes[:, 0] + boxes[:, 2] / 2 - 0.5 87 | ymax = boxes[:, 1] + boxes[:, 3] / 2 - 0.5 88 | return torch.stack([xmin, ymin, xmax, ymax], 1) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | from setuptools import setup 11 | from distutils.extension import Extension 12 | from Cython.Distutils import build_ext 13 | import subprocess 14 | import numpy as np 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # Adapted fom 19 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 20 | for dir in path.split(os.pathsep): 21 | binpath = pjoin(dir, name) 22 | if os.path.exists(binpath): 23 | return os.path.abspath(binpath) 24 | return None 25 | 26 | 27 | 28 | # Obtain the numpy include directory. This logic works across numpy versions. 29 | try: 30 | numpy_include = np.get_include() 31 | except AttributeError: 32 | numpy_include = np.get_numpy_include() 33 | 34 | def customize_compiler_for_nvcc(self): 35 | """inject deep into distutils to customize how the dispatch 36 | to gcc/nvcc works. 37 | If you subclass UnixCCompiler, it's not trivial to get your subclass 38 | injected in, and still have the right customizations (i.e. 39 | distutils.sysconfig.customize_compiler) run on it. So instead of going 40 | the OO route, I have this. Note, it's kindof like a wierd functional 41 | subclassing going on.""" 42 | 43 | # tell the compiler it can processes .cu 44 | self.src_extensions.append('.cu') 45 | 46 | # save references to the default compiler_so and _comple methods 47 | default_compiler_so = self.compiler_so 48 | super = self._compile 49 | 50 | # now redefine the _compile method. This gets executed for each 51 | # object but distutils doesn't have the ability to change compilers 52 | # based on source extension: we add it. 53 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 54 | if os.path.splitext(src)[1] == '.cu': 55 | # use the cuda for .cu files 56 | self.set_executable('compiler_so', CUDA['nvcc']) 57 | # use only a subset of the extra_postargs, which are 1-1 translated 58 | # from the extra_compile_args in the Extension class 59 | postargs = extra_postargs['nvcc'] 60 | else: 61 | postargs = extra_postargs['gcc'] 62 | 63 | super(obj, src, ext, cc_args, postargs, pp_opts) 64 | # reset the default compiler_so, which we might have changed for cuda 65 | self.compiler_so = default_compiler_so 66 | 67 | # inject our redefined _compile method into the class 68 | self._compile = _compile 69 | 70 | 71 | # run the customize_compiler 72 | class custom_build_ext(build_ext): 73 | def build_extensions(self): 74 | customize_compiler_for_nvcc(self.compiler) 75 | build_ext.build_extensions(self) 76 | 77 | 78 | ext_modules = [ 79 | Extension( 80 | "utils.cpu_nms", 81 | ["utils/cpu_nms.pyx"], 82 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 83 | include_dirs = [numpy_include] 84 | ), 85 | ] 86 | 87 | setup( 88 | name='fast_rcnn', 89 | ext_modules=ext_modules, 90 | # inject our custom trigger 91 | cmdclass={'build_ext': custom_build_ext}, 92 | ) -------------------------------------------------------------------------------- /datasets/voc_loader.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # PyTorch WSDDN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Seungkwan Lee 5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang 6 | # -------------------------------------------------------- 7 | from scipy.misc import imread 8 | from scipy.io import loadmat 9 | import numpy as np 10 | import sys 11 | import os 12 | import xml.etree.ElementTree as ET 13 | 14 | VOC_CLASSES = [ 15 | 'aeroplane', 'bicycle', 'bird', 'boat', 16 | 'bottle', 'bus', 'car', 'cat', 'chair', 17 | 'cow', 'diningtable', 'dog', 'horse', 18 | 'motorbike', 'person', 'pottedplant', 19 | 'sheep', 'sofa', 'train', 'tvmonitor'] 20 | 21 | 22 | class VOCLoader: 23 | def __init__(self, root, prop_method, min_prop_scale, year, name): 24 | self.items = [] 25 | self.name_to_index = dict(zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 26 | print('VOC %s %s dataset loading...' % (year, name)) 27 | 28 | proposals = {} 29 | prop_scores = {} 30 | if prop_method == 'eb': 31 | raw_data = loadmat(os.path.join(root, 'proposals', 'edge_boxes_voc_%s_%s.mat' % (year, name))) 32 | for i in range(len(raw_data['images'][0])): 33 | id = raw_data['images'][0][i][0] 34 | boxes = raw_data['boxes'][0][i].astype(np.float) - 1 35 | scores = raw_data['boxScores'][0][i][:, 0] 36 | is_good = (boxes[:, 2] >= boxes[:, 0] + min_prop_scale) * (boxes[:, 3] >= boxes[:, 1] + min_prop_scale) 37 | is_good = np.nonzero(is_good)[0] 38 | boxes = boxes[is_good] 39 | scores = scores[is_good] 40 | proposals[id] = np.concatenate([boxes[:, 1:2], boxes[:, 0:1], boxes[:, 3:4], boxes[:, 2:3]], 1) 41 | prop_scores[id] = scores 42 | 43 | elif prop_method == 'ss': 44 | raw_data = loadmat(os.path.join(root, 'proposals', 'selective_search_voc_%s_%s.mat' % (year, name))) 45 | for i in range(len(raw_data['images'])): 46 | id = raw_data['images'][i][0][0] 47 | boxes = raw_data['boxes'][0][i].astype(np.float) - 1 48 | scores = np.zeros(len(boxes)) 49 | is_good = (boxes[:, 2] >= boxes[:, 0] + min_prop_scale) * (boxes[:, 3] >= boxes[:, 1] + min_prop_scale) 50 | is_good = np.nonzero(is_good)[0] 51 | boxes = boxes[is_good] 52 | scores = scores[is_good] 53 | proposals[id] = np.concatenate([boxes[:, 1:2], boxes[:, 0:1], boxes[:, 3:4], boxes[:, 2:3]], 1) 54 | prop_scores[id] = scores 55 | 56 | rootpath = os.path.join(root, 'VOCdevkit2007', 'VOC' + year) 57 | for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 58 | data = {} 59 | id = line.strip() 60 | target = ET.parse(os.path.join(rootpath, 'Annotations', id + '.xml')) 61 | 62 | box_set = [] 63 | category_set = [] 64 | for obj in target.iter('object'): 65 | cls_name = obj.find('name').text.strip().lower() 66 | bbox = obj.find('bndbox') 67 | 68 | xmin = int(bbox.find('xmin').text) - 1 69 | ymin = int(bbox.find('ymin').text) - 1 70 | xmax = int(bbox.find('xmax').text) - 1 71 | ymax = int(bbox.find('ymax').text) - 1 72 | 73 | category = self.name_to_index[cls_name] 74 | box_set.append(np.array([xmin, ymin, xmax, ymax], np.float32)) 75 | category_set.append(category) 76 | 77 | data['id'] = id 78 | data['boxes'] = np.array(box_set) 79 | data['categories'] = np.array(category_set, np.long) 80 | data['img_path'] = os.path.join(rootpath, 'JPEGImages', line.strip() + '.jpg') 81 | data['proposals'] = proposals[id] 82 | data['prop_scores'] = prop_scores[id] 83 | self.items.append(data) 84 | 85 | print('VOC %s %s dataset loading complete' % (year, name)) 86 | 87 | def __len__(self): 88 | return len(self.items) 89 | -------------------------------------------------------------------------------- /model/roi_pooling/src/roi_pooling.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale, 5 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output) 6 | { 7 | // Grab the input tensor 8 | float * data_flat = THFloatTensor_data(features); 9 | float * rois_flat = THFloatTensor_data(rois); 10 | 11 | float * output_flat = THFloatTensor_data(output); 12 | 13 | // Number of ROIs 14 | int num_rois = THFloatTensor_size(rois, 0); 15 | int size_rois = THFloatTensor_size(rois, 1); 16 | // batch size 17 | int batch_size = THFloatTensor_size(features, 0); 18 | if(batch_size != 1) 19 | { 20 | return 0; 21 | } 22 | // data height 23 | int data_height = THFloatTensor_size(features, 1); 24 | // data width 25 | int data_width = THFloatTensor_size(features, 2); 26 | // Number of channels 27 | int num_channels = THFloatTensor_size(features, 3); 28 | 29 | // Set all element of the output tensor to -inf. 30 | THFloatStorage_fill(THFloatTensor_storage(output), -1); 31 | 32 | // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R 33 | int index_roi = 0; 34 | int index_output = 0; 35 | int n; 36 | for (n = 0; n < num_rois; ++n) 37 | { 38 | int roi_batch_ind = rois_flat[index_roi + 0]; 39 | int roi_start_w = round(rois_flat[index_roi + 1] * spatial_scale); 40 | int roi_start_h = round(rois_flat[index_roi + 2] * spatial_scale); 41 | int roi_end_w = round(rois_flat[index_roi + 3] * spatial_scale); 42 | int roi_end_h = round(rois_flat[index_roi + 4] * spatial_scale); 43 | // CHECK_GE(roi_batch_ind, 0); 44 | // CHECK_LT(roi_batch_ind, batch_size); 45 | 46 | int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); 47 | int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); 48 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 49 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 50 | 51 | int index_data = roi_batch_ind * data_height * data_width * num_channels; 52 | const int output_area = pooled_width * pooled_height; 53 | 54 | int c, ph, pw; 55 | for (ph = 0; ph < pooled_height; ++ph) 56 | { 57 | for (pw = 0; pw < pooled_width; ++pw) 58 | { 59 | int hstart = (floor((float)(ph) * bin_size_h)); 60 | int wstart = (floor((float)(pw) * bin_size_w)); 61 | int hend = (ceil((float)(ph + 1) * bin_size_h)); 62 | int wend = (ceil((float)(pw + 1) * bin_size_w)); 63 | 64 | hstart = fminf(fmaxf(hstart + roi_start_h, 0), data_height); 65 | hend = fminf(fmaxf(hend + roi_start_h, 0), data_height); 66 | wstart = fminf(fmaxf(wstart + roi_start_w, 0), data_width); 67 | wend = fminf(fmaxf(wend + roi_start_w, 0), data_width); 68 | 69 | const int pool_index = index_output + (ph * pooled_width + pw); 70 | int is_empty = (hend <= hstart) || (wend <= wstart); 71 | if (is_empty) 72 | { 73 | for (c = 0; c < num_channels * output_area; c += output_area) 74 | { 75 | output_flat[pool_index + c] = 0; 76 | } 77 | } 78 | else 79 | { 80 | int h, w, c; 81 | for (h = hstart; h < hend; ++h) 82 | { 83 | for (w = wstart; w < wend; ++w) 84 | { 85 | for (c = 0; c < num_channels; ++c) 86 | { 87 | const int index = (h * data_width + w) * num_channels + c; 88 | if (data_flat[index_data + index] > output_flat[pool_index + c * output_area]) 89 | { 90 | output_flat[pool_index + c * output_area] = data_flat[index_data + index]; 91 | } 92 | } 93 | } 94 | } 95 | } 96 | } 97 | } 98 | 99 | // Increment ROI index 100 | index_roi += size_rois; 101 | index_output += pooled_height * pooled_width * num_channels; 102 | } 103 | return 1; 104 | } -------------------------------------------------------------------------------- /datasets/wsddn_dataset.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # PyTorch WSDDN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Seungkwan Lee 5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang 6 | # -------------------------------------------------------- 7 | import torch.utils.data as data 8 | import torch 9 | 10 | from scipy.misc import imread 11 | import numpy as np 12 | import cv2 13 | from datasets.voc_loader import VOCLoader 14 | 15 | 16 | class WSDDNDataset(data.Dataset): 17 | def __init__(self, dataset_names, data_dir, prop_method, num_classes=20, min_prop_scale=20): 18 | self._dataset_loaders = [] 19 | self.num_classes = num_classes 20 | for name in dataset_names: 21 | if name == 'voc07_trainval': 22 | self._dataset_loaders.append(VOCLoader(data_dir, prop_method, min_prop_scale, '2007', 'trainval')) 23 | elif name == 'voc07_test': 24 | self._dataset_loaders.append(VOCLoader(data_dir, prop_method, min_prop_scale, '2007', 'test')) 25 | else: 26 | raise Exception('Undefined dataset %s' % name) 27 | 28 | def get_data(self, index, h_flip=False, target_im_size=688, min_resize=False): 29 | im, gt_boxes, gt_categories, proposals, prop_scores, id, loader_index = self.get_raw_data(index) 30 | raw_img = im.copy() 31 | 32 | # rgb -> bgr 33 | im = im[:, :, ::-1] 34 | 35 | # horizontal flip 36 | if h_flip: 37 | im = im[:, ::-1, :] 38 | raw_img = raw_img[:, ::-1, :].copy() 39 | 40 | flipped_xmin = im.shape[1] - gt_boxes[:, 2] 41 | flipped_xmax = im.shape[1] - gt_boxes[:, 0] 42 | gt_boxes[:, 0] = flipped_xmin 43 | gt_boxes[:, 2] = flipped_xmax 44 | 45 | flipped_xmin = im.shape[1] - proposals[:, 2] 46 | flipped_xmax = im.shape[1] - proposals[:, 0] 47 | proposals[:, 0] = flipped_xmin 48 | proposals[:, 2] = flipped_xmax 49 | 50 | # cast to float type and mean subtraction 51 | im = im.astype(np.float32, copy=False) 52 | im -= np.array([[[102.9801, 115.9465, 122.7717]]]) 53 | 54 | # image rescale 55 | im_shape = im.shape 56 | im_size_min = np.min(im_shape[0:2]) 57 | im_size_max = np.max(im_shape[0:2]) 58 | 59 | if min_resize: 60 | im_scale = target_im_size / float(im_size_min) 61 | else: 62 | im_scale = target_im_size / float(im_size_max) 63 | 64 | if im_size_max * im_scale > 2000: 65 | im_scale = 2000 / im_size_max 66 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) 67 | 68 | gt_boxes = gt_boxes * im_scale 69 | proposals = proposals * im_scale 70 | 71 | # to tensor 72 | data = torch.tensor(im, dtype=torch.float32) 73 | data = data.permute(2, 0, 1).contiguous() 74 | gt_boxes = torch.tensor(gt_boxes, dtype=torch.float32) 75 | proposals = torch.tensor(proposals, dtype=torch.float32) 76 | prop_scores = torch.tensor(prop_scores, dtype=torch.float32) 77 | gt_categories = torch.tensor(gt_categories, dtype=torch.long) 78 | 79 | image_level_label = torch.zeros(self.num_classes, dtype=torch.uint8) 80 | for label in gt_categories: 81 | image_level_label[label] = 1 82 | return data, gt_boxes, gt_categories, proposals, prop_scores, image_level_label, im_scale, raw_img, id 83 | 84 | def get_raw_proposal(self, index): 85 | here = None 86 | loader_index = 0 87 | 88 | # select proper data loader by index 89 | for loader in self._dataset_loaders: 90 | if index < len(loader): 91 | here = loader.items[index] 92 | break 93 | else: 94 | index -= len(loader) 95 | loader_index += 1 96 | 97 | proposals = here['proposals'].copy() 98 | return proposals 99 | 100 | def get_raw_data(self, index): 101 | here = None 102 | loader_index = 0 103 | 104 | # select proper data loader by index 105 | for loader in self._dataset_loaders: 106 | if index < len(loader): 107 | here = loader.items[index] 108 | break 109 | else: 110 | index -= len(loader) 111 | loader_index += 1 112 | 113 | assert here is not None 114 | im = imread(here['img_path']) 115 | 116 | # gray to rgb 117 | if len(im.shape) == 2: 118 | im = im[:, :, np.newaxis] 119 | im = np.concatenate((im, im, im), axis=2) 120 | 121 | gt_boxes = here['boxes'].copy() 122 | gt_categories = here['categories'].copy() 123 | proposals = here['proposals'].copy() 124 | prop_scores = here['prop_scores'].copy() 125 | id = here['id'] 126 | return im, gt_boxes, gt_categories, proposals, prop_scores, id, loader_index 127 | 128 | def __len__(self): 129 | tot_len = 0 130 | for loader in self._dataset_loaders: 131 | tot_len += len(loader) 132 | return tot_len 133 | -------------------------------------------------------------------------------- /frcnn_eval/pascal_voc.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # 7 | # Modified by Seungkwan Lee for WSDDN 8 | # -------------------------------------------------------- 9 | 10 | import os 11 | from frcnn_eval.imdb import imdb 12 | import numpy as np 13 | from frcnn_eval.voc_eval import voc_eval 14 | import uuid 15 | 16 | class voc_eval_kit(imdb): 17 | def __init__(self, image_set, year, devkit_path): 18 | imdb.__init__(self, 'voc_' + year + '_' + image_set) 19 | self._year = year 20 | self._image_set = image_set 21 | self._devkit_path = devkit_path 22 | self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year) 23 | self._classes = ('aeroplane', 'bicycle', 'bird', 'boat', 24 | 'bottle', 'bus', 'car', 'cat', 'chair', 25 | 'cow', 'diningtable', 'dog', 'horse', 26 | 'motorbike', 'person', 'pottedplant', 27 | 'sheep', 'sofa', 'train', 'tvmonitor') 28 | self._class_to_ind = dict(zip(self.classes, range(self.num_classes))) 29 | self._image_ext = '.jpg' 30 | self._image_index = self._load_image_set_index() 31 | self._salt = str(uuid.uuid4()) 32 | 33 | 34 | assert os.path.exists(self._devkit_path), 'VOCdevkit path does not exist: {}'.format(self._devkit_path) 35 | assert os.path.exists(self._data_path), 'Path does not exist: {}'.format(self._data_path) 36 | 37 | def _load_image_set_index(self): 38 | """ 39 | Load the indexes listed in this dataset's image set file. 40 | """ 41 | # Example path to image set file: 42 | # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt 43 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main', 44 | self._image_set + '.txt') 45 | assert os.path.exists(image_set_file), \ 46 | 'Path does not exist: {}'.format(image_set_file) 47 | with open(image_set_file) as f: 48 | image_index = [x.strip() for x in f.readlines()] 49 | return image_index 50 | 51 | def _get_voc_results_file_template(self): 52 | # VOCdevkit/results/VOC2007/Main/_det_test_aeroplane.txt 53 | filename = 'det_' + self._image_set + '_' + self._salt + '_{:s}.txt' 54 | path = os.path.join( 55 | self._devkit_path, 56 | 'results', 57 | 'VOC' + self._year, 58 | 'Main', 59 | filename) 60 | return path 61 | 62 | def _write_voc_results_file(self, all_boxes): 63 | for cls_ind, cls in enumerate(self.classes): 64 | print('Writing {} VOC results file'.format(cls)) 65 | filename = self._get_voc_results_file_template().format(cls) 66 | with open(filename, 'wt') as f: 67 | for im_ind, index in enumerate(self.image_index): 68 | dets = all_boxes[cls_ind][im_ind] 69 | if dets == []: 70 | continue 71 | # the VOCdevkit expects 1-based indices 72 | for k in range(dets.shape[0]): 73 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 74 | format(index, dets[k, -1], 75 | dets[k, 0] + 1, dets[k, 1] + 1, 76 | dets[k, 2] + 1, dets[k, 3] + 1)) 77 | 78 | def _do_python_eval(self): 79 | annopath = os.path.join( 80 | self._devkit_path, 81 | 'VOC' + self._year, 82 | 'Annotations', 83 | '{:s}.xml') 84 | imagesetfile = os.path.join( 85 | self._devkit_path, 86 | 'VOC' + self._year, 87 | 'ImageSets', 88 | 'Main', 89 | self._image_set + '.txt') 90 | cachedir = os.path.join(self._devkit_path, 'annotations_cache') 91 | aps = [] 92 | # The PASCAL VOC metric changed in 2010 93 | use_07_metric = True if int(self._year) < 2010 else False 94 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 95 | for i, cls in enumerate(self._classes): 96 | filename = self._get_voc_results_file_template().format(cls) 97 | rec, prec, ap = voc_eval(filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5, use_07_metric=use_07_metric) 98 | aps += [ap] 99 | print('AP for {} = {:.4f}'.format(cls, ap)) 100 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 101 | print('~~~~~~~~') 102 | print('Results:') 103 | for ap in aps: 104 | print('{:.3f}'.format(ap)) 105 | print('{:.3f}'.format(np.mean(aps))) 106 | print('~~~~~~~~') 107 | print('') 108 | print('--------------------------------------------------------------') 109 | print('Results computed with the **unofficial** Python eval code.') 110 | print('Results should be very close to the official MATLAB eval code.') 111 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 112 | print('-- Thanks, The Management') 113 | print('--------------------------------------------------------------') 114 | 115 | def evaluate_detections(self, all_boxes): 116 | self._write_voc_results_file(all_boxes) 117 | self._do_python_eval() 118 | for cls in self._classes: 119 | filename = self._get_voc_results_file_template().format(cls) 120 | # os.remove(filename) 121 | -------------------------------------------------------------------------------- /frcnn_eval/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # 6 | # Modified by Seungkwan Lee for WSDDN 7 | # -------------------------------------------------------- 8 | 9 | import xml.etree.ElementTree as ET 10 | import os 11 | import pickle 12 | import numpy as np 13 | 14 | 15 | def parse_rec(filename): 16 | """ Parse a PASCAL VOC xml file """ 17 | tree = ET.parse(filename) 18 | objects = [] 19 | for obj in tree.findall('object'): 20 | obj_struct = {} 21 | obj_struct['name'] = obj.find('name').text 22 | obj_struct['pose'] = obj.find('pose').text 23 | obj_struct['truncated'] = int(obj.find('truncated').text) 24 | obj_struct['difficult'] = int(obj.find('difficult').text) 25 | bbox = obj.find('bndbox') 26 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 27 | int(bbox.find('ymin').text), 28 | int(bbox.find('xmax').text), 29 | int(bbox.find('ymax').text)] 30 | objects.append(obj_struct) 31 | 32 | return objects 33 | 34 | 35 | def voc_ap(rec, prec, use_07_metric=False): 36 | """ ap = voc_ap(rec, prec, [use_07_metric]) 37 | Compute VOC AP given precision and recall. 38 | If use_07_metric is true, uses the 39 | VOC 07 11 point method (default:False). 40 | """ 41 | if use_07_metric: 42 | # 11 point metric 43 | ap = 0. 44 | for t in np.arange(0., 1.1, 0.1): 45 | if np.sum(rec >= t) == 0: 46 | p = 0 47 | else: 48 | p = np.max(prec[rec >= t]) 49 | ap = ap + p / 11. 50 | else: 51 | # correct AP calculation 52 | # first append sentinel values at the end 53 | mrec = np.concatenate(([0.], rec, [1.])) 54 | mpre = np.concatenate(([0.], prec, [0.])) 55 | 56 | # compute the precision envelope 57 | for i in range(mpre.size - 1, 0, -1): 58 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 59 | 60 | # to calculate area under PR curve, look for points 61 | # where X axis (recall) changes value 62 | i = np.where(mrec[1:] != mrec[:-1])[0] 63 | 64 | # and sum (\Delta recall) * prec 65 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 66 | return ap 67 | 68 | 69 | def voc_eval(detpath, 70 | annopath, 71 | imagesetfile, 72 | classname, 73 | cachedir, 74 | ovthresh=0.5, 75 | use_07_metric=False): 76 | """rec, prec, ap = voc_eval(detpath, 77 | annopath, 78 | imagesetfile, 79 | classname, 80 | [ovthresh], 81 | [use_07_metric]) 82 | 83 | Top level function that does the PASCAL VOC evaluation. 84 | 85 | detpath: Path to detections 86 | detpath.format(classname) should produce the detection results file. 87 | annopath: Path to annotations 88 | annopath.format(imagename) should be the xml annotations file. 89 | imagesetfile: Text file containing the list of images, one image per line. 90 | classname: Category name (duh) 91 | cachedir: Directory for caching the annotations 92 | [ovthresh]: Overlap threshold (default = 0.5) 93 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 94 | (default False) 95 | """ 96 | # assumes detections are in detpath.format(classname) 97 | # assumes annotations are in annopath.format(imagename) 98 | # assumes imagesetfile is a text file with each line an image name 99 | # cachedir caches the annotations in a pickle file 100 | 101 | # first load gt 102 | if not os.path.isdir(cachedir): 103 | os.mkdir(cachedir) 104 | cachefile = os.path.join(cachedir, 'annots.pkl') 105 | # read list of images 106 | with open(imagesetfile, 'r') as f: 107 | lines = f.readlines() 108 | imagenames = [x.strip() for x in lines] 109 | 110 | if not os.path.isfile(cachefile): 111 | # load annots 112 | recs = {} 113 | for i, imagename in enumerate(imagenames): 114 | recs[imagename] = parse_rec(annopath.format(imagename)) 115 | if i % 100 == 0: 116 | print('Reading annotation for {:d}/{:d}'.format(i + 1, len(imagenames))) 117 | # save 118 | print('Saving cached annotations to {:s}'.format(cachefile)) 119 | with open(cachefile, 'wb') as f: 120 | pickle.dump(recs, f) 121 | else: 122 | # load 123 | with open(cachefile, 'rb') as f: 124 | recs = pickle.load(f) 125 | 126 | # extract gt objects for this class 127 | class_recs = {} 128 | npos = 0 129 | for imagename in imagenames: 130 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 131 | bbox = np.array([x['bbox'] for x in R]) 132 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 133 | det = [False] * len(R) 134 | npos = npos + sum(~difficult) 135 | class_recs[imagename] = {'bbox': bbox, 136 | 'difficult': difficult, 137 | 'det': det} 138 | 139 | # read dets 140 | detfile = detpath.format(classname) 141 | with open(detfile, 'r') as f: 142 | lines = f.readlines() 143 | 144 | splitlines = [x.strip().split(' ') for x in lines] 145 | image_ids = [x[0] for x in splitlines] 146 | confidence = np.array([float(x[1]) for x in splitlines]) 147 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 148 | print(BB, '@@@',confidence) 149 | # sort by confidence 150 | sorted_ind = np.argsort(-confidence) 151 | sorted_scores = np.sort(-confidence) 152 | BB = BB[sorted_ind, :] 153 | image_ids = [image_ids[x] for x in sorted_ind] 154 | 155 | # go down dets and mark TPs and FPs 156 | nd = len(image_ids) 157 | tp = np.zeros(nd) 158 | fp = np.zeros(nd) 159 | for d in range(nd): 160 | R = class_recs[image_ids[d]] 161 | bb = BB[d, :].astype(float) 162 | ovmax = -np.inf 163 | BBGT = R['bbox'].astype(float) 164 | 165 | if BBGT.size > 0: 166 | # compute overlaps 167 | # intersection 168 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 169 | iymin = np.maximum(BBGT[:, 1], bb[1]) 170 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 171 | iymax = np.minimum(BBGT[:, 3], bb[3]) 172 | iw = np.maximum(ixmax - ixmin + 1., 0.) 173 | ih = np.maximum(iymax - iymin + 1., 0.) 174 | inters = iw * ih 175 | 176 | # union 177 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 178 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 179 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 180 | 181 | overlaps = inters / uni 182 | ovmax = np.max(overlaps) 183 | jmax = np.argmax(overlaps) 184 | 185 | if ovmax > ovthresh: 186 | if not R['difficult'][jmax]: 187 | if not R['det'][jmax]: 188 | tp[d] = 1. 189 | R['det'][jmax] = 1 190 | else: 191 | fp[d] = 1. 192 | else: 193 | fp[d] = 1. 194 | 195 | # compute precision recall 196 | fp = np.cumsum(fp) 197 | tp = np.cumsum(tp) 198 | rec = tp / float(npos) 199 | # avoid divide by zero in case the first detection matches a difficult 200 | # ground truth 201 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 202 | ap = voc_ap(rec, prec, use_07_metric) 203 | 204 | return rec, prec, ap 205 | -------------------------------------------------------------------------------- /model/wsddn_vgg16.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # PyTorch WSDDN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Seungkwan Lee 5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang 6 | # -------------------------------------------------------- 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch 10 | from model.roi_align.modules.roi_align import RoIAlignAvg 11 | from model.roi_pooling.modules.roi_pool import _RoIPooling 12 | from utils.box_utils import * 13 | import torchvision 14 | 15 | 16 | class WSDDN_VGG16(nn.Module): 17 | def __init__(self, pretrained_model_path=None, num_class=20): 18 | super(WSDDN_VGG16, self).__init__() 19 | vgg = torchvision.models.vgg16() 20 | if pretrained_model_path is None: 21 | print("Create WSDDN_VGG16 without pretrained weights") 22 | else: 23 | print("Loading pretrained VGG16 weights from %s" % (pretrained_model_path)) 24 | state_dict = torch.load(pretrained_model_path) 25 | vgg.load_state_dict({k: v for k, v in state_dict.items() if k in vgg.state_dict()}) 26 | 27 | self.base = nn.Sequential(*list(vgg.features._modules.values())[:-1]) 28 | self.top = nn.Sequential(*list(vgg.classifier._modules.values())[:-1]) 29 | self.num_classes = num_class 30 | 31 | self.fc8c = nn.Linear(4096, self.num_classes) 32 | self.fc8d = nn.Linear(4096, self.num_classes) 33 | self.roi_pooling = _RoIPooling(7, 7, 1.0 / 16.0) 34 | self.roi_align = RoIAlignAvg(7, 7, 1.0 / 16.0) 35 | self.num_classes = self.num_classes 36 | self._init_weights() 37 | 38 | def _init_weights(self): 39 | def normal_init(m, mean, stddev, truncated=False): 40 | if truncated: 41 | m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation 42 | else: 43 | m.weight.data.normal_(mean, stddev) 44 | m.bias.data.zero_() 45 | 46 | normal_init(self.fc8c, 0, 0.01, False) 47 | normal_init(self.fc8d, 0, 0.01, False) 48 | 49 | def adjust_roi_offset(self, rois): 50 | rois = rois.clone() 51 | o0 = 8.5 52 | o1 = 9.5 53 | rois[:, 0] = torch.floor((rois[:, 0] - o0 + o1) / 16 + 0.5) 54 | rois[:, 1] = torch.floor((rois[:, 1] - o0 + o1) / 16 + 0.5) 55 | rois[:, 2] = torch.floor((rois[:, 2] - o0 - o1) / 16 - 0.5) 56 | rois[:, 3] = torch.floor((rois[:, 3] - o0 - o1) / 16 - 0.5) 57 | return rois 58 | 59 | def forward(self, im_data, rois, prop_scores=None, image_level_label=None): 60 | #rois = self.adjust_roi_offset(rois) 61 | N = rois.size(0) 62 | feature_map = self.base(im_data) 63 | zero_padded_rois = torch.cat([torch.zeros(N, 1).to(rois), rois], 1) 64 | pooled_feat = self.roi_pooling(feature_map, zero_padded_rois).view(N, -1) 65 | 66 | if prop_scores is not None: 67 | pooled_feat = pooled_feat * (prop_scores.view(N, 1) * 10 + 1) 68 | 69 | fc7 = self.top(pooled_feat) 70 | fc8c = self.fc8c(fc7) 71 | fc8d = self.fc8d(fc7) / 2 72 | 73 | cls = F.softmax(fc8c, dim=1) 74 | det = F.softmax(fc8d, dim=0) 75 | #det = self.region_aware_softmax(rois, fc8d) 76 | scores = cls * det 77 | 78 | if image_level_label is None: 79 | return scores 80 | 81 | image_level_scores = torch.sum(scores, 0) 82 | 83 | # To avoid numerical error 84 | image_level_scores = torch.clamp(image_level_scores, min=0, max=1) 85 | 86 | loss = F.binary_cross_entropy(image_level_scores, image_level_label.to(torch.float32), size_average=False) 87 | reg = self.spatial_regulariser(rois, fc7, scores, image_level_label) 88 | 89 | return scores, loss, reg 90 | 91 | def region_aware_softmax(self, rois, det_score): 92 | N = rois.size(0) 93 | C = self.num_classes 94 | cwh_form_rois = to_cwh_form(rois) 95 | pair_wise_dx = cwh_form_rois[:, 0].view(1, -1) - cwh_form_rois[:, 0].view(-1, 1) 96 | pair_wise_dy = cwh_form_rois[:, 1].view(1, -1) - cwh_form_rois[:, 1].view(-1, 1) 97 | 98 | pair_wise_wsum = cwh_form_rois[:, 2].view(1, -1) + cwh_form_rois[:, 2].view(-1, 1) 99 | pair_wise_hsum = cwh_form_rois[:, 3].view(1, -1) + cwh_form_rois[:, 3].view(-1, 1) 100 | 101 | pair_wise_dx = pair_wise_dx / pair_wise_wsum 102 | pair_wise_dy = pair_wise_dy / pair_wise_hsum 103 | 104 | pair_wise_dist = torch.sqrt(pair_wise_dx * pair_wise_dx + pair_wise_dy * pair_wise_dy) 105 | pair_wise_weight = torch.exp(-pair_wise_dist) 106 | 107 | det_score = torch.exp(det_score) 108 | output = [] 109 | 110 | for cls in range(self.num_classes): 111 | weighted_det_sum = torch.sum(det_score[:, cls] * pair_wise_weight, 1) 112 | here = det_score[:, cls] / weighted_det_sum 113 | output.append(here) 114 | 115 | output = torch.stack(output, 1) 116 | 117 | if output.max() < 0.001: 118 | det_score = torch.log(det_score) 119 | print(det_score) 120 | print(pair_wise_weight) 121 | print(det_score.max(), det_score.min()) 122 | print(pair_wise_weight.max(), pair_wise_weight.min()) 123 | print(pair_wise_dist.max(), pair_wise_dist.min()) 124 | 125 | return output 126 | 127 | 128 | # def spatial_regulariser(self, rois, fc7, scores, image_level_label): 129 | # N = rois.size(0) 130 | # ret = 0 131 | # C = 0 132 | # for cls in range(self.num_classes): 133 | # if image_level_label[cls].item() == 0: 134 | # continue 135 | # 136 | # max_score, max_score_index = torch.max(scores[:, cls], 0) 137 | # max_score_box = rois[max_score_index] 138 | # max_feature = fc7[max_score_index] 139 | # 140 | # iou = all_pair_iou(max_score_box.view(1, 4), rois).view(N) 141 | # adjacent_indices = iou.gt(0.6).nonzero().squeeze() 142 | # adjacent_features = fc7[adjacent_indices] 143 | # 144 | # diff = adjacent_features - max_feature 145 | # diff = diff * max_score 146 | # 147 | # ret = torch.sum(diff * diff) + ret 148 | # C = C + 1 149 | # return ret / C 150 | 151 | # def spatial_regulariser(self, rois, fc7, scores, image_level_label): 152 | # N = rois.size(0) 153 | # ret = 0 154 | # for cls in range(self.num_classes): 155 | # if image_level_label[cls].item() == 0: 156 | # continue 157 | # 158 | # max_score, max_score_index = torch.max(scores[:, cls], 0) 159 | # max_score_box = rois[max_score_index] 160 | # max_feature = fc7[max_score_index] 161 | # 162 | # iou = all_pair_iou(max_score_box.view(1, 4), rois).view(N) 163 | # adjacent_indices = iou.gt(0.6).nonzero().squeeze() 164 | # adjacent_features = fc7[adjacent_indices] 165 | # 166 | # diff = adjacent_features - max_feature 167 | # diff = diff * max_score 168 | # 169 | # ret = torch.sum(diff * diff) * 0.5 + ret 170 | # 171 | # return ret 172 | 173 | def spatial_regulariser(self, rois, fc7, scores, image_level_label): 174 | K = 10 175 | th = 0.6 176 | N = rois.size(0) 177 | ret = 0 178 | for cls in range(self.num_classes): 179 | if image_level_label[cls].item() == 0: 180 | continue 181 | 182 | topk_scores, topk_indices = scores[:, cls].topk(K, dim=0) 183 | topk_boxes = rois[topk_indices] 184 | topk_featres = fc7[topk_indices] 185 | 186 | mask = all_pair_iou(topk_boxes[0:1, :], topk_boxes).view(K).gt(th).float() 187 | 188 | diff = topk_featres - topk_featres[0] 189 | diff = diff * topk_scores.detach().view(K, 1) 190 | 191 | ret = (torch.pow(diff, 2).sum(1) * mask).sum() * 0.5 + ret 192 | 193 | return ret -------------------------------------------------------------------------------- /model/roi_align/src/roi_align_kernel.cu: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include "roi_align_kernel.h" 9 | 10 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 11 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 12 | i += blockDim.x * gridDim.x) 13 | 14 | 15 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width, 16 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) { 17 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 18 | // (n, c, ph, pw) is an element in the aligned output 19 | // int n = index; 20 | // int pw = n % aligned_width; 21 | // n /= aligned_width; 22 | // int ph = n % aligned_height; 23 | // n /= aligned_height; 24 | // int c = n % channels; 25 | // n /= channels; 26 | 27 | int pw = index % aligned_width; 28 | int ph = (index / aligned_width) % aligned_height; 29 | int c = (index / aligned_width / aligned_height) % channels; 30 | int n = index / aligned_width / aligned_height / channels; 31 | 32 | // bottom_rois += n * 5; 33 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 34 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 35 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 36 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 37 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 38 | 39 | // Force malformed ROIs to be 1x1 40 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 41 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 42 | float bin_size_h = roi_height / (aligned_height - 1.); 43 | float bin_size_w = roi_width / (aligned_width - 1.); 44 | 45 | float h = (float)(ph) * bin_size_h + roi_start_h; 46 | float w = (float)(pw) * bin_size_w + roi_start_w; 47 | 48 | int hstart = fminf(floor(h), height - 2); 49 | int wstart = fminf(floor(w), width - 2); 50 | 51 | int img_start = roi_batch_ind * channels * height * width; 52 | 53 | // bilinear interpolation 54 | if (h < 0 || h >= height || w < 0 || w >= width) { 55 | top_data[index] = 0.; 56 | } else { 57 | float h_ratio = h - (float)(hstart); 58 | float w_ratio = w - (float)(wstart); 59 | int upleft = img_start + (c * height + hstart) * width + wstart; 60 | int upright = upleft + 1; 61 | int downleft = upleft + width; 62 | int downright = downleft + 1; 63 | 64 | top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio) 65 | + bottom_data[upright] * (1. - h_ratio) * w_ratio 66 | + bottom_data[downleft] * h_ratio * (1. - w_ratio) 67 | + bottom_data[downright] * h_ratio * w_ratio; 68 | } 69 | } 70 | } 71 | 72 | 73 | int ROIAlignForwardLaucher(const float* bottom_data, const float spatial_scale, const int num_rois, const int height, const int width, 74 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data, cudaStream_t stream) { 75 | const int kThreadsPerBlock = 1024; 76 | const int output_size = num_rois * aligned_height * aligned_width * channels; 77 | cudaError_t err; 78 | 79 | 80 | ROIAlignForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 81 | output_size, bottom_data, spatial_scale, height, width, channels, 82 | aligned_height, aligned_width, bottom_rois, top_data); 83 | 84 | err = cudaGetLastError(); 85 | if(cudaSuccess != err) { 86 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 87 | exit( -1 ); 88 | } 89 | 90 | return 1; 91 | } 92 | 93 | 94 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, const float spatial_scale, const int height, const int width, 95 | const int channels, const int aligned_height, const int aligned_width, float* bottom_diff, const float* bottom_rois) { 96 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 97 | 98 | // (n, c, ph, pw) is an element in the aligned output 99 | int pw = index % aligned_width; 100 | int ph = (index / aligned_width) % aligned_height; 101 | int c = (index / aligned_width / aligned_height) % channels; 102 | int n = index / aligned_width / aligned_height / channels; 103 | 104 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 105 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 106 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 107 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 108 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 109 | /* int roi_start_w = round(bottom_rois[1] * spatial_scale); */ 110 | /* int roi_start_h = round(bottom_rois[2] * spatial_scale); */ 111 | /* int roi_end_w = round(bottom_rois[3] * spatial_scale); */ 112 | /* int roi_end_h = round(bottom_rois[4] * spatial_scale); */ 113 | 114 | // Force malformed ROIs to be 1x1 115 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 116 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 117 | float bin_size_h = roi_height / (aligned_height - 1.); 118 | float bin_size_w = roi_width / (aligned_width - 1.); 119 | 120 | float h = (float)(ph) * bin_size_h + roi_start_h; 121 | float w = (float)(pw) * bin_size_w + roi_start_w; 122 | 123 | int hstart = fminf(floor(h), height - 2); 124 | int wstart = fminf(floor(w), width - 2); 125 | 126 | int img_start = roi_batch_ind * channels * height * width; 127 | 128 | // bilinear interpolation 129 | if (!(h < 0 || h >= height || w < 0 || w >= width)) { 130 | float h_ratio = h - (float)(hstart); 131 | float w_ratio = w - (float)(wstart); 132 | int upleft = img_start + (c * height + hstart) * width + wstart; 133 | int upright = upleft + 1; 134 | int downleft = upleft + width; 135 | int downright = downleft + 1; 136 | 137 | atomicAdd(bottom_diff + upleft, top_diff[index] * (1. - h_ratio) * (1 - w_ratio)); 138 | atomicAdd(bottom_diff + upright, top_diff[index] * (1. - h_ratio) * w_ratio); 139 | atomicAdd(bottom_diff + downleft, top_diff[index] * h_ratio * (1 - w_ratio)); 140 | atomicAdd(bottom_diff + downright, top_diff[index] * h_ratio * w_ratio); 141 | } 142 | } 143 | } 144 | 145 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, const int height, const int width, 146 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* bottom_diff, cudaStream_t stream) { 147 | const int kThreadsPerBlock = 1024; 148 | const int output_size = num_rois * aligned_height * aligned_width * channels; 149 | cudaError_t err; 150 | 151 | ROIAlignBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 152 | output_size, top_diff, spatial_scale, height, width, channels, 153 | aligned_height, aligned_width, bottom_diff, bottom_rois); 154 | 155 | err = cudaGetLastError(); 156 | if(cudaSuccess != err) { 157 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 158 | exit( -1 ); 159 | } 160 | 161 | return 1; 162 | } 163 | 164 | 165 | #ifdef __cplusplus 166 | } 167 | #endif 168 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # PyTorch WSDDN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Seungkwan Lee 5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, Jianwei Yang 6 | # -------------------------------------------------------- 7 | import os 8 | import numpy as np 9 | import argparse 10 | import time 11 | 12 | import torch 13 | 14 | from utils.net_utils import adjust_learning_rate, save_checkpoint, clip_gradient 15 | from model.wsddn_vgg16 import WSDDN_VGG16 16 | from datasets.wsddn_dataset import WSDDNDataset 17 | from matplotlib import pyplot as plt 18 | import torch.nn.functional as F 19 | import math 20 | 21 | 22 | def parse_args(): 23 | parser = argparse.ArgumentParser(description='Train') 24 | parser.add_argument('--net', default='WSDDN_VGG16', type=str) 25 | parser.add_argument('--start_epoch', help='starting epoch', default=1, type=int) 26 | parser.add_argument('--epochs', dest='max_epochs', help='number of epochs', default=20, type=int) 27 | parser.add_argument('--disp_interval', help='number of iterations to display loss', default=1000, type=int) 28 | parser.add_argument('--save_interval', dest='save_interval', help='number of epochs to save', default=5, type=int) 29 | parser.add_argument('--save_dir', help='directory to save models', default="../repo/wsddn") 30 | parser.add_argument('--data_dir', help='directory to load data', default='../data', type=str) 31 | 32 | parser.add_argument('--prop_method', help='ss or eb', default='eb', type=str) 33 | parser.add_argument('--use_prop_score', action='store_true') 34 | parser.add_argument('--min_prop', help='minimum proposal box size', default=20, type=int) 35 | parser.add_argument('--alpha', help='alpha for spatial regularization', default=0.0001, type=float) 36 | 37 | parser.add_argument('--lr', help='starting learning rate', default=0.00001, type=float) 38 | parser.add_argument('--s', dest='session', help='training session', default=1, type=int) 39 | parser.add_argument('--bs', help='training batch size', default=1, type=int) 40 | parser.add_argument('--bavg', help='batch average', action='store_true') 41 | 42 | # resume trained model 43 | parser.add_argument('--r', dest='resume', help='resume checkpoint or not', action='store_true') 44 | parser.add_argument('--checksession', dest='checksession', help='checksession to load model', default=0, type=int) 45 | parser.add_argument('--checkepoch', dest='checkepoch', help='checkepoch to load model', default=0, type=int) 46 | 47 | args = parser.parse_args() 48 | return args 49 | 50 | 51 | def draw_box(boxes, col=None): 52 | for j, (xmin, ymin, xmax, ymax) in enumerate(boxes): 53 | if col is None: 54 | c = np.random.rand(3) 55 | else: 56 | c = col 57 | plt.hlines(ymin, xmin, xmax, colors=c, lw=2) 58 | plt.hlines(ymax, xmin, xmax, colors=c, lw=2) 59 | plt.vlines(xmin, ymin, ymax, colors=c, lw=2) 60 | plt.vlines(xmax, ymin, ymax, colors=c, lw=2) 61 | 62 | 63 | def train(): 64 | args = parse_args() 65 | print('Called with args:') 66 | print(args) 67 | 68 | np.random.seed(3) 69 | torch.manual_seed(4) 70 | if torch.cuda.is_available(): 71 | torch.cuda.manual_seed(5) 72 | device = torch.device('cuda') 73 | else: 74 | device = torch.device('cpu') 75 | 76 | output_dir = args.save_dir 77 | if not os.path.exists(output_dir): 78 | os.makedirs(output_dir) 79 | 80 | train_dataset = WSDDNDataset(dataset_names=['voc07_trainval'], data_dir=args.data_dir, prop_method=args.prop_method, 81 | num_classes=20, min_prop_scale=args.min_prop) 82 | 83 | lr = args.lr 84 | 85 | if args.net == 'WSDDN_VGG16': 86 | model = WSDDN_VGG16(os.path.join(args.data_dir, 'pretrained_model/vgg16_caffe.pth'), 20) 87 | 88 | else: 89 | raise Exception('network is not defined') 90 | 91 | params = [] 92 | for key, value in dict(model.named_parameters()).items(): 93 | if value.requires_grad: 94 | if 'bias' in key: 95 | params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}] 96 | else: 97 | params += [{'params': [value], 'lr': lr, 'weight_decay': 0.0005}] 98 | 99 | optimizer = torch.optim.SGD(params, momentum=0.9) 100 | 101 | if args.resume: 102 | load_name = os.path.join(output_dir, '{}_{}_{}.pth'.format(args.net, args.checksession, args.checkepoch)) 103 | print("loading checkpoint %s" % (load_name)) 104 | checkpoint = torch.load(load_name) 105 | assert args.net == checkpoint['net'] 106 | args.start_epoch = checkpoint['epoch'] 107 | model.load_state_dict(checkpoint['model']) 108 | optimizer.load_state_dict(checkpoint['optimizer']) 109 | lr = optimizer.param_groups[0]['lr'] 110 | print("loaded checkpoint %s" % (load_name)) 111 | 112 | log_file_name = os.path.join(output_dir, 'log_{}_{}.txt'.format(args.net, args.session)) 113 | if args.resume: 114 | log_file = open(log_file_name, 'a') 115 | else: 116 | log_file = open(log_file_name, 'w') 117 | log_file.write(str(args)) 118 | log_file.write('\n') 119 | 120 | model.to(device) 121 | 122 | for epoch in range(args.start_epoch, args.max_epochs + 1): 123 | model.train() 124 | loss_sum = 0 125 | reg_sum = 0 126 | iter_sum = 0 127 | num_prop = 0 128 | start = time.time() 129 | 130 | optimizer.zero_grad() 131 | rand_perm = np.random.permutation(len(train_dataset)) 132 | for step in range(1, len(train_dataset) + 1): 133 | index = rand_perm[step - 1] 134 | apply_h_flip = np.random.rand() > 0.5 135 | target_im_size = np.random.choice([480, 576, 688, 864, 1200]) 136 | im_data, gt_boxes, box_labels, proposals, prop_scores, image_level_label, im_scale, raw_img, im_id = \ 137 | train_dataset.get_data(index, apply_h_flip, target_im_size) 138 | 139 | # plt.imshow(raw_img) 140 | # draw_box(proposals / im_scale) 141 | # draw_box(gt_boxes / im_scale, 'black') 142 | # plt.show() 143 | 144 | im_data = im_data.unsqueeze(0).to(device) 145 | rois = proposals.to(device) 146 | image_level_label = image_level_label.to(device) 147 | 148 | if args.use_prop_score: 149 | prop_scores = prop_scores.to(device) 150 | else: 151 | prop_scores = None 152 | scores, loss, reg = model(im_data, rois, prop_scores, image_level_label) 153 | reg = reg * args.alpha 154 | num_prop += proposals.size(0) 155 | loss_sum += loss.item() 156 | reg_sum += reg.item() 157 | loss = loss + reg 158 | if args.bavg: 159 | loss = loss / args.bs 160 | loss.backward() 161 | 162 | if step % args.bs == 0: 163 | optimizer.step() 164 | optimizer.zero_grad() 165 | iter_sum += 1 166 | 167 | if step % args.disp_interval == 0: 168 | end = time.time() 169 | 170 | print("[net %s][session %d][epoch %2d][iter %4d] loss: %.4f, reg: %.4f, num_prop: %.1f, lr: %.2e, time: %.1f" % 171 | (args.net, args.session, epoch, step, loss_sum / iter_sum, reg_sum / iter_sum, num_prop / iter_sum, lr, end - start)) 172 | log_file.write("[net %s][session %d][epoch %2d][iter %4d] loss: %.4f, reg: %.4f, num_prop: %.1f, lr: %.2e, time: %.1f\n" % 173 | (args.net, args.session, epoch, step, loss_sum / iter_sum, reg_sum / iter_sum, num_prop / iter_sum, lr, end - start)) 174 | loss_sum = 0 175 | reg_sum = 0 176 | num_prop = 0 177 | iter_sum = 0 178 | start = time.time() 179 | 180 | log_file.flush() 181 | if epoch == 10: 182 | adjust_learning_rate(optimizer, 0.1) 183 | lr *= 0.1 184 | 185 | if epoch % args.save_interval == 0: 186 | save_name = os.path.join(output_dir, '{}_{}_{}.pth'.format(args.net, args.session, epoch)) 187 | checkpoint = dict() 188 | checkpoint['net'] = args.net 189 | checkpoint['session'] = args.session 190 | checkpoint['epoch'] = epoch + 1 191 | checkpoint['model'] = model.state_dict() 192 | checkpoint['optimizer'] = optimizer.state_dict() 193 | 194 | save_checkpoint(checkpoint, save_name) 195 | print('save model: {}'.format(save_name)) 196 | 197 | log_file.close() 198 | 199 | 200 | if __name__ == '__main__': 201 | train() -------------------------------------------------------------------------------- /model/roi_pooling/src/roi_pooling_kernel.cu: -------------------------------------------------------------------------------- 1 | // #ifdef __cplusplus 2 | // extern "C" { 3 | // #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "roi_pooling_kernel.h" 10 | 11 | 12 | #define DIVUP(m, n) ((m) / (m) + ((m) % (n) > 0)) 13 | 14 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 15 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 16 | i += blockDim.x * gridDim.x) 17 | 18 | // CUDA: grid stride looping 19 | #define CUDA_KERNEL_LOOP(i, n) \ 20 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 21 | i < (n); \ 22 | i += blockDim.x * gridDim.x) 23 | 24 | __global__ void ROIPoolForward(const int nthreads, const float* bottom_data, 25 | const float spatial_scale, const int height, const int width, 26 | const int channels, const int pooled_height, const int pooled_width, 27 | const float* bottom_rois, float* top_data, int* argmax_data) 28 | { 29 | CUDA_KERNEL_LOOP(index, nthreads) 30 | { 31 | // (n, c, ph, pw) is an element in the pooled output 32 | // int n = index; 33 | // int pw = n % pooled_width; 34 | // n /= pooled_width; 35 | // int ph = n % pooled_height; 36 | // n /= pooled_height; 37 | // int c = n % channels; 38 | // n /= channels; 39 | int pw = index % pooled_width; 40 | int ph = (index / pooled_width) % pooled_height; 41 | int c = (index / pooled_width / pooled_height) % channels; 42 | int n = index / pooled_width / pooled_height / channels; 43 | 44 | // bottom_rois += n * 5; 45 | int roi_batch_ind = bottom_rois[n * 5 + 0]; 46 | int roi_start_w = round(bottom_rois[n * 5 + 1] * spatial_scale); 47 | int roi_start_h = round(bottom_rois[n * 5 + 2] * spatial_scale); 48 | int roi_end_w = round(bottom_rois[n * 5 + 3] * spatial_scale); 49 | int roi_end_h = round(bottom_rois[n * 5 + 4] * spatial_scale); 50 | 51 | // Force malformed ROIs to be 1x1 52 | int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); 53 | int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); 54 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 55 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 56 | 57 | int hstart = (int)(floor((float)(ph) * bin_size_h)); 58 | int wstart = (int)(floor((float)(pw) * bin_size_w)); 59 | int hend = (int)(ceil((float)(ph + 1) * bin_size_h)); 60 | int wend = (int)(ceil((float)(pw + 1) * bin_size_w)); 61 | 62 | // Add roi offsets and clip to input boundaries 63 | hstart = fminf(fmaxf(hstart + roi_start_h, 0), height); 64 | hend = fminf(fmaxf(hend + roi_start_h, 0), height); 65 | wstart = fminf(fmaxf(wstart + roi_start_w, 0), width); 66 | wend = fminf(fmaxf(wend + roi_start_w, 0), width); 67 | bool is_empty = (hend <= hstart) || (wend <= wstart); 68 | 69 | // Define an empty pooling region to be zero 70 | float maxval = is_empty ? 0 : -FLT_MAX; 71 | // If nothing is pooled, argmax = -1 causes nothing to be backprop'd 72 | int maxidx = -1; 73 | // bottom_data += roi_batch_ind * channels * height * width; 74 | 75 | int bottom_data_batch_offset = roi_batch_ind * channels * height * width; 76 | int bottom_data_offset = bottom_data_batch_offset + c * height * width; 77 | 78 | for (int h = hstart; h < hend; ++h) { 79 | for (int w = wstart; w < wend; ++w) { 80 | // int bottom_index = (h * width + w) * channels + c; 81 | // int bottom_index = (c * height + h) * width + w; 82 | int bottom_index = h * width + w; 83 | if (bottom_data[bottom_data_offset + bottom_index] > maxval) { 84 | maxval = bottom_data[bottom_data_offset + bottom_index]; 85 | maxidx = bottom_data_offset + bottom_index; 86 | } 87 | } 88 | } 89 | top_data[index] = maxval; 90 | if (argmax_data != NULL) 91 | argmax_data[index] = maxidx; 92 | } 93 | } 94 | 95 | int ROIPoolForwardLaucher( 96 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 97 | const int width, const int channels, const int pooled_height, 98 | const int pooled_width, const float* bottom_rois, 99 | float* top_data, int* argmax_data, cudaStream_t stream) 100 | { 101 | const int kThreadsPerBlock = 1024; 102 | int output_size = num_rois * pooled_height * pooled_width * channels; 103 | cudaError_t err; 104 | 105 | ROIPoolForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 106 | output_size, bottom_data, spatial_scale, height, width, channels, pooled_height, 107 | pooled_width, bottom_rois, top_data, argmax_data); 108 | 109 | // dim3 blocks(DIVUP(output_size, kThreadsPerBlock), 110 | // DIVUP(output_size, kThreadsPerBlock)); 111 | // dim3 threads(kThreadsPerBlock); 112 | // 113 | // ROIPoolForward<<>>( 114 | // output_size, bottom_data, spatial_scale, height, width, channels, pooled_height, 115 | // pooled_width, bottom_rois, top_data, argmax_data); 116 | 117 | err = cudaGetLastError(); 118 | if(cudaSuccess != err) 119 | { 120 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 121 | exit( -1 ); 122 | } 123 | 124 | return 1; 125 | } 126 | 127 | 128 | __global__ void ROIPoolBackward(const int nthreads, const float* top_diff, 129 | const int* argmax_data, const int num_rois, const float spatial_scale, 130 | const int height, const int width, const int channels, 131 | const int pooled_height, const int pooled_width, float* bottom_diff, 132 | const float* bottom_rois) { 133 | CUDA_1D_KERNEL_LOOP(index, nthreads) 134 | { 135 | 136 | // (n, c, ph, pw) is an element in the pooled output 137 | int n = index; 138 | int w = n % width; 139 | n /= width; 140 | int h = n % height; 141 | n /= height; 142 | int c = n % channels; 143 | n /= channels; 144 | 145 | float gradient = 0; 146 | // Accumulate gradient over all ROIs that pooled this element 147 | for (int roi_n = 0; roi_n < num_rois; ++roi_n) 148 | { 149 | const float* offset_bottom_rois = bottom_rois + roi_n * 5; 150 | int roi_batch_ind = offset_bottom_rois[0]; 151 | // Skip if ROI's batch index doesn't match n 152 | if (n != roi_batch_ind) { 153 | continue; 154 | } 155 | 156 | int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); 157 | int roi_start_h = round(offset_bottom_rois[2] * spatial_scale); 158 | int roi_end_w = round(offset_bottom_rois[3] * spatial_scale); 159 | int roi_end_h = round(offset_bottom_rois[4] * spatial_scale); 160 | 161 | // Skip if ROI doesn't include (h, w) 162 | const bool in_roi = (w >= roi_start_w && w <= roi_end_w && 163 | h >= roi_start_h && h <= roi_end_h); 164 | if (!in_roi) { 165 | continue; 166 | } 167 | 168 | int offset = roi_n * pooled_height * pooled_width * channels; 169 | const float* offset_top_diff = top_diff + offset; 170 | const int* offset_argmax_data = argmax_data + offset; 171 | 172 | // Compute feasible set of pooled units that could have pooled 173 | // this bottom unit 174 | 175 | // Force malformed ROIs to be 1x1 176 | int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); 177 | int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); 178 | 179 | float bin_size_h = (float)(roi_height) / (float)(pooled_height); 180 | float bin_size_w = (float)(roi_width) / (float)(pooled_width); 181 | 182 | int phstart = floor((float)(h - roi_start_h) / bin_size_h); 183 | int phend = ceil((float)(h - roi_start_h + 1) / bin_size_h); 184 | int pwstart = floor((float)(w - roi_start_w) / bin_size_w); 185 | int pwend = ceil((float)(w - roi_start_w + 1) / bin_size_w); 186 | 187 | phstart = fminf(fmaxf(phstart, 0), pooled_height); 188 | phend = fminf(fmaxf(phend, 0), pooled_height); 189 | pwstart = fminf(fmaxf(pwstart, 0), pooled_width); 190 | pwend = fminf(fmaxf(pwend, 0), pooled_width); 191 | 192 | for (int ph = phstart; ph < phend; ++ph) { 193 | for (int pw = pwstart; pw < pwend; ++pw) { 194 | if (offset_argmax_data[(c * pooled_height + ph) * pooled_width + pw] == index) 195 | { 196 | gradient += offset_top_diff[(c * pooled_height + ph) * pooled_width + pw]; 197 | } 198 | } 199 | } 200 | } 201 | bottom_diff[index] = gradient; 202 | } 203 | } 204 | 205 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 206 | const int height, const int width, const int channels, const int pooled_height, 207 | const int pooled_width, const float* bottom_rois, 208 | float* bottom_diff, const int* argmax_data, cudaStream_t stream) 209 | { 210 | const int kThreadsPerBlock = 1024; 211 | int output_size = batch_size * height * width * channels; 212 | cudaError_t err; 213 | 214 | ROIPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 215 | output_size, top_diff, argmax_data, num_rois, spatial_scale, height, width, channels, pooled_height, 216 | pooled_width, bottom_diff, bottom_rois); 217 | 218 | // dim3 blocks(DIVUP(output_size, kThreadsPerBlock), 219 | // DIVUP(output_size, kThreadsPerBlock)); 220 | // dim3 threads(kThreadsPerBlock); 221 | // 222 | // ROIPoolBackward<<>>( 223 | // output_size, top_diff, argmax_data, num_rois, spatial_scale, height, width, channels, pooled_height, 224 | // pooled_width, bottom_diff, bottom_rois); 225 | 226 | err = cudaGetLastError(); 227 | if(cudaSuccess != err) 228 | { 229 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 230 | exit( -1 ); 231 | } 232 | 233 | return 1; 234 | } 235 | 236 | 237 | // #ifdef __cplusplus 238 | // } 239 | // #endif 240 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # PyTorch WSDDN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Seungkwan Lee 5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang 6 | # -------------------------------------------------------- 7 | import os 8 | import numpy as np 9 | import argparse 10 | import time 11 | 12 | import torch 13 | 14 | from model.wsddn_vgg16 import WSDDN_VGG16 15 | from datasets.wsddn_dataset import WSDDNDataset 16 | from matplotlib import pyplot as plt 17 | import torch.nn.functional as F 18 | import math 19 | import pickle 20 | from utils.cpu_nms import cpu_nms as nms 21 | import heapq 22 | import itertools 23 | from frcnn_eval.pascal_voc import voc_eval_kit 24 | 25 | def parse_args(): 26 | parser = argparse.ArgumentParser(description='Eval') 27 | parser.add_argument('--save_dir', help='directory to load model and save detection results', default="../repo") 28 | parser.add_argument('--data_dir', help='directory to load data', default='./data', type=str) 29 | 30 | parser.add_argument('--prop_method', help='ss or eb', default='eb', type=str) 31 | parser.add_argument('--use_prop_score', action='store_true') 32 | parser.add_argument('--multiscale', action='store_true') 33 | parser.add_argument('--min_resize', action='store_true') 34 | 35 | parser.add_argument('--min_prop', help='minimum proposal box size', default=20, type=int) 36 | parser.add_argument('--model_name', default='WSDDN_VGG16_1_20', type=str) 37 | 38 | args = parser.parse_args() 39 | return args 40 | 41 | args = parse_args() 42 | 43 | def draw_box(boxes, col=None): 44 | for j, (xmin, ymin, xmax, ymax) in enumerate(boxes): 45 | if col is None: 46 | c = np.random.rand(3) 47 | else: 48 | c = col 49 | plt.hlines(ymin, xmin, xmax, colors=c, lw=2) 50 | plt.hlines(ymax, xmin, xmax, colors=c, lw=2) 51 | plt.vlines(xmin, ymin, ymax, colors=c, lw=2) 52 | plt.vlines(xmax, ymin, ymax, colors=c, lw=2) 53 | 54 | 55 | def eval(): 56 | print('Called with args:') 57 | print(args) 58 | 59 | np.random.seed(3) 60 | torch.manual_seed(4) 61 | if torch.cuda.is_available(): 62 | torch.cuda.manual_seed(5) 63 | device = torch.device('cuda') 64 | else: 65 | device = torch.device('cpu') 66 | 67 | eval_kit = voc_eval_kit('test', '2007', os.path.join(args.data_dir, 'VOCdevkit2007')) 68 | 69 | test_dataset = WSDDNDataset(dataset_names=['voc07_test'], data_dir=args.data_dir, prop_method=args.prop_method, 70 | num_classes=20, min_prop_scale=args.min_prop) 71 | 72 | load_name = os.path.join(args.save_dir, 'wsddn', '{}.pth'.format(args.model_name)) 73 | print("loading checkpoint %s" % (load_name)) 74 | checkpoint = torch.load(load_name) 75 | if checkpoint['net'] == 'WSDDN_VGG16': 76 | model = WSDDN_VGG16(None, 20) 77 | else: 78 | raise Exception('network is not defined') 79 | model.load_state_dict(checkpoint['model']) 80 | print("loaded checkpoint %s" % (load_name)) 81 | 82 | model.to(device) 83 | model.eval() 84 | 85 | start = time.time() 86 | 87 | num_images = len(test_dataset) 88 | # heuristic: keep an average of 40 detections per class per images prior 89 | # to NMS 90 | max_per_set = 40 * num_images 91 | # heuristic: keep at most 100 detection per class per image prior to NMS 92 | max_per_image = 100 93 | # detection thresold for each class (this is adaptively set based on the 94 | # max_per_set constraint) 95 | thresh = -np.inf * np.ones(20) 96 | # thresh = 0.1 * np.ones(imdb.num_classes) 97 | # top_scores will hold one minheap of scores per class (used to enforce 98 | # the max_per_set constraint) 99 | top_scores = [[] for _ in range(20)] 100 | # all detections are collected into: 101 | # all_boxes[cls][image] = N x 5 array of detections in 102 | # (x1, y1, x2, y2, score) 103 | all_boxes = [[[] for _ in range(num_images)] for _ in range(20)] 104 | 105 | for index in range(len(test_dataset)): 106 | scores = 0 107 | if args.multiscale: 108 | comb = itertools.product([False, True], [480, 576, 688, 864, 1200]) 109 | else: 110 | comb = itertools.product([False], [688]) 111 | for h_flip, im_size in comb: 112 | im_data, gt_boxes, box_labels, proposals, prop_scores, image_level_label, im_scale_ratio, raw_img, im_id = test_dataset.get_data(index, h_flip, im_size, args.min_resize) 113 | 114 | im_data = im_data.unsqueeze(0).to(device) 115 | rois = proposals.to(device) 116 | 117 | if args.use_prop_score: 118 | prop_scores = prop_scores.to(device) 119 | else: 120 | prop_scores = None 121 | local_scores = model(im_data, rois, prop_scores, None).detach().cpu().numpy() 122 | scores = scores + local_scores 123 | 124 | scores = scores * 1000 125 | boxes = test_dataset.get_raw_proposal(index) 126 | 127 | for cls in range(20): 128 | inds = np.where((scores[:, cls] > thresh[cls]))[0] 129 | cls_scores = scores[inds, cls] 130 | cls_boxes = boxes[inds].copy() 131 | top_inds = np.argsort(-cls_scores)[:max_per_image] 132 | cls_scores = cls_scores[top_inds] 133 | cls_boxes = cls_boxes[top_inds, :] 134 | 135 | # if cls_scores[0] > 0.001: 136 | # #print(cls) 137 | # plt.imshow(raw_img) 138 | # draw_box(cls_boxes[0:10, :]) 139 | # draw_box(gt_boxes / im_scale, 'black') 140 | # plt.show() 141 | 142 | # push new scores onto the minheap 143 | for val in cls_scores: 144 | heapq.heappush(top_scores[cls], val) 145 | # if we've collected more than the max number of detection, 146 | # then pop items off the minheap and update the class threshold 147 | if len(top_scores[cls]) > max_per_set: 148 | while len(top_scores[cls]) > max_per_set: 149 | heapq.heappop(top_scores[cls]) 150 | thresh[cls] = top_scores[cls][0] 151 | 152 | all_boxes[cls][index] = np.hstack((cls_boxes, cls_scores[:, np.newaxis])).astype(np.float32, copy=False) 153 | 154 | # sorted_scores, sorted_indices = torch.sort(scores.detach(), dim=0, descending=True) 155 | # sorted_boxes = rois[sorted_indices.permute(1, 0)] 156 | # 157 | # for cls in range(20): 158 | # here = torch.cat((sorted_boxes[cls], sorted_scores[:, cls:cls + 1]), 1).cpu() 159 | # print(here) 160 | # all_boxes[cls][index] = here.numpy() 161 | 162 | if index % 100 == 99: 163 | print('%d images complete, elapsed time:%.1f' % (index + 1, time.time() - start)) 164 | 165 | for j in range(20): 166 | for i in range(len(test_dataset)): 167 | inds = np.where(all_boxes[j][i][:, -1] > thresh[j])[0] 168 | all_boxes[j][i] = all_boxes[j][i][inds, :] 169 | 170 | save_name = os.path.join(args.save_dir, 'detection_result', '{}.pkl'.format(args.model_name)) 171 | pickle.dump(all_boxes, open(save_name, 'wb')) 172 | 173 | print('Detection Complete, elapsed time: %.1f', time.time() - start) 174 | 175 | for cls in range(20): 176 | for index in range(len(test_dataset)): 177 | dets = all_boxes[cls][index] 178 | if dets == []: 179 | continue 180 | keep = nms(dets, 0.4) 181 | all_boxes[cls][index] = dets[keep, :].copy() 182 | print('NMS complete, elapsed time: %.1f', time.time() - start) 183 | 184 | eval_kit.evaluate_detections(all_boxes) 185 | 186 | 187 | def my_eval(): 188 | print('Called with args:') 189 | print(args) 190 | 191 | np.random.seed(3) 192 | torch.manual_seed(4) 193 | if torch.cuda.is_available(): 194 | torch.cuda.manual_seed(5) 195 | device = torch.device('cuda') 196 | else: 197 | device = torch.device('cpu') 198 | 199 | eval_kit = voc_eval_kit('test', '2007', os.path.join(args.data_dir, 'VOCdevkit2007')) 200 | 201 | test_dataset = WSDDNDataset(dataset_names=['voc07_test'], data_dir=args.data_dir, prop_method=args.prop_method, 202 | num_classes=20, min_prop_scale=args.min_prop) 203 | 204 | load_name = os.path.join(args.save_dir, 'wsddn', '{}.pth'.format(args.model_name)) 205 | print("loading checkpoint %s" % (load_name)) 206 | checkpoint = torch.load(load_name) 207 | if checkpoint['net'] == 'WSDDN_VGG16': 208 | model = WSDDN_VGG16(None, 20) 209 | else: 210 | raise Exception('network is not defined') 211 | model.load_state_dict(checkpoint['model']) 212 | print("loaded checkpoint %s" % (load_name)) 213 | 214 | model.to(device) 215 | model.eval() 216 | 217 | start = time.time() 218 | 219 | all_boxes = [[[] for _ in range(len(test_dataset))] for _ in range(20)] 220 | 221 | for index in range(len(test_dataset)): 222 | im_data, gt_boxes, box_labels, proposals, prop_scores, image_level_label, im_scale_ratio, raw_img, im_id = test_dataset.get_data( 223 | index, False, 688) 224 | 225 | im_data = im_data.unsqueeze(0).to(device) 226 | rois = proposals.to(device) 227 | 228 | if args.use_prop_score: 229 | prop_scores = prop_scores.to(device) 230 | else: 231 | prop_scores = None 232 | scores = model(im_data, rois, prop_scores, None) 233 | 234 | sorted_scores, sorted_indices = torch.sort(scores.detach(), dim=0, descending=True) 235 | sorted_boxes = rois[sorted_indices.permute(1, 0)] / im_scale_ratio 236 | 237 | for cls in range(20): 238 | here = torch.cat((sorted_boxes[cls], sorted_scores[:, cls:cls + 1]), 1).cpu() 239 | all_boxes[cls][index] = here.numpy() 240 | 241 | if index % 500 == 499: 242 | print('%d images complete, elapsed time:%.1f' % (index + 1, time.time() - start)) 243 | 244 | save_name = os.path.join(args.save_dir, 'detection_result', '{}.pkl'.format(args.model_name)) 245 | pickle.dump(all_boxes, open(save_name, 'wb')) 246 | 247 | print('Detection Complete, elapsed time: %.1f', time.time() - start) 248 | 249 | for cls in range(20): 250 | for index in range(len(test_dataset)): 251 | dets = all_boxes[cls][index] 252 | if dets == []: 253 | continue 254 | keep = nms(dets, 0.4) 255 | all_boxes[cls][index] = dets[keep, :].copy() 256 | print('NMS complete, elapsed time: %.1f', time.time() - start) 257 | 258 | eval_kit.evaluate_detections(all_boxes) 259 | 260 | def eval_saved_result(): 261 | eval_kit = voc_eval_kit('test', '2007', os.path.join(args.data_dir, 'VOCdevkit2007')) 262 | 263 | save_name = os.path.join(args.save_dir, 'detection_result', '{}.pkl'.format(args.model_name)) 264 | 265 | all_boxes = pickle.load(open(save_name, 'rb'), encoding='latin1') 266 | #all_boxes = pickle.load(open('../repo/oicr_result/test_detections.pkl', 'rb'), encoding='latin1') 267 | 268 | for cls in range(20): 269 | for index in range(len(all_boxes[0])): 270 | dets = all_boxes[cls][index] 271 | if dets == []: 272 | continue 273 | keep = nms(dets, 0.4) 274 | all_boxes[cls][index] = dets[keep, :].copy() 275 | if index % 500 == 499: 276 | print(index) 277 | print('nms: cls %d complete' % cls) 278 | 279 | eval_kit.evaluate_detections(all_boxes) 280 | 281 | 282 | if __name__ == '__main__': 283 | eval() 284 | #eval_saved_result() --------------------------------------------------------------------------------