├── utils
    ├── __init__.py
    ├── net_utils.py
    ├── cpu_nms.pyx
    └── box_utils.py
├── hello.py
├── model
    ├── roi_align
    │   ├── __init__.py
    │   ├── _ext
    │   │   ├── __init__.py
    │   │   └── roi_align
    │   │   │   └── __init__.py
    │   ├── functions
    │   │   ├── __init__.py
    │   │   └── roi_align.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   └── roi_align.py
    │   ├── src
    │   │   ├── roi_align_kernel.cu.o
    │   │   ├── roi_align_cuda.h
    │   │   ├── roi_align_kernel.h
    │   │   ├── roi_align_cuda.c
    │   │   └── roi_align_kernel.cu
    │   ├── make.sh
    │   └── build.py
    ├── roi_pooling
    │   ├── __init__.py
    │   ├── _ext
    │   │   ├── __init__.py
    │   │   └── roi_pooling
    │   │   │   └── __init__.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   └── roi_pool.py
    │   ├── functions
    │   │   ├── __init__.py
    │   │   └── roi_pool.py
    │   ├── src
    │   │   ├── roi_pooling.cu.o
    │   │   ├── roi_pooling.h
    │   │   ├── roi_pooling_cuda.h
    │   │   ├── roi_pooling_kernel.h
    │   │   ├── roi_pooling_cuda.c
    │   │   ├── roi_pooling.c
    │   │   └── roi_pooling_kernel.cu
    │   └── build.py
    ├── __init__.py
    └── wsddn_vgg16.py
├── README.md
├── .gitignore
├── frcnn_eval
    ├── __init__.py
    ├── imdb.py
    ├── pascal_voc.py
    └── voc_eval.py
├── datasets
    ├── __init__.py
    ├── voc_loader.py
    └── wsddn_dataset.py
├── make.sh
├── LICENSE
├── setup.py
├── train.py
└── eval.py


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hello.py:
--------------------------------------------------------------------------------
1 | from scipy.misc


--------------------------------------------------------------------------------
/model/roi_align/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WSDDN.pytorch
2 | 


--------------------------------------------------------------------------------
/model/roi_align/_ext/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/roi_pooling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/roi_align/functions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/roi_align/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/roi_pooling/_ext/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/roi_pooling/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/roi_pooling/functions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data/*
2 | data
3 | .idea
4 | .idea/*
5 | *.pyc
6 | *~
7 | *.so
8 | 


--------------------------------------------------------------------------------
/model/roi_pooling/src/roi_pooling.cu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deneb2016/WSDDN.pytorch/HEAD/model/roi_pooling/src/roi_pooling.cu.o


--------------------------------------------------------------------------------
/model/roi_align/src/roi_align_kernel.cu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deneb2016/WSDDN.pytorch/HEAD/model/roi_align/src/roi_align_kernel.cu.o


--------------------------------------------------------------------------------
/model/roi_pooling/src/roi_pooling.h:
--------------------------------------------------------------------------------
1 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale,
2 |                         THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output);


--------------------------------------------------------------------------------
/model/roi_align/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CUDA_PATH=/usr/local/cuda/
 4 | 
 5 | cd src
 6 | echo "Compiling my_lib kernels by nvcc..."
 7 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52
 8 | 
 9 | cd ../
10 | python build.py
11 | 


--------------------------------------------------------------------------------
/frcnn_eval/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # PyTorch WSDDN
3 | # Licensed under The MIT License [see LICENSE for details]
4 | # Written by Seungkwan Lee
5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang
6 | # --------------------------------------------------------


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # PyTorch WSDDN
3 | # Licensed under The MIT License [see LICENSE for details]
4 | # Written by Seungkwan Lee
5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang
6 | # --------------------------------------------------------


--------------------------------------------------------------------------------
/model/roi_align/src/roi_align_cuda.h:
--------------------------------------------------------------------------------
1 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale,
2 |                         THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output);
3 | 
4 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale,
5 |                         THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad);
6 | 


--------------------------------------------------------------------------------
/model/roi_pooling/src/roi_pooling_cuda.h:
--------------------------------------------------------------------------------
1 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale,
2 |                         THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax);
3 | 
4 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale,
5 |                         THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax);


--------------------------------------------------------------------------------
/model/roi_align/_ext/roi_align/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from torch.utils.ffi import _wrap_function
 3 | from ._roi_align import lib as _lib, ffi as _ffi
 4 | 
 5 | __all__ = []
 6 | def _import_symbols(locals):
 7 |     for symbol in dir(_lib):
 8 |         fn = getattr(_lib, symbol)
 9 |         if callable(fn):
10 |             locals[symbol] = _wrap_function(fn, _ffi)
11 |         else:
12 |             locals[symbol] = fn
13 |         __all__.append(symbol)
14 | 
15 | _import_symbols(locals())
16 | 


--------------------------------------------------------------------------------
/model/roi_pooling/_ext/roi_pooling/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from torch.utils.ffi import _wrap_function
 3 | from ._roi_pooling import lib as _lib, ffi as _ffi
 4 | 
 5 | __all__ = []
 6 | def _import_symbols(locals):
 7 |     for symbol in dir(_lib):
 8 |         fn = getattr(_lib, symbol)
 9 |         if callable(fn):
10 |             locals[symbol] = _wrap_function(fn, _ffi)
11 |         else:
12 |             locals[symbol] = fn
13 |         __all__.append(symbol)
14 | 
15 | _import_symbols(locals())
16 | 


--------------------------------------------------------------------------------
/model/roi_pooling/modules/roi_pool.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.modules.module import Module
 2 | from ..functions.roi_pool import RoIPoolFunction
 3 | 
 4 | 
 5 | class _RoIPooling(Module):
 6 |     def __init__(self, pooled_height, pooled_width, spatial_scale):
 7 |         super(_RoIPooling, self).__init__()
 8 | 
 9 |         self.pooled_width = int(pooled_width)
10 |         self.pooled_height = int(pooled_height)
11 |         self.spatial_scale = float(spatial_scale)
12 | 
13 |     def forward(self, features, rois):
14 |         return RoIPoolFunction(self.pooled_height, self.pooled_width, self.spatial_scale)(features, rois)
15 | 


--------------------------------------------------------------------------------
/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CUDA_PATH=/usr/local/cuda/
 4 | export CXXFLAGS="-std=c++11"
 5 | export CFLAGS="-std=c99"
 6 | export PATH=$CUDA_PATH/bin:$PATH
 7 | 
 8 | python setup.py build_ext --inplace
 9 | rm -rf build
10 | 
11 | 
12 | CUDA_ARCH="-gencode arch=compute_52,code=sm_52 -arch=sm_52"
13 | 
14 | 
15 | # compile roi_pooling
16 | cd model/roi_pooling/src
17 | echo "Compiling roi pooling kernels by nvcc..."
18 | nvcc -c -o roi_pooling.cu.o roi_pooling_kernel.cu \
19 | 	 -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC $CUDA_ARCH
20 | cd ../
21 | python build.py
22 | 
23 | # compile roi_align
24 | cd ../../
25 | cd model/roi_align/src
26 | echo "Compiling roi align kernels by nvcc..."
27 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu \
28 | 	 -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC $CUDA_ARCH
29 | cd ../
30 | python build.py
31 | 


--------------------------------------------------------------------------------
/model/roi_pooling/src/roi_pooling_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _ROI_POOLING_KERNEL
 2 | #define _ROI_POOLING_KERNEL
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | int ROIPoolForwardLaucher(
 9 |     const float* bottom_data, const float spatial_scale, const int num_rois, const int height,
10 |     const int width, const int channels, const int pooled_height,
11 |     const int pooled_width, const float* bottom_rois,
12 |     float* top_data, int* argmax_data, cudaStream_t stream);
13 | 
14 | 
15 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois,
16 |     const int height, const int width, const int channels, const int pooled_height,
17 |     const int pooled_width, const float* bottom_rois,
18 |     float* bottom_diff, const int* argmax_data, cudaStream_t stream);
19 | 
20 | #ifdef __cplusplus
21 | }
22 | #endif
23 | 
24 | #endif
25 | 
26 | 


--------------------------------------------------------------------------------
/model/roi_pooling/build.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | import torch
 4 | from torch.utils.ffi import create_extension
 5 | 
 6 | 
 7 | sources = ['src/roi_pooling.c']
 8 | headers = ['src/roi_pooling.h']
 9 | defines = []
10 | with_cuda = False
11 | 
12 | if torch.cuda.is_available():
13 |     print('Including CUDA code.')
14 |     sources += ['src/roi_pooling_cuda.c']
15 |     headers += ['src/roi_pooling_cuda.h']
16 |     defines += [('WITH_CUDA', None)]
17 |     with_cuda = True
18 | 
19 | this_file = os.path.dirname(os.path.realpath(__file__))
20 | print(this_file)
21 | extra_objects = ['src/roi_pooling.cu.o']
22 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
23 | 
24 | ffi = create_extension(
25 |     '_ext.roi_pooling',
26 |     headers=headers,
27 |     sources=sources,
28 |     define_macros=defines,
29 |     relative_to=__file__,
30 |     with_cuda=with_cuda,
31 |     extra_objects=extra_objects
32 | )
33 | 
34 | if __name__ == '__main__':
35 |     ffi.build()
36 | 


--------------------------------------------------------------------------------
/model/roi_align/build.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | import torch
 4 | from torch.utils.ffi import create_extension
 5 | 
 6 | # sources = ['src/roi_align.c']
 7 | # headers = ['src/roi_align.h']
 8 | sources = []
 9 | headers = []
10 | defines = []
11 | with_cuda = False
12 | 
13 | if torch.cuda.is_available():
14 |     print('Including CUDA code.')
15 |     sources += ['src/roi_align_cuda.c']
16 |     headers += ['src/roi_align_cuda.h']
17 |     defines += [('WITH_CUDA', None)]
18 |     with_cuda = True
19 | 
20 | this_file = os.path.dirname(os.path.realpath(__file__))
21 | print(this_file)
22 | extra_objects = ['src/roi_align_kernel.cu.o']
23 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
24 | 
25 | ffi = create_extension(
26 |     '_ext.roi_align',
27 |     headers=headers,
28 |     sources=sources,
29 |     define_macros=defines,
30 |     relative_to=__file__,
31 |     with_cuda=with_cuda,
32 |     extra_objects=extra_objects
33 | )
34 | 
35 | if __name__ == '__main__':
36 |     ffi.build()
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Seungkwan Lee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/utils/net_utils.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # PyTorch WSDDN
 3 | # Copyright 2018. Seungkwan Lee
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Seungkwan Lee
 6 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang
 7 | # --------------------------------------------------------
 8 | import torch
 9 | import numpy as np
10 | 
11 | 
12 | def clip_gradient(model, clip_norm):
13 |     totalnorm = 0
14 |     for p in model.parameters():
15 |         if p.requires_grad and p.grad is not None:
16 |             modulenorm = p.grad.data.norm().item()
17 |             totalnorm = totalnorm + modulenorm ** 2
18 |     totalnorm = np.sqrt(totalnorm)
19 | 
20 |     norm = clip_norm / max(totalnorm, clip_norm)
21 |     for p in model.parameters():
22 |         if p.requires_grad and p.grad is not None:
23 |             p.grad.mul_(norm)
24 | 
25 | 
26 | def adjust_learning_rate(optimizer, decay=0.1):
27 |     for param_group in optimizer.param_groups:
28 |         param_group['lr'] = decay * param_group['lr']
29 | 
30 | 
31 | def save_checkpoint(state, filename):
32 |     torch.save(state, filename)
33 | 


--------------------------------------------------------------------------------
/model/roi_align/src/roi_align_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _ROI_ALIGN_KERNEL
 2 | #define _ROI_ALIGN_KERNEL
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data,
 9 |     const float spatial_scale, const int height, const int width,
10 |     const int channels, const int aligned_height, const int aligned_width,
11 |     const float* bottom_rois, float* top_data);
12 | 
13 | int ROIAlignForwardLaucher(
14 |     const float* bottom_data, const float spatial_scale, const int num_rois, const int height,
15 |     const int width, const int channels, const int aligned_height,
16 |     const int aligned_width, const float* bottom_rois,
17 |     float* top_data, cudaStream_t stream);
18 | 
19 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff,
20 |     const float spatial_scale, const int height, const int width,
21 |     const int channels, const int aligned_height, const int aligned_width,
22 |     float* bottom_diff, const float* bottom_rois);
23 | 
24 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois,
25 |     const int height, const int width, const int channels, const int aligned_height,
26 |     const int aligned_width, const float* bottom_rois,
27 |     float* bottom_diff, cudaStream_t stream);
28 | 
29 | #ifdef __cplusplus
30 | }
31 | #endif
32 | 
33 | #endif
34 | 
35 | 


--------------------------------------------------------------------------------
/model/roi_align/modules/roi_align.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.modules.module import Module
 2 | from torch.nn.functional import avg_pool2d, max_pool2d
 3 | from ..functions.roi_align import RoIAlignFunction
 4 | 
 5 | 
 6 | class RoIAlign(Module):
 7 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
 8 |         super(RoIAlign, self).__init__()
 9 | 
10 |         self.aligned_width = int(aligned_width)
11 |         self.aligned_height = int(aligned_height)
12 |         self.spatial_scale = float(spatial_scale)
13 | 
14 |     def forward(self, features, rois):
15 |         return RoIAlignFunction(self.aligned_height, self.aligned_width,
16 |                                 self.spatial_scale)(features, rois)
17 | 
18 | class RoIAlignAvg(Module):
19 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
20 |         super(RoIAlignAvg, self).__init__()
21 | 
22 |         self.aligned_width = int(aligned_width)
23 |         self.aligned_height = int(aligned_height)
24 |         self.spatial_scale = float(spatial_scale)
25 | 
26 |     def forward(self, features, rois):
27 |         x =  RoIAlignFunction(self.aligned_height+1, self.aligned_width+1,
28 |                                 self.spatial_scale)(features, rois)
29 |         return avg_pool2d(x, kernel_size=2, stride=1)
30 | 
31 | class RoIAlignMax(Module):
32 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
33 |         super(RoIAlignMax, self).__init__()
34 | 
35 |         self.aligned_width = int(aligned_width)
36 |         self.aligned_height = int(aligned_height)
37 |         self.spatial_scale = float(spatial_scale)
38 | 
39 |     def forward(self, features, rois):
40 |         x =  RoIAlignFunction(self.aligned_height+1, self.aligned_width+1,
41 |                                 self.spatial_scale)(features, rois)
42 |         return max_pool2d(x, kernel_size=2, stride=1)
43 | 


--------------------------------------------------------------------------------
/model/roi_pooling/functions/roi_pool.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Function
 3 | from .._ext import roi_pooling
 4 | import pdb
 5 | 
 6 | class RoIPoolFunction(Function):
 7 |     def __init__(ctx, pooled_height, pooled_width, spatial_scale):
 8 |         ctx.pooled_width = pooled_width
 9 |         ctx.pooled_height = pooled_height
10 |         ctx.spatial_scale = spatial_scale
11 |         ctx.feature_size = None
12 | 
13 |     def forward(ctx, features, rois): 
14 |         ctx.feature_size = features.size()           
15 |         batch_size, num_channels, data_height, data_width = ctx.feature_size
16 |         num_rois = rois.size(0)
17 |         output = features.new(num_rois, num_channels, ctx.pooled_height, ctx.pooled_width).zero_()
18 |         ctx.argmax = features.new(num_rois, num_channels, ctx.pooled_height, ctx.pooled_width).zero_().int()
19 |         ctx.rois = rois
20 |         if not features.is_cuda:
21 |             _features = features.permute(0, 2, 3, 1)
22 |             roi_pooling.roi_pooling_forward(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale,
23 |                                             _features, rois, output)
24 |         else:
25 |             roi_pooling.roi_pooling_forward_cuda(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale,
26 |                                                  features, rois, output, ctx.argmax)
27 | 
28 |         return output
29 | 
30 |     def backward(ctx, grad_output):
31 |         assert(ctx.feature_size is not None and grad_output.is_cuda)
32 |         batch_size, num_channels, data_height, data_width = ctx.feature_size
33 |         grad_input = grad_output.new(batch_size, num_channels, data_height, data_width).zero_()
34 | 
35 |         roi_pooling.roi_pooling_backward_cuda(ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale,
36 |                                               grad_output, ctx.rois, grad_input, ctx.argmax)
37 | 
38 |         return grad_input, None
39 | 


--------------------------------------------------------------------------------
/model/roi_align/functions/roi_align.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Function
 3 | from .._ext import roi_align
 4 | 
 5 | 
 6 | # TODO use save_for_backward instead
 7 | class RoIAlignFunction(Function):
 8 |     def __init__(self, aligned_height, aligned_width, spatial_scale):
 9 |         self.aligned_width = int(aligned_width)
10 |         self.aligned_height = int(aligned_height)
11 |         self.spatial_scale = float(spatial_scale)
12 |         self.rois = None
13 |         self.feature_size = None
14 | 
15 |     def forward(self, features, rois):
16 |         self.rois = rois
17 |         self.feature_size = features.size()
18 | 
19 |         batch_size, num_channels, data_height, data_width = features.size()
20 |         num_rois = rois.size(0)
21 | 
22 |         output = features.new(num_rois, num_channels, self.aligned_height, self.aligned_width).zero_()
23 |         if features.is_cuda:
24 |             roi_align.roi_align_forward_cuda(self.aligned_height,
25 |                                              self.aligned_width,
26 |                                              self.spatial_scale, features,
27 |                                              rois, output)
28 |         else:
29 |             raise NotImplementedError
30 | 
31 |         return output
32 | 
33 |     def backward(self, grad_output):
34 |         assert(self.feature_size is not None and grad_output.is_cuda)
35 | 
36 |         batch_size, num_channels, data_height, data_width = self.feature_size
37 | 
38 |         grad_input = self.rois.new(batch_size, num_channels, data_height,
39 |                                   data_width).zero_()
40 |         roi_align.roi_align_backward_cuda(self.aligned_height,
41 |                                           self.aligned_width,
42 |                                           self.spatial_scale, grad_output,
43 |                                           self.rois, grad_input)
44 | 
45 |         # print grad_input
46 | 
47 |         return grad_input, None
48 | 


--------------------------------------------------------------------------------
/frcnn_eval/imdb.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | #
 7 | # Modified by Seungkwan Lee for WSDDN
 8 | # --------------------------------------------------------
 9 | 
10 | import os
11 | import os.path as osp
12 | 
13 | 
14 | class imdb(object):
15 |     """Image database."""
16 | 
17 |     def __init__(self, name):
18 |         self._name = name
19 |         self._num_classes = 0
20 |         self._classes = []
21 |         self._image_index = []
22 |         # Use this dict for storing dataset specific config options
23 |         self.config = {}
24 | 
25 |     @property
26 |     def name(self):
27 |         return self._name
28 | 
29 |     @property
30 |     def num_classes(self):
31 |         return len(self._classes)
32 | 
33 |     @property
34 |     def classes(self):
35 |         return self._classes
36 | 
37 |     @property
38 |     def image_index(self):
39 |         return self._image_index
40 | 
41 |     @property
42 |     def cache_path(self):
43 |         cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache'))
44 |         if not os.path.exists(cache_path):
45 |             os.makedirs(cache_path)
46 |         return cache_path
47 | 
48 |     @property
49 |     def num_images(self):
50 |       return len(self.image_index)
51 | 
52 |     def image_path_at(self, i):
53 |         raise NotImplementedError
54 | 
55 |     def default_roidb(self):
56 |         raise NotImplementedError
57 | 
58 |     def evaluate_detections(self, all_boxes, output_dir=None):
59 |         """
60 |         all_boxes is a list of length number-of-classes.
61 |         Each list element is a list of length number-of-images.
62 |         Each of those list elements is either an empty list []
63 |         or a numpy array of detection.
64 | 
65 |         all_boxes[class][image] = [] or np.array of shape #dets x 5
66 |         """
67 |         raise NotImplementedError
68 | 
69 |     def competition_mode(self, on):
70 |         """Turn competition mode on or off."""
71 |         pass
72 | 


--------------------------------------------------------------------------------
/utils/cpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
12 |     return a if a >= b else b
13 | 
14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
15 |     return a if a <= b else b
16 | 
17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
18 |     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
19 |     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
20 |     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
21 |     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
22 |     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
23 | 
24 |     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
25 |     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
26 | 
27 |     cdef int ndets = dets.shape[0]
28 |     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
29 |             np.zeros((ndets), dtype=np.int)
30 | 
31 |     # nominal indices
32 |     cdef int _i, _j
33 |     # sorted indices
34 |     cdef int i, j
35 |     # temp variables for box i's (the box currently under consideration)
36 |     cdef np.float32_t ix1, iy1, ix2, iy2, iarea
37 |     # variables for computing overlap with box j (lower scoring box)
38 |     cdef np.float32_t xx1, yy1, xx2, yy2
39 |     cdef np.float32_t w, h
40 |     cdef np.float32_t inter, ovr
41 | 
42 |     keep = []
43 |     for _i in range(ndets):
44 |         i = order[_i]
45 |         if suppressed[i] == 1:
46 |             continue
47 |         keep.append(i)
48 |         ix1 = x1[i]
49 |         iy1 = y1[i]
50 |         ix2 = x2[i]
51 |         iy2 = y2[i]
52 |         iarea = areas[i]
53 |         for _j in range(_i + 1, ndets):
54 |             j = order[_j]
55 |             if suppressed[j] == 1:
56 |                 continue
57 |             xx1 = max(ix1, x1[j])
58 |             yy1 = max(iy1, y1[j])
59 |             xx2 = min(ix2, x2[j])
60 |             yy2 = min(iy2, y2[j])
61 |             w = max(0.0, xx2 - xx1 + 1)
62 |             h = max(0.0, yy2 - yy1 + 1)
63 |             inter = w * h
64 |             ovr = inter / (iarea + areas[j] - inter)
65 |             if ovr >= thresh:
66 |                 suppressed[j] = 1
67 | 
68 |     return keep


--------------------------------------------------------------------------------
/model/roi_align/src/roi_align_cuda.c:
--------------------------------------------------------------------------------
 1 | #include <THC/THC.h>
 2 | #include <math.h>
 3 | #include "roi_align_kernel.h"
 4 | 
 5 | extern THCState *state;
 6 | 
 7 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale,
 8 |                         THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output)
 9 | {
10 |     // Grab the input tensor
11 |     float * data_flat = THCudaTensor_data(state, features);
12 |     float * rois_flat = THCudaTensor_data(state, rois);
13 | 
14 |     float * output_flat = THCudaTensor_data(state, output);
15 | 
16 |     // Number of ROIs
17 |     int num_rois = THCudaTensor_size(state, rois, 0);
18 |     int size_rois = THCudaTensor_size(state, rois, 1);
19 |     if (size_rois != 5)
20 |     {
21 |         return 0;
22 |     }
23 | 
24 |     // data height
25 |     int data_height = THCudaTensor_size(state, features, 2);
26 |     // data width
27 |     int data_width = THCudaTensor_size(state, features, 3);
28 |     // Number of channels
29 |     int num_channels = THCudaTensor_size(state, features, 1);
30 | 
31 |     cudaStream_t stream = THCState_getCurrentStream(state);
32 | 
33 |     ROIAlignForwardLaucher(
34 |         data_flat, spatial_scale, num_rois, data_height,
35 |         data_width, num_channels, aligned_height,
36 |         aligned_width, rois_flat,
37 |         output_flat, stream);
38 | 
39 |     return 1;
40 | }
41 | 
42 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale,
43 |                         THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad)
44 | {
45 |     // Grab the input tensor
46 |     float * top_grad_flat = THCudaTensor_data(state, top_grad);
47 |     float * rois_flat = THCudaTensor_data(state, rois);
48 | 
49 |     float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad);
50 | 
51 |     // Number of ROIs
52 |     int num_rois = THCudaTensor_size(state, rois, 0);
53 |     int size_rois = THCudaTensor_size(state, rois, 1);
54 |     if (size_rois != 5)
55 |     {
56 |         return 0;
57 |     }
58 | 
59 |     // batch size
60 |     int batch_size = THCudaTensor_size(state, bottom_grad, 0);
61 |     // data height
62 |     int data_height = THCudaTensor_size(state, bottom_grad, 2);
63 |     // data width
64 |     int data_width = THCudaTensor_size(state, bottom_grad, 3);
65 |     // Number of channels
66 |     int num_channels = THCudaTensor_size(state, bottom_grad, 1);
67 | 
68 |     cudaStream_t stream = THCState_getCurrentStream(state);
69 |     ROIAlignBackwardLaucher(
70 |         top_grad_flat, spatial_scale, batch_size, num_rois, data_height,
71 |         data_width, num_channels, aligned_height,
72 |         aligned_width, rois_flat,
73 |         bottom_grad_flat, stream);
74 | 
75 |     return 1;
76 | }
77 | 


--------------------------------------------------------------------------------
/model/roi_pooling/src/roi_pooling_cuda.c:
--------------------------------------------------------------------------------
 1 | #include <THC/THC.h>
 2 | #include <math.h>
 3 | #include "roi_pooling_kernel.h"
 4 | 
 5 | extern THCState *state;
 6 | 
 7 | int roi_pooling_forward_cuda(int pooled_height, int pooled_width, float spatial_scale,
 8 |                         THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output, THCudaIntTensor * argmax)
 9 | {
10 |     // Grab the input tensor
11 |     float * data_flat = THCudaTensor_data(state, features);
12 |     float * rois_flat = THCudaTensor_data(state, rois);
13 | 
14 |     float * output_flat = THCudaTensor_data(state, output);
15 |     int * argmax_flat = THCudaIntTensor_data(state, argmax);
16 | 
17 |     // Number of ROIs
18 |     int num_rois = THCudaTensor_size(state, rois, 0);
19 |     int size_rois = THCudaTensor_size(state, rois, 1);
20 |     if (size_rois != 5)
21 |     {
22 |         return 0;
23 |     }
24 | 
25 |     // batch size
26 |     // int batch_size = THCudaTensor_size(state, features, 0);
27 |     // if (batch_size != 1)
28 |     // {
29 |     //     return 0;
30 |     // }
31 |     // data height
32 |     int data_height = THCudaTensor_size(state, features, 2);
33 |     // data width
34 |     int data_width = THCudaTensor_size(state, features, 3);
35 |     // Number of channels
36 |     int num_channels = THCudaTensor_size(state, features, 1);
37 | 
38 |     cudaStream_t stream = THCState_getCurrentStream(state);
39 | 
40 |     ROIPoolForwardLaucher(
41 |         data_flat, spatial_scale, num_rois, data_height,
42 |         data_width, num_channels, pooled_height,
43 |         pooled_width, rois_flat,
44 |         output_flat, argmax_flat, stream);
45 | 
46 |     return 1;
47 | }
48 | 
49 | int roi_pooling_backward_cuda(int pooled_height, int pooled_width, float spatial_scale,
50 |                         THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad, THCudaIntTensor * argmax)
51 | {
52 |     // Grab the input tensor
53 |     float * top_grad_flat = THCudaTensor_data(state, top_grad);
54 |     float * rois_flat = THCudaTensor_data(state, rois);
55 | 
56 |     float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad);
57 |     int * argmax_flat = THCudaIntTensor_data(state, argmax);
58 | 
59 |     // Number of ROIs
60 |     int num_rois = THCudaTensor_size(state, rois, 0);
61 |     int size_rois = THCudaTensor_size(state, rois, 1);
62 |     if (size_rois != 5)
63 |     {
64 |         return 0;
65 |     }
66 | 
67 |     // batch size
68 |     int batch_size = THCudaTensor_size(state, bottom_grad, 0);
69 |     // if (batch_size != 1)
70 |     // {
71 |     //     return 0;
72 |     // }
73 |     // data height
74 |     int data_height = THCudaTensor_size(state, bottom_grad, 2);
75 |     // data width
76 |     int data_width = THCudaTensor_size(state, bottom_grad, 3);
77 |     // Number of channels
78 |     int num_channels = THCudaTensor_size(state, bottom_grad, 1);
79 | 
80 |     cudaStream_t stream = THCState_getCurrentStream(state);
81 |     ROIPoolBackwardLaucher(
82 |         top_grad_flat, spatial_scale, batch_size, num_rois, data_height,
83 |         data_width, num_channels, pooled_height,
84 |         pooled_width, rois_flat,
85 |         bottom_grad_flat, argmax_flat, stream);
86 | 
87 |     return 1;
88 | }
89 | 


--------------------------------------------------------------------------------
/utils/box_utils.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # PyTorch WSDDN
 3 | # Copyright 2018. Seungkwan Lee
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Seungkwan Lee
 6 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang
 7 | # --------------------------------------------------------
 8 | import torch
 9 | 
10 | 
11 | def element_wise_iou(boxes_a, boxes_b):
12 |     """
13 |     Compute the element wise IoU
14 |     :param box_a: (n, 4) minmax form boxes
15 |     :param box_b: (n, 4) minmax form boxes
16 |     :return: (n) iou
17 |     """
18 |     max_xy = torch.min(boxes_a[:, 2:], boxes_b[:, 2:])
19 |     min_xy = torch.max(boxes_a[:, :2], boxes_b[:, :2])
20 |     inter_wh = torch.clamp((max_xy - min_xy + 1), min=0)
21 |     I = inter_wh[:, 0] * inter_wh[:, 1]
22 |     A = (boxes_a[:, 2] - boxes_a[:, 0] + 1) * (boxes_a[:, 3] - boxes_a[:, 1] + 1)
23 |     B = (boxes_b[:, 2] - boxes_b[:, 0] + 1) * (boxes_b[:, 3] - boxes_b[:, 1] + 1)
24 |     U = A + B - I
25 |     return I / U
26 | 
27 | 
28 | def all_pair_iou(boxes_a, boxes_b):
29 |     """
30 |     Compute the IoU of all pairs.
31 |     :param boxes_a: (n, 4) minmax form boxes
32 |     :param boxes_b: (m, 4) minmax form boxes
33 |     :return: (n, m) iou of all pairs of two set
34 |     """
35 | 
36 |     N = boxes_a.size(0)
37 |     M = boxes_b.size(0)
38 |     max_xy = torch.min(boxes_a[:, 2:].unsqueeze(1).expand(N, M, 2), boxes_b[:, 2:].unsqueeze(0).expand(N, M, 2))
39 |     min_xy = torch.max(boxes_a[:, :2].unsqueeze(1).expand(N, M, 2), boxes_b[:, :2].unsqueeze(0).expand(N, M, 2))
40 |     inter_wh = torch.clamp((max_xy - min_xy + 1), min=0)
41 |     I = inter_wh[:, :, 0] * inter_wh[:, :, 1]
42 |     A = ((boxes_a[:, 2] - boxes_a[:, 0] + 1) * (boxes_a[:, 3] - boxes_a[:, 1] + 1)).unsqueeze(1).expand_as(I)
43 |     B = ((boxes_b[:, 2] - boxes_b[:, 0] + 1) * (boxes_b[:, 3] - boxes_b[:, 1] + 1)).unsqueeze(0).expand_as(I)
44 |     U = A + B - I
45 | 
46 |     return I / U
47 | 
48 | 
49 | def transform(boxes, transform_param):
50 |     """
51 |     transform boxes
52 |     :param boxes: (n, 4) tensor, (cx, cy, w, h) form.
53 |     :param transform_param: (n, 4) tensor.
54 |     :return: (n, 4) transformed boxes, (cx, cy, w, h) form.
55 |     """
56 | 
57 |     cx = boxes[:, 0] + transform_param[:, 0] * boxes[:, 2]
58 |     cy = boxes[:, 1] + transform_param[:, 1] * boxes[:, 3]
59 |     w = boxes[:, 2] * torch.exp(transform_param[:, 2])
60 |     h = boxes[:, 3] * torch.exp(transform_param[:, 3])
61 | 
62 |     return torch.stack([cx, cy, w, h], 1)
63 | 
64 | 
65 | def to_cwh_form(boxes):
66 |     """
67 |     :param boxes: (n, 4) tensor, (cx, cy, w, h) form.
68 |     :return: (n, 4) tensor, (xmin, ymin, xmax, ymax) form
69 |     """
70 | 
71 |     cx = (boxes[:, 0] + boxes[:, 2]) / 2
72 |     cy = (boxes[:, 1] + boxes[:, 3]) / 2
73 |     w = boxes[:, 2] - boxes[:, 0] + 1
74 |     h = boxes[:, 3] - boxes[:, 1] + 1
75 |     return torch.stack([cx, cy, w, h], 1)
76 | 
77 | 
78 | def to_minmax_form(boxes):
79 |     """
80 |     :param boxes: (n, 4) tensor, (xmin, ymin, xmax, ymax) form.
81 |     :return: (n, 4) tensor, (cx, cy, w, h) form
82 |     """
83 | 
84 |     xmin = boxes[:, 0] - boxes[:, 2] / 2 + 0.5
85 |     ymin = boxes[:, 1] - boxes[:, 3] / 2 + 0.5
86 |     xmax = boxes[:, 0] + boxes[:, 2] / 2 - 0.5
87 |     ymax = boxes[:, 1] + boxes[:, 3] / 2 - 0.5
88 |     return torch.stack([xmin, ymin, xmax, ymax], 1)


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import os
 9 | from os.path import join as pjoin
10 | from setuptools import setup
11 | from distutils.extension import Extension
12 | from Cython.Distutils import build_ext
13 | import subprocess
14 | import numpy as np
15 | 
16 | def find_in_path(name, path):
17 |     "Find a file in a search path"
18 |     # Adapted fom
19 |     # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
20 |     for dir in path.split(os.pathsep):
21 |         binpath = pjoin(dir, name)
22 |         if os.path.exists(binpath):
23 |             return os.path.abspath(binpath)
24 |     return None
25 | 
26 | 
27 | 
28 | # Obtain the numpy include directory.  This logic works across numpy versions.
29 | try:
30 |     numpy_include = np.get_include()
31 | except AttributeError:
32 |     numpy_include = np.get_numpy_include()
33 | 
34 | def customize_compiler_for_nvcc(self):
35 |     """inject deep into distutils to customize how the dispatch
36 |     to gcc/nvcc works.
37 |     If you subclass UnixCCompiler, it's not trivial to get your subclass
38 |     injected in, and still have the right customizations (i.e.
39 |     distutils.sysconfig.customize_compiler) run on it. So instead of going
40 |     the OO route, I have this. Note, it's kindof like a wierd functional
41 |     subclassing going on."""
42 | 
43 |     # tell the compiler it can processes .cu
44 |     self.src_extensions.append('.cu')
45 | 
46 |     # save references to the default compiler_so and _comple methods
47 |     default_compiler_so = self.compiler_so
48 |     super = self._compile
49 | 
50 |     # now redefine the _compile method. This gets executed for each
51 |     # object but distutils doesn't have the ability to change compilers
52 |     # based on source extension: we add it.
53 |     def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
54 |         if os.path.splitext(src)[1] == '.cu':
55 |             # use the cuda for .cu files
56 |             self.set_executable('compiler_so', CUDA['nvcc'])
57 |             # use only a subset of the extra_postargs, which are 1-1 translated
58 |             # from the extra_compile_args in the Extension class
59 |             postargs = extra_postargs['nvcc']
60 |         else:
61 |             postargs = extra_postargs['gcc']
62 | 
63 |         super(obj, src, ext, cc_args, postargs, pp_opts)
64 |         # reset the default compiler_so, which we might have changed for cuda
65 |         self.compiler_so = default_compiler_so
66 | 
67 |     # inject our redefined _compile method into the class
68 |     self._compile = _compile
69 | 
70 | 
71 | # run the customize_compiler
72 | class custom_build_ext(build_ext):
73 |     def build_extensions(self):
74 |         customize_compiler_for_nvcc(self.compiler)
75 |         build_ext.build_extensions(self)
76 | 
77 | 
78 | ext_modules = [
79 |     Extension(
80 |         "utils.cpu_nms",
81 |         ["utils/cpu_nms.pyx"],
82 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
83 |         include_dirs = [numpy_include]
84 |     ),
85 | ]
86 | 
87 | setup(
88 |     name='fast_rcnn',
89 |     ext_modules=ext_modules,
90 |     # inject our custom trigger
91 |     cmdclass={'build_ext': custom_build_ext},
92 | )


--------------------------------------------------------------------------------
/datasets/voc_loader.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # PyTorch WSDDN
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Written by Seungkwan Lee
 5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang
 6 | # --------------------------------------------------------
 7 | from scipy.misc import imread
 8 | from scipy.io import loadmat
 9 | import numpy as np
10 | import sys
11 | import os
12 | import xml.etree.ElementTree as ET
13 | 
14 | VOC_CLASSES = [
15 |     'aeroplane', 'bicycle', 'bird', 'boat',
16 |     'bottle', 'bus', 'car', 'cat', 'chair',
17 |     'cow', 'diningtable', 'dog', 'horse',
18 |     'motorbike', 'person', 'pottedplant',
19 |     'sheep', 'sofa', 'train', 'tvmonitor']
20 | 
21 | 
22 | class VOCLoader:
23 |     def __init__(self, root, prop_method, min_prop_scale, year, name):
24 |         self.items = []
25 |         self.name_to_index = dict(zip(VOC_CLASSES, range(len(VOC_CLASSES))))
26 |         print('VOC %s %s dataset loading...' % (year, name))
27 | 
28 |         proposals = {}
29 |         prop_scores = {}
30 |         if prop_method == 'eb':
31 |             raw_data = loadmat(os.path.join(root, 'proposals', 'edge_boxes_voc_%s_%s.mat' % (year, name)))
32 |             for i in range(len(raw_data['images'][0])):
33 |                 id = raw_data['images'][0][i][0]
34 |                 boxes = raw_data['boxes'][0][i].astype(np.float) - 1
35 |                 scores = raw_data['boxScores'][0][i][:, 0]
36 |                 is_good = (boxes[:, 2] >= boxes[:, 0] + min_prop_scale) * (boxes[:, 3] >= boxes[:, 1] + min_prop_scale)
37 |                 is_good = np.nonzero(is_good)[0]
38 |                 boxes = boxes[is_good]
39 |                 scores = scores[is_good]
40 |                 proposals[id] = np.concatenate([boxes[:, 1:2], boxes[:, 0:1], boxes[:, 3:4], boxes[:, 2:3]], 1)
41 |                 prop_scores[id] = scores
42 | 
43 |         elif prop_method == 'ss':
44 |             raw_data = loadmat(os.path.join(root, 'proposals', 'selective_search_voc_%s_%s.mat' % (year, name)))
45 |             for i in range(len(raw_data['images'])):
46 |                 id = raw_data['images'][i][0][0]
47 |                 boxes = raw_data['boxes'][0][i].astype(np.float) - 1
48 |                 scores = np.zeros(len(boxes))
49 |                 is_good = (boxes[:, 2] >= boxes[:, 0] + min_prop_scale) * (boxes[:, 3] >= boxes[:, 1] + min_prop_scale)
50 |                 is_good = np.nonzero(is_good)[0]
51 |                 boxes = boxes[is_good]
52 |                 scores = scores[is_good]
53 |                 proposals[id] = np.concatenate([boxes[:, 1:2], boxes[:, 0:1], boxes[:, 3:4], boxes[:, 2:3]], 1)
54 |                 prop_scores[id] = scores
55 | 
56 |         rootpath = os.path.join(root, 'VOCdevkit2007', 'VOC' + year)
57 |         for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
58 |             data = {}
59 |             id = line.strip()
60 |             target = ET.parse(os.path.join(rootpath, 'Annotations', id + '.xml'))
61 | 
62 |             box_set = []
63 |             category_set = []
64 |             for obj in target.iter('object'):
65 |                 cls_name = obj.find('name').text.strip().lower()
66 |                 bbox = obj.find('bndbox')
67 | 
68 |                 xmin = int(bbox.find('xmin').text) - 1
69 |                 ymin = int(bbox.find('ymin').text) - 1
70 |                 xmax = int(bbox.find('xmax').text) - 1
71 |                 ymax = int(bbox.find('ymax').text) - 1
72 | 
73 |                 category = self.name_to_index[cls_name]
74 |                 box_set.append(np.array([xmin, ymin, xmax, ymax], np.float32))
75 |                 category_set.append(category)
76 | 
77 |             data['id'] = id
78 |             data['boxes'] = np.array(box_set)
79 |             data['categories'] = np.array(category_set, np.long)
80 |             data['img_path'] = os.path.join(rootpath, 'JPEGImages', line.strip() + '.jpg')
81 |             data['proposals'] = proposals[id]
82 |             data['prop_scores'] = prop_scores[id]
83 |             self.items.append(data)
84 | 
85 |         print('VOC %s %s dataset loading complete' % (year, name))
86 | 
87 |     def __len__(self):
88 |         return len(self.items)
89 | 


--------------------------------------------------------------------------------
/model/roi_pooling/src/roi_pooling.c:
--------------------------------------------------------------------------------
  1 | #include <TH/TH.h>
  2 | #include <math.h>
  3 | 
  4 | int roi_pooling_forward(int pooled_height, int pooled_width, float spatial_scale,
  5 |                         THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output)
  6 | {
  7 |     // Grab the input tensor
  8 |     float * data_flat = THFloatTensor_data(features);
  9 |     float * rois_flat = THFloatTensor_data(rois);
 10 | 
 11 |     float * output_flat = THFloatTensor_data(output);
 12 | 
 13 |     // Number of ROIs
 14 |     int num_rois = THFloatTensor_size(rois, 0);
 15 |     int size_rois = THFloatTensor_size(rois, 1);
 16 |     // batch size
 17 |     int batch_size = THFloatTensor_size(features, 0);
 18 |     if(batch_size != 1)
 19 |     {
 20 |         return 0;
 21 |     }
 22 |     // data height
 23 |     int data_height = THFloatTensor_size(features, 1);
 24 |     // data width
 25 |     int data_width = THFloatTensor_size(features, 2);
 26 |     // Number of channels
 27 |     int num_channels = THFloatTensor_size(features, 3);
 28 | 
 29 |     // Set all element of the output tensor to -inf.
 30 |     THFloatStorage_fill(THFloatTensor_storage(output), -1);
 31 | 
 32 |     // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
 33 |     int index_roi = 0;
 34 |     int index_output = 0;
 35 |     int n;
 36 |     for (n = 0; n < num_rois; ++n)
 37 |     {
 38 |         int roi_batch_ind = rois_flat[index_roi + 0];
 39 |         int roi_start_w = round(rois_flat[index_roi + 1] * spatial_scale);
 40 |         int roi_start_h = round(rois_flat[index_roi + 2] * spatial_scale);
 41 |         int roi_end_w = round(rois_flat[index_roi + 3] * spatial_scale);
 42 |         int roi_end_h = round(rois_flat[index_roi + 4] * spatial_scale);
 43 |         //      CHECK_GE(roi_batch_ind, 0);
 44 |         //      CHECK_LT(roi_batch_ind, batch_size);
 45 | 
 46 |         int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1);
 47 |         int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1);
 48 |         float bin_size_h = (float)(roi_height) / (float)(pooled_height);
 49 |         float bin_size_w = (float)(roi_width) / (float)(pooled_width);
 50 | 
 51 |         int index_data = roi_batch_ind * data_height * data_width * num_channels;
 52 |         const int output_area = pooled_width * pooled_height;
 53 | 
 54 |         int c, ph, pw;
 55 |         for (ph = 0; ph < pooled_height; ++ph)
 56 |         {
 57 |             for (pw = 0; pw < pooled_width; ++pw)
 58 |             {
 59 |                 int hstart = (floor((float)(ph) * bin_size_h));
 60 |                 int wstart = (floor((float)(pw) * bin_size_w));
 61 |                 int hend = (ceil((float)(ph + 1) * bin_size_h));
 62 |                 int wend = (ceil((float)(pw + 1) * bin_size_w));
 63 | 
 64 |                 hstart = fminf(fmaxf(hstart + roi_start_h, 0), data_height);
 65 |                 hend = fminf(fmaxf(hend + roi_start_h, 0), data_height);
 66 |                 wstart = fminf(fmaxf(wstart + roi_start_w, 0), data_width);
 67 |                 wend = fminf(fmaxf(wend + roi_start_w, 0), data_width);
 68 | 
 69 |                 const int pool_index = index_output + (ph * pooled_width + pw);
 70 |                 int is_empty = (hend <= hstart) || (wend <= wstart);
 71 |                 if (is_empty)
 72 |                 {
 73 |                     for (c = 0; c < num_channels * output_area; c += output_area)
 74 |                     {
 75 |                         output_flat[pool_index + c] = 0;
 76 |                     }
 77 |                 }
 78 |                 else
 79 |                 {
 80 |                     int h, w, c;
 81 |                     for (h = hstart; h < hend; ++h)
 82 |                     {
 83 |                         for (w = wstart; w < wend; ++w)
 84 |                         {
 85 |                             for (c = 0; c < num_channels; ++c)
 86 |                             {
 87 |                                 const int index = (h * data_width + w) * num_channels + c;
 88 |                                 if (data_flat[index_data + index] > output_flat[pool_index + c * output_area])
 89 |                                 {
 90 |                                     output_flat[pool_index + c * output_area] = data_flat[index_data + index];
 91 |                                 }
 92 |                             }
 93 |                         }
 94 |                     }
 95 |                 }
 96 |             }
 97 |         }
 98 | 
 99 |         // Increment ROI index
100 |         index_roi += size_rois;
101 |         index_output += pooled_height * pooled_width * num_channels;
102 |     }
103 |     return 1;
104 | }


--------------------------------------------------------------------------------
/datasets/wsddn_dataset.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # PyTorch WSDDN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Seungkwan Lee
  5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang
  6 | # --------------------------------------------------------
  7 | import torch.utils.data as data
  8 | import torch
  9 | 
 10 | from scipy.misc import imread
 11 | import numpy as np
 12 | import cv2
 13 | from datasets.voc_loader import VOCLoader
 14 | 
 15 | 
 16 | class WSDDNDataset(data.Dataset):
 17 |     def __init__(self, dataset_names, data_dir, prop_method, num_classes=20, min_prop_scale=20):
 18 |         self._dataset_loaders = []
 19 |         self.num_classes = num_classes
 20 |         for name in dataset_names:
 21 |             if name == 'voc07_trainval':
 22 |                 self._dataset_loaders.append(VOCLoader(data_dir, prop_method, min_prop_scale, '2007', 'trainval'))
 23 |             elif name == 'voc07_test':
 24 |                 self._dataset_loaders.append(VOCLoader(data_dir, prop_method, min_prop_scale, '2007', 'test'))
 25 |             else:
 26 |                 raise Exception('Undefined dataset %s' % name)
 27 | 
 28 |     def get_data(self, index, h_flip=False, target_im_size=688, min_resize=False):
 29 |         im, gt_boxes, gt_categories, proposals, prop_scores, id, loader_index = self.get_raw_data(index)
 30 |         raw_img = im.copy()
 31 | 
 32 |         # rgb -> bgr
 33 |         im = im[:, :, ::-1]
 34 | 
 35 |         # horizontal flip
 36 |         if h_flip:
 37 |             im = im[:, ::-1, :]
 38 |             raw_img = raw_img[:, ::-1, :].copy()
 39 | 
 40 |             flipped_xmin = im.shape[1] - gt_boxes[:, 2]
 41 |             flipped_xmax = im.shape[1] - gt_boxes[:, 0]
 42 |             gt_boxes[:, 0] = flipped_xmin
 43 |             gt_boxes[:, 2] = flipped_xmax
 44 | 
 45 |             flipped_xmin = im.shape[1] - proposals[:, 2]
 46 |             flipped_xmax = im.shape[1] - proposals[:, 0]
 47 |             proposals[:, 0] = flipped_xmin
 48 |             proposals[:, 2] = flipped_xmax
 49 | 
 50 |         # cast to float type and mean subtraction
 51 |         im = im.astype(np.float32, copy=False)
 52 |         im -= np.array([[[102.9801, 115.9465, 122.7717]]])
 53 | 
 54 |         # image rescale
 55 |         im_shape = im.shape
 56 |         im_size_min = np.min(im_shape[0:2])
 57 |         im_size_max = np.max(im_shape[0:2])
 58 | 
 59 |         if min_resize:
 60 |             im_scale = target_im_size / float(im_size_min)
 61 |         else:
 62 |             im_scale = target_im_size / float(im_size_max)
 63 | 
 64 |         if im_size_max * im_scale > 2000:
 65 |             im_scale = 2000 / im_size_max
 66 |         im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
 67 | 
 68 |         gt_boxes = gt_boxes * im_scale
 69 |         proposals = proposals * im_scale
 70 | 
 71 |         # to tensor
 72 |         data = torch.tensor(im, dtype=torch.float32)
 73 |         data = data.permute(2, 0, 1).contiguous()
 74 |         gt_boxes = torch.tensor(gt_boxes, dtype=torch.float32)
 75 |         proposals = torch.tensor(proposals, dtype=torch.float32)
 76 |         prop_scores = torch.tensor(prop_scores, dtype=torch.float32)
 77 |         gt_categories = torch.tensor(gt_categories, dtype=torch.long)
 78 | 
 79 |         image_level_label = torch.zeros(self.num_classes, dtype=torch.uint8)
 80 |         for label in gt_categories:
 81 |             image_level_label[label] = 1
 82 |         return data, gt_boxes, gt_categories, proposals, prop_scores, image_level_label, im_scale, raw_img, id
 83 | 
 84 |     def get_raw_proposal(self, index):
 85 |         here = None
 86 |         loader_index = 0
 87 | 
 88 |         # select proper data loader by index
 89 |         for loader in self._dataset_loaders:
 90 |             if index < len(loader):
 91 |                 here = loader.items[index]
 92 |                 break
 93 |             else:
 94 |                 index -= len(loader)
 95 |                 loader_index += 1
 96 | 
 97 |         proposals = here['proposals'].copy()
 98 |         return proposals
 99 | 
100 |     def get_raw_data(self, index):
101 |         here = None
102 |         loader_index = 0
103 | 
104 |         # select proper data loader by index
105 |         for loader in self._dataset_loaders:
106 |             if index < len(loader):
107 |                 here = loader.items[index]
108 |                 break
109 |             else:
110 |                 index -= len(loader)
111 |                 loader_index += 1
112 | 
113 |         assert here is not None
114 |         im = imread(here['img_path'])
115 | 
116 |         # gray to rgb
117 |         if len(im.shape) == 2:
118 |             im = im[:, :, np.newaxis]
119 |             im = np.concatenate((im, im, im), axis=2)
120 | 
121 |         gt_boxes = here['boxes'].copy()
122 |         gt_categories = here['categories'].copy()
123 |         proposals = here['proposals'].copy()
124 |         prop_scores = here['prop_scores'].copy()
125 |         id = here['id']
126 |         return im, gt_boxes, gt_categories, proposals, prop_scores, id, loader_index
127 | 
128 |     def __len__(self):
129 |         tot_len = 0
130 |         for loader in self._dataset_loaders:
131 |             tot_len += len(loader)
132 |         return tot_len
133 | 


--------------------------------------------------------------------------------
/frcnn_eval/pascal_voc.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick
  6 | #
  7 | # Modified by Seungkwan Lee for WSDDN
  8 | # --------------------------------------------------------
  9 | 
 10 | import os
 11 | from frcnn_eval.imdb import imdb
 12 | import numpy as np
 13 | from frcnn_eval.voc_eval import voc_eval
 14 | import uuid
 15 | 
 16 | class voc_eval_kit(imdb):
 17 |     def __init__(self, image_set, year, devkit_path):
 18 |         imdb.__init__(self, 'voc_' + year + '_' + image_set)
 19 |         self._year = year
 20 |         self._image_set = image_set
 21 |         self._devkit_path = devkit_path
 22 |         self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
 23 |         self._classes = ('aeroplane', 'bicycle', 'bird', 'boat',
 24 |                          'bottle', 'bus', 'car', 'cat', 'chair',
 25 |                          'cow', 'diningtable', 'dog', 'horse',
 26 |                          'motorbike', 'person', 'pottedplant',
 27 |                          'sheep', 'sofa', 'train', 'tvmonitor')
 28 |         self._class_to_ind = dict(zip(self.classes, range(self.num_classes)))
 29 |         self._image_ext = '.jpg'
 30 |         self._image_index = self._load_image_set_index()
 31 |         self._salt = str(uuid.uuid4())
 32 | 
 33 | 
 34 |         assert os.path.exists(self._devkit_path), 'VOCdevkit path does not exist: {}'.format(self._devkit_path)
 35 |         assert os.path.exists(self._data_path), 'Path does not exist: {}'.format(self._data_path)
 36 | 
 37 |     def _load_image_set_index(self):
 38 |         """
 39 |         Load the indexes listed in this dataset's image set file.
 40 |         """
 41 |         # Example path to image set file:
 42 |         # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt
 43 |         image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
 44 |                                       self._image_set + '.txt')
 45 |         assert os.path.exists(image_set_file), \
 46 |                 'Path does not exist: {}'.format(image_set_file)
 47 |         with open(image_set_file) as f:
 48 |             image_index = [x.strip() for x in f.readlines()]
 49 |         return image_index
 50 | 
 51 |     def _get_voc_results_file_template(self):
 52 |         # VOCdevkit/results/VOC2007/Main/<comp_id>_det_test_aeroplane.txt
 53 |         filename = 'det_' + self._image_set + '_' + self._salt + '_{:s}.txt'
 54 |         path = os.path.join(
 55 |             self._devkit_path,
 56 |             'results',
 57 |             'VOC' + self._year,
 58 |             'Main',
 59 |             filename)
 60 |         return path
 61 | 
 62 |     def _write_voc_results_file(self, all_boxes):
 63 |         for cls_ind, cls in enumerate(self.classes):
 64 |             print('Writing {} VOC results file'.format(cls))
 65 |             filename = self._get_voc_results_file_template().format(cls)
 66 |             with open(filename, 'wt') as f:
 67 |                 for im_ind, index in enumerate(self.image_index):
 68 |                     dets = all_boxes[cls_ind][im_ind]
 69 |                     if dets == []:
 70 |                         continue
 71 |                     # the VOCdevkit expects 1-based indices
 72 |                     for k in range(dets.shape[0]):
 73 |                         f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
 74 |                                 format(index, dets[k, -1],
 75 |                                        dets[k, 0] + 1, dets[k, 1] + 1,
 76 |                                        dets[k, 2] + 1, dets[k, 3] + 1))
 77 | 
 78 |     def _do_python_eval(self):
 79 |         annopath = os.path.join(
 80 |             self._devkit_path,
 81 |             'VOC' + self._year,
 82 |             'Annotations',
 83 |             '{:s}.xml')
 84 |         imagesetfile = os.path.join(
 85 |             self._devkit_path,
 86 |             'VOC' + self._year,
 87 |             'ImageSets',
 88 |             'Main',
 89 |             self._image_set + '.txt')
 90 |         cachedir = os.path.join(self._devkit_path, 'annotations_cache')
 91 |         aps = []
 92 |         # The PASCAL VOC metric changed in 2010
 93 |         use_07_metric = True if int(self._year) < 2010 else False
 94 |         print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
 95 |         for i, cls in enumerate(self._classes):
 96 |             filename = self._get_voc_results_file_template().format(cls)
 97 |             rec, prec, ap = voc_eval(filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5, use_07_metric=use_07_metric)
 98 |             aps += [ap]
 99 |             print('AP for {} = {:.4f}'.format(cls, ap))
100 |         print('Mean AP = {:.4f}'.format(np.mean(aps)))
101 |         print('~~~~~~~~')
102 |         print('Results:')
103 |         for ap in aps:
104 |             print('{:.3f}'.format(ap))
105 |         print('{:.3f}'.format(np.mean(aps)))
106 |         print('~~~~~~~~')
107 |         print('')
108 |         print('--------------------------------------------------------------')
109 |         print('Results computed with the **unofficial** Python eval code.')
110 |         print('Results should be very close to the official MATLAB eval code.')
111 |         print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
112 |         print('-- Thanks, The Management')
113 |         print('--------------------------------------------------------------')
114 | 
115 |     def evaluate_detections(self, all_boxes):
116 |         self._write_voc_results_file(all_boxes)
117 |         self._do_python_eval()
118 |         for cls in self._classes:
119 |             filename = self._get_voc_results_file_template().format(cls)
120 | #            os.remove(filename)
121 | 


--------------------------------------------------------------------------------
/frcnn_eval/voc_eval.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast/er R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Bharath Hariharan
  5 | #
  6 | # Modified by Seungkwan Lee for WSDDN
  7 | # --------------------------------------------------------
  8 | 
  9 | import xml.etree.ElementTree as ET
 10 | import os
 11 | import pickle
 12 | import numpy as np
 13 | 
 14 | 
 15 | def parse_rec(filename):
 16 |     """ Parse a PASCAL VOC xml file """
 17 |     tree = ET.parse(filename)
 18 |     objects = []
 19 |     for obj in tree.findall('object'):
 20 |         obj_struct = {}
 21 |         obj_struct['name'] = obj.find('name').text
 22 |         obj_struct['pose'] = obj.find('pose').text
 23 |         obj_struct['truncated'] = int(obj.find('truncated').text)
 24 |         obj_struct['difficult'] = int(obj.find('difficult').text)
 25 |         bbox = obj.find('bndbox')
 26 |         obj_struct['bbox'] = [int(bbox.find('xmin').text),
 27 |                               int(bbox.find('ymin').text),
 28 |                               int(bbox.find('xmax').text),
 29 |                               int(bbox.find('ymax').text)]
 30 |         objects.append(obj_struct)
 31 | 
 32 |     return objects
 33 | 
 34 | 
 35 | def voc_ap(rec, prec, use_07_metric=False):
 36 |     """ ap = voc_ap(rec, prec, [use_07_metric])
 37 |     Compute VOC AP given precision and recall.
 38 |     If use_07_metric is true, uses the
 39 |     VOC 07 11 point method (default:False).
 40 |     """
 41 |     if use_07_metric:
 42 |         # 11 point metric
 43 |         ap = 0.
 44 |         for t in np.arange(0., 1.1, 0.1):
 45 |             if np.sum(rec >= t) == 0:
 46 |                 p = 0
 47 |             else:
 48 |                 p = np.max(prec[rec >= t])
 49 |             ap = ap + p / 11.
 50 |     else:
 51 |         # correct AP calculation
 52 |         # first append sentinel values at the end
 53 |         mrec = np.concatenate(([0.], rec, [1.]))
 54 |         mpre = np.concatenate(([0.], prec, [0.]))
 55 | 
 56 |         # compute the precision envelope
 57 |         for i in range(mpre.size - 1, 0, -1):
 58 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 59 | 
 60 |         # to calculate area under PR curve, look for points
 61 |         # where X axis (recall) changes value
 62 |         i = np.where(mrec[1:] != mrec[:-1])[0]
 63 | 
 64 |         # and sum (\Delta recall) * prec
 65 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 66 |     return ap
 67 | 
 68 | 
 69 | def voc_eval(detpath,
 70 |              annopath,
 71 |              imagesetfile,
 72 |              classname,
 73 |              cachedir,
 74 |              ovthresh=0.5,
 75 |              use_07_metric=False):
 76 |     """rec, prec, ap = voc_eval(detpath,
 77 |                                 annopath,
 78 |                                 imagesetfile,
 79 |                                 classname,
 80 |                                 [ovthresh],
 81 |                                 [use_07_metric])
 82 | 
 83 |     Top level function that does the PASCAL VOC evaluation.
 84 | 
 85 |     detpath: Path to detections
 86 |         detpath.format(classname) should produce the detection results file.
 87 |     annopath: Path to annotations
 88 |         annopath.format(imagename) should be the xml annotations file.
 89 |     imagesetfile: Text file containing the list of images, one image per line.
 90 |     classname: Category name (duh)
 91 |     cachedir: Directory for caching the annotations
 92 |     [ovthresh]: Overlap threshold (default = 0.5)
 93 |     [use_07_metric]: Whether to use VOC07's 11 point AP computation
 94 |         (default False)
 95 |     """
 96 |     # assumes detections are in detpath.format(classname)
 97 |     # assumes annotations are in annopath.format(imagename)
 98 |     # assumes imagesetfile is a text file with each line an image name
 99 |     # cachedir caches the annotations in a pickle file
100 | 
101 |     # first load gt
102 |     if not os.path.isdir(cachedir):
103 |         os.mkdir(cachedir)
104 |     cachefile = os.path.join(cachedir, 'annots.pkl')
105 |     # read list of images
106 |     with open(imagesetfile, 'r') as f:
107 |         lines = f.readlines()
108 |     imagenames = [x.strip() for x in lines]
109 | 
110 |     if not os.path.isfile(cachefile):
111 |         # load annots
112 |         recs = {}
113 |         for i, imagename in enumerate(imagenames):
114 |             recs[imagename] = parse_rec(annopath.format(imagename))
115 |             if i % 100 == 0:
116 |                 print('Reading annotation for {:d}/{:d}'.format(i + 1, len(imagenames)))
117 |         # save
118 |         print('Saving cached annotations to {:s}'.format(cachefile))
119 |         with open(cachefile, 'wb') as f:
120 |             pickle.dump(recs, f)
121 |     else:
122 |         # load
123 |         with open(cachefile, 'rb') as f:
124 |             recs = pickle.load(f)
125 | 
126 |     # extract gt objects for this class
127 |     class_recs = {}
128 |     npos = 0
129 |     for imagename in imagenames:
130 |         R = [obj for obj in recs[imagename] if obj['name'] == classname]
131 |         bbox = np.array([x['bbox'] for x in R])
132 |         difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
133 |         det = [False] * len(R)
134 |         npos = npos + sum(~difficult)
135 |         class_recs[imagename] = {'bbox': bbox,
136 |                                  'difficult': difficult,
137 |                                  'det': det}
138 | 
139 |     # read dets
140 |     detfile = detpath.format(classname)
141 |     with open(detfile, 'r') as f:
142 |         lines = f.readlines()
143 | 
144 |     splitlines = [x.strip().split(' ') for x in lines]
145 |     image_ids = [x[0] for x in splitlines]
146 |     confidence = np.array([float(x[1]) for x in splitlines])
147 |     BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
148 |     print(BB, '@@@',confidence)
149 |     # sort by confidence
150 |     sorted_ind = np.argsort(-confidence)
151 |     sorted_scores = np.sort(-confidence)
152 |     BB = BB[sorted_ind, :]
153 |     image_ids = [image_ids[x] for x in sorted_ind]
154 | 
155 |     # go down dets and mark TPs and FPs
156 |     nd = len(image_ids)
157 |     tp = np.zeros(nd)
158 |     fp = np.zeros(nd)
159 |     for d in range(nd):
160 |         R = class_recs[image_ids[d]]
161 |         bb = BB[d, :].astype(float)
162 |         ovmax = -np.inf
163 |         BBGT = R['bbox'].astype(float)
164 | 
165 |         if BBGT.size > 0:
166 |             # compute overlaps
167 |             # intersection
168 |             ixmin = np.maximum(BBGT[:, 0], bb[0])
169 |             iymin = np.maximum(BBGT[:, 1], bb[1])
170 |             ixmax = np.minimum(BBGT[:, 2], bb[2])
171 |             iymax = np.minimum(BBGT[:, 3], bb[3])
172 |             iw = np.maximum(ixmax - ixmin + 1., 0.)
173 |             ih = np.maximum(iymax - iymin + 1., 0.)
174 |             inters = iw * ih
175 | 
176 |             # union
177 |             uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
178 |                    (BBGT[:, 2] - BBGT[:, 0] + 1.) *
179 |                    (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
180 | 
181 |             overlaps = inters / uni
182 |             ovmax = np.max(overlaps)
183 |             jmax = np.argmax(overlaps)
184 | 
185 |         if ovmax > ovthresh:
186 |             if not R['difficult'][jmax]:
187 |                 if not R['det'][jmax]:
188 |                     tp[d] = 1.
189 |                     R['det'][jmax] = 1
190 |                 else:
191 |                     fp[d] = 1.
192 |         else:
193 |             fp[d] = 1.
194 | 
195 |     # compute precision recall
196 |     fp = np.cumsum(fp)
197 |     tp = np.cumsum(tp)
198 |     rec = tp / float(npos)
199 |     # avoid divide by zero in case the first detection matches a difficult
200 |     # ground truth
201 |     prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
202 |     ap = voc_ap(rec, prec, use_07_metric)
203 | 
204 |     return rec, prec, ap
205 | 


--------------------------------------------------------------------------------
/model/wsddn_vgg16.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # PyTorch WSDDN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Seungkwan Lee
  5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang
  6 | # --------------------------------------------------------
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch
 10 | from model.roi_align.modules.roi_align import RoIAlignAvg
 11 | from model.roi_pooling.modules.roi_pool import _RoIPooling
 12 | from utils.box_utils import *
 13 | import torchvision
 14 | 
 15 | 
 16 | class WSDDN_VGG16(nn.Module):
 17 |     def __init__(self, pretrained_model_path=None, num_class=20):
 18 |         super(WSDDN_VGG16, self).__init__()
 19 |         vgg = torchvision.models.vgg16()
 20 |         if pretrained_model_path is None:
 21 |             print("Create WSDDN_VGG16 without pretrained weights")
 22 |         else:
 23 |             print("Loading pretrained VGG16 weights from %s" % (pretrained_model_path))
 24 |             state_dict = torch.load(pretrained_model_path)
 25 |             vgg.load_state_dict({k: v for k, v in state_dict.items() if k in vgg.state_dict()})
 26 | 
 27 |         self.base = nn.Sequential(*list(vgg.features._modules.values())[:-1])
 28 |         self.top = nn.Sequential(*list(vgg.classifier._modules.values())[:-1])
 29 |         self.num_classes = num_class
 30 | 
 31 |         self.fc8c = nn.Linear(4096, self.num_classes)
 32 |         self.fc8d = nn.Linear(4096, self.num_classes)
 33 |         self.roi_pooling = _RoIPooling(7, 7, 1.0 / 16.0)
 34 |         self.roi_align = RoIAlignAvg(7, 7, 1.0 / 16.0)
 35 |         self.num_classes = self.num_classes
 36 |         self._init_weights()
 37 | 
 38 |     def _init_weights(self):
 39 |         def normal_init(m, mean, stddev, truncated=False):
 40 |             if truncated:
 41 |                 m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation
 42 |             else:
 43 |                 m.weight.data.normal_(mean, stddev)
 44 |                 m.bias.data.zero_()
 45 | 
 46 |         normal_init(self.fc8c, 0, 0.01, False)
 47 |         normal_init(self.fc8d, 0, 0.01, False)
 48 | 
 49 |     def adjust_roi_offset(self, rois):
 50 |         rois = rois.clone()
 51 |         o0 = 8.5
 52 |         o1 = 9.5
 53 |         rois[:, 0] = torch.floor((rois[:, 0] - o0 + o1) / 16 + 0.5)
 54 |         rois[:, 1] = torch.floor((rois[:, 1] - o0 + o1) / 16 + 0.5)
 55 |         rois[:, 2] = torch.floor((rois[:, 2] - o0 - o1) / 16 - 0.5)
 56 |         rois[:, 3] = torch.floor((rois[:, 3] - o0 - o1) / 16 - 0.5)
 57 |         return rois
 58 | 
 59 |     def forward(self, im_data, rois, prop_scores=None, image_level_label=None):
 60 |         #rois = self.adjust_roi_offset(rois)
 61 |         N = rois.size(0)
 62 |         feature_map = self.base(im_data)
 63 |         zero_padded_rois = torch.cat([torch.zeros(N, 1).to(rois), rois], 1)
 64 |         pooled_feat = self.roi_pooling(feature_map, zero_padded_rois).view(N, -1)
 65 | 
 66 |         if prop_scores is not None:
 67 |             pooled_feat = pooled_feat * (prop_scores.view(N, 1) * 10 + 1)
 68 | 
 69 |         fc7 = self.top(pooled_feat)
 70 |         fc8c = self.fc8c(fc7)
 71 |         fc8d = self.fc8d(fc7) / 2
 72 | 
 73 |         cls = F.softmax(fc8c, dim=1)
 74 |         det = F.softmax(fc8d, dim=0)
 75 |         #det = self.region_aware_softmax(rois, fc8d)
 76 |         scores = cls * det
 77 | 
 78 |         if image_level_label is None:
 79 |             return scores
 80 | 
 81 |         image_level_scores = torch.sum(scores, 0)
 82 | 
 83 |         # To avoid numerical error
 84 |         image_level_scores = torch.clamp(image_level_scores, min=0, max=1)
 85 | 
 86 |         loss = F.binary_cross_entropy(image_level_scores, image_level_label.to(torch.float32), size_average=False)
 87 |         reg = self.spatial_regulariser(rois, fc7, scores, image_level_label)
 88 | 
 89 |         return scores, loss, reg
 90 | 
 91 |     def region_aware_softmax(self, rois, det_score):
 92 |         N = rois.size(0)
 93 |         C = self.num_classes
 94 |         cwh_form_rois = to_cwh_form(rois)
 95 |         pair_wise_dx = cwh_form_rois[:, 0].view(1, -1) - cwh_form_rois[:, 0].view(-1, 1)
 96 |         pair_wise_dy = cwh_form_rois[:, 1].view(1, -1) - cwh_form_rois[:, 1].view(-1, 1)
 97 | 
 98 |         pair_wise_wsum = cwh_form_rois[:, 2].view(1, -1) + cwh_form_rois[:, 2].view(-1, 1)
 99 |         pair_wise_hsum = cwh_form_rois[:, 3].view(1, -1) + cwh_form_rois[:, 3].view(-1, 1)
100 | 
101 |         pair_wise_dx = pair_wise_dx / pair_wise_wsum
102 |         pair_wise_dy = pair_wise_dy / pair_wise_hsum
103 | 
104 |         pair_wise_dist = torch.sqrt(pair_wise_dx * pair_wise_dx + pair_wise_dy * pair_wise_dy)
105 |         pair_wise_weight = torch.exp(-pair_wise_dist)
106 | 
107 |         det_score = torch.exp(det_score)
108 |         output = []
109 | 
110 |         for cls in range(self.num_classes):
111 |             weighted_det_sum = torch.sum(det_score[:, cls] * pair_wise_weight, 1)
112 |             here = det_score[:, cls] / weighted_det_sum
113 |             output.append(here)
114 | 
115 |         output = torch.stack(output, 1)
116 | 
117 |         if output.max() < 0.001:
118 |             det_score = torch.log(det_score)
119 |             print(det_score)
120 |             print(pair_wise_weight)
121 |             print(det_score.max(), det_score.min())
122 |             print(pair_wise_weight.max(), pair_wise_weight.min())
123 |             print(pair_wise_dist.max(), pair_wise_dist.min())
124 | 
125 |         return output
126 | 
127 | 
128 |     # def spatial_regulariser(self, rois, fc7, scores, image_level_label):
129 |     #     N = rois.size(0)
130 |     #     ret = 0
131 |     #     C = 0
132 |     #     for cls in range(self.num_classes):
133 |     #         if image_level_label[cls].item() == 0:
134 |     #             continue
135 |     #
136 |     #         max_score, max_score_index = torch.max(scores[:, cls], 0)
137 |     #         max_score_box = rois[max_score_index]
138 |     #         max_feature = fc7[max_score_index]
139 |     #
140 |     #         iou = all_pair_iou(max_score_box.view(1, 4), rois).view(N)
141 |     #         adjacent_indices = iou.gt(0.6).nonzero().squeeze()
142 |     #         adjacent_features = fc7[adjacent_indices]
143 |     #
144 |     #         diff = adjacent_features - max_feature
145 |     #         diff = diff * max_score
146 |     #
147 |     #         ret = torch.sum(diff * diff) + ret
148 |     #         C = C + 1
149 |     #     return ret / C
150 | 
151 |     # def spatial_regulariser(self, rois, fc7, scores, image_level_label):
152 |     #     N = rois.size(0)
153 |     #     ret = 0
154 |     #     for cls in range(self.num_classes):
155 |     #         if image_level_label[cls].item() == 0:
156 |     #             continue
157 |     #
158 |     #         max_score, max_score_index = torch.max(scores[:, cls], 0)
159 |     #         max_score_box = rois[max_score_index]
160 |     #         max_feature = fc7[max_score_index]
161 |     #
162 |     #         iou = all_pair_iou(max_score_box.view(1, 4), rois).view(N)
163 |     #         adjacent_indices = iou.gt(0.6).nonzero().squeeze()
164 |     #         adjacent_features = fc7[adjacent_indices]
165 |     #
166 |     #         diff = adjacent_features - max_feature
167 |     #         diff = diff * max_score
168 |     #
169 |     #         ret = torch.sum(diff * diff) * 0.5 + ret
170 |     #
171 |     #     return ret
172 | 
173 |     def spatial_regulariser(self, rois, fc7, scores, image_level_label):
174 |         K = 10
175 |         th = 0.6
176 |         N = rois.size(0)
177 |         ret = 0
178 |         for cls in range(self.num_classes):
179 |             if image_level_label[cls].item() == 0:
180 |                 continue
181 | 
182 |             topk_scores, topk_indices = scores[:, cls].topk(K, dim=0)
183 |             topk_boxes = rois[topk_indices]
184 |             topk_featres = fc7[topk_indices]
185 | 
186 |             mask = all_pair_iou(topk_boxes[0:1, :], topk_boxes).view(K).gt(th).float()
187 | 
188 |             diff = topk_featres - topk_featres[0]
189 |             diff = diff * topk_scores.detach().view(K, 1)
190 | 
191 |             ret = (torch.pow(diff, 2).sum(1) * mask).sum() * 0.5 + ret
192 | 
193 |         return ret


--------------------------------------------------------------------------------
/model/roi_align/src/roi_align_kernel.cu:
--------------------------------------------------------------------------------
  1 | #ifdef __cplusplus
  2 | extern "C" {
  3 | #endif
  4 | 
  5 | #include <stdio.h>
  6 | #include <math.h>
  7 | #include <float.h>
  8 | #include "roi_align_kernel.h"
  9 | 
 10 | #define CUDA_1D_KERNEL_LOOP(i, n)                            \
 11 |     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
 12 |             i += blockDim.x * gridDim.x)
 13 | 
 14 | 
 15 |     __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width,
 16 |                                     const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) {
 17 |         CUDA_1D_KERNEL_LOOP(index, nthreads) {
 18 |             // (n, c, ph, pw) is an element in the aligned output
 19 |             // int n = index;
 20 |             // int pw = n % aligned_width;
 21 |             // n /= aligned_width;
 22 |             // int ph = n % aligned_height;
 23 |             // n /= aligned_height;
 24 |             // int c = n % channels;
 25 |             // n /= channels;
 26 | 
 27 |             int pw = index % aligned_width;
 28 |             int ph = (index / aligned_width) % aligned_height;
 29 |             int c  = (index / aligned_width / aligned_height) % channels;
 30 |             int n  = index / aligned_width / aligned_height / channels;
 31 | 
 32 |             // bottom_rois += n * 5;
 33 |             float roi_batch_ind = bottom_rois[n * 5 + 0];
 34 |             float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
 35 |             float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
 36 |             float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
 37 |             float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
 38 | 
 39 |             // Force malformed ROIs to be 1x1
 40 |             float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
 41 |             float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
 42 |             float bin_size_h = roi_height / (aligned_height - 1.);
 43 |             float bin_size_w = roi_width / (aligned_width - 1.);
 44 | 
 45 |             float h = (float)(ph) * bin_size_h + roi_start_h;
 46 |             float w = (float)(pw) * bin_size_w + roi_start_w;
 47 | 
 48 |             int hstart = fminf(floor(h), height - 2);
 49 |             int wstart = fminf(floor(w), width - 2);
 50 | 
 51 |             int img_start = roi_batch_ind * channels * height * width;
 52 | 
 53 |             // bilinear interpolation
 54 |             if (h < 0 || h >= height || w < 0 || w >= width) {
 55 |                 top_data[index] = 0.;
 56 |             } else {
 57 |                 float h_ratio = h - (float)(hstart);
 58 |                 float w_ratio = w - (float)(wstart);
 59 |                 int upleft = img_start + (c * height + hstart) * width + wstart;
 60 |                 int upright = upleft + 1;
 61 |                 int downleft = upleft + width;
 62 |                 int downright = downleft + 1;
 63 | 
 64 |                 top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio)
 65 |                     + bottom_data[upright] * (1. - h_ratio) * w_ratio
 66 |                     + bottom_data[downleft] * h_ratio * (1. - w_ratio)
 67 |                     + bottom_data[downright] * h_ratio * w_ratio;
 68 |             }
 69 |         }
 70 |     }
 71 | 
 72 | 
 73 |     int ROIAlignForwardLaucher(const float* bottom_data, const float spatial_scale, const int num_rois, const int height, const int width,
 74 |                                const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data, cudaStream_t stream) {
 75 |         const int kThreadsPerBlock = 1024;
 76 |         const int output_size = num_rois * aligned_height * aligned_width * channels;
 77 |         cudaError_t err;
 78 | 
 79 | 
 80 |         ROIAlignForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
 81 |           output_size, bottom_data, spatial_scale, height, width, channels,
 82 |           aligned_height, aligned_width, bottom_rois, top_data);
 83 | 
 84 |         err = cudaGetLastError();
 85 |         if(cudaSuccess != err) {
 86 |             fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
 87 |             exit( -1 );
 88 |         }
 89 | 
 90 |         return 1;
 91 |     }
 92 | 
 93 | 
 94 |     __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, const float spatial_scale, const int height, const int width,
 95 |                                      const int channels, const int aligned_height, const int aligned_width, float* bottom_diff, const float* bottom_rois) {
 96 |         CUDA_1D_KERNEL_LOOP(index, nthreads) {
 97 | 
 98 |             // (n, c, ph, pw) is an element in the aligned output
 99 |             int pw = index % aligned_width;
100 |             int ph = (index / aligned_width) % aligned_height;
101 |             int c  = (index / aligned_width / aligned_height) % channels;
102 |             int n  = index / aligned_width / aligned_height / channels;
103 | 
104 |             float roi_batch_ind = bottom_rois[n * 5 + 0];
105 |             float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
106 |             float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
107 |             float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
108 |             float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
109 |             /* int roi_start_w = round(bottom_rois[1] * spatial_scale); */
110 |             /* int roi_start_h = round(bottom_rois[2] * spatial_scale); */
111 |             /* int roi_end_w = round(bottom_rois[3] * spatial_scale); */
112 |             /* int roi_end_h = round(bottom_rois[4] * spatial_scale); */
113 | 
114 |             // Force malformed ROIs to be 1x1
115 |             float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
116 |             float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
117 |             float bin_size_h = roi_height / (aligned_height - 1.);
118 |             float bin_size_w = roi_width / (aligned_width - 1.);
119 | 
120 |             float h = (float)(ph) * bin_size_h + roi_start_h;
121 |             float w = (float)(pw) * bin_size_w + roi_start_w;
122 | 
123 |             int hstart = fminf(floor(h), height - 2);
124 |             int wstart = fminf(floor(w), width - 2);
125 | 
126 |             int img_start = roi_batch_ind * channels * height * width;
127 | 
128 |             // bilinear interpolation
129 |             if (!(h < 0 || h >= height || w < 0 || w >= width)) {
130 |                 float h_ratio = h - (float)(hstart);
131 |                 float w_ratio = w - (float)(wstart);
132 |                 int upleft = img_start + (c * height + hstart) * width + wstart;
133 |                 int upright = upleft + 1;
134 |                 int downleft = upleft + width;
135 |                 int downright = downleft + 1;
136 | 
137 |                 atomicAdd(bottom_diff + upleft, top_diff[index] * (1. - h_ratio) * (1 - w_ratio));
138 |                 atomicAdd(bottom_diff + upright, top_diff[index] * (1. - h_ratio) * w_ratio);
139 |                 atomicAdd(bottom_diff + downleft, top_diff[index] * h_ratio * (1 - w_ratio));
140 |                 atomicAdd(bottom_diff + downright, top_diff[index] * h_ratio * w_ratio);
141 |             }
142 |         }
143 |     }
144 | 
145 |     int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, const int height, const int width,
146 |                                 const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* bottom_diff, cudaStream_t stream) {
147 |         const int kThreadsPerBlock = 1024;
148 |         const int output_size = num_rois * aligned_height * aligned_width * channels;
149 |         cudaError_t err;
150 | 
151 |         ROIAlignBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
152 |           output_size, top_diff, spatial_scale, height, width, channels,
153 |           aligned_height, aligned_width, bottom_diff, bottom_rois);
154 | 
155 |         err = cudaGetLastError();
156 |         if(cudaSuccess != err) {
157 |             fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
158 |             exit( -1 );
159 |         }
160 | 
161 |         return 1;
162 |     }
163 | 
164 | 
165 | #ifdef __cplusplus
166 | }
167 | #endif
168 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # PyTorch WSDDN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Seungkwan Lee
  5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, Jianwei Yang
  6 | # --------------------------------------------------------
  7 | import os
  8 | import numpy as np
  9 | import argparse
 10 | import time
 11 | 
 12 | import torch
 13 | 
 14 | from utils.net_utils import adjust_learning_rate, save_checkpoint, clip_gradient
 15 | from model.wsddn_vgg16 import WSDDN_VGG16
 16 | from datasets.wsddn_dataset import WSDDNDataset
 17 | from matplotlib import pyplot as plt
 18 | import torch.nn.functional as F
 19 | import math
 20 | 
 21 | 
 22 | def parse_args():
 23 |     parser = argparse.ArgumentParser(description='Train')
 24 |     parser.add_argument('--net', default='WSDDN_VGG16', type=str)
 25 |     parser.add_argument('--start_epoch', help='starting epoch', default=1, type=int)
 26 |     parser.add_argument('--epochs', dest='max_epochs', help='number of epochs', default=20, type=int)
 27 |     parser.add_argument('--disp_interval', help='number of iterations to display loss', default=1000, type=int)
 28 |     parser.add_argument('--save_interval', dest='save_interval', help='number of epochs to save', default=5, type=int)
 29 |     parser.add_argument('--save_dir', help='directory to save models', default="../repo/wsddn")
 30 |     parser.add_argument('--data_dir', help='directory to load data', default='../data', type=str)
 31 | 
 32 |     parser.add_argument('--prop_method', help='ss or eb', default='eb', type=str)
 33 |     parser.add_argument('--use_prop_score', action='store_true')
 34 |     parser.add_argument('--min_prop', help='minimum proposal box size', default=20, type=int)
 35 |     parser.add_argument('--alpha', help='alpha for spatial regularization', default=0.0001, type=float)
 36 | 
 37 |     parser.add_argument('--lr', help='starting learning rate', default=0.00001, type=float)
 38 |     parser.add_argument('--s', dest='session', help='training session', default=1, type=int)
 39 |     parser.add_argument('--bs', help='training batch size', default=1, type=int)
 40 |     parser.add_argument('--bavg', help='batch average', action='store_true')
 41 | 
 42 |     # resume trained model
 43 |     parser.add_argument('--r', dest='resume', help='resume checkpoint or not', action='store_true')
 44 |     parser.add_argument('--checksession', dest='checksession', help='checksession to load model', default=0, type=int)
 45 |     parser.add_argument('--checkepoch', dest='checkepoch', help='checkepoch to load model', default=0, type=int)
 46 | 
 47 |     args = parser.parse_args()
 48 |     return args
 49 | 
 50 | 
 51 | def draw_box(boxes, col=None):
 52 |     for j, (xmin, ymin, xmax, ymax) in enumerate(boxes):
 53 |         if col is None:
 54 |             c = np.random.rand(3)
 55 |         else:
 56 |             c = col
 57 |         plt.hlines(ymin, xmin, xmax, colors=c, lw=2)
 58 |         plt.hlines(ymax, xmin, xmax, colors=c, lw=2)
 59 |         plt.vlines(xmin, ymin, ymax, colors=c, lw=2)
 60 |         plt.vlines(xmax, ymin, ymax, colors=c, lw=2)
 61 | 
 62 | 
 63 | def train():
 64 |     args = parse_args()
 65 |     print('Called with args:')
 66 |     print(args)
 67 | 
 68 |     np.random.seed(3)
 69 |     torch.manual_seed(4)
 70 |     if torch.cuda.is_available():
 71 |         torch.cuda.manual_seed(5)
 72 |         device = torch.device('cuda')
 73 |     else:
 74 |         device = torch.device('cpu')
 75 | 
 76 |     output_dir = args.save_dir
 77 |     if not os.path.exists(output_dir):
 78 |         os.makedirs(output_dir)
 79 | 
 80 |     train_dataset = WSDDNDataset(dataset_names=['voc07_trainval'], data_dir=args.data_dir, prop_method=args.prop_method,
 81 |                                  num_classes=20, min_prop_scale=args.min_prop)
 82 | 
 83 |     lr = args.lr
 84 | 
 85 |     if args.net == 'WSDDN_VGG16':
 86 |         model = WSDDN_VGG16(os.path.join(args.data_dir, 'pretrained_model/vgg16_caffe.pth'), 20)
 87 | 
 88 |     else:
 89 |         raise Exception('network is not defined')
 90 | 
 91 |     params = []
 92 |     for key, value in dict(model.named_parameters()).items():
 93 |         if value.requires_grad:
 94 |             if 'bias' in key:
 95 |                 params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}]
 96 |             else:
 97 |                 params += [{'params': [value], 'lr': lr, 'weight_decay': 0.0005}]
 98 | 
 99 |     optimizer = torch.optim.SGD(params, momentum=0.9)
100 | 
101 |     if args.resume:
102 |         load_name = os.path.join(output_dir, '{}_{}_{}.pth'.format(args.net, args.checksession, args.checkepoch))
103 |         print("loading checkpoint %s" % (load_name))
104 |         checkpoint = torch.load(load_name)
105 |         assert args.net == checkpoint['net']
106 |         args.start_epoch = checkpoint['epoch']
107 |         model.load_state_dict(checkpoint['model'])
108 |         optimizer.load_state_dict(checkpoint['optimizer'])
109 |         lr = optimizer.param_groups[0]['lr']
110 |         print("loaded checkpoint %s" % (load_name))
111 | 
112 |     log_file_name = os.path.join(output_dir, 'log_{}_{}.txt'.format(args.net, args.session))
113 |     if args.resume:
114 |         log_file = open(log_file_name, 'a')
115 |     else:
116 |         log_file = open(log_file_name, 'w')
117 |     log_file.write(str(args))
118 |     log_file.write('\n')
119 | 
120 |     model.to(device)
121 | 
122 |     for epoch in range(args.start_epoch, args.max_epochs + 1):
123 |         model.train()
124 |         loss_sum = 0
125 |         reg_sum = 0
126 |         iter_sum = 0
127 |         num_prop = 0
128 |         start = time.time()
129 | 
130 |         optimizer.zero_grad()
131 |         rand_perm = np.random.permutation(len(train_dataset))
132 |         for step in range(1, len(train_dataset) + 1):
133 |             index = rand_perm[step - 1]
134 |             apply_h_flip = np.random.rand() > 0.5
135 |             target_im_size = np.random.choice([480, 576, 688, 864, 1200])
136 |             im_data, gt_boxes, box_labels, proposals, prop_scores, image_level_label, im_scale, raw_img, im_id = \
137 |                 train_dataset.get_data(index, apply_h_flip, target_im_size)
138 | 
139 |             # plt.imshow(raw_img)
140 |             # draw_box(proposals / im_scale)
141 |             # draw_box(gt_boxes / im_scale, 'black')
142 |             # plt.show()
143 | 
144 |             im_data = im_data.unsqueeze(0).to(device)
145 |             rois = proposals.to(device)
146 |             image_level_label = image_level_label.to(device)
147 | 
148 |             if args.use_prop_score:
149 |                 prop_scores = prop_scores.to(device)
150 |             else:
151 |                 prop_scores = None
152 |             scores, loss, reg = model(im_data, rois, prop_scores, image_level_label)
153 |             reg = reg * args.alpha
154 |             num_prop += proposals.size(0)
155 |             loss_sum += loss.item()
156 |             reg_sum += reg.item()
157 |             loss = loss + reg
158 |             if args.bavg:
159 |                 loss = loss / args.bs
160 |             loss.backward()
161 | 
162 |             if step % args.bs == 0:
163 |                 optimizer.step()
164 |                 optimizer.zero_grad()
165 |             iter_sum += 1
166 | 
167 |             if step % args.disp_interval == 0:
168 |                 end = time.time()
169 | 
170 |                 print("[net %s][session %d][epoch %2d][iter %4d] loss: %.4f, reg: %.4f, num_prop: %.1f, lr: %.2e, time: %.1f" %
171 |                       (args.net, args.session, epoch, step, loss_sum / iter_sum,  reg_sum / iter_sum, num_prop / iter_sum, lr,  end - start))
172 |                 log_file.write("[net %s][session %d][epoch %2d][iter %4d] loss: %.4f, reg: %.4f, num_prop: %.1f, lr: %.2e, time: %.1f\n" %
173 |                                (args.net, args.session, epoch, step, loss_sum / iter_sum, reg_sum / iter_sum, num_prop / iter_sum, lr,  end - start))
174 |                 loss_sum = 0
175 |                 reg_sum = 0
176 |                 num_prop = 0
177 |                 iter_sum = 0
178 |                 start = time.time()
179 | 
180 |         log_file.flush()
181 |         if epoch == 10:
182 |             adjust_learning_rate(optimizer, 0.1)
183 |             lr *= 0.1
184 | 
185 |         if epoch % args.save_interval == 0:
186 |             save_name = os.path.join(output_dir, '{}_{}_{}.pth'.format(args.net, args.session, epoch))
187 |             checkpoint = dict()
188 |             checkpoint['net'] = args.net
189 |             checkpoint['session'] = args.session
190 |             checkpoint['epoch'] = epoch + 1
191 |             checkpoint['model'] = model.state_dict()
192 |             checkpoint['optimizer'] = optimizer.state_dict()
193 | 
194 |             save_checkpoint(checkpoint, save_name)
195 |             print('save model: {}'.format(save_name))
196 | 
197 |     log_file.close()
198 | 
199 | 
200 | if __name__ == '__main__':
201 |     train()


--------------------------------------------------------------------------------
/model/roi_pooling/src/roi_pooling_kernel.cu:
--------------------------------------------------------------------------------
  1 | // #ifdef __cplusplus
  2 | // extern "C" {
  3 | // #endif
  4 | 
  5 | #include <stdio.h>
  6 | #include <vector>
  7 | #include <math.h>
  8 | #include <float.h>
  9 | #include "roi_pooling_kernel.h"
 10 | 
 11 | 
 12 | #define DIVUP(m, n) ((m) / (m) + ((m) % (n) > 0))
 13 | 
 14 | #define CUDA_1D_KERNEL_LOOP(i, n)                            \
 15 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
 16 |        i += blockDim.x * gridDim.x)
 17 | 
 18 | // CUDA: grid stride looping
 19 | #define CUDA_KERNEL_LOOP(i, n) \
 20 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
 21 |        i < (n); \
 22 |        i += blockDim.x * gridDim.x)
 23 | 
 24 | __global__ void ROIPoolForward(const int nthreads, const float* bottom_data,
 25 |     const float spatial_scale, const int height, const int width,
 26 |     const int channels, const int pooled_height, const int pooled_width,
 27 |     const float* bottom_rois, float* top_data, int* argmax_data)
 28 | {
 29 |     CUDA_KERNEL_LOOP(index, nthreads)
 30 |     {
 31 |         // (n, c, ph, pw) is an element in the pooled output
 32 |         // int n = index;
 33 |         // int pw = n % pooled_width;
 34 |         // n /= pooled_width;
 35 |         // int ph = n % pooled_height;
 36 |         // n /= pooled_height;
 37 |         // int c = n % channels;
 38 |         // n /= channels;
 39 |         int pw = index % pooled_width;
 40 |         int ph = (index / pooled_width) % pooled_height;
 41 |         int c  = (index / pooled_width / pooled_height) % channels;
 42 |         int n  = index / pooled_width / pooled_height / channels;
 43 | 
 44 |         // bottom_rois += n * 5;
 45 |         int roi_batch_ind = bottom_rois[n * 5 + 0];
 46 |         int roi_start_w = round(bottom_rois[n * 5 + 1] * spatial_scale);
 47 |         int roi_start_h = round(bottom_rois[n * 5 + 2] * spatial_scale);
 48 |         int roi_end_w = round(bottom_rois[n * 5 + 3] * spatial_scale);
 49 |         int roi_end_h = round(bottom_rois[n * 5 + 4] * spatial_scale);
 50 | 
 51 |         // Force malformed ROIs to be 1x1
 52 |         int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1);
 53 |         int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1);
 54 |         float bin_size_h = (float)(roi_height) / (float)(pooled_height);
 55 |         float bin_size_w = (float)(roi_width) / (float)(pooled_width);
 56 | 
 57 |         int hstart = (int)(floor((float)(ph) * bin_size_h));
 58 |         int wstart = (int)(floor((float)(pw) * bin_size_w));
 59 |         int hend = (int)(ceil((float)(ph + 1) * bin_size_h));
 60 |         int wend = (int)(ceil((float)(pw + 1) * bin_size_w));
 61 | 
 62 |         // Add roi offsets and clip to input boundaries
 63 |         hstart = fminf(fmaxf(hstart + roi_start_h, 0), height);
 64 |         hend = fminf(fmaxf(hend + roi_start_h, 0), height);
 65 |         wstart = fminf(fmaxf(wstart + roi_start_w, 0), width);
 66 |         wend = fminf(fmaxf(wend + roi_start_w, 0), width);
 67 |         bool is_empty = (hend <= hstart) || (wend <= wstart);
 68 | 
 69 |         // Define an empty pooling region to be zero
 70 |         float maxval = is_empty ? 0 : -FLT_MAX;
 71 |         // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
 72 |         int maxidx = -1;
 73 |         // bottom_data += roi_batch_ind * channels * height * width;
 74 | 
 75 |         int bottom_data_batch_offset = roi_batch_ind * channels * height * width;
 76 |         int bottom_data_offset = bottom_data_batch_offset + c * height * width;
 77 | 
 78 |         for (int h = hstart; h < hend; ++h) {
 79 |             for (int w = wstart; w < wend; ++w) {
 80 |                 // int bottom_index = (h * width + w) * channels + c;
 81 |                 // int bottom_index = (c * height + h) * width + w;
 82 |                 int bottom_index = h * width + w;
 83 |                 if (bottom_data[bottom_data_offset + bottom_index] > maxval) {
 84 |                     maxval = bottom_data[bottom_data_offset + bottom_index];
 85 |                     maxidx = bottom_data_offset + bottom_index;
 86 |                 }
 87 |             }
 88 |         }
 89 |         top_data[index] = maxval;
 90 |         if (argmax_data != NULL)
 91 |             argmax_data[index] = maxidx;
 92 |     }
 93 | }
 94 | 
 95 | int ROIPoolForwardLaucher(
 96 |     const float* bottom_data, const float spatial_scale, const int num_rois, const int height,
 97 |     const int width, const int channels, const int pooled_height,
 98 |     const int pooled_width, const float* bottom_rois,
 99 |     float* top_data, int* argmax_data, cudaStream_t stream)
100 | {
101 |     const int kThreadsPerBlock = 1024;
102 |     int output_size = num_rois * pooled_height * pooled_width * channels;
103 |     cudaError_t err;
104 | 
105 |     ROIPoolForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
106 |       output_size, bottom_data, spatial_scale, height, width, channels, pooled_height,
107 |       pooled_width, bottom_rois, top_data, argmax_data);
108 | 
109 |     // dim3 blocks(DIVUP(output_size, kThreadsPerBlock),
110 |     //             DIVUP(output_size, kThreadsPerBlock));
111 |     // dim3 threads(kThreadsPerBlock);
112 |     //
113 |     // ROIPoolForward<<<blocks, threads, 0, stream>>>(
114 |     //   output_size, bottom_data, spatial_scale, height, width, channels, pooled_height,
115 |     //   pooled_width, bottom_rois, top_data, argmax_data);
116 | 
117 |     err = cudaGetLastError();
118 |     if(cudaSuccess != err)
119 |     {
120 |         fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
121 |         exit( -1 );
122 |     }
123 | 
124 |     return 1;
125 | }
126 | 
127 | 
128 | __global__ void ROIPoolBackward(const int nthreads, const float* top_diff,
129 |     const int* argmax_data, const int num_rois, const float spatial_scale,
130 |     const int height, const int width, const int channels,
131 |     const int pooled_height, const int pooled_width, float* bottom_diff,
132 |     const float* bottom_rois) {
133 |     CUDA_1D_KERNEL_LOOP(index, nthreads)
134 |     {
135 | 
136 |         // (n, c, ph, pw) is an element in the pooled output
137 |         int n = index;
138 |         int w = n % width;
139 |         n /= width;
140 |         int h = n % height;
141 |         n /= height;
142 |         int c = n % channels;
143 |         n /= channels;
144 | 
145 |         float gradient = 0;
146 |         // Accumulate gradient over all ROIs that pooled this element
147 |         for (int roi_n = 0; roi_n < num_rois; ++roi_n)
148 |         {
149 |             const float* offset_bottom_rois = bottom_rois + roi_n * 5;
150 |             int roi_batch_ind = offset_bottom_rois[0];
151 |             // Skip if ROI's batch index doesn't match n
152 |             if (n != roi_batch_ind) {
153 |                 continue;
154 |             }
155 | 
156 |             int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
157 |             int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
158 |             int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
159 |             int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
160 | 
161 |             // Skip if ROI doesn't include (h, w)
162 |             const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
163 |                                h >= roi_start_h && h <= roi_end_h);
164 |             if (!in_roi) {
165 |                 continue;
166 |             }
167 | 
168 |             int offset = roi_n * pooled_height * pooled_width * channels;
169 |             const float* offset_top_diff = top_diff + offset;
170 |             const int* offset_argmax_data = argmax_data + offset;
171 | 
172 |             // Compute feasible set of pooled units that could have pooled
173 |             // this bottom unit
174 | 
175 |             // Force malformed ROIs to be 1x1
176 |             int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1);
177 |             int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1);
178 | 
179 |             float bin_size_h = (float)(roi_height) / (float)(pooled_height);
180 |             float bin_size_w = (float)(roi_width) / (float)(pooled_width);
181 | 
182 |             int phstart = floor((float)(h - roi_start_h) / bin_size_h);
183 |             int phend = ceil((float)(h - roi_start_h + 1) / bin_size_h);
184 |             int pwstart = floor((float)(w - roi_start_w) / bin_size_w);
185 |             int pwend = ceil((float)(w - roi_start_w + 1) / bin_size_w);
186 | 
187 |             phstart = fminf(fmaxf(phstart, 0), pooled_height);
188 |             phend = fminf(fmaxf(phend, 0), pooled_height);
189 |             pwstart = fminf(fmaxf(pwstart, 0), pooled_width);
190 |             pwend = fminf(fmaxf(pwend, 0), pooled_width);
191 | 
192 |             for (int ph = phstart; ph < phend; ++ph) {
193 |                 for (int pw = pwstart; pw < pwend; ++pw) {
194 |                     if (offset_argmax_data[(c * pooled_height + ph) * pooled_width + pw] == index)
195 |                     {
196 |                         gradient += offset_top_diff[(c * pooled_height + ph) * pooled_width + pw];
197 |                     }
198 |                 }
199 |             }
200 |         }
201 |         bottom_diff[index] = gradient;
202 |   }
203 | }
204 | 
205 | int ROIPoolBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois,
206 |     const int height, const int width, const int channels, const int pooled_height,
207 |     const int pooled_width, const float* bottom_rois,
208 |     float* bottom_diff, const int* argmax_data, cudaStream_t stream)
209 | {
210 |     const int kThreadsPerBlock = 1024;
211 |     int output_size = batch_size * height * width * channels;
212 |     cudaError_t err;
213 | 
214 |     ROIPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
215 |       output_size, top_diff, argmax_data, num_rois, spatial_scale, height, width, channels, pooled_height,
216 |       pooled_width, bottom_diff, bottom_rois);
217 | 
218 |     // dim3 blocks(DIVUP(output_size, kThreadsPerBlock),
219 |     //             DIVUP(output_size, kThreadsPerBlock));
220 |     // dim3 threads(kThreadsPerBlock);
221 |     //
222 |     // ROIPoolBackward<<<blocks, threads, 0, stream>>>(
223 |     //   output_size, top_diff, argmax_data, num_rois, spatial_scale, height, width, channels, pooled_height,
224 |     //   pooled_width, bottom_diff, bottom_rois);
225 | 
226 |     err = cudaGetLastError();
227 |     if(cudaSuccess != err)
228 |     {
229 |         fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
230 |         exit( -1 );
231 |     }
232 | 
233 |     return 1;
234 | }
235 | 
236 | 
237 | // #ifdef __cplusplus
238 | // }
239 | // #endif
240 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # PyTorch WSDDN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Seungkwan Lee
  5 | # Some parts of this implementation are based on code from Ross Girshick, Jiasen Lu, and Jianwei Yang
  6 | # --------------------------------------------------------
  7 | import os
  8 | import numpy as np
  9 | import argparse
 10 | import time
 11 | 
 12 | import torch
 13 | 
 14 | from model.wsddn_vgg16 import WSDDN_VGG16
 15 | from datasets.wsddn_dataset import WSDDNDataset
 16 | from matplotlib import pyplot as plt
 17 | import torch.nn.functional as F
 18 | import math
 19 | import pickle
 20 | from utils.cpu_nms import cpu_nms as nms
 21 | import heapq
 22 | import itertools
 23 | from frcnn_eval.pascal_voc import voc_eval_kit
 24 | 
 25 | def parse_args():
 26 |     parser = argparse.ArgumentParser(description='Eval')
 27 |     parser.add_argument('--save_dir', help='directory to load model and save detection results', default="../repo")
 28 |     parser.add_argument('--data_dir', help='directory to load data', default='./data', type=str)
 29 | 
 30 |     parser.add_argument('--prop_method', help='ss or eb', default='eb', type=str)
 31 |     parser.add_argument('--use_prop_score', action='store_true')
 32 |     parser.add_argument('--multiscale', action='store_true')
 33 |     parser.add_argument('--min_resize', action='store_true')
 34 | 
 35 |     parser.add_argument('--min_prop', help='minimum proposal box size', default=20, type=int)
 36 |     parser.add_argument('--model_name', default='WSDDN_VGG16_1_20', type=str)
 37 | 
 38 |     args = parser.parse_args()
 39 |     return args
 40 | 
 41 | args = parse_args()
 42 | 
 43 | def draw_box(boxes, col=None):
 44 |     for j, (xmin, ymin, xmax, ymax) in enumerate(boxes):
 45 |         if col is None:
 46 |             c = np.random.rand(3)
 47 |         else:
 48 |             c = col
 49 |         plt.hlines(ymin, xmin, xmax, colors=c, lw=2)
 50 |         plt.hlines(ymax, xmin, xmax, colors=c, lw=2)
 51 |         plt.vlines(xmin, ymin, ymax, colors=c, lw=2)
 52 |         plt.vlines(xmax, ymin, ymax, colors=c, lw=2)
 53 | 
 54 | 
 55 | def eval():
 56 |     print('Called with args:')
 57 |     print(args)
 58 | 
 59 |     np.random.seed(3)
 60 |     torch.manual_seed(4)
 61 |     if torch.cuda.is_available():
 62 |         torch.cuda.manual_seed(5)
 63 |         device = torch.device('cuda')
 64 |     else:
 65 |         device = torch.device('cpu')
 66 | 
 67 |     eval_kit = voc_eval_kit('test', '2007', os.path.join(args.data_dir, 'VOCdevkit2007'))
 68 | 
 69 |     test_dataset = WSDDNDataset(dataset_names=['voc07_test'], data_dir=args.data_dir, prop_method=args.prop_method,
 70 |                                 num_classes=20, min_prop_scale=args.min_prop)
 71 | 
 72 |     load_name = os.path.join(args.save_dir, 'wsddn', '{}.pth'.format(args.model_name))
 73 |     print("loading checkpoint %s" % (load_name))
 74 |     checkpoint = torch.load(load_name)
 75 |     if checkpoint['net'] == 'WSDDN_VGG16':
 76 |         model = WSDDN_VGG16(None, 20)
 77 |     else:
 78 |         raise Exception('network is not defined')
 79 |     model.load_state_dict(checkpoint['model'])
 80 |     print("loaded checkpoint %s" % (load_name))
 81 | 
 82 |     model.to(device)
 83 |     model.eval()
 84 | 
 85 |     start = time.time()
 86 | 
 87 |     num_images = len(test_dataset)
 88 |     # heuristic: keep an average of 40 detections per class per images prior
 89 |     # to NMS
 90 |     max_per_set = 40 * num_images
 91 |     # heuristic: keep at most 100 detection per class per image prior to NMS
 92 |     max_per_image = 100
 93 |     # detection thresold for each class (this is adaptively set based on the
 94 |     # max_per_set constraint)
 95 |     thresh = -np.inf * np.ones(20)
 96 |     # thresh = 0.1 * np.ones(imdb.num_classes)
 97 |     # top_scores will hold one minheap of scores per class (used to enforce
 98 |     # the max_per_set constraint)
 99 |     top_scores = [[] for _ in range(20)]
100 |     # all detections are collected into:
101 |     #    all_boxes[cls][image] = N x 5 array of detections in
102 |     #    (x1, y1, x2, y2, score)
103 |     all_boxes = [[[] for _ in range(num_images)] for _ in range(20)]
104 | 
105 |     for index in range(len(test_dataset)):
106 |         scores = 0
107 |         if args.multiscale:
108 |             comb = itertools.product([False, True], [480, 576, 688, 864, 1200])
109 |         else:
110 |             comb = itertools.product([False], [688])
111 |         for h_flip, im_size in comb:
112 |             im_data, gt_boxes, box_labels, proposals, prop_scores, image_level_label, im_scale_ratio, raw_img, im_id = test_dataset.get_data(index, h_flip, im_size, args.min_resize)
113 | 
114 |             im_data = im_data.unsqueeze(0).to(device)
115 |             rois = proposals.to(device)
116 | 
117 |             if args.use_prop_score:
118 |                 prop_scores = prop_scores.to(device)
119 |             else:
120 |                 prop_scores = None
121 |             local_scores = model(im_data, rois, prop_scores, None).detach().cpu().numpy()
122 |             scores = scores + local_scores
123 | 
124 |         scores = scores * 1000
125 |         boxes = test_dataset.get_raw_proposal(index)
126 | 
127 |         for cls in range(20):
128 |             inds = np.where((scores[:, cls] > thresh[cls]))[0]
129 |             cls_scores = scores[inds, cls]
130 |             cls_boxes = boxes[inds].copy()
131 |             top_inds = np.argsort(-cls_scores)[:max_per_image]
132 |             cls_scores = cls_scores[top_inds]
133 |             cls_boxes = cls_boxes[top_inds, :]
134 | 
135 |             # if cls_scores[0] > 0.001:
136 |             #     #print(cls)
137 |             #     plt.imshow(raw_img)
138 |             #     draw_box(cls_boxes[0:10, :])
139 |             #     draw_box(gt_boxes / im_scale, 'black')
140 |             #     plt.show()
141 | 
142 |             # push new scores onto the minheap
143 |             for val in cls_scores:
144 |                 heapq.heappush(top_scores[cls], val)
145 |             # if we've collected more than the max number of detection,
146 |             # then pop items off the minheap and update the class threshold
147 |             if len(top_scores[cls]) > max_per_set:
148 |                 while len(top_scores[cls]) > max_per_set:
149 |                     heapq.heappop(top_scores[cls])
150 |                 thresh[cls] = top_scores[cls][0]
151 | 
152 |             all_boxes[cls][index] = np.hstack((cls_boxes, cls_scores[:, np.newaxis])).astype(np.float32, copy=False)
153 | 
154 |         # sorted_scores, sorted_indices = torch.sort(scores.detach(), dim=0, descending=True)
155 |         # sorted_boxes = rois[sorted_indices.permute(1, 0)]
156 |         #
157 |         # for cls in range(20):
158 |         #     here = torch.cat((sorted_boxes[cls], sorted_scores[:, cls:cls + 1]), 1).cpu()
159 |         #     print(here)
160 |         #     all_boxes[cls][index] = here.numpy()
161 | 
162 |         if index % 100 == 99:
163 |            print('%d images complete, elapsed time:%.1f' % (index + 1, time.time() - start))
164 | 
165 |     for j in range(20):
166 |         for i in range(len(test_dataset)):
167 |             inds = np.where(all_boxes[j][i][:, -1] > thresh[j])[0]
168 |             all_boxes[j][i] = all_boxes[j][i][inds, :]
169 | 
170 |     save_name = os.path.join(args.save_dir, 'detection_result', '{}.pkl'.format(args.model_name))
171 |     pickle.dump(all_boxes, open(save_name, 'wb'))
172 | 
173 |     print('Detection Complete, elapsed time: %.1f', time.time() - start)
174 | 
175 |     for cls in range(20):
176 |         for index in range(len(test_dataset)):
177 |             dets = all_boxes[cls][index]
178 |             if dets == []:
179 |                 continue
180 |             keep = nms(dets, 0.4)
181 |             all_boxes[cls][index] = dets[keep, :].copy()
182 |     print('NMS complete, elapsed time: %.1f', time.time() - start)
183 | 
184 |     eval_kit.evaluate_detections(all_boxes)
185 | 
186 | 
187 | def my_eval():
188 |     print('Called with args:')
189 |     print(args)
190 | 
191 |     np.random.seed(3)
192 |     torch.manual_seed(4)
193 |     if torch.cuda.is_available():
194 |         torch.cuda.manual_seed(5)
195 |         device = torch.device('cuda')
196 |     else:
197 |         device = torch.device('cpu')
198 | 
199 |     eval_kit = voc_eval_kit('test', '2007', os.path.join(args.data_dir, 'VOCdevkit2007'))
200 | 
201 |     test_dataset = WSDDNDataset(dataset_names=['voc07_test'], data_dir=args.data_dir, prop_method=args.prop_method,
202 |                                 num_classes=20, min_prop_scale=args.min_prop)
203 | 
204 |     load_name = os.path.join(args.save_dir, 'wsddn', '{}.pth'.format(args.model_name))
205 |     print("loading checkpoint %s" % (load_name))
206 |     checkpoint = torch.load(load_name)
207 |     if checkpoint['net'] == 'WSDDN_VGG16':
208 |         model = WSDDN_VGG16(None, 20)
209 |     else:
210 |         raise Exception('network is not defined')
211 |     model.load_state_dict(checkpoint['model'])
212 |     print("loaded checkpoint %s" % (load_name))
213 | 
214 |     model.to(device)
215 |     model.eval()
216 | 
217 |     start = time.time()
218 | 
219 |     all_boxes = [[[] for _ in range(len(test_dataset))] for _ in range(20)]
220 | 
221 |     for index in range(len(test_dataset)):
222 |         im_data, gt_boxes, box_labels, proposals, prop_scores, image_level_label, im_scale_ratio, raw_img, im_id = test_dataset.get_data(
223 |             index, False, 688)
224 | 
225 |         im_data = im_data.unsqueeze(0).to(device)
226 |         rois = proposals.to(device)
227 | 
228 |         if args.use_prop_score:
229 |             prop_scores = prop_scores.to(device)
230 |         else:
231 |             prop_scores = None
232 |         scores = model(im_data, rois, prop_scores, None)
233 | 
234 |         sorted_scores, sorted_indices = torch.sort(scores.detach(), dim=0, descending=True)
235 |         sorted_boxes = rois[sorted_indices.permute(1, 0)] / im_scale_ratio
236 | 
237 |         for cls in range(20):
238 |             here = torch.cat((sorted_boxes[cls], sorted_scores[:, cls:cls + 1]), 1).cpu()
239 |             all_boxes[cls][index] = here.numpy()
240 | 
241 |         if index % 500 == 499:
242 |             print('%d images complete, elapsed time:%.1f' % (index + 1, time.time() - start))
243 | 
244 |     save_name = os.path.join(args.save_dir, 'detection_result', '{}.pkl'.format(args.model_name))
245 |     pickle.dump(all_boxes, open(save_name, 'wb'))
246 | 
247 |     print('Detection Complete, elapsed time: %.1f', time.time() - start)
248 | 
249 |     for cls in range(20):
250 |         for index in range(len(test_dataset)):
251 |             dets = all_boxes[cls][index]
252 |             if dets == []:
253 |                 continue
254 |             keep = nms(dets, 0.4)
255 |             all_boxes[cls][index] = dets[keep, :].copy()
256 |     print('NMS complete, elapsed time: %.1f', time.time() - start)
257 | 
258 |     eval_kit.evaluate_detections(all_boxes)
259 | 
260 | def eval_saved_result():
261 |     eval_kit = voc_eval_kit('test', '2007', os.path.join(args.data_dir, 'VOCdevkit2007'))
262 | 
263 |     save_name = os.path.join(args.save_dir, 'detection_result', '{}.pkl'.format(args.model_name))
264 | 
265 |     all_boxes = pickle.load(open(save_name, 'rb'), encoding='latin1')
266 |     #all_boxes = pickle.load(open('../repo/oicr_result/test_detections.pkl', 'rb'), encoding='latin1')
267 | 
268 |     for cls in range(20):
269 |         for index in range(len(all_boxes[0])):
270 |             dets = all_boxes[cls][index]
271 |             if dets == []:
272 |                 continue
273 |             keep = nms(dets, 0.4)
274 |             all_boxes[cls][index] = dets[keep, :].copy()
275 |             if index % 500 == 499:
276 |                 print(index)
277 |         print('nms: cls %d complete' % cls)
278 | 
279 |     eval_kit.evaluate_detections(all_boxes)
280 | 
281 | 
282 | if __name__ == '__main__':
283 |     eval()
284 |     #eval_saved_result()


--------------------------------------------------------------------------------