├── utils
    ├── __init__.py
    ├── nms
    │   ├── __init__.py
    │   ├── gpu_nms.hpp
    │   ├── py_cpu_nms.py
    │   ├── gpu_nms.pyx
    │   ├── nms_kernel.cu
    │   └── cpu_nms.pyx
    ├── averageMeter.py
    ├── timer.py
    ├── nms_wrapper.py
    ├── collections2.py
    ├── build.py
    ├── get_class_map.py
    └── convert_darknet.py
├── dcn
    ├── modules
    │   ├── __init__.py
    │   ├── deform_conv.py
    │   └── deform_pool.py
    ├── functions
    │   ├── __init__.py
    │   ├── deform_pool.py
    │   └── deform_conv.py
    ├── __init__.py
    ├── setup.py
    └── src
    │   └── deform_pool_cuda.cpp
├── layers
    ├── __init__.py
    ├── functions
    │   ├── __init__.py
    │   ├── prior_layer.py
    │   ├── prior_box.py
    │   └── detection.py
    └── modules
    │   ├── __init__.py
    │   ├── focal_loss_sigmoid.py
    │   ├── weight_smooth_l1_loss.py
    │   ├── weight_softmax_loss.py
    │   ├── focal_loss_softmax.py
    │   ├── multibox_loss.py
    │   ├── refine_multibox_loss.py
    │   └── refine_multibox_loss_seperate.py
├── .gitignore
├── data
    ├── __init__.py
    ├── scripts
    │   ├── VOC2012.sh
    │   └── VOC2007.sh
    ├── voc_eval.py
    ├── data_augment.py
    └── voc0712.py
├── compile.sh
├── make.sh
├── configs
    ├── EFGRNet_vgg_coco_dcn_512.yaml
    ├── EFGRNet_vgg_coco_dcn.yaml
    └── config.py
├── README.md
├── models
    ├── resnet.py
    ├── model_builder.py
    ├── vgg.py
    ├── model_builder_vgg.py
    └── model_builder_resnet.py
└── eval_dcn.py


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dcn/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/nms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dcn/functions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import *
2 | from .modules import *
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .idea/
3 | *.so
4 | /eval/
5 | /utils/nms/cpu_nms.so
6 | /utils/nms/gpu_nms.so
7 | 


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .voc0712 import VOCDetection, detection_collate
3 | from .coco import *
4 | from .data_augment import *
5 | 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/utils/nms/gpu_nms.hpp:
--------------------------------------------------------------------------------
1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
2 |           int boxes_dim, float nms_overlap_thresh, int device_id);
3 | 


--------------------------------------------------------------------------------
/compile.sh:
--------------------------------------------------------------------------------
1 | PYTHON=${PYTHON:-"python"}
2 | 
3 | echo "Building dcn..."
4 | cd ./dcn
5 | if [ -d "build" ]; then
6 |     rm -r build
7 | fi
8 | $PYTHON setup.py build_ext --inplace
9 | 


--------------------------------------------------------------------------------
/layers/functions/__init__.py:
--------------------------------------------------------------------------------
1 | from .detection import Detect
2 | from .prior_box import PriorBox
3 | # from .refine_prior_box import RefinePriorBox
4 | 
5 | __all__ = ['Detect', 'PriorBox']
6 | 


--------------------------------------------------------------------------------
/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd ./utils/
 3 | 
 4 | CUDA_PATH=/usr/local/cuda/
 5 | 
 6 | python build.py build_ext --inplace
 7 | # if you use anaconda3 maybe you need add this
 8 | # change code like https://github.com/rbgirshick/py-faster-rcnn/issues/706
 9 | mv nms/cpu_nms.cpython-36m-x86_64-linux-gnu.so nms/cpu_nms.so
10 | mv nms/gpu_nms.cpython-36m-x86_64-linux-gnu.so nms/gpu_nms.so
11 | cd ..
12 | 


--------------------------------------------------------------------------------
/layers/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from .weight_smooth_l1_loss import WeightSmoothL1Loss
 2 | from .weight_softmax_loss import WeightSoftmaxLoss
 3 | from .multibox_loss import MultiBoxLoss 
 4 | from .refine_multibox_loss import RefineMultiBoxLoss 
 5 | from .focal_loss_sigmoid import FocalLossSigmoid
 6 | from .focal_loss_softmax import FocalLossSoftmax
 7 | 
 8 | 
 9 | 
10 | __all__ = ['MultiBoxLoss', 'WeightSoftmaxLoss', ]
11 | 


--------------------------------------------------------------------------------
/utils/averageMeter.py:
--------------------------------------------------------------------------------
 1 | class AverageMeter(object):
 2 |     """Computes and stores the average and current value"""
 3 | 
 4 |     def __init__(self):
 5 |         self.reset()
 6 | 
 7 |     def reset(self):
 8 |         self.val = 0
 9 |         self.avg = 0
10 |         self.sum = 0
11 |         self.count = 0
12 | 
13 |     def update(self, val, n=1):
14 |         self.val = val
15 |         self.sum += val * n
16 |         self.count += n
17 |         self.avg = self.sum / self.count


--------------------------------------------------------------------------------
/dcn/__init__.py:
--------------------------------------------------------------------------------
 1 | from .functions.deform_conv import deform_conv, modulated_deform_conv
 2 | from .functions.deform_pool import deform_roi_pooling
 3 | from .modules.deform_conv import (DeformConv, ModulatedDeformConv,
 4 |                                   ModulatedDeformConvPack)
 5 | from .modules.deform_pool import (DeformRoIPooling, DeformRoIPoolingPack,
 6 |                                   ModulatedDeformRoIPoolingPack)
 7 | 
 8 | __all__ = [
 9 |     'DeformConv', 'DeformRoIPooling', 'DeformRoIPoolingPack',
10 |     'ModulatedDeformRoIPoolingPack', 'ModulatedDeformConv',
11 |     'ModulatedDeformConvPack', 'deform_conv',
12 |     'modulated_deform_conv', 'deform_roi_pooling'
13 | ]
14 | 


--------------------------------------------------------------------------------
/data/scripts/VOC2012.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | echo "Downloading VOC2012 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
26 | echo "Done downloading."
27 | 
28 | 
29 | # Extract data
30 | echo "Extracting trainval ..."
31 | tar -xvf VOCtrainval_11-May-2012.tar
32 | echo "removing tar ..."
33 | rm VOCtrainval_11-May-2012.tar
34 | 
35 | end=`date +%s`
36 | runtime=$((end-start))
37 | 
38 | echo "Completed in" $runtime "seconds"


--------------------------------------------------------------------------------
/dcn/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 3 | 
 4 | nvcc_ARCH  = ['-arch=sm_52']
 5 | nvcc_ARCH += ["-gencode=arch=compute_61,code=\"compute_61\""]
 6 | # nvcc_ARCH += ["-gencode=arch=compute_75,code=\"sm_75\""]
 7 | # nvcc_ARCH += ["-gencode=arch=compute_70,code=\"sm_70\""]
 8 | nvcc_ARCH += ["-gencode=arch=compute_61,code=\"sm_61\""]
 9 | nvcc_ARCH += ["-gencode=arch=compute_52,code=\"sm_52\""]
10 | extra_compile_args = { 
11 |             'cxx': ['-Wno-unused-function', '-Wno-write-strings'],
12 |             'nvcc': nvcc_ARCH,}
13 | 
14 | setup(
15 |     name='deform_conv',
16 |     ext_modules=[
17 |         CUDAExtension('deform_conv_cuda', [
18 |             'src/deform_conv_cuda.cpp',
19 |             'src/deform_conv_cuda_kernel.cu',
20 |         ],
21 |         extra_compile_args=extra_compile_args,
22 |         ),
23 |         CUDAExtension('deform_pool_cuda', [
24 |             'src/deform_pool_cuda.cpp', 'src/deform_pool_cuda_kernel.cu'
25 |         ]),
26 |     ],
27 |     cmdclass={'build_ext': BuildExtension})
28 | 


--------------------------------------------------------------------------------
/data/scripts/VOC2007.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | echo "Downloading VOC2007 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
26 | echo "Downloading VOC2007 test data ..."
27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
28 | echo "Done downloading."
29 | 
30 | # Extract data
31 | echo "Extracting trainval ..."
32 | tar -xvf VOCtrainval_06-Nov-2007.tar
33 | echo "Extracting test ..."
34 | tar -xvf VOCtest_06-Nov-2007.tar
35 | echo "removing tars ..."
36 | rm VOCtrainval_06-Nov-2007.tar
37 | rm VOCtest_06-Nov-2007.tar
38 | 
39 | end=`date +%s`
40 | runtime=$((end-start))
41 | 
42 | echo "Completed in" $runtime "seconds"


--------------------------------------------------------------------------------
/layers/modules/focal_loss_sigmoid.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Written by yq_yao
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | class FocalLossSigmoid(nn.Module):
11 |     '''
12 |     sigmoid version focal loss
13 |     '''
14 | 
15 |     def __init__(self, alpha=0.25, gamma=2, size_average=False):
16 |         super(FocalLossSigmoid, self).__init__()
17 |         self.alpha = alpha
18 |         self.gamma = gamma
19 |         self.size_average = size_average
20 | 
21 |     def forward(self, inputs, targets):
22 |         N = inputs.size(0)
23 |         C = inputs.size(1)
24 |         P = torch.sigmoid(inputs)
25 |         alpha_mask = self.alpha * targets
26 |         loss_pos = -1. * torch.pow(
27 |             1 - P, self.gamma) * torch.log(P) * targets * alpha_mask
28 |         loss_neg = -1. * torch.pow(1 - P, self.gamma) * torch.log(1 - P) * (
29 |             1 - targets) * (1 - alpha_mask)
30 |         batch_loss = loss_neg + loss_pos
31 |         if self.size_average:
32 |             loss = batch_loss.mean()
33 |         else:
34 |             loss = batch_loss.sum()
35 |         return loss
36 | 


--------------------------------------------------------------------------------
/utils/nms/py_cpu_nms.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | def py_cpu_nms(dets, thresh):
11 |     """Pure Python NMS baseline."""
12 |     x1 = dets[:, 0]
13 |     y1 = dets[:, 1]
14 |     x2 = dets[:, 2]
15 |     y2 = dets[:, 3]
16 |     scores = dets[:, 4]
17 | 
18 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
19 |     order = scores.argsort()[::-1]
20 | 
21 |     keep = []
22 |     while order.size > 0:
23 |         i = order[0]
24 |         keep.append(i)
25 |         xx1 = np.maximum(x1[i], x1[order[1:]])
26 |         yy1 = np.maximum(y1[i], y1[order[1:]])
27 |         xx2 = np.minimum(x2[i], x2[order[1:]])
28 |         yy2 = np.minimum(y2[i], y2[order[1:]])
29 | 
30 |         w = np.maximum(0.0, xx2 - xx1 + 1)
31 |         h = np.maximum(0.0, yy2 - yy1 + 1)
32 |         inter = w * h
33 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
34 | 
35 |         inds = np.where(ovr <= thresh)[0]
36 |         order = order[inds + 1]
37 | 
38 |     return keep
39 | 


--------------------------------------------------------------------------------
/utils/nms/gpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Faster R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | assert sizeof(int) == sizeof(np.int32_t)
12 | 
13 | cdef extern from "gpu_nms.hpp":
14 |     void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
15 | 
16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
17 |             np.int32_t device_id=0):
18 |     cdef int boxes_num = dets.shape[0]
19 |     cdef int boxes_dim = dets.shape[1]
20 |     cdef int num_out
21 |     cdef np.ndarray[np.int32_t, ndim=1] \
22 |         keep = np.zeros(boxes_num, dtype=np.int32)
23 |     cdef np.ndarray[np.float32_t, ndim=1] \
24 |         scores = dets[:, 4]
25 |     cdef np.ndarray[np.int_t, ndim=1] \
26 |         order = scores.argsort()[::-1]
27 |     cdef np.ndarray[np.float32_t, ndim=2] \
28 |         sorted_dets = dets[order, :]
29 |     _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
30 |     keep = keep[:num_out]
31 |     return list(order[keep])
32 | 


--------------------------------------------------------------------------------
/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import time
 9 | 
10 | 
11 | class Timer(object):
12 |     """A simple timer."""
13 | 
14 |     def __init__(self):
15 |         self.total_time = 0.
16 |         self.calls = 0
17 |         self.start_time = 0.
18 |         self.diff = 0.
19 |         self.average_time = 0.
20 | 
21 |     def tic(self):
22 |         # using time.time instead of time.clock because time time.clock
23 |         # does not normalize for multithreading
24 |         self.start_time = time.time()
25 | 
26 |     def toc(self, average=True):
27 |         self.diff = time.time() - self.start_time
28 |         self.total_time += self.diff
29 |         self.calls += 1
30 |         self.average_time = self.total_time / self.calls
31 |         if average:
32 |             return self.average_time
33 |         else:
34 |             return self.diff
35 | 
36 |     def clear(self):
37 |         self.total_time = 0.
38 |         self.calls = 0
39 |         self.start_time = 0.
40 |         self.diff = 0.
41 |         self.average_time = 0.
42 | 


--------------------------------------------------------------------------------
/utils/nms_wrapper.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | from .nms.cpu_nms import cpu_nms, cpu_soft_nms
 9 | from .nms.gpu_nms import gpu_nms
10 | 
11 | # def nms(dets, thresh, force_cpu=False):
12 | #     """Dispatch to either CPU or GPU NMS implementations."""
13 | 
14 | #     if dets.shape[0] == 0:
15 | #         return []
16 | #     if cfg.USE_GPU_NMS and not force_cpu:
17 | #         return gpu_nms(dets, thresh, device_id=cfg.GPU_ID)
18 | #     else:
19 | #         return cpu_nms(dets, thresh)
20 | 
21 | 
22 | def nms(dets, thresh, force_cpu=False):
23 |     """Dispatch to either CPU or GPU NMS implementations."""
24 | 
25 |     if dets.shape[0] == 0:
26 |         return []
27 |     if force_cpu:
28 |         #return cpu_soft_nms(dets, thresh, method = 0)
29 |         return cpu_nms(dets, thresh)
30 |     return gpu_nms(dets, thresh)
31 | 
32 | 
33 | def soft_nms(dets, Nt=0.3, sigma=0.5, thresh=0.001, method=1):
34 |     """Dispatch to either CPU or GPU NMS implementations."""
35 | 
36 |     if dets.shape[0] == 0:
37 |         return []
38 |     return cpu_soft_nms(dets, sigma, Nt, thresh, method)


--------------------------------------------------------------------------------
/layers/modules/weight_smooth_l1_loss.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Written by yq_yao
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | class WeightSmoothL1Loss(nn.Module):
11 |     def __init__(self, class_num, size_average=False):
12 |         super(WeightSmoothL1Loss, self).__init__()
13 |         self.class_num = class_num
14 |         self.size_average = size_average
15 | 
16 |     def forward(self, inputs, targets, weights):
17 |         N = inputs.size(0)
18 |         loc_num = inputs.size(1)
19 |         abs_out = torch.abs(inputs - targets)
20 | 
21 |         if inputs.is_cuda and not weights.is_cuda:
22 |             weights = weights.cuda()
23 | 
24 |         weights = weights.view(-1, 1)
25 | 
26 |         weights = torch.cat((weights, weights, weights, weights), 1)
27 |         mask_big = abs_out >= 1.
28 |         mask_small = abs_out < 1.
29 |         loss_big = weights[mask_big] * (abs_out[mask_big] - 0.5)
30 |         loss_small = weights[mask_small] * 0.5 * torch.pow(
31 |             abs_out[mask_small], 2)
32 |         loss_sum = loss_big.sum() + loss_small.sum()
33 | 
34 |         if self.size_average:
35 |             loss = loss_sum / N * loc_num
36 |         else:
37 |             loss = loss_sum
38 |         return loss
39 | 


--------------------------------------------------------------------------------
/layers/modules/weight_softmax_loss.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Written by yq_yao
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | class WeightSoftmaxLoss(nn.Module):
11 |     def __init__(self, class_num, gamma=2, size_average=True):
12 |         super(WeightSoftmaxLoss, self).__init__()
13 |         # if isinstance(weights, Variable):
14 |         #     self.weights = weights
15 |         # else:
16 |         #     self.weights = Variable(weights)
17 | 
18 |         self.class_num = class_num
19 |         self.gamma = gamma
20 |         self.size_average = size_average
21 | 
22 |     def forward(self, inputs, targets, weights):
23 |         N = inputs.size(0)
24 |         C = inputs.size(1)
25 |         P = F.softmax(inputs)
26 | 
27 |         class_mask = inputs.data.new(N, C).fill_(0)
28 |         class_mask = Variable(class_mask)
29 |         ids = targets.view(-1, 1)
30 |         class_mask.scatter_(1, ids.data, 1.)
31 |         if inputs.is_cuda and not weights.is_cuda:
32 |             weights = weights.cuda()
33 |         probs = (P * class_mask).sum(1).view(-1, 1)
34 | 
35 |         log_p = probs.log()
36 |         weights = weights.view(-1, 1)
37 |         batch_loss = -weights * log_p
38 | 
39 |         if self.size_average:
40 |             loss = batch_loss.mean()
41 |         else:
42 |             loss = batch_loss.sum()
43 |         return loss


--------------------------------------------------------------------------------
/layers/modules/focal_loss_softmax.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Written by yq_yao
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | class FocalLossSoftmax(nn.Module):
11 |     '''
12 |     softmax version focal loss
13 |     '''
14 | 
15 |     def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
16 |         super(FocalLossSoftmax, self).__init__()
17 |         if alpha is None:
18 |             self.alpha = Variable(torch.ones(class_num, 1))
19 |         else:
20 |             if isinstance(alpha, Variable):
21 |                 self.alpha = alpha
22 |             else:
23 |                 self.alpha = Variable(alpha)
24 |         self.gamma = gamma
25 |         self.class_num = class_num
26 |         self.size_average = size_average
27 | 
28 |     def forward(self, inputs, targets):
29 |         N = inputs.size(0)
30 |         C = inputs.size(1)
31 |         P = F.softmax(inputs)
32 | 
33 |         class_mask = inputs.data.new(N, C).fill_(0)
34 |         class_mask = Variable(class_mask)
35 |         ids = targets.view(-1, 1)
36 |         class_mask.scatter_(1, ids.data, 1.)
37 | 
38 |         if inputs.is_cuda and not self.alpha.is_cuda:
39 |             self.alpha = self.alpha.cuda()
40 |         alpha = self.alpha[ids.data.view(-1)]
41 |         probs = (P * class_mask).sum(1).view(-1, 1)
42 |         log_p = probs.log()
43 |         batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p
44 | 
45 |         if self.size_average:
46 |             loss = batch_loss.mean()
47 |         else:
48 |             loss = batch_loss.sum()
49 |         return loss


--------------------------------------------------------------------------------
/configs/EFGRNet_vgg_coco_dcn_512.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: refine_vgg
 3 |   SIZE: '512'
 4 |   REFINE: True
 5 |   CONV_BODY: efrgnet_vgg.refine_vgg
 6 |   NUM_CLASSES: 81
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/vgg16_reducedfc.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 5
18 |   TRAIN_ON: True
19 | 
20 | BIG:
21 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]]
22 |   ARM_CHANNELS: [512, 1024, 256, 256]
23 |   ODM_CHANNELS: [256, 256, 256, 256]
24 |   NUM_ANCHORS: [3, 3, 3, 3]
25 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
26 |   MIN_SIZES: [30, 64, 128, 256]
27 |   MAX_SIZES: [64, 128, 256, 315]
28 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
29 |   CLIP: True
30 |   IMG_WH: [512, 512]
31 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
32 |   USE_MAX_SIZE: False
33 | 
34 | SOLVER:
35 |   WEIGHT_DECAY: 0.0005
36 |   BASE_LR: 0.002
37 |   GAMMA: 0.1
38 |   MOMENTUM: 0.9
39 | #  EPOCH_STEPS: [0, 90, 120, 140]
40 |   LR: [0.004, 0.002, 0.0004, 0.00004, 0.000004]
41 |   EPOCH_STEPS: [90, 110, 130, 150, 160]
42 |   END_EPOCH: 160
43 |   START_EPOCH: 0
44 | 
45 | DATASETS:
46 |   TRAIN_TYPE: [['2014', 'train'], ['2014', 'valminusminival']]
47 |   VAL_TYPE: [['2014', 'minival']]
48 | #  VAL_TYPE: [['2015', 'test-dev']]
49 |   DATAROOT: '/media/jnie/Storage/ubuntu/DataSets/coco/'
50 |   DATA_TYPE: 'COCO'
51 |   SETS:
52 |     VOC: [['2007', 'trainval'], ['2012', 'trainval']]
53 |     VOC0712PLUS: [['2007', 'trainval'], ['2012', 'trainval'],['2007', 'test']]
54 |     VOC0712: [['2012', '2012_trainval']]
55 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
56 |     VOC2007: [['2007', 'test']]
57 |     COCOval: [['2014', 'minival']]
58 |     VOCROOT: 'raid/jing/data/VOCdevkit/'
59 |     COCOROOT: 'data/coco'
60 |   
61 | TEST:
62 |   INPUT_WH: [512, 512]
63 |   CONFIDENCE_THRESH: 0.01
64 |   NMS_OVERLAP: 0.45
65 |   BATCH_SIZE: 1
66 |   
67 | 
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/configs/EFGRNet_vgg_coco_dcn.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: refine_vgg
 3 |   SIZE: '300'
 4 |   REFINE: True
 5 | #  CONV_BODY: efrgnet_vgg_reduce.refine_vgg
 6 |   CONV_BODY: efrgnet_vgg.refine_vgg
 7 |   NUM_CLASSES: 81
 8 |   LOAD_PRETRAINED_WEIGHTS: False
 9 |   PRETRAIN_WEIGHTS: './weights/vgg16_reducedfc.pth'
10 | 
11 | TRAIN:
12 |   OVERLAP: 0.5
13 |   BGR_MEAN: [104, 117, 123]
14 |   BATCH_SIZE: 32
15 |   OHEM: True
16 |   NEG_RATIO: 3
17 |   WARMUP: True
18 |   WARMUP_EPOCH: 5
19 |   TRAIN_ON: True
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]]
23 |   ARM_CHANNELS: [512, 1024, 256, 256]
24 |   ODM_CHANNELS: [256, 256, 256, 256]
25 |   NUM_ANCHORS: [3, 3, 3, 3]
26 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
27 |   MIN_SIZES: [30, 64, 128, 256]
28 |   MAX_SIZES: [64, 128, 256, 315]
29 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
30 |   VARIANCE : [0.1, 0.2]
31 |   CLIP: True
32 |   IMG_WH: [320, 320]
33 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
34 |   USE_MAX_SIZE: False
35 | 
36 | SOLVER:
37 |   WEIGHT_DECAY: 0.0005
38 |   BASE_LR: 0.002
39 |   GAMMA: 0.1
40 |   MOMENTUM: 0.9
41 | #  EPOCH_STEPS: [0, 90, 120, 140]
42 |   LR: [0.004, 0.002, 0.0004, 0.00004, 0.000004]
43 |   EPOCH_STEPS: [90, 110, 130, 150, 160]
44 |   END_EPOCH: 160
45 |   START_EPOCH: 0
46 | 
47 | DATASETS:
48 |   TRAIN_TYPE: [['2014', 'train'], ['2014', 'valminusminival']]
49 |   VAL_TYPE: [['2014', 'minival']]
50 |   DATAROOT: '/media/jnie/Storage/ubuntu/DataSets/coco/'
51 |   DATA_TYPE: 'COCO'
52 |   SETS:
53 |     VOC: [['2007', 'trainval'], ['2012', 'trainval']]
54 |     VOC0712PLUS: [['2007', 'trainval'], ['2012', 'trainval'],['2007', 'test']]
55 |     VOC0712: [['2012', '2012_trainval']]
56 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
57 |     VOC2007: [['2007', 'test']]
58 |     COCOval: [['2014', 'minival']]
59 |     VOCROOT: 'raid/jing/data/VOCdevkit/'
60 |     COCOROOT: 'data/coco'
61 |   
62 | TEST:
63 |   INPUT_WH: [320, 320]
64 |   CONFIDENCE_THRESH: 0.01
65 |   NMS_OVERLAP: 0.45
66 |   BATCH_SIZE: 1
67 |   
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/utils/collections2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | ##############################################################################
15 | """A simple attribute dictionary used for representing configuration options."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | from __future__ import unicode_literals
21 | 
22 | 
23 | class AttrDict(dict):
24 | 
25 |     IMMUTABLE = '__immutable__'
26 | 
27 |     def __init__(self, *args, **kwargs):
28 |         super(AttrDict, self).__init__(*args, **kwargs)
29 |         self.__dict__[AttrDict.IMMUTABLE] = False
30 | 
31 |     def __getattr__(self, name):
32 |         if name in self.__dict__:
33 |             return self.__dict__[name]
34 |         elif name in self:
35 |             return self[name]
36 |         else:
37 |             raise AttributeError(name)
38 | 
39 |     def __setattr__(self, name, value):
40 |         if not self.__dict__[AttrDict.IMMUTABLE]:
41 |             if name in self.__dict__:
42 |                 self.__dict__[name] = value
43 |             else:
44 |                 self[name] = value
45 |         else:
46 |             raise AttributeError(
47 |                 'Attempted to set "{}" to "{}", but AttrDict is immutable'.
48 |                 format(name, value))
49 | 
50 |     def immutable(self, is_immutable):
51 |         """Set immutability to is_immutable and recursively apply the setting
52 |         to all nested AttrDicts.
53 |         """
54 |         self.__dict__[AttrDict.IMMUTABLE] = is_immutable
55 |         # Recursively set immutable state
56 |         for v in self.__dict__.values():
57 |             if isinstance(v, AttrDict):
58 |                 v.immutable(is_immutable)
59 |         for v in self.values():
60 |             if isinstance(v, AttrDict):
61 |                 v.immutable(is_immutable)
62 | 
63 |     def is_immutable(self):
64 |         return self.__dict__[AttrDict.IMMUTABLE]
65 | 


--------------------------------------------------------------------------------
/dcn/functions/deform_pool.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Function
 3 | 
 4 | from .. import deform_pool_cuda
 5 | 
 6 | 
 7 | class DeformRoIPoolingFunction(Function):
 8 | 
 9 |     @staticmethod
10 |     def forward(ctx,
11 |                 data,
12 |                 rois,
13 |                 offset,
14 |                 spatial_scale,
15 |                 out_size,
16 |                 out_channels,
17 |                 no_trans,
18 |                 group_size=1,
19 |                 part_size=None,
20 |                 sample_per_part=4,
21 |                 trans_std=.0):
22 |         ctx.spatial_scale = spatial_scale
23 |         ctx.out_size = out_size
24 |         ctx.out_channels = out_channels
25 |         ctx.no_trans = no_trans
26 |         ctx.group_size = group_size
27 |         ctx.part_size = out_size if part_size is None else part_size
28 |         ctx.sample_per_part = sample_per_part
29 |         ctx.trans_std = trans_std
30 | 
31 |         assert 0.0 <= ctx.trans_std <= 1.0
32 |         if not data.is_cuda:
33 |             raise NotImplementedError
34 | 
35 |         n = rois.shape[0]
36 |         output = data.new_empty(n, out_channels, out_size, out_size)
37 |         output_count = data.new_empty(n, out_channels, out_size, out_size)
38 |         deform_pool_cuda.deform_psroi_pooling_cuda_forward(
39 |             data, rois, offset, output, output_count, ctx.no_trans,
40 |             ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size,
41 |             ctx.part_size, ctx.sample_per_part, ctx.trans_std)
42 | 
43 |         if data.requires_grad or rois.requires_grad or offset.requires_grad:
44 |             ctx.save_for_backward(data, rois, offset)
45 |         ctx.output_count = output_count
46 | 
47 |         return output
48 | 
49 |     @staticmethod
50 |     def backward(ctx, grad_output):
51 |         if not grad_output.is_cuda:
52 |             raise NotImplementedError
53 | 
54 |         data, rois, offset = ctx.saved_tensors
55 |         output_count = ctx.output_count
56 |         grad_input = torch.zeros_like(data)
57 |         grad_rois = None
58 |         grad_offset = torch.zeros_like(offset)
59 | 
60 |         deform_pool_cuda.deform_psroi_pooling_cuda_backward(
61 |             grad_output, data, rois, offset, output_count, grad_input,
62 |             grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels,
63 |             ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part,
64 |             ctx.trans_std)
65 |         return (grad_input, grad_rois, grad_offset, None, None, None, None,
66 |                 None, None, None, None)
67 | 
68 | 
69 | deform_roi_pooling = DeformRoIPoolingFunction.apply
70 | 


--------------------------------------------------------------------------------
/layers/functions/prior_layer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from math import sqrt as sqrt
 3 | from math import ceil
 4 | import torch.nn as nn
 5 | from itertools import product as product
 6 | 
 7 | 
 8 | class PriorLayer(nn.Module):
 9 |     def __init__(self, cfg):
10 |         super(PriorLayer, self).__init__()
11 |         self.size = cfg.MODEL.SIZE
12 |         if self.size == '300':
13 |             size_cfg = cfg.SMALL
14 |         else:
15 |             size_cfg = cfg.BIG
16 |         self.img_wh = size_cfg.IMG_WH
17 |         self.num_priors = len(size_cfg.ASPECT_RATIOS)
18 |         self.feature_maps = size_cfg.FEATURE_MAPS
19 |         self.variance = size_cfg.VARIANCE or [0.1]
20 |         self.min_sizes = size_cfg.MIN_SIZES
21 |         self.use_max_sizes = size_cfg.USE_MAX_SIZE
22 |         if self.use_max_sizes:
23 |             self.max_sizes = size_cfg.MAX_SIZES
24 |         self.steps = size_cfg.STEPS
25 |         self.aspect_ratios = size_cfg.ASPECT_RATIOS
26 |         self.clip = size_cfg.CLIP
27 |         for v in self.variance:
28 |             if v <= 0:
29 |                 raise ValueError('Variances must be greater than 0')
30 | 
31 |     def forward(self, img_wh, feature_maps_wh):
32 |         self.img_wh = img_wh
33 |         self.feature_maps_wh = feature_maps_wh
34 |         mean = []
35 |         for k, f in enumerate(self.feature_maps_wh):
36 |             grid_h, grid_w = f[1], f[0]
37 |             for i in range(grid_h):
38 |                 for j in range(grid_w):
39 |                     f_k_h = self.img_wh[1] / self.steps[k][1]
40 |                     f_k_w = self.img_wh[0] / self.steps[k][0]
41 |                     # unit center x,y
42 |                     cx = (j + 0.5) / f_k_w
43 |                     cy = (i + 0.5) / f_k_h
44 | 
45 |                     # aspect_ratio: 1
46 |                     # rel size: min_size
47 |                     s_k_h = self.min_sizes[k] / self.img_wh[1]
48 |                     s_k_w = self.min_sizes[k] / self.img_wh[0]
49 |                     mean += [cx, cy, s_k_w, s_k_h]
50 | 
51 |                     # aspect_ratio: 1
52 |                     # rel size: sqrt(s_k * s_(k+1))
53 |                     if self.use_max_sizes:
54 |                         s_k_prime_w = sqrt(
55 |                             s_k_w * (self.max_sizes[k] / self.img_wh[0]))
56 |                         s_k_prime_h = sqrt(
57 |                             s_k_h * (self.max_sizes[k] / self.img_wh[1]))
58 |                         mean += [cx, cy, s_k_prime_w, s_k_prime_h]
59 | 
60 |                     for ar in self.aspect_ratios[k]:
61 |                         mean += [cx, cy, s_k_w * sqrt(ar), s_k_h / sqrt(ar)]
62 | 
63 |         output = torch.Tensor(mean).view(-1, 4)
64 |         if self.clip:
65 |             output.clamp_(max=1, min=0)
66 |         return output
67 | 


--------------------------------------------------------------------------------
/layers/functions/prior_box.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from math import sqrt as sqrt
 3 | from itertools import product as product
 4 | 
 5 | 
 6 | class PriorBox(object):
 7 |     """Compute priorbox coordinates in center-offset form for each source
 8 |     feature map.
 9 |     Note:
10 |     This 'layer' has changed between versions of the original SSD
11 |     paper, so we include both versions, but note v2 is the most tested and most
12 |     recent version of the paper.
13 | 
14 |     """
15 | 
16 |     def __init__(self, cfg):
17 |         super(PriorBox, self).__init__()
18 |         self.size = cfg.MODEL.SIZE
19 |         if self.size == '300':
20 |             size_cfg = cfg.SMALL
21 |         else:
22 |             size_cfg = cfg.BIG
23 |         self.img_wh = size_cfg.IMG_WH
24 |         self.num_priors = len(size_cfg.ASPECT_RATIOS)
25 |         self.feature_maps = size_cfg.FEATURE_MAPS
26 |         self.variance = size_cfg.VARIANCE or [0.1]
27 |         self.min_sizes = size_cfg.MIN_SIZES
28 |         self.use_max_sizes = size_cfg.USE_MAX_SIZE
29 |         if self.use_max_sizes:
30 |             self.max_sizes = size_cfg.MAX_SIZES
31 |         self.steps = size_cfg.STEPS
32 |         self.aspect_ratios = size_cfg.ASPECT_RATIOS
33 |         self.clip = size_cfg.CLIP
34 |         for v in self.variance:
35 |             if v <= 0:
36 |                 raise ValueError('Variances must be greater than 0')
37 | 
38 |     def forward(self):
39 |         mean = []
40 |         for k, f in enumerate(self.feature_maps):
41 |             grid_h, grid_w = f[1], f[0]
42 |             for i in range(grid_h):
43 |                 for j in range(grid_w):
44 |                     f_k_h = self.img_wh[1] / self.steps[k][1]
45 |                     f_k_w = self.img_wh[0] / self.steps[k][0]
46 |                     # unit center x,y
47 |                     cx = (j + 0.5) / f_k_w
48 |                     cy = (i + 0.5) / f_k_h
49 | 
50 |                     # aspect_ratio: 1
51 |                     # rel size: min_size
52 |                     s_k_h = self.min_sizes[k] / self.img_wh[1]
53 |                     s_k_w = self.min_sizes[k] / self.img_wh[0]
54 |                     mean += [cx, cy, s_k_w, s_k_h]
55 | 
56 |                     # aspect_ratio: 1
57 |                     # rel size: sqrt(s_k * s_(k+1))
58 |                     if self.use_max_sizes:
59 |                         s_k_prime_w = sqrt(
60 |                             s_k_w * (self.max_sizes[k] / self.img_wh[0]))
61 |                         s_k_prime_h = sqrt(
62 |                             s_k_h * (self.max_sizes[k] / self.img_wh[1]))
63 |                         mean += [cx, cy, s_k_prime_w, s_k_prime_h]
64 | 
65 |                     for ar in self.aspect_ratios[k]:
66 |                         mean += [cx, cy, s_k_w * sqrt(ar), s_k_h / sqrt(ar)]
67 | 
68 |         # back to torch land
69 |         output = torch.Tensor(mean).view(-1, 4)
70 |         if self.clip:
71 |             output.clamp_(max=1, min=0)
72 |         # print(output.size())
73 |         return output
74 | 


--------------------------------------------------------------------------------
/layers/functions/detection.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.backends.cudnn as cudnn
 4 | from torch.autograd import Function
 5 | from torch.autograd import Variable
 6 | import torch.nn.functional as F
 7 | from utils.box_utils import decode, center_size
 8 | import time
 9 | 
10 | 
11 | class Detect(Function):
12 |     """At test time, Detect is the final layer of SSD.  Decode location preds,
13 |     apply non-maximum suppression to location predictions based on conf
14 |     scores and threshold to a top_k number of output predictions for both
15 |     confidence score and locations.
16 |     """
17 | 
18 |     def __init__(self, cfg):
19 |         self.cfg = cfg
20 |         self.num_classes = cfg.MODEL.NUM_CLASSES
21 |         #self.thresh = thresh
22 |         self.size = cfg.MODEL.SIZE
23 |         if self.size == '300':
24 |             size_cfg = cfg.SMALL
25 |         else:
26 |             size_cfg = cfg.BIG
27 |         # Parameters used in nms.
28 |         self.variance = size_cfg.VARIANCE
29 |         self.object_score = cfg.MODEL.OBJECT_SCORE
30 | 
31 |     def forward(self, predictions):
32 |         """
33 |         Args:
34 |             loc_data: (tensor) Loc preds from loc layers
35 |                 Shape: [batch,num_priors*4]
36 |             conf_data: (tensor) Shape: Conf preds from conf layers
37 |                 Shape: [batch*num_priors,num_classes]
38 |             prior_data: (tensor) Prior boxes and variances from priorbox layers
39 |                 Shape: [1,num_priors,4]
40 |         """
41 |         # loc, conf, priors = predictions
42 |         if self.cfg.MODEL.REFINE:
43 |             # start_time = time.time()
44 |             arm_loc, arm_conf, loc, conf, priors = predictions
45 |             arm_conf = F.softmax(arm_conf.view(-1, 2), 1)
46 |             conf = F.softmax(conf.view(-1, self.num_classes), 1)
47 |             arm_loc_data = arm_loc.data
48 |             arm_conf_data = arm_conf.data
49 |             arm_object_conf = arm_conf_data[:, 1:]
50 |             no_object_index = arm_object_conf <= self.object_score
51 |             conf.data[no_object_index.expand_as(conf.data)] = 0
52 |             # time1 = time.time() - start_time
53 |             # print('prediction_time_first:', time1)
54 |         else:
55 |             loc, conf, priors = predictions
56 |             conf = F.softmax(conf.view(-1, self.num_classes), 1)
57 | 
58 |         # start_time2 = time.time()
59 |         loc_data = loc.data
60 |         conf_data = conf.data
61 |         # prior_data = priors.data
62 |         prior_data = priors[:loc_data.size(1), :]
63 | 
64 |         num = loc_data.size(0)  # batch size
65 | 
66 |         self.num_priors = prior_data.size(0)
67 |         # time2 = time.time() - start_time2
68 |         # print('prepare_time:', time2)
69 | 
70 |         # start_time3 = time.time()
71 |         self.boxes = torch.zeros(num, self.num_priors, 4)
72 |         self.scores = torch.zeros(num, self.num_priors, self.num_classes)
73 |         conf_preds = conf_data.view(num, self.num_priors, self.num_classes)
74 |         batch_prior = prior_data.view(-1, self.num_priors, 4).expand(
75 |             (num, self.num_priors, 4))
76 |         batch_prior = batch_prior.contiguous().view(-1, 4)
77 |         # time3 = time.time() - start_time3
78 |         # print('prepare_time2:', time3)
79 | 
80 |         # start_time4 = time.time()
81 |         if self.cfg.MODEL.REFINE:
82 |             default = decode(
83 |                 arm_loc_data.view(-1, 4), batch_prior, self.variance)
84 |             default = center_size(default)
85 |             decoded_boxes = decode(
86 |                 loc_data.view(-1, 4), default, self.variance)
87 |         else:
88 |             decoded_boxes = decode(
89 |                 loc_data.view(-1, 4), batch_prior, self.variance)
90 | 
91 |         self.scores = conf_preds.view(num, self.num_priors, self.num_classes)
92 |         self.boxes = decoded_boxes.view(num, self.num_priors, 4)
93 | 
94 |         # time4 = time.time() - start_time4
95 |         # print('prediction_time2:', time4)
96 |         return self.boxes, self.scores


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Enriched Feature Guided Refinement Network for Object Detection
  2 | 
  3 | By Jing Nie1†, Rao Muhammad Anwer†, Hisham Cholakkal, Fahad Shahbaz Khan, Yanwei Pang1‡, Ling Shao  \
  4 | † denotes equal contribution，‡ Corresponding author
  5 | 
  6 | 
  7 | ### Introduction
  8 | We propose a single-stage detection framework that
  9 | jointly tackles the problem of multi-scale object detection and class imbalance.
 10 | Rather than designing deeper networks, we introduce a simple yet effective feature enrichment scheme to produce multi-scale contextual features.
 11 | We further introduce a cascaded refinement scheme which first instills multi-scale contextual features into the prediction layers of the single-stage detector
 12 | in order to enrich their discriminative power for multi-scale detection. Second, the cascaded refinement scheme counters the class im- balance problem by refining the
 13 | anchors and enriched features to improve classification and regression.
 14 | 
 15 | ## Installation
 16 | - Clone this repository. This repository is mainly based mainly based on [SSD_Pytorch](https://github.com/yqyao/SSD_Pytorch.git)
 17 | 
 18 | ```Shell
 19 |     EFGR_ROOT=/path/to/clone/EFGR
 20 |     git clone https://github.com/Ranchentx/EFGRNet.git $EFGR_ROOT
 21 | ```
 22 | 
 23 | - The code was tested on Ubuntu 16.04, with [Anaconda](https://www.anaconda.com/download) Python 3.6 and [PyTorch]((http://pytorch.org/)) v0.4.1.
 24 | NVIDIA GPUs are needed for testing. After install Anaconda, create a new conda environment, activate the environment and install pytorch0.4.1.
 25 | 
 26 | ```Shell
 27 |     conda create -n EFGRNet python=3.6
 28 |     source activate EFGRNet
 29 |     conda install pytorch=0.4.1 torchvision -c pytorch
 30 | ```
 31 | 
 32 | 
 33 | - Install opencv and COCOAPI.
 34 | ```Shell
 35 |     pip install opencv-python
 36 |     pip install pycocotools
 37 | ```
 38 | 
 39 | - Compile NMS:
 40 | 
 41 | ```Shell
 42 |     cd $EFGR_ROOT/
 43 |     ./make.sh
 44 | ```
 45 | 
 46 | - Compile DCN:
 47 | 
 48 | ```Shell
 49 |     ./compile.sh
 50 | ```
 51 | 
 52 | 
 53 | ## Download
 54 | To evaluate the performance reported in the paper, Pascal VOC and COCO dataset as well as our trained models need to be downloaded.
 55 | 
 56 | 
 57 | ### VOC Dataset
 58 | - Directly download the images and annotations from the [VOC website](http://host.robots.ox.ac.uk/pascal/VOC/) and put them into $LFIP_ROOT/data/VOCdevkit/.
 59 | - Create the `VOCdevkit` folder and make the data(or create symlinks) folder like:
 60 | 
 61 |   ~~~
 62 |   ${$EFGR_ROOT}
 63 |   |-- data
 64 |   `-- |-- VOCdevkit
 65 |       `-- |-- VOC2007
 66 |           |   |-- annotations
 67 |           |   |-- ImageSets
 68 |           |   |-- JPEGImages
 69 |           |-- VOC2012
 70 |           |   |-- annotations
 71 |           |   |-- ImageSets
 72 |           |   |-- JPEGImages
 73 |           |-- results
 74 |   ~~~
 75 | 
 76 | ### COCO Dataset
 77 | - Download the images and annotation files from coco website [coco website](http://cocodataset.org/#download).
 78 | - Place the data (or create symlinks) to make the data folder like:
 79 | 
 80 |   ~~~
 81 |   ${$EFGR_ROOT}
 82 |   |-- data
 83 |   `-- |-- coco
 84 |       `-- |-- annotations
 85 |           |   |-- instances_train2014.json
 86 |           |   |-- instances_val2014.json
 87 |           |   |-- image_info_test-dev2015.json
 88 |           `-- images
 89 |           |   |-- train2014
 90 |           |   |-- val2014
 91 |           |   |-- test2015
 92 |           `-- cache
 93 |   ~~~
 94 | 
 95 | ## Training
 96 | 
 97 | 
 98 | 
 99 | ```Shell
100 | python train_coco.py --cfg ./configs/EFGRNet_vgg_coco_dcn.yaml
101 | ```
102 | 
103 | 
104 | ## Testing
105 | 
106 | - Note:
107 |   All testing configs are in EFGRNet_vgg_coco_dcn.yaml, you can change it by yourself.
108 | 
109 | - To evaluate a trained network:
110 | 
111 | ```Shell
112 | python eval_dcn.py --cfg ./configs/EFGRNet_vgg_coco_dcn.yaml --weights ./eval_weights
113 | ```
114 | 
115 | ## Models
116 | 
117 | * COCO [EFGRNet_VGG320](https://drive.google.com/open?id=1-_x9e4kX3ZJBKzfTKloslJxK2qO8bfkO); [BaiduYun Driver](https://pan.baidu.com/s/1ZPiibo-PnoTJl5HjAl63Pg&shfl=sharepset)
118 | * COCO [EFGRNet_VGG512](https://drive.google.com/open?id=1OVRiYRAyJiErUYsOXPaE12XEXtAV4ZrD); [BaiduYun Driver](https://pan.baidu.com/s/1YvXhhIXdziDV9q3wj9mLRg&shfl=sharepset)
119 | 
120 | 
121 | ## Citation
122 | Please cite our paper in your publications if it helps your research:
123 | 
124 |     @article{Jing2019EFGR,
125 |         title = {Enriched Feature Guided Refinement Network for Object Detection},
126 |         author = {Jing Nie, Rao Muhammad Anwer, Hisham Cholakkal, Fahad Shahbaz Khan， Yanwei Pang, Ling Shao},
127 |         booktitle = {ICCV},
128 |         year = {2019}
129 |     }


--------------------------------------------------------------------------------
/dcn/modules/deform_conv.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.nn.modules.utils import _pair
  6 | 
  7 | from ..functions.deform_conv import deform_conv, modulated_deform_conv
  8 | 
  9 | 
 10 | class DeformConv(nn.Module):
 11 | 
 12 |     def __init__(self,
 13 |                  in_channels,
 14 |                  out_channels,
 15 |                  kernel_size,
 16 |                  stride=1,
 17 |                  padding=0,
 18 |                  dilation=1,
 19 |                  groups=1,
 20 |                  deformable_groups=1,
 21 |                  bias=False):
 22 |         assert not bias
 23 |         super(DeformConv, self).__init__()
 24 | 
 25 |         assert in_channels % groups == 0, \
 26 |             'in_channels {} cannot be divisible by groups {}'.format(
 27 |                 in_channels, groups)
 28 |         assert out_channels % groups == 0, \
 29 |             'out_channels {} cannot be divisible by groups {}'.format(
 30 |                 out_channels, groups)
 31 |         self.in_channels = in_channels
 32 |         self.out_channels = out_channels
 33 |         self.kernel_size = _pair(kernel_size)
 34 |         self.stride = _pair(stride)
 35 |         self.padding = _pair(padding)
 36 |         self.dilation = _pair(dilation)
 37 |         self.groups = groups
 38 |         self.deformable_groups = deformable_groups
 39 | 
 40 |         self.weight = nn.Parameter(
 41 |             torch.Tensor(out_channels, in_channels // self.groups,
 42 |                          *self.kernel_size))
 43 | 
 44 |         self.reset_parameters()
 45 | 
 46 |     def reset_parameters(self):
 47 |         n = self.in_channels
 48 |         for k in self.kernel_size:
 49 |             n *= k
 50 |         stdv = 1. / math.sqrt(n)
 51 |         self.weight.data.uniform_(-stdv, stdv)
 52 | 
 53 |     def forward(self, input, offset):
 54 |         return deform_conv(input, offset, self.weight, self.stride,
 55 |                            self.padding, self.dilation, self.groups,
 56 |                            self.deformable_groups)
 57 | 
 58 | 
 59 | class ModulatedDeformConv(nn.Module):
 60 | 
 61 |     def __init__(self,
 62 |                  in_channels,
 63 |                  out_channels,
 64 |                  kernel_size,
 65 |                  stride=1,
 66 |                  padding=0,
 67 |                  dilation=1,
 68 |                  groups=1,
 69 |                  deformable_groups=1,
 70 |                  bias=True):
 71 |         super(ModulatedDeformConv, self).__init__()
 72 |         self.in_channels = in_channels
 73 |         self.out_channels = out_channels
 74 |         self.kernel_size = _pair(kernel_size)
 75 |         self.stride = stride
 76 |         self.padding = padding
 77 |         self.dilation = dilation
 78 |         self.groups = groups
 79 |         self.deformable_groups = deformable_groups
 80 |         self.with_bias = bias
 81 | 
 82 |         self.weight = nn.Parameter(
 83 |             torch.Tensor(out_channels, in_channels // groups,
 84 |                          *self.kernel_size))
 85 |         if bias:
 86 |             self.bias = nn.Parameter(torch.Tensor(out_channels))
 87 |         else:
 88 |             self.register_parameter('bias', None)
 89 |         self.reset_parameters()
 90 | 
 91 |     def reset_parameters(self):
 92 |         n = self.in_channels
 93 |         for k in self.kernel_size:
 94 |             n *= k
 95 |         stdv = 1. / math.sqrt(n)
 96 |         self.weight.data.uniform_(-stdv, stdv)
 97 |         if self.bias is not None:
 98 |             self.bias.data.zero_()
 99 | 
100 |     def forward(self, input, offset, mask):
101 |         return modulated_deform_conv(
102 |             input, offset, mask, self.weight, self.bias, self.stride,
103 |             self.padding, self.dilation, self.groups, self.deformable_groups)
104 | 
105 | 
106 | class ModulatedDeformConvPack(ModulatedDeformConv):
107 | 
108 |     def __init__(self,
109 |                  in_channels,
110 |                  out_channels,
111 |                  kernel_size,
112 |                  stride=1,
113 |                  padding=0,
114 |                  dilation=1,
115 |                  groups=1,
116 |                  deformable_groups=1,
117 |                  bias=True):
118 |         super(ModulatedDeformConvPack, self).__init__(
119 |             in_channels, out_channels, kernel_size, stride, padding, dilation,
120 |             groups, deformable_groups, bias)
121 | 
122 |         self.conv_offset_mask = nn.Conv2d(
123 |             self.in_channels // self.groups,
124 |             self.deformable_groups * 3 * self.kernel_size[0] *
125 |             self.kernel_size[1],
126 |             kernel_size=self.kernel_size,
127 |             stride=_pair(self.stride),
128 |             padding=_pair(self.padding),
129 |             bias=True)
130 |         self.init_offset()
131 | 
132 |     def init_offset(self):
133 |         self.conv_offset_mask.weight.data.zero_()
134 |         self.conv_offset_mask.bias.data.zero_()
135 | 
136 |     def forward(self, input):
137 |         out = self.conv_offset_mask(input)
138 |         o1, o2, mask = torch.chunk(out, 3, dim=1)
139 |         offset = torch.cat((o1, o2), dim=1)
140 |         mask = torch.sigmoid(mask)
141 |         return modulated_deform_conv(
142 |             input, offset, mask, self.weight, self.bias, self.stride,
143 |             self.padding, self.dilation, self.groups, self.deformable_groups)
144 | 


--------------------------------------------------------------------------------
/utils/nms/nms_kernel.cu:
--------------------------------------------------------------------------------
  1 | // ------------------------------------------------------------------
  2 | // Faster R-CNN
  3 | // Copyright (c) 2015 Microsoft
  4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
  5 | // Written by Shaoqing Ren
  6 | // ------------------------------------------------------------------
  7 | 
  8 | #include "gpu_nms.hpp"
  9 | #include <vector>
 10 | #include <iostream>
 11 | 
 12 | #define CUDA_CHECK(condition) \
 13 |   /* Code block avoids redefinition of cudaError_t error */ \
 14 |   do { \
 15 |     cudaError_t error = condition; \
 16 |     if (error != cudaSuccess) { \
 17 |       std::cout << cudaGetErrorString(error) << std::endl; \
 18 |     } \
 19 |   } while (0)
 20 | 
 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
 23 | 
 24 | __device__ inline float devIoU(float const * const a, float const * const b) {
 25 |   float left = max(a[0], b[0]), right = min(a[2], b[2]);
 26 |   float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
 27 |   float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
 28 |   float interS = width * height;
 29 |   float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
 30 |   float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
 31 |   return interS / (Sa + Sb - interS);
 32 | }
 33 | 
 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 35 |                            const float *dev_boxes, unsigned long long *dev_mask) {
 36 |   const int row_start = blockIdx.y;
 37 |   const int col_start = blockIdx.x;
 38 | 
 39 |   // if (row_start > col_start) return;
 40 | 
 41 |   const int row_size =
 42 |         min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
 43 |   const int col_size =
 44 |         min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
 45 | 
 46 |   __shared__ float block_boxes[threadsPerBlock * 5];
 47 |   if (threadIdx.x < col_size) {
 48 |     block_boxes[threadIdx.x * 5 + 0] =
 49 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
 50 |     block_boxes[threadIdx.x * 5 + 1] =
 51 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
 52 |     block_boxes[threadIdx.x * 5 + 2] =
 53 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
 54 |     block_boxes[threadIdx.x * 5 + 3] =
 55 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
 56 |     block_boxes[threadIdx.x * 5 + 4] =
 57 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
 58 |   }
 59 |   __syncthreads();
 60 | 
 61 |   if (threadIdx.x < row_size) {
 62 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
 63 |     const float *cur_box = dev_boxes + cur_box_idx * 5;
 64 |     int i = 0;
 65 |     unsigned long long t = 0;
 66 |     int start = 0;
 67 |     if (row_start == col_start) {
 68 |       start = threadIdx.x + 1;
 69 |     }
 70 |     for (i = start; i < col_size; i++) {
 71 |       if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
 72 |         t |= 1ULL << i;
 73 |       }
 74 |     }
 75 |     const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
 76 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
 77 |   }
 78 | }
 79 | 
 80 | void _set_device(int device_id) {
 81 |   int current_device;
 82 |   CUDA_CHECK(cudaGetDevice(&current_device));
 83 |   if (current_device == device_id) {
 84 |     return;
 85 |   }
 86 |   // The call to cudaSetDevice must come before any calls to Get, which
 87 |   // may perform initialization using the GPU.
 88 |   CUDA_CHECK(cudaSetDevice(device_id));
 89 | }
 90 | 
 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
 92 |           int boxes_dim, float nms_overlap_thresh, int device_id) {
 93 |   _set_device(device_id);
 94 | 
 95 |   float* boxes_dev = NULL;
 96 |   unsigned long long* mask_dev = NULL;
 97 | 
 98 |   const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
 99 | 
100 |   CUDA_CHECK(cudaMalloc(&boxes_dev,
101 |                         boxes_num * boxes_dim * sizeof(float)));
102 |   CUDA_CHECK(cudaMemcpy(boxes_dev,
103 |                         boxes_host,
104 |                         boxes_num * boxes_dim * sizeof(float),
105 |                         cudaMemcpyHostToDevice));
106 | 
107 |   CUDA_CHECK(cudaMalloc(&mask_dev,
108 |                         boxes_num * col_blocks * sizeof(unsigned long long)));
109 | 
110 |   dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
111 |               DIVUP(boxes_num, threadsPerBlock));
112 |   dim3 threads(threadsPerBlock);
113 |   nms_kernel<<<blocks, threads>>>(boxes_num,
114 |                                   nms_overlap_thresh,
115 |                                   boxes_dev,
116 |                                   mask_dev);
117 | 
118 |   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
119 |   CUDA_CHECK(cudaMemcpy(&mask_host[0],
120 |                         mask_dev,
121 |                         sizeof(unsigned long long) * boxes_num * col_blocks,
122 |                         cudaMemcpyDeviceToHost));
123 | 
124 |   std::vector<unsigned long long> remv(col_blocks);
125 |   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
126 | 
127 |   int num_to_keep = 0;
128 |   for (int i = 0; i < boxes_num; i++) {
129 |     int nblock = i / threadsPerBlock;
130 |     int inblock = i % threadsPerBlock;
131 | 
132 |     if (!(remv[nblock] & (1ULL << inblock))) {
133 |       keep_out[num_to_keep++] = i;
134 |       unsigned long long *p = &mask_host[0] + i * col_blocks;
135 |       for (int j = nblock; j < col_blocks; j++) {
136 |         remv[j] |= p[j];
137 |       }
138 |     }
139 |   }
140 |   *num_out = num_to_keep;
141 | 
142 |   CUDA_CHECK(cudaFree(boxes_dev));
143 |   CUDA_CHECK(cudaFree(mask_dev));
144 | }
145 | 


--------------------------------------------------------------------------------
/utils/build.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick
  6 | # --------------------------------------------------------
  7 | 
  8 | import os
  9 | from os.path import join as pjoin
 10 | import numpy as np
 11 | from distutils.core import setup
 12 | from distutils.extension import Extension
 13 | from Cython.Distutils import build_ext
 14 | 
 15 | 
 16 | def find_in_path(name, path):
 17 |     "Find a file in a search path"
 18 |     # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
 19 |     for dir in path.split(os.pathsep):
 20 |         binpath = pjoin(dir, name)
 21 |         if os.path.exists(binpath):
 22 |             return os.path.abspath(binpath)
 23 |     return None
 24 | 
 25 | 
 26 | def locate_cuda():
 27 |     """Locate the CUDA environment on the system
 28 | 
 29 |     Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
 30 |     and values giving the absolute path to each directory.
 31 | 
 32 |     Starts by looking for the CUDAHOME env variable. If not found, everything
 33 |     is based on finding 'nvcc' in the PATH.
 34 |     """
 35 | 
 36 |     # first check if the CUDAHOME env variable is in use
 37 |     if 'CUDAHOME' in os.environ:
 38 |         home = os.environ['CUDAHOME']
 39 |         nvcc = pjoin(home, 'bin', 'nvcc')
 40 |     else:
 41 |         # otherwise, search the PATH for NVCC
 42 |         default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
 43 |         nvcc = find_in_path('nvcc',
 44 |                             os.environ['PATH'] + os.pathsep + default_path)
 45 |         if nvcc is None:
 46 |             raise EnvironmentError(
 47 |                 'The nvcc binary could not be '
 48 |                 'located in your $PATH. Either add it to your path, or set $CUDAHOME'
 49 |             )
 50 |         home = os.path.dirname(os.path.dirname(nvcc))
 51 | 
 52 |     cudaconfig = {
 53 |         'home': home,
 54 |         'nvcc': nvcc,
 55 |         'include': pjoin(home, 'include'),
 56 |         'lib64': pjoin(home, 'lib64')
 57 |     }
 58 |     for k, v in cudaconfig.items():
 59 |         if not os.path.exists(v):
 60 |             raise EnvironmentError(
 61 |                 'The CUDA %s path could not be located in %s' % (k, v))
 62 | 
 63 |     return cudaconfig
 64 | 
 65 | 
 66 | CUDA = locate_cuda()
 67 | 
 68 | # Obtain the numpy include directory.  This logic works across numpy versions.
 69 | try:
 70 |     numpy_include = np.get_include()
 71 | except AttributeError:
 72 |     numpy_include = np.get_numpy_include()
 73 | 
 74 | 
 75 | def customize_compiler_for_nvcc(self):
 76 |     """inject deep into distutils to customize how the dispatch
 77 |     to gcc/nvcc works.
 78 | 
 79 |     If you subclass UnixCCompiler, it's not trivial to get your subclass
 80 |     injected in, and still have the right customizations (i.e.
 81 |     distutils.sysconfig.customize_compiler) run on it. So instead of going
 82 |     the OO route, I have this. Note, it's kindof like a wierd functional
 83 |     subclassing going on."""
 84 | 
 85 |     # tell the compiler it can processes .cu
 86 |     self.src_extensions.append('.cu')
 87 | 
 88 |     # save references to the default compiler_so and _comple methods
 89 |     default_compiler_so = self.compiler_so
 90 |     super = self._compile
 91 | 
 92 |     # now redefine the _compile method. This gets executed for each
 93 |     # object but distutils doesn't have the ability to change compilers
 94 |     # based on source extension: we add it.
 95 |     def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
 96 |         print(extra_postargs)
 97 |         if os.path.splitext(src)[1] == '.cu':
 98 |             # use the cuda for .cu files
 99 |             self.set_executable('compiler_so', CUDA['nvcc'])
100 |             # use only a subset of the extra_postargs, which are 1-1 translated
101 |             # from the extra_compile_args in the Extension class
102 |             postargs = extra_postargs['nvcc']
103 |         else:
104 |             postargs = extra_postargs['gcc']
105 | 
106 |         super(obj, src, ext, cc_args, postargs, pp_opts)
107 |         # reset the default compiler_so, which we might have changed for cuda
108 |         self.compiler_so = default_compiler_so
109 | 
110 |     # inject our redefined _compile method into the class
111 |     self._compile = _compile
112 | 
113 | 
114 | # run the customize_compiler
115 | class custom_build_ext(build_ext):
116 |     def build_extensions(self):
117 |         customize_compiler_for_nvcc(self.compiler)
118 |         build_ext.build_extensions(self)
119 | 
120 | 
121 | ext_modules = [
122 |     Extension(
123 |         "nms.cpu_nms", ["nms/cpu_nms.pyx"],
124 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
125 |         include_dirs=[numpy_include]),
126 |     Extension(
127 |         'nms.gpu_nms',
128 |         ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'],
129 |         library_dirs=[CUDA['lib64']],
130 |         libraries=['cudart'],
131 |         language='c++',
132 |         runtime_library_dirs=[CUDA['lib64']],
133 |         # this syntax is specific to this build system
134 |         # we're only going to use certain compiler args with nvcc and not with gcc
135 |         # the implementation of this trick is in customize_compiler() below
136 |         extra_compile_args={
137 |             'gcc': ["-Wno-unused-function"],
138 |             'nvcc': [
139 |                 '-arch=sm_61', '--ptxas-options=-v', '-c',
140 |                 '--compiler-options', "'-fPIC'"
141 |             ]
142 |         },
143 |         include_dirs=[numpy_include, CUDA['include']])
144 | ]
145 | 
146 | setup(
147 |     name='mot_utils',
148 |     ext_modules=ext_modules,
149 |     # inject our custom trigger
150 |     cmdclass={'build_ext': custom_build_ext},
151 | )
152 | 


--------------------------------------------------------------------------------
/utils/get_class_map.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import argparse
  4 | import os.path as osp
  5 | 
  6 | 
  7 | def check_size(submit_file):
  8 |     max_size = 60 * 1024 * 1024
  9 |     if osp.getsize(submit_file) > max_size:
 10 |         raise (
 11 |             IOError,
 12 |             "File size exceeds the specified maximum size, which is 60M for the server."
 13 |         )
 14 | 
 15 | 
 16 | def parse_submission(submit_file):
 17 |     with open(submit_file, 'r') as f:
 18 |         lines = f.readlines()
 19 |     submit_dict = dict()
 20 |     final_dict = dict()
 21 |     splitlines = [x.strip().split(' ') for x in lines]
 22 |     for idx, val in enumerate(splitlines):
 23 |         cls = str(int(float(val[1])))
 24 |         if cls not in submit_dict:
 25 |             submit_dict[cls] = list()
 26 |             final_dict[cls] = dict()
 27 |         submit_dict[cls].append(
 28 |             [val[0], val[2], val[3], val[4], val[5], val[6]])
 29 |     for k, v in submit_dict.items():
 30 |         image_ids = [x[0] for x in v]
 31 |         confidence = np.array([float(x[1]) for x in v])
 32 |         BB = np.array([[float(z) for z in x[2:]] for x in v])
 33 |         sorted_ind = np.argsort(-confidence)
 34 |         sorted_scores = np.sort(-confidence)
 35 |         BB = BB[sorted_ind, :]
 36 |         image_ids = [image_ids[x] for x in sorted_ind]
 37 |         final_dict[k]["image_ids"] = image_ids
 38 |         final_dict[k]["BB"] = np.array(BB)
 39 |     return final_dict
 40 | 
 41 | 
 42 | def parse_gt_annotation(gt_file):
 43 |     with open(gt_file, 'r') as f:
 44 |         lines = f.readlines()
 45 |     info = [x.strip().split() for x in lines]
 46 |     gt = {}
 47 |     for item in info:
 48 |         img_id = item[0]
 49 |         obj_struct = {}
 50 |         obj_struct['class'] = item[1]
 51 |         obj_struct['bbox'] = [
 52 |             int(item[2]),
 53 |             int(item[3]),
 54 |             int(item[4]),
 55 |             int(item[5])
 56 |         ]
 57 |         if img_id not in gt:
 58 |             gt[img_id] = list()
 59 |         gt[img_id].append(obj_struct)
 60 |     return gt
 61 | 
 62 | 
 63 | def get_class_recs(recs, classname):
 64 |     npos = 0
 65 |     class_recs = {}
 66 |     for key in recs.keys():
 67 |         R = [obj for obj in recs[key] if obj['class'] == classname]
 68 |         bbox = np.array([x['bbox'] for x in R])
 69 |         det = [False] * len(R)
 70 |         npos += len(R)
 71 |         class_recs[key] = {'bbox': bbox, 'det': det}
 72 |     return class_recs, npos
 73 | 
 74 | 
 75 | def compute_ap(rec, prec):
 76 |     mrec = np.concatenate(([0.], rec, [1.]))
 77 |     mpre = np.concatenate(([0.], prec, [0.]))
 78 |     for i in range(mpre.size - 1, 0, -1):
 79 |         mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 80 |     i = np.where(mrec[1:] != mrec[:-1])[0]
 81 |     ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 82 |     return ap
 83 | 
 84 | 
 85 | def eval(submit_file, gt_file, ovthresh, classname):
 86 |     recs = parse_gt_annotation(gt_file)
 87 |     submit_result = parse_submission(submit_file)
 88 |     # get one class result
 89 |     class_recs, npos = get_class_recs(recs, classname)
 90 |     image_ids = submit_result[classname]["image_ids"]
 91 |     BB = submit_result[classname]["BB"]
 92 |     nd = len(image_ids)
 93 |     tp = np.zeros(nd)
 94 |     fp = np.zeros(nd)
 95 |     for d in range(nd):
 96 |         if image_ids[d] not in recs.keys():
 97 |             raise KeyError(
 98 |                 "Can not find image {} in the groundtruth file, did you submit the result file for the right dataset?"
 99 |                 .format(image_ids[d]))
100 |     for d in range(nd):
101 |         R = class_recs[image_ids[d]]
102 |         bb = BB[d, :].astype(float)
103 |         ovmax = -np.inf
104 |         BBGT = R['bbox'].astype(float)
105 |         if BBGT.size > 0:
106 |             ixmin = np.maximum(BBGT[:, 0], bb[0])
107 |             iymin = np.maximum(BBGT[:, 1], bb[1])
108 |             ixmax = np.minimum(BBGT[:, 2], bb[2])
109 |             iymax = np.minimum(BBGT[:, 3], bb[3])
110 |             iw = np.maximum(ixmax - ixmin + 1., 0.)
111 |             ih = np.maximum(iymax - iymin + 1., 0.)
112 |             inters = iw * ih
113 |             uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
114 |                    (BBGT[:, 2] - BBGT[:, 0] + 1.) *
115 |                    (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
116 |             overlaps = inters / uni
117 |             ovmax = np.max(overlaps)
118 |             jmax = np.argmax(overlaps)
119 |         if ovmax > ovthresh:
120 |             if not R['det'][jmax]:
121 |                 tp[d] = 1.
122 |                 R['det'][jmax] = 1
123 |             else:
124 |                 fp[d] = 1.
125 |         else:
126 |             fp[d] = 1.
127 |     fp = np.cumsum(fp)
128 |     tp = np.cumsum(tp)
129 |     rec = tp / float(npos)
130 |     prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
131 |     ap = compute_ap(rec, prec)
132 |     return ap
133 | 
134 | 
135 | def result_eval(submit_file, gt, class_list):
136 |     ove_aap = []
137 |     for ove in np.arange(0.5, 1.0, 0.05):
138 |         cls_aap = []
139 |         for cls in class_list:
140 |             ap = eval(submit_file, gt, ove, cls)
141 |             cls_aap.append(ap)
142 |         cls_mAP = np.average(cls_aap)
143 |         print("thresh", round(ove, 3), "map", round(cls_mAP * 100, 3))
144 |         ove_aap.append(cls_mAP)
145 |     mAP = np.average(ove_aap) * 100
146 |     return round(mAP, 3)
147 | 
148 | 
149 | if __name__ == '__main__':
150 |     '''
151 |     submit_file: image_id, class, score, xmin, ymin, xmax, ymax
152 |     gt_file: image_id, class, xmin, ymin, xmax, ymax
153 |     '''
154 |     class_list = []
155 |     for i in range(1, 61):
156 |         class_list.append(str(i))
157 |     submit_file = "./results/fpn_dcn_result.csv"
158 |     gt_file = "./results/val_label.txt"
159 |     check_size(submit_file)
160 |     mAP = result_eval(submit_file, gt_file, class_list)
161 |     out = {'Average AP': str(round(mAP, 3))}
162 |     print(out)


--------------------------------------------------------------------------------
/layers/modules/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | from torch.autograd import Variable
  6 | from utils.box_utils import match, log_sum_exp
  7 | from .focal_loss_softmax import FocalLossSoftmax
  8 | from .focal_loss_sigmoid import FocalLossSigmoid
  9 | 
 10 | GPU = False
 11 | if torch.cuda.is_available():
 12 |     GPU = True
 13 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 14 | 
 15 | 
 16 | class MultiBoxLoss(nn.Module):
 17 |     """SSD Weighted Loss Function
 18 |     Compute Targets:
 19 |         1) Produce Confidence Target Indices by matching  ground truth boxes
 20 |            with (default) 'priorboxes' that have jaccard index > threshold parameter
 21 |            (default threshold: 0.5).
 22 |         2) Produce localization target by 'encoding' variance into offsets of ground
 23 |            truth boxes and their matched  'priorboxes'.
 24 |         3) Hard negative mining to filter the excessive number of negative examples
 25 |            that comes with using a large number of default bounding boxes.
 26 |            (default negative:positive ratio 3:1)
 27 |     Objective Loss:
 28 |         L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
 29 |         Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
 30 |         weighted by α which is set to 1 by cross val.
 31 |         Args:
 32 |             c: class confidences,
 33 |             l: predicted boxes,
 34 |             g: ground truth boxes
 35 |             N: number of matched default boxes
 36 |         See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 37 |     """
 38 | 
 39 |     def __init__(self, cfg):
 40 |         super(MultiBoxLoss, self).__init__()
 41 |         self.cfg = cfg
 42 |         self.size = cfg.MODEL.SIZE
 43 |         if self.size == '300':
 44 |             size_cfg = cfg.SMALL
 45 |         else:
 46 |             size_cfg = cfg.BIG
 47 |         self.variance = size_cfg.VARIANCE
 48 |         self.num_classes = cfg.MODEL.NUM_CLASSES
 49 |         self.threshold = cfg.TRAIN.OVERLAP
 50 |         self.OHEM = cfg.TRAIN.OHEM
 51 |         self.negpos_ratio = cfg.TRAIN.NEG_RATIO
 52 |         self.variance = size_cfg.VARIANCE
 53 |         if cfg.TRAIN.FOCAL_LOSS:
 54 |             if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX':
 55 |                 self.focaloss = FocalLossSoftmax(
 56 |                     self.num_classes, gamma=2, size_average=False)
 57 |             else:
 58 |                 self.focaloss = FocalLossSigmoid()
 59 | 
 60 |     def forward(self, predictions, targets):
 61 |         """Multibox Loss
 62 |         Args:
 63 |             predictions (tuple): A tuple containing loc preds, conf preds,
 64 |             and prior boxes from SSD net.
 65 |                 conf shape: torch.size(batch_size,num_priors,num_classes)
 66 |                 loc shape: torch.size(batch_size,num_priors,4)
 67 |                 priors shape: torch.size(num_priors,4)
 68 | 
 69 |             ground_truth (tensor): Ground truth boxes and labels for a batch,
 70 |                 shape: [batch_size,num_objs,5] (last idx is the label).
 71 |         """
 72 |         loc_data, conf_data, priors = predictions
 73 |         num = loc_data.size(0)
 74 |         priors = priors[:loc_data.size(1), :]
 75 |         num_priors = (priors.size(0))
 76 |         num_classes = self.num_classes
 77 |         loc_t = torch.Tensor(num, num_priors, 4)
 78 |         conf_t = torch.LongTensor(num, num_priors)
 79 |         for idx in range(num):
 80 |             truths = targets[idx][:, :-1].data
 81 |             labels = targets[idx][:, -1].data
 82 |             if self.num_classes == 2:
 83 |                 labels = labels > 0
 84 |             defaults = priors.data
 85 |             match(self.threshold, truths, defaults, self.variance, labels,
 86 |                   loc_t, conf_t, idx)
 87 |         loc_t = loc_t.cuda()
 88 |         conf_t = conf_t.cuda()
 89 | 
 90 |         pos = conf_t > 0
 91 |         num_pos = pos.sum(1, keepdim=True)
 92 | 
 93 |         if self.OHEM:
 94 |             # Compute max conf across batch for hard negative mining
 95 |             batch_conf = conf_data.view(-1, self.num_classes)
 96 | 
 97 |             loss_hard = log_sum_exp(batch_conf) - batch_conf.gather(
 98 |                 1, conf_t.view(-1, 1))
 99 |             # Hard Negative Mining
100 |             loss_hard[pos.view(-1, 1)] = 0  # filter out pos boxes for now
101 |             loss_hard = loss_hard.view(num, -1)
102 |             _, loss_idx = loss_hard.sort(1, descending=True)
103 |             _, idx_rank = loss_idx.sort(1)
104 |             num_pos = pos.long().sum(1, keepdim=True)
105 |             if num_pos.data.sum() > 0:
106 |                 num_neg = torch.clamp(
107 |                 self.negpos_ratio * num_pos, max=pos.size(1) - 1)
108 |             else:
109 |                 fake_num_pos = torch.ones(32, 1).long() * 15
110 |                 num_neg = torch.clamp(
111 |                 self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1)
112 |             neg = idx_rank < num_neg.expand_as(idx_rank)
113 | 
114 |             # Confidence Loss Including Positive and Negative Examples
115 |             pos_idx = pos.unsqueeze(2).expand_as(conf_data)
116 |             neg_idx = neg.unsqueeze(2).expand_as(conf_data)
117 |             conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(
118 |                 -1, self.num_classes)
119 |             targets_weighted = conf_t[(pos + neg).gt(0)]
120 |             loss_c = F.cross_entropy(
121 |                 conf_p, targets_weighted, size_average=False)
122 |         else:
123 |             loss_c = F.cross_entropy(conf_p, conf_t, size_average=False)
124 |         # Localization Loss (Smooth L1)
125 |         # Shape: [batch,num_priors,4]
126 |         if num_pos.data.sum() > 0:
127 |             pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
128 |             loc_p = loc_data[pos_idx].view(-1, 4)
129 |             loc_t = loc_t[pos_idx].view(-1, 4)
130 |             loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
131 |             N = num_pos.data.sum()
132 |         else:
133 |             loss_l = torch.zeros(1)
134 |             N = 1.0
135 |         loss_l /= float(N)
136 |         loss_c /= float(N)
137 |         return loss_l, loss_c
138 | 


--------------------------------------------------------------------------------
/dcn/modules/deform_pool.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | 
  3 | from ..functions.deform_pool import deform_roi_pooling
  4 | 
  5 | 
  6 | class DeformRoIPooling(nn.Module):
  7 | 
  8 |     def __init__(self,
  9 |                  spatial_scale,
 10 |                  out_size,
 11 |                  out_channels,
 12 |                  no_trans,
 13 |                  group_size=1,
 14 |                  part_size=None,
 15 |                  sample_per_part=4,
 16 |                  trans_std=.0):
 17 |         super(DeformRoIPooling, self).__init__()
 18 |         self.spatial_scale = spatial_scale
 19 |         self.out_size = out_size
 20 |         self.out_channels = out_channels
 21 |         self.no_trans = no_trans
 22 |         self.group_size = group_size
 23 |         self.part_size = out_size if part_size is None else part_size
 24 |         self.sample_per_part = sample_per_part
 25 |         self.trans_std = trans_std
 26 | 
 27 |     def forward(self, data, rois, offset):
 28 |         if self.no_trans:
 29 |             offset = data.new_empty(0)
 30 |         return deform_roi_pooling(
 31 |             data, rois, offset, self.spatial_scale, self.out_size,
 32 |             self.out_channels, self.no_trans, self.group_size, self.part_size,
 33 |             self.sample_per_part, self.trans_std)
 34 | 
 35 | 
 36 | class DeformRoIPoolingPack(DeformRoIPooling):
 37 | 
 38 |     def __init__(self,
 39 |                  spatial_scale,
 40 |                  out_size,
 41 |                  out_channels,
 42 |                  no_trans,
 43 |                  group_size=1,
 44 |                  part_size=None,
 45 |                  sample_per_part=4,
 46 |                  trans_std=.0,
 47 |                  deform_fc_channels=1024):
 48 |         super(DeformRoIPoolingPack,
 49 |               self).__init__(spatial_scale, out_size, out_channels, no_trans,
 50 |                              group_size, part_size, sample_per_part, trans_std)
 51 | 
 52 |         self.deform_fc_channels = deform_fc_channels
 53 | 
 54 |         if not no_trans:
 55 |             self.offset_fc = nn.Sequential(
 56 |                 nn.Linear(self.out_size * self.out_size * self.out_channels,
 57 |                           self.deform_fc_channels),
 58 |                 nn.ReLU(inplace=True),
 59 |                 nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
 60 |                 nn.ReLU(inplace=True),
 61 |                 nn.Linear(self.deform_fc_channels,
 62 |                           self.out_size * self.out_size * 2))
 63 |             self.offset_fc[-1].weight.data.zero_()
 64 |             self.offset_fc[-1].bias.data.zero_()
 65 | 
 66 |     def forward(self, data, rois):
 67 |         assert data.size(1) == self.out_channels
 68 |         if self.no_trans:
 69 |             offset = data.new_empty(0)
 70 |             return deform_roi_pooling(
 71 |                 data, rois, offset, self.spatial_scale, self.out_size,
 72 |                 self.out_channels, self.no_trans, self.group_size,
 73 |                 self.part_size, self.sample_per_part, self.trans_std)
 74 |         else:
 75 |             n = rois.shape[0]
 76 |             offset = data.new_empty(0)
 77 |             x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
 78 |                                    self.out_size, self.out_channels, True,
 79 |                                    self.group_size, self.part_size,
 80 |                                    self.sample_per_part, self.trans_std)
 81 |             offset = self.offset_fc(x.view(n, -1))
 82 |             offset = offset.view(n, 2, self.out_size, self.out_size)
 83 |             return deform_roi_pooling(
 84 |                 data, rois, offset, self.spatial_scale, self.out_size,
 85 |                 self.out_channels, self.no_trans, self.group_size,
 86 |                 self.part_size, self.sample_per_part, self.trans_std)
 87 | 
 88 | 
 89 | class ModulatedDeformRoIPoolingPack(DeformRoIPooling):
 90 | 
 91 |     def __init__(self,
 92 |                  spatial_scale,
 93 |                  out_size,
 94 |                  out_channels,
 95 |                  no_trans,
 96 |                  group_size=1,
 97 |                  part_size=None,
 98 |                  sample_per_part=4,
 99 |                  trans_std=.0,
100 |                  deform_fc_channels=1024):
101 |         super(ModulatedDeformRoIPoolingPack, self).__init__(
102 |             spatial_scale, out_size, out_channels, no_trans, group_size,
103 |             part_size, sample_per_part, trans_std)
104 | 
105 |         self.deform_fc_channels = deform_fc_channels
106 | 
107 |         if not no_trans:
108 |             self.offset_fc = nn.Sequential(
109 |                 nn.Linear(self.out_size * self.out_size * self.out_channels,
110 |                           self.deform_fc_channels),
111 |                 nn.ReLU(inplace=True),
112 |                 nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
113 |                 nn.ReLU(inplace=True),
114 |                 nn.Linear(self.deform_fc_channels,
115 |                           self.out_size * self.out_size * 2))
116 |             self.offset_fc[-1].weight.data.zero_()
117 |             self.offset_fc[-1].bias.data.zero_()
118 |             self.mask_fc = nn.Sequential(
119 |                 nn.Linear(self.out_size * self.out_size * self.out_channels,
120 |                           self.deform_fc_channels),
121 |                 nn.ReLU(inplace=True),
122 |                 nn.Linear(self.deform_fc_channels,
123 |                           self.out_size * self.out_size * 1),
124 |                 nn.Sigmoid())
125 |             self.mask_fc[2].weight.data.zero_()
126 |             self.mask_fc[2].bias.data.zero_()
127 | 
128 |     def forward(self, data, rois):
129 |         assert data.size(1) == self.out_channels
130 |         if self.no_trans:
131 |             offset = data.new_empty(0)
132 |             return deform_roi_pooling(
133 |                 data, rois, offset, self.spatial_scale, self.out_size,
134 |                 self.out_channels, self.no_trans, self.group_size,
135 |                 self.part_size, self.sample_per_part, self.trans_std)
136 |         else:
137 |             n = rois.shape[0]
138 |             offset = data.new_empty(0)
139 |             x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
140 |                                    self.out_size, self.out_channels, True,
141 |                                    self.group_size, self.part_size,
142 |                                    self.sample_per_part, self.trans_std)
143 |             offset = self.offset_fc(x.view(n, -1))
144 |             offset = offset.view(n, 2, self.out_size, self.out_size)
145 |             mask = self.mask_fc(x.view(n, -1))
146 |             mask = mask.view(n, 1, self.out_size, self.out_size)
147 |             return deform_roi_pooling(
148 |                 data, rois, offset, self.spatial_scale, self.out_size,
149 |                 self.out_channels, self.no_trans, self.group_size,
150 |                 self.part_size, self.sample_per_part, self.trans_std) * mask
151 | 


--------------------------------------------------------------------------------
/dcn/src/deform_pool_cuda.cpp:
--------------------------------------------------------------------------------
  1 | // author: Charles Shang
  2 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
  3 | 
  4 | // modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob /mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
  5 | 
  6 | #include <torch/torch.h>
  7 | 
  8 | #include <cmath>
  9 | #include <vector>
 10 | 
 11 | void DeformablePSROIPoolForward(const at::Tensor data,
 12 |                                 const at::Tensor bbox,
 13 |                                 const at::Tensor trans,
 14 |                                 at::Tensor out,
 15 |                                 at::Tensor top_count,
 16 |                                 const int batch,
 17 |                                 const int channels,
 18 |                                 const int height,
 19 |                                 const int width,
 20 |                                 const int num_bbox,
 21 |                                 const int channels_trans,
 22 |                                 const int no_trans,
 23 |                                 const float spatial_scale,
 24 |                                 const int output_dim,
 25 |                                 const int group_size,
 26 |                                 const int pooled_size,
 27 |                                 const int part_size,
 28 |                                 const int sample_per_part,
 29 |                                 const float trans_std);
 30 | 
 31 | void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
 32 |                                     const at::Tensor data,
 33 |                                     const at::Tensor bbox,
 34 |                                     const at::Tensor trans,
 35 |                                     const at::Tensor top_count,
 36 |                                     at::Tensor in_grad,
 37 |                                     at::Tensor trans_grad,
 38 |                                     const int batch,
 39 |                                     const int channels,
 40 |                                     const int height,
 41 |                                     const int width,
 42 |                                     const int num_bbox,
 43 |                                     const int channels_trans,
 44 |                                     const int no_trans,
 45 |                                     const float spatial_scale,
 46 |                                     const int output_dim,
 47 |                                     const int group_size,
 48 |                                     const int pooled_size,
 49 |                                     const int part_size,
 50 |                                     const int sample_per_part,
 51 |                                     const float trans_std);
 52 | 
 53 | void deform_psroi_pooling_cuda_forward(at::Tensor input, at::Tensor bbox,
 54 |                                        at::Tensor trans,
 55 |                                        at::Tensor out, at::Tensor top_count,
 56 |                                        const int no_trans,
 57 |                                        const float spatial_scale,
 58 |                                        const int output_dim,
 59 |                                        const int group_size,
 60 |                                        const int pooled_size,
 61 |                                        const int part_size,
 62 |                                        const int sample_per_part,
 63 |                                        const float trans_std)
 64 | {
 65 |     AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
 66 | 
 67 |     const int batch = input.size(0);
 68 |     const int channels = input.size(1);
 69 |     const int height = input.size(2);
 70 |     const int width = input.size(3);
 71 |     const int channels_trans = no_trans ? 2 : trans.size(1);
 72 | 
 73 |     const int num_bbox = bbox.size(0);
 74 |     if (num_bbox != out.size(0))
 75 |         AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
 76 |                  out.size(0), num_bbox);
 77 | 
 78 |     DeformablePSROIPoolForward(input, bbox, trans, out, top_count,
 79 |                                batch, channels, height, width,
 80 |                                num_bbox,
 81 |                                channels_trans,
 82 |                                no_trans,
 83 |                                spatial_scale,
 84 |                                output_dim,
 85 |                                group_size,
 86 |                                pooled_size,
 87 |                                part_size,
 88 |                                sample_per_part,
 89 |                                trans_std);
 90 | }
 91 | 
 92 | void deform_psroi_pooling_cuda_backward(at::Tensor out_grad,
 93 |                                         at::Tensor input, at::Tensor bbox,
 94 |                                         at::Tensor trans, at::Tensor top_count,
 95 |                                         at::Tensor input_grad, at::Tensor trans_grad,
 96 |                                         const int no_trans,
 97 |                                         const float spatial_scale,
 98 |                                         const int output_dim,
 99 |                                         const int group_size,
100 |                                         const int pooled_size,
101 |                                         const int part_size,
102 |                                         const int sample_per_part,
103 |                                         const float trans_std)
104 | {
105 |     AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
106 |     AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
107 | 
108 |     const int batch = input.size(0);
109 |     const int channels = input.size(1);
110 |     const int height = input.size(2);
111 |     const int width = input.size(3);
112 |     const int channels_trans = no_trans ? 2 : trans.size(1);
113 | 
114 |     const int num_bbox = bbox.size(0);
115 |     if (num_bbox != out_grad.size(0))
116 |         AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
117 |                  out_grad.size(0), num_bbox);
118 | 
119 |     DeformablePSROIPoolBackwardAcc(out_grad,
120 |                                    input,
121 |                                    bbox,
122 |                                    trans,
123 |                                    top_count,
124 |                                    input_grad,
125 |                                    trans_grad,
126 |                                    batch, channels, height, width, num_bbox,
127 |                                    channels_trans,
128 |                                    no_trans,
129 |                                    spatial_scale,
130 |                                    output_dim,
131 |                                    group_size,
132 |                                    pooled_size,
133 |                                    part_size,
134 |                                    sample_per_part,
135 |                                    trans_std);
136 | }
137 | 
138 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
139 | {
140 |     m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward,
141 |           "deform psroi pooling forward(CUDA)");
142 |     m.def("deform_psroi_pooling_cuda_backward", &deform_psroi_pooling_cuda_backward,
143 |           "deform psroi pooling backward(CUDA)");
144 | }


--------------------------------------------------------------------------------
/models/resnet.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | from models.model_helper import weights_init
  9 | 
 10 | 
 11 | def add_extras(size, in_channel, batch_norm=False):
 12 |     # Extra layers added to resnet for feature scaling
 13 |     layers = []
 14 |     layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)]
 15 |     layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)]
 16 |     layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 17 |     layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 18 |     if size == '300':
 19 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 20 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=0)]
 21 |     else:
 22 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 23 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 24 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 25 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 26 | 
 27 |     return layers
 28 | 
 29 | 
 30 | def conv3x3(in_planes, out_planes, stride=1):
 31 |     "3x3 convolution with padding"
 32 |     return nn.Conv2d(
 33 |         in_planes,
 34 |         out_planes,
 35 |         kernel_size=3,
 36 |         stride=stride,
 37 |         padding=1,
 38 |         bias=False)
 39 | 
 40 | 
 41 | class BasicBlock(nn.Module):
 42 |     expansion = 1
 43 | 
 44 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 45 |         super(BasicBlock, self).__init__()
 46 |         self.conv1 = conv3x3(inplanes, planes, stride)
 47 |         self.bn1 = nn.BatchNorm2d(planes)
 48 |         self.relu = nn.ReLU(inplace=True)
 49 |         self.conv2 = conv3x3(planes, planes)
 50 |         self.bn2 = nn.BatchNorm2d(planes)
 51 |         self.downsample = downsample
 52 |         self.stride = stride
 53 | 
 54 |     def forward(self, x):
 55 |         residual = x
 56 | 
 57 |         out = self.conv1(x)
 58 |         out = self.bn1(out)
 59 |         out = self.relu(out)
 60 | 
 61 |         out = self.conv2(out)
 62 |         out = self.bn2(out)
 63 | 
 64 |         if self.downsample is not None:
 65 |             residual = self.downsample(x)
 66 |         out += residual
 67 |         out = self.relu(out)
 68 | 
 69 |         return out
 70 | 
 71 | 
 72 | class Bottleneck(nn.Module):
 73 |     expansion = 4
 74 | 
 75 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 76 |         super(Bottleneck, self).__init__()
 77 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
 78 |         self.bn1 = nn.BatchNorm2d(planes)
 79 |         self.conv2 = nn.Conv2d(
 80 |             planes,
 81 |             planes,
 82 |             kernel_size=3,
 83 |             stride=stride,
 84 |             padding=1,
 85 |             bias=False)
 86 |         self.bn2 = nn.BatchNorm2d(planes)
 87 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
 88 |         self.bn3 = nn.BatchNorm2d(planes * 4)
 89 |         self.relu = nn.ReLU(inplace=True)
 90 |         self.downsample = downsample
 91 |         self.stride = stride
 92 | 
 93 |     def forward(self, x):
 94 |         residual = x
 95 | 
 96 |         out = self.conv1(x)
 97 |         out = self.bn1(out)
 98 |         out = self.relu(out)
 99 | 
100 |         out = self.conv2(out)
101 |         out = self.bn2(out)
102 |         out = self.relu(out)
103 | 
104 |         out = self.conv3(out)
105 |         out = self.bn3(out)
106 | 
107 |         if self.downsample is not None:
108 |             residual = self.downsample(x)
109 | 
110 |         out += residual
111 |         out = self.relu(out)
112 | 
113 |         return out
114 | 
115 | 
116 | class SSDResnet(nn.Module):
117 |     def __init__(self, block, num_blocks, size):
118 |         super(SSDResnet, self).__init__()
119 |         self.inplanes = 64
120 | 
121 |         self.conv1 = nn.Conv2d(
122 |             3, 64, kernel_size=7, stride=2, padding=3, bias=False)
123 |         self.bn1 = nn.BatchNorm2d(64)
124 | 
125 |         # Bottom-up layers
126 |         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
127 |         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
128 |         self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
129 |         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
130 |         self.inchannel = block.expansion * 512
131 |         self.extras = nn.ModuleList(add_extras(str(size), self.inchannel))
132 |         self.smooth1 = nn.Conv2d(
133 |             self.inchannel, 512, kernel_size=3, stride=1, padding=1)
134 |         self._init_modules()
135 | 
136 |     def _make_layer(self, block, planes, blocks, stride=1):
137 |         downsample = None
138 |         if stride != 1 or self.inplanes != planes * block.expansion:
139 |             downsample = nn.Sequential(
140 |                 nn.Conv2d(
141 |                     self.inplanes,
142 |                     planes * block.expansion,
143 |                     kernel_size=1,
144 |                     stride=stride,
145 |                     bias=False),
146 |                 nn.BatchNorm2d(planes * block.expansion),
147 |             )
148 | 
149 |         layers = []
150 |         layers.append(block(self.inplanes, planes, stride, downsample))
151 |         self.inplanes = planes * block.expansion
152 |         for i in range(1, blocks):
153 |             layers.append(block(self.inplanes, planes))
154 | 
155 |         return nn.Sequential(*layers)
156 | 
157 |     def _init_modules(self):
158 |         self.extras.apply(weights_init)
159 |         self.smooth1.apply(weights_init)
160 | 
161 |     def forward(self, x):
162 |         # Bottom-up
163 |         c1 = F.relu(self.bn1(self.conv1(x)))
164 |         c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1)
165 |         c2 = self.layer1(c1)
166 |         c3 = self.layer2(c2)
167 |         c4 = self.layer3(c3)
168 |         c5 = self.layer4(c4)
169 |         x = c5
170 |         c5_ = self.smooth1(c5)
171 |         sources = [c3, c4, c5_]
172 |         for k, v in enumerate(self.extras):
173 |             x = F.relu(v(x), inplace=True)
174 |             if k % 2 == 1:
175 |                 sources.append(x)
176 |         return sources
177 | 
178 | 
179 | def SSDResnet18(size, channel_size='48'):
180 |     return SSDResnet(BasicBlock, [2, 2, 2, 2], size)
181 | 
182 | 
183 | def SSDResnet34(size, channel_size='48'):
184 |     return SSDResnet(BasicBlock, [3, 4, 6, 3], size)
185 | 
186 | 
187 | def SSDResnet50(size, channel_size='48'):
188 |     return SSDResnet(Bottleneck, [3, 4, 6, 3], size)
189 | 
190 | 
191 | def SSDResnet101(size, channel_size='48'):
192 |     return SSDResnet(Bottleneck, [3, 4, 23, 3], size)
193 | 
194 | 
195 | def SSDResnet152(size, channel_size='48'):
196 |     return SSDResnet(Bottleneck, [3, 8, 36, 3], size)
197 | 
198 | 
199 | if __name__ == "__main__":
200 |     import os
201 |     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
202 |     model3 = SSDResnet18(size=300)
203 |     with torch.no_grad():
204 |         model3.eval()
205 |         x = torch.randn(1, 3, 300, 300)
206 |         model3.cuda()
207 |         model3(x.cuda())
208 |         import time
209 |         st = time.time()
210 |         for i in range(1):
211 |             model3(x.cuda())
212 |         print(time.time() - st)
213 |         # print(model3(x))
214 | 


--------------------------------------------------------------------------------
/models/model_builder.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | from layers import *
  9 | import os
 10 | from models.model_helper import weights_init
 11 | import importlib
 12 | from layers.functions.prior_layer import PriorLayer
 13 | 
 14 | 
 15 | def get_func(func_name):
 16 |     """Helper to return a function object by name. func_name must identify a
 17 |     function in this module or the path to a function relative to the base
 18 |     'modeling' module.
 19 |     """
 20 |     if func_name == '':
 21 |         return None
 22 |     try:
 23 |         parts = func_name.split('.')
 24 |         # Refers to a function in this module
 25 |         if len(parts) == 1:
 26 |             return globals()[parts[0]]
 27 |         # Otherwise, assume we're referencing a module under modeling
 28 |         module_name = 'models.' + '.'.join(parts[:-1])
 29 |         module = importlib.import_module(module_name)
 30 |         return getattr(module, parts[-1])
 31 |     except Exception:
 32 |         print('Failed to find function: %s', func_name)
 33 |         raise
 34 | 
 35 | 
 36 | class SSD(nn.Module):
 37 |     """Single Shot Multibox Architecture
 38 |     The network is composed of a base VGG network followed by the
 39 |     added multibox conv layers.  Each multibox layer branches into
 40 |         1) conv2d for class conf scores
 41 |         2) conv2d for localization predictions
 42 |         3) associated priorbox layer to produce default bounding
 43 |            boxes specific to the layer's feature map size.
 44 |     See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 45 | 
 46 |     Args:
 47 |         phase: (string) Can be "test" or "train"
 48 |         base: VGG16 layers for input, size of either 300 or 500
 49 |         extras: extra layers that feed to multibox loc and conf layers
 50 |         head: "multibox head" consists of loc and conf conv layers
 51 |     """
 52 | 
 53 |     def _init_modules(self):
 54 |         self.arm_loc.apply(weights_init)
 55 |         self.arm_conf.apply(weights_init)
 56 |         if self.cfg.MODEL.REFINE:
 57 |             self.odm_loc.apply(weights_init)
 58 |             self.odm_conf.apply(weights_init)
 59 |         if self.cfg.MODEL.LOAD_PRETRAINED_WEIGHTS:
 60 |             weights = torch.load(self.cfg.MODEL.PRETRAIN_WEIGHTS)
 61 |             print("load pretrain model {}".format(
 62 |                 self.cfg.MODEL.PRETRAIN_WEIGHTS))
 63 |             if self.cfg.MODEL.TYPE.split('_')[-1] == 'vgg':
 64 |                 self.extractor.vgg.load_state_dict(weights)
 65 |             else:
 66 |                 self.extractor.load_state_dict(weights, strict=False)
 67 | 
 68 |     def __init__(self, cfg):
 69 |         super(SSD, self).__init__()
 70 |         self.cfg = cfg
 71 |         self.size = cfg.MODEL.SIZE
 72 |         if self.size == '300':
 73 |             size_cfg = cfg.SMALL
 74 |         else:
 75 |             size_cfg = cfg.BIG
 76 |         self.num_classes = cfg.MODEL.NUM_CLASSES
 77 |         self.prior_layer = PriorLayer(cfg)
 78 |         self.priorbox = PriorBox(cfg)
 79 |         self.priors = self.priorbox.forward()
 80 |         self.extractor = get_func(cfg.MODEL.CONV_BODY)(self.size,
 81 |                                                        cfg.TRAIN.CHANNEL_SIZE)
 82 |         if cfg.MODEL.REFINE:
 83 |             self.odm_channels = size_cfg.ODM_CHANNELS
 84 |             self.arm_num_classes = 2
 85 |             self.odm_loc = nn.ModuleList()
 86 |             self.odm_conf = nn.ModuleList()
 87 |         self.arm_loc = nn.ModuleList()
 88 |         self.arm_conf = nn.ModuleList()
 89 |         self.arm_channels = size_cfg.ARM_CHANNELS
 90 |         self.num_anchors = size_cfg.NUM_ANCHORS
 91 |         self.input_fixed = size_cfg.INPUT_FIXED
 92 |         self.arm_loc = nn.ModuleList()
 93 |         self.arm_conf = nn.ModuleList()
 94 | 
 95 |         for i in range(len(self.arm_channels)):
 96 |             if cfg.MODEL.REFINE:
 97 |                 self.arm_loc += [
 98 |                     nn.Conv2d(
 99 |                         self.arm_channels[i],
100 |                         self.num_anchors[i] * 4,
101 |                         kernel_size=3,
102 |                         padding=1)
103 |                 ]
104 |                 self.arm_conf += [
105 |                     nn.Conv2d(
106 |                         self.arm_channels[i],
107 |                         self.num_anchors[i] * self.arm_num_classes,
108 |                         kernel_size=3,
109 |                         padding=1)
110 |                 ]
111 | 
112 |                 self.odm_loc += [
113 |                     nn.Conv2d(
114 |                         self.odm_channels[i],
115 |                         self.num_anchors[i] * 4,
116 |                         kernel_size=3,
117 |                         padding=1)
118 |                 ]
119 |                 self.odm_conf += [
120 |                     nn.Conv2d(
121 |                         self.odm_channels[i],
122 |                         self.num_anchors[i] * self.num_classes,
123 |                         kernel_size=3,
124 |                         padding=1)
125 |                 ]
126 |             else:
127 |                 self.arm_loc += [
128 |                     nn.Conv2d(
129 |                         self.arm_channels[i],
130 |                         self.num_anchors[i] * 4,
131 |                         kernel_size=3,
132 |                         padding=1)
133 |                 ]
134 |                 self.arm_conf += [
135 |                     nn.Conv2d(
136 |                         self.arm_channels[i],
137 |                         self.num_anchors[i] * self.num_classes,
138 |                         kernel_size=3,
139 |                         padding=1)
140 |                 ]
141 |         if cfg.TRAIN.TRAIN_ON:
142 |             self._init_modules()
143 | 
144 |     def forward(self, x):
145 | 
146 |         arm_loc = list()
147 |         arm_conf = list()
148 |         if self.cfg.MODEL.REFINE:
149 |             odm_loc = list()
150 |             odm_conf = list()
151 |             arm_xs, odm_xs = self.extractor(x)
152 |             for (x, l, c) in zip(odm_xs, self.odm_loc, self.odm_conf):
153 |             # for (x) in (odm_xs):
154 |                 odm_loc.append(l(x).permute(0, 2, 3, 1).contiguous())
155 |                 odm_conf.append(c(x).permute(0, 2, 3, 1).contiguous())
156 |             odm_loc = torch.cat([o.view(o.size(0), -1) for o in odm_loc], 1)
157 |             odm_conf = torch.cat([o.view(o.size(0), -1) for o in odm_conf], 1)
158 |         else:
159 |             arm_xs, fbb, att, mm = self.extractor(x)
160 |         img_wh = (x.size(3), x.size(2))
161 |         feature_maps_wh = [(t.size(3), t.size(2)) for t in arm_xs]
162 |         for (x, l, c) in zip(arm_xs, self.arm_loc, self.arm_conf):
163 |             arm_loc.append(l(x).permute(0, 2, 3, 1).contiguous())
164 |             arm_conf.append(c(x).permute(0, 2, 3, 1).contiguous())
165 |         arm_loc = torch.cat([o.view(o.size(0), -1) for o in arm_loc], 1)
166 |         arm_conf = torch.cat([o.view(o.size(0), -1) for o in arm_conf], 1)
167 |         if self.cfg.MODEL.REFINE:
168 |             output = (arm_loc.view(arm_loc.size(0), -1, 4),
169 |                       arm_conf.view(
170 |                           arm_conf.size(0), -1, self.arm_num_classes),
171 |                       odm_loc.view(odm_loc.size(0), -1, 4),
172 |                       odm_conf.view(odm_conf.size(0), -1, self.num_classes),
173 |                       self.priors if self.input_fixed else self.prior_layer(
174 |                           img_wh, feature_maps_wh))
175 |         else:
176 |             output = (arm_loc.view(arm_loc.size(0), -1, 4),
177 |                       arm_conf.view(arm_conf.size(0), -1, self.num_classes),
178 |                       self.priors if self.input_fixed else self.prior_layer(
179 |                           img_wh, feature_maps_wh))
180 | 
181 |         return output, arm_xs, fbb, att, mm
182 | 
183 | 


--------------------------------------------------------------------------------
/layers/modules/refine_multibox_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import numpy as np
  8 | from torch.autograd import Variable
  9 | from utils.box_utils import match, log_sum_exp, refine_match
 10 | from layers.modules import WeightSoftmaxLoss, WeightSmoothL1Loss
 11 | GPU = False
 12 | if torch.cuda.is_available():
 13 |     GPU = True
 14 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 15 | 
 16 | 
 17 | class RefineMultiBoxLoss(nn.Module):
 18 |     """SSD Weighted Loss Function
 19 |     Compute Targets:
 20 |         1) Produce Confidence Target Indices by matching  ground truth boxes
 21 |            with (default) 'priorboxes' that have jaccard index > threshold parameter
 22 |            (default threshold: 0.5).
 23 |         2) Produce localization target by 'encoding' variance into offsets of ground
 24 |            truth boxes and their matched  'priorboxes'.
 25 |         3) Hard negative mining to filter the excessive number of negative examples
 26 |            that comes with using a large number of default bounding boxes.
 27 |            (default negative:positive ratio 3:1)
 28 |     Objective Loss:
 29 |         L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
 30 |         Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
 31 |         weighted by α which is set to 1 by cross val.
 32 |         Args:
 33 |             c: class confidences,
 34 |             l: predicted boxes,
 35 |             g: ground truth boxes
 36 |             N: number of matched default boxes
 37 |         See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 38 |     """
 39 | 
 40 |     def __init__(self, cfg, num_classes):
 41 |         super(RefineMultiBoxLoss, self).__init__()
 42 |         self.cfg = cfg
 43 |         self.size = cfg.MODEL.SIZE
 44 |         if self.size == '300':
 45 |             size_cfg = cfg.SMALL
 46 |         else:
 47 |             size_cfg = cfg.BIG
 48 |         self.variance = size_cfg.VARIANCE
 49 |         self.num_classes = num_classes
 50 |         self.threshold = cfg.TRAIN.OVERLAP
 51 |         self.OHEM = cfg.TRAIN.OHEM
 52 |         self.negpos_ratio = cfg.TRAIN.NEG_RATIO
 53 |         self.object_score = cfg.MODEL.OBJECT_SCORE
 54 |         self.variance = size_cfg.VARIANCE
 55 |         if cfg.TRAIN.FOCAL_LOSS:
 56 |             if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX':
 57 |                 self.focaloss = FocalLossSoftmax(
 58 |                     self.num_classes, gamma=2, size_average=False)
 59 |             else:
 60 |                 self.focaloss = FocalLossSigmoid()
 61 | 
 62 |     def forward(self,
 63 |                 predictions,
 64 |                 targets,
 65 |                 use_arm=False,
 66 |                 filter_object=False,
 67 |                 debug=False):
 68 |         """Multibox Loss
 69 |         Args:
 70 |             predictions (tuple): A tuple containing loc preds, conf preds,
 71 |             and prior boxes from SSD net.
 72 |                 conf shape: torch.size(batch_size,num_priors,num_classes)
 73 |                 loc shape: torch.size(batch_size,num_priors,4)
 74 |                 priors shape: torch.size(num_priors,4)
 75 | 
 76 |             ground_truth (tensor): Ground truth boxes and labels for a batch,
 77 |                 shape: [batch_size,num_objs,5] (last idx is the label).
 78 |         """
 79 |         # arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions
 80 |         if use_arm:
 81 |             arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions
 82 |         else:
 83 |             loc_data, conf_data, _, _, priors = predictions
 84 |         num = loc_data.size(0)
 85 |         priors = priors[:loc_data.size(1), :]
 86 |         num_priors = (priors.size(0))
 87 |         num_classes = self.num_classes
 88 | 
 89 |         # match priors (default boxes) and ground truth boxes
 90 |         loc_t = torch.Tensor(num, num_priors, 4)
 91 |         conf_t = torch.LongTensor(num, num_priors)
 92 |         defaults = priors.data
 93 |         for idx in range(num):
 94 |             truths = targets[idx][:, :-1].data
 95 |             labels = targets[idx][:, -1].data
 96 |             if self.num_classes == 2:
 97 |                 labels = labels > 0
 98 |             if use_arm:
 99 |                 bbox_weight = refine_match(
100 |                     self.threshold,
101 |                     truths,
102 |                     defaults,
103 |                     self.variance,
104 |                     labels,
105 |                     loc_t,
106 |                     conf_t,
107 |                     idx,
108 |                     arm_loc_data[idx].data,
109 |                     use_weight=False)
110 |             else:
111 |                 match(self.threshold, truths, defaults, self.variance, labels,
112 |                       loc_t, conf_t, idx)
113 | 
114 |         loc_t = loc_t.cuda()
115 |         conf_t = conf_t.cuda()
116 |         # wrap targets
117 |         loc_t = Variable(loc_t, requires_grad=False)
118 |         conf_t = Variable(conf_t, requires_grad=False)
119 | 
120 |         if use_arm and filter_object:
121 |             P = F.softmax(arm_conf_data, 2)
122 |             arm_conf_data_temp = P[:, :, 1]
123 |             object_score_index = arm_conf_data_temp <= self.object_score
124 |             pos = conf_t > 0
125 |             pos[object_score_index.detach()] = 0
126 |         else:
127 |             pos = conf_t > 0
128 |         num_pos = pos.sum(1, keepdim=True)
129 |         if debug:
130 |             if use_arm:
131 |                 print("odm pos num: ", str(loc_t.size(0)), str(loc_t.size(1)))
132 |             else:
133 |                 print("arm pos num", str(loc_t.size(0)), str(loc_t.size(1)))
134 | 
135 |         if self.OHEM:
136 |             # Compute max conf across batch for hard negative mining
137 |             batch_conf = conf_data.view(-1, self.num_classes)
138 | 
139 |             loss_c = log_sum_exp(batch_conf) - batch_conf.gather(
140 |                 1, conf_t.view(-1, 1))
141 | 
142 |             # Hard Negative Mining
143 |             loss_c[pos.view(-1, 1)] = 0  # filter out pos boxes for now
144 |             loss_c = loss_c.view(num, -1)
145 |             _, loss_idx = loss_c.sort(1, descending=True)
146 |             _, idx_rank = loss_idx.sort(1)
147 |             num_pos = pos.long().sum(1, keepdim=True)
148 | 
149 |             if num_pos.data.sum() > 0:
150 |                 num_neg = torch.clamp(
151 |                 self.negpos_ratio * num_pos, max=pos.size(1) - 1)
152 |             else:
153 |                 fake_num_pos = torch.ones(32, 1).long() * 15
154 |                 num_neg = torch.clamp(
155 |                 self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1)
156 |             neg = idx_rank < num_neg.expand_as(idx_rank)
157 | 
158 |             # Confidence Loss Including Positive and Negative Examples
159 |             pos_idx = pos.unsqueeze(2).expand_as(conf_data)
160 |             neg_idx = neg.unsqueeze(2).expand_as(conf_data)
161 |             conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(
162 |                 -1, self.num_classes)
163 | 
164 |             targets_weighted = conf_t[(pos + neg).gt(0)]
165 |             loss_c = F.cross_entropy(
166 |                 conf_p, targets_weighted, size_average=False)
167 |         else:
168 |             loss_c = F.cross_entropy(conf_p, conf_t, size_average=False)
169 | 
170 |         # Localization Loss (Smooth L1)
171 |         # Shape: [batch,num_priors,4]
172 |         if num_pos.data.sum() > 0:
173 |             pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
174 |             loc_p = loc_data[pos_idx].view(-1, 4)
175 |             loc_t = loc_t[pos_idx].view(-1, 4)
176 |             loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
177 |             N = num_pos.data.sum()
178 |         else:
179 |             loss_l = torch.zeros(1)
180 |             N = 1.0
181 | 
182 |         loss_l /= float(N)
183 |         loss_c /= float(N)
184 |         return loss_l, loss_c
185 | 


--------------------------------------------------------------------------------
/layers/modules/refine_multibox_loss_seperate.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import numpy as np
  8 | from torch.autograd import Variable
  9 | from utils.box_utils import match, log_sum_exp, refine_match
 10 | from layers.modules import WeightSoftmaxLoss, WeightSmoothL1Loss
 11 | GPU = False
 12 | if torch.cuda.is_available():
 13 |     GPU = True
 14 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 15 | 
 16 | 
 17 | class RefineMultiBoxLoss(nn.Module):
 18 |     """SSD Weighted Loss Function
 19 |     Compute Targets:
 20 |         1) Produce Confidence Target Indices by matching  ground truth boxes
 21 |            with (default) 'priorboxes' that have jaccard index > threshold parameter
 22 |            (default threshold: 0.5).
 23 |         2) Produce localization target by 'encoding' variance into offsets of ground
 24 |            truth boxes and their matched  'priorboxes'.
 25 |         3) Hard negative mining to filter the excessive number of negative examples
 26 |            that comes with using a large number of default bounding boxes.
 27 |            (default negative:positive ratio 3:1)
 28 |     Objective Loss:
 29 |         L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
 30 |         Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
 31 |         weighted by α which is set to 1 by cross val.
 32 |         Args:
 33 |             c: class confidences,
 34 |             l: predicted boxes,
 35 |             g: ground truth boxes
 36 |             N: number of matched default boxes
 37 |         See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 38 |     """
 39 | 
 40 |     def __init__(self, cfg, num_classes):
 41 |         super(RefineMultiBoxLoss, self).__init__()
 42 |         self.cfg = cfg
 43 |         self.size = cfg.MODEL.SIZE
 44 |         if self.size == '300':
 45 |             size_cfg = cfg.SMALL
 46 |         else:
 47 |             size_cfg = cfg.BIG
 48 |         self.variance = size_cfg.VARIANCE
 49 |         self.num_classes = num_classes
 50 |         self.threshold = cfg.TRAIN.OVERLAP
 51 |         self.OHEM = cfg.TRAIN.OHEM
 52 |         self.negpos_ratio = cfg.TRAIN.NEG_RATIO
 53 |         self.object_score = cfg.MODEL.OBJECT_SCORE
 54 |         self.variance = size_cfg.VARIANCE
 55 |         if cfg.TRAIN.FOCAL_LOSS:
 56 |             if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX':
 57 |                 self.focaloss = FocalLossSoftmax(
 58 |                     self.num_classes, gamma=2, size_average=False)
 59 |             else:
 60 |                 self.focaloss = FocalLossSigmoid()
 61 | 
 62 |     def forward(self,
 63 |                 predictions,
 64 |                 targets,
 65 |                 use_arm=False,
 66 |                 filter_object=False,
 67 |                 debug=False):
 68 |         """Multibox Loss
 69 |         Args:
 70 |             predictions (tuple): A tuple containing loc preds, conf preds,
 71 |             and prior boxes from SSD net.
 72 |                 conf shape: torch.size(batch_size,num_priors,num_classes)
 73 |                 loc shape: torch.size(batch_size,num_priors,4)
 74 |                 priors shape: torch.size(num_priors,4)
 75 | 
 76 |             ground_truth (tensor): Ground truth boxes and labels for a batch,
 77 |                 shape: [batch_size,num_objs,5] (last idx is the label).
 78 |         """
 79 |         # arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions
 80 |         if use_arm:
 81 |             arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions
 82 |         else:
 83 |             loc_data, conf_data, _, _, priors = predictions
 84 |         num = loc_data.size(0)
 85 |         priors = priors[:loc_data.size(1), :]
 86 |         num_priors = (priors.size(0))
 87 |         num_classes = self.num_classes
 88 | 
 89 | 
 90 | 
 91 |         # match priors (default boxes) and ground truth boxes
 92 |         loc_t = torch.Tensor(num, num_priors, 4)
 93 |         conf_t = torch.LongTensor(num, num_priors)
 94 |         defaults = priors.data
 95 |         for idx in range(num):
 96 |             truths = targets[idx][:, :-1].data
 97 |             labels = targets[idx][:, -1].data
 98 |             if self.num_classes == 2:
 99 |                 labels = labels > 0
100 |             if use_arm:
101 |                 bbox_weight = refine_match(
102 |                     self.threshold,
103 |                     truths,
104 |                     defaults,
105 |                     self.variance,
106 |                     labels,
107 |                     loc_t,
108 |                     conf_t,
109 |                     idx,
110 |                     arm_loc_data[idx].data,
111 |                     use_weight=False)
112 |             else:
113 |                 match(self.threshold, truths, defaults, self.variance, labels,
114 |                       loc_t, conf_t, idx)
115 | 
116 |         loc_t = loc_t.cuda()
117 |         conf_t = conf_t.cuda()
118 |         # wrap targets
119 |         loc_t = Variable(loc_t, requires_grad=False)
120 |         conf_t = Variable(conf_t, requires_grad=False)
121 | 
122 |         if use_arm and filter_object:
123 |             P = F.softmax(arm_conf_data, 2)
124 |             arm_conf_data_temp = P[:, :, 1]
125 |             object_score_index = arm_conf_data_temp <= self.object_score
126 |             pos = conf_t > 0
127 |             pos[object_score_index.detach()] = 0
128 |         else:
129 |             pos = conf_t > 0
130 |         num_pos = pos.sum(1, keepdim=True)
131 |         if debug:
132 |             if use_arm:
133 |                 print("odm pos num: ", str(loc_t.size(0)), str(loc_t.size(1)))
134 |             else:
135 |                 print("arm pos num", str(loc_t.size(0)), str(loc_t.size(1)))
136 | 
137 |         if self.OHEM:
138 |             # Compute max conf across batch for hard negative mining
139 |             batch_conf = conf_data.view(-1, self.num_classes)
140 | 
141 |             loss_c = log_sum_exp(batch_conf) - batch_conf.gather(
142 |                 1, conf_t.view(-1, 1))
143 | 
144 |             # Hard Negative Mining
145 |             loss_c[pos.view(-1, 1)] = 0  # filter out pos boxes for now
146 |             loss_c = loss_c.view(num, -1)
147 |             _, loss_idx = loss_c.sort(1, descending=True)
148 |             _, idx_rank = loss_idx.sort(1)
149 |             num_pos = pos.long().sum(1, keepdim=True)
150 | 
151 |             if num_pos.data.sum() > 0:
152 |                 num_neg = torch.clamp(
153 |                 self.negpos_ratio * num_pos, max=pos.size(1) - 1)
154 |             else:
155 |                 fake_num_pos = torch.ones(32, 1).long() * 15
156 |                 num_neg = torch.clamp(
157 |                 self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1)
158 |             neg = idx_rank < num_neg.expand_as(idx_rank)
159 | 
160 |             # Confidence Loss Including Positive and Negative Examples
161 |             pos_idx = pos.unsqueeze(2).expand_as(conf_data)
162 |             neg_idx = neg.unsqueeze(2).expand_as(conf_data)
163 |             conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(
164 |                 -1, self.num_classes)
165 | 
166 |             targets_weighted = conf_t[(pos + neg).gt(0)]
167 |             loss_c = F.cross_entropy(
168 |                 conf_p, targets_weighted, size_average=False)
169 |         else:
170 |             loss_c = F.cross_entropy(conf_p, conf_t, size_average=False)
171 | 
172 |         # Localization Loss (Smooth L1)
173 |         # Shape: [batch,num_priors,4]
174 |         if num_pos.data.sum() > 0:
175 |             pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
176 |             loc_p = loc_data[pos_idx].view(-1, 4)
177 |             loc_t = loc_t[pos_idx].view(-1, 4)
178 |             loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
179 |             N = num_pos.data.sum()
180 |         else:
181 |             loss_l = torch.zeros(1)
182 |             N = 1.0
183 | 
184 |         loss_l /= float(N)
185 |         loss_c /= float(N)
186 |         return loss_l, loss_c
187 | 


--------------------------------------------------------------------------------
/dcn/functions/deform_conv.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.autograd import Function
  3 | from torch.nn.modules.utils import _pair
  4 | 
  5 | from .. import deform_conv_cuda
  6 | 
  7 | 
  8 | class DeformConvFunction(Function):
  9 | 
 10 |     @staticmethod
 11 |     def forward(ctx,
 12 |                 input,
 13 |                 offset,
 14 |                 weight,
 15 |                 stride=1,
 16 |                 padding=0,
 17 |                 dilation=1,
 18 |                 groups=1,
 19 |                 deformable_groups=1,
 20 |                 im2col_step=64):
 21 |         if input is not None and input.dim() != 4:
 22 |             raise ValueError(
 23 |                 "Expected 4D tensor as input, got {}D tensor instead.".format(
 24 |                     input.dim()))
 25 |         ctx.stride = _pair(stride)
 26 |         ctx.padding = _pair(padding)
 27 |         ctx.dilation = _pair(dilation)
 28 |         ctx.groups = groups
 29 |         ctx.deformable_groups = deformable_groups
 30 |         ctx.im2col_step = im2col_step
 31 | 
 32 |         ctx.save_for_backward(input, offset, weight)
 33 | 
 34 |         output = input.new_empty(
 35 |             DeformConvFunction._output_size(input, weight, ctx.padding,
 36 |                                             ctx.dilation, ctx.stride))
 37 | 
 38 |         ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
 39 | 
 40 |         if not input.is_cuda:
 41 |             raise NotImplementedError
 42 |         else:
 43 |             cur_im2col_step = min(ctx.im2col_step, input.shape[0])
 44 |             assert (input.shape[0] %
 45 |                     cur_im2col_step) == 0, 'im2col step must divide batchsize'
 46 |             deform_conv_cuda.deform_conv_forward_cuda(
 47 |                 input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1],
 48 |                 weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0],
 49 |                 ctx.padding[1], ctx.padding[0], ctx.dilation[1],
 50 |                 ctx.dilation[0], ctx.groups, ctx.deformable_groups,
 51 |                 cur_im2col_step)
 52 |         return output
 53 | 
 54 |     @staticmethod
 55 |     def backward(ctx, grad_output):
 56 |         input, offset, weight = ctx.saved_tensors
 57 | 
 58 |         grad_input = grad_offset = grad_weight = None
 59 | 
 60 |         if not grad_output.is_cuda:
 61 |             raise NotImplementedError
 62 |         else:
 63 |             cur_im2col_step = min(ctx.im2col_step, input.shape[0])
 64 |             assert (input.shape[0] %
 65 |                     cur_im2col_step) == 0, 'im2col step must divide batchsize'
 66 | 
 67 |             if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
 68 |                 grad_input = torch.zeros_like(input)
 69 |                 grad_offset = torch.zeros_like(offset)
 70 |                 deform_conv_cuda.deform_conv_backward_input_cuda(
 71 |                     input, offset, grad_output, grad_input,
 72 |                     grad_offset, weight, ctx.bufs_[0], weight.size(3),
 73 |                     weight.size(2), ctx.stride[1], ctx.stride[0],
 74 |                     ctx.padding[1], ctx.padding[0], ctx.dilation[1],
 75 |                     ctx.dilation[0], ctx.groups, ctx.deformable_groups,
 76 |                     cur_im2col_step)
 77 | 
 78 |             if ctx.needs_input_grad[2]:
 79 |                 grad_weight = torch.zeros_like(weight)
 80 |                 deform_conv_cuda.deform_conv_backward_parameters_cuda(
 81 |                     input, offset, grad_output,
 82 |                     grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3),
 83 |                     weight.size(2), ctx.stride[1], ctx.stride[0],
 84 |                     ctx.padding[1], ctx.padding[0], ctx.dilation[1],
 85 |                     ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1,
 86 |                     cur_im2col_step)
 87 | 
 88 |         return (grad_input, grad_offset, grad_weight, None, None, None, None,
 89 |                 None)
 90 | 
 91 |     @staticmethod
 92 |     def _output_size(input, weight, padding, dilation, stride):
 93 |         channels = weight.size(0)
 94 |         output_size = (input.size(0), channels)
 95 |         for d in range(input.dim() - 2):
 96 |             in_size = input.size(d + 2)
 97 |             pad = padding[d]
 98 |             kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
 99 |             stride_ = stride[d]
100 |             output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
101 |         if not all(map(lambda s: s > 0, output_size)):
102 |             raise ValueError(
103 |                 "convolution input is too small (output would be {})".format(
104 |                     'x'.join(map(str, output_size))))
105 |         return output_size
106 | 
107 | 
108 | class ModulatedDeformConvFunction(Function):
109 | 
110 |     @staticmethod
111 |     def forward(ctx,
112 |                 input,
113 |                 offset,
114 |                 mask,
115 |                 weight,
116 |                 bias=None,
117 |                 stride=1,
118 |                 padding=0,
119 |                 dilation=1,
120 |                 groups=1,
121 |                 deformable_groups=1):
122 |         ctx.stride = stride
123 |         ctx.padding = padding
124 |         ctx.dilation = dilation
125 |         ctx.groups = groups
126 |         ctx.deformable_groups = deformable_groups
127 |         ctx.with_bias = bias is not None
128 |         if not ctx.with_bias:
129 |             bias = input.new_empty(1)  # fake tensor
130 |         if not input.is_cuda:
131 |             raise NotImplementedError
132 |         if weight.requires_grad or mask.requires_grad or offset.requires_grad \
133 |                 or input.requires_grad:
134 |             ctx.save_for_backward(input, offset, mask, weight, bias)
135 |         output = input.new_empty(
136 |             ModulatedDeformConvFunction._infer_shape(ctx, input, weight))
137 |         ctx._bufs = [input.new_empty(0), input.new_empty(0)]
138 |         deform_conv_cuda.modulated_deform_conv_cuda_forward(
139 |             input, weight, bias, ctx._bufs[0], offset, mask, output,
140 |             ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride,
141 |             ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
142 |             ctx.groups, ctx.deformable_groups, ctx.with_bias)
143 |         return output
144 | 
145 |     @staticmethod
146 |     def backward(ctx, grad_output):
147 |         if not grad_output.is_cuda:
148 |             raise NotImplementedError
149 |         input, offset, mask, weight, bias = ctx.saved_tensors
150 |         grad_input = torch.zeros_like(input)
151 |         grad_offset = torch.zeros_like(offset)
152 |         grad_mask = torch.zeros_like(mask)
153 |         grad_weight = torch.zeros_like(weight)
154 |         grad_bias = torch.zeros_like(bias)
155 |         deform_conv_cuda.modulated_deform_conv_cuda_backward(
156 |             input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1],
157 |             grad_input, grad_weight, grad_bias, grad_offset, grad_mask,
158 |             grad_output, weight.shape[2], weight.shape[3], ctx.stride,
159 |             ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
160 |             ctx.groups, ctx.deformable_groups, ctx.with_bias)
161 |         if not ctx.with_bias:
162 |             grad_bias = None
163 | 
164 |         return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
165 |                 None, None, None, None, None)
166 | 
167 |     @staticmethod
168 |     def _infer_shape(ctx, input, weight):
169 |         n = input.size(0)
170 |         channels_out = weight.size(0)
171 |         height, width = input.shape[2:4]
172 |         kernel_h, kernel_w = weight.shape[2:4]
173 |         height_out = (height + 2 * ctx.padding -
174 |                       (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1
175 |         width_out = (width + 2 * ctx.padding -
176 |                      (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1
177 |         return n, channels_out, height_out, width_out
178 | 
179 | 
180 | deform_conv = DeformConvFunction.apply
181 | modulated_deform_conv = ModulatedDeformConvFunction.apply
182 | 


--------------------------------------------------------------------------------
/utils/nms/cpu_nms.pyx:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick
  6 | # --------------------------------------------------------
  7 | 
  8 | import numpy as np
  9 | cimport numpy as np
 10 | 
 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
 12 |     return a if a >= b else b
 13 | 
 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
 15 |     return a if a <= b else b
 16 | 
 17 | cdef inline np.float32_t abs(np.float32_t a, np.float32_t b):
 18 |     return a - b if a >= b else b - a
 19 | 
 20 | def get_iou_weights(np.ndarray[np.float32_t, ndim=1] ious, np.float threshold, float init_weight):
 21 | 
 22 |     cdef:
 23 |         int num = ious.shape[0]
 24 |         # np.ndarray[np.float32_t, ndim=1] out = np.zeros(num, dtype=np.float)
 25 |         int idx
 26 |         float iou
 27 |         float weight
 28 | 
 29 |     for idx, iou in enumerate(ious):
 30 |         weight = init_weight
 31 |         if iou > 0.0:
 32 |             if iou > threshold + 0.1:
 33 |                 weight += 1.0
 34 |             elif iou < threshold - 0.1:
 35 |                 weight += 1.0
 36 |             else:
 37 |                 weight += 0.0
 38 |         ious[idx]  = <float>weight
 39 |     return ious
 40 | 
 41 | def get_mask(np.ndarray[np.float32_t, ndim=1] ious, np.float threshold):
 42 |     cdef:
 43 |         int num = ious.shape[0]
 44 |         int idx = 0
 45 |         float distance
 46 |         float iou
 47 |         np.ndarray[np.int64_t, ndim=1] out = np.zeros((num), dtype=np.int64)
 48 |     for idx, iou in enumerate(ious):
 49 |         # if iou >= threshold:
 50 |         #     distance = iou - threshold
 51 |         #     if distance < 0.1:
 52 |         #         out[idx] = 0
 53 |         #     elif distance < 0.2:
 54 |         #         out[idx] = 1
 55 |         #     else:
 56 |         #         out[idx] = 2
 57 |         # else:
 58 |         #     distance = threshold - iou
 59 |         #     if distance < 0.1:
 60 |         #         out[idx] = 2
 61 |         #     elif distance < 0.2:
 62 |         #         out[idx] = 1
 63 |         #     else:
 64 |         #         out[idx] = 0
 65 |         distance = abs(iou, threshold) 
 66 |         if distance < 0.1:
 67 |             # out[:,2] = 1
 68 |             out[idx] = 2
 69 |         elif distance < 0.2:
 70 |             # out[:,1] = 1
 71 |             out[idx] = 1
 72 |         else:
 73 |             # out[:,0] = 0
 74 |             out[idx] = 0
 75 |     return out
 76 | 
 77 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
 78 |     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
 79 |     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
 80 |     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
 81 |     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
 82 |     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
 83 | 
 84 |     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
 85 |     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
 86 | 
 87 |     cdef int ndets = dets.shape[0]
 88 |     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
 89 |             np.zeros((ndets), dtype=np.int)
 90 | 
 91 |     # nominal indices
 92 |     cdef int _i, _j
 93 |     # sorted indices
 94 |     cdef int i, j
 95 |     # temp variables for box i's (the box currently under consideration)
 96 |     cdef np.float32_t ix1, iy1, ix2, iy2, iarea
 97 |     # variables for computing overlap with box j (lower scoring box)
 98 |     cdef np.float32_t xx1, yy1, xx2, yy2
 99 |     cdef np.float32_t w, h
100 |     cdef np.float32_t inter, ovr
101 | 
102 |     keep = []
103 |     for _i in range(ndets):
104 |         i = order[_i]
105 |         if suppressed[i] == 1:
106 |             continue
107 |         keep.append(i)
108 |         ix1 = x1[i]
109 |         iy1 = y1[i]
110 |         ix2 = x2[i]
111 |         iy2 = y2[i]
112 |         iarea = areas[i]
113 |         for _j in range(_i + 1, ndets):
114 |             j = order[_j]
115 |             if suppressed[j] == 1:
116 |                 continue
117 |             xx1 = max(ix1, x1[j])
118 |             yy1 = max(iy1, y1[j])
119 |             xx2 = min(ix2, x2[j])
120 |             yy2 = min(iy2, y2[j])
121 |             w = max(0.0, xx2 - xx1 + 1)
122 |             h = max(0.0, yy2 - yy1 + 1)
123 |             inter = w * h
124 |             ovr = inter / (iarea + areas[j] - inter)
125 |             if ovr >= thresh:
126 |                 suppressed[j] = 1
127 | 
128 |     return keep
129 | 
130 | def cpu_soft_nms(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0):
131 |     cdef unsigned int N = boxes.shape[0]
132 |     cdef float iw, ih, box_area
133 |     cdef float ua
134 |     cdef int pos = 0
135 |     cdef float maxscore = 0
136 |     cdef int maxpos = 0
137 |     cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov
138 | 
139 |     for i in range(N):
140 |         maxscore = boxes[i, 4]
141 |         maxpos = i
142 | 
143 |         tx1 = boxes[i,0]
144 |         ty1 = boxes[i,1]
145 |         tx2 = boxes[i,2]
146 |         ty2 = boxes[i,3]
147 |         ts = boxes[i,4]
148 | 
149 |         pos = i + 1
150 | 	# get max box
151 |         while pos < N:
152 |             if maxscore < boxes[pos, 4]:
153 |                 maxscore = boxes[pos, 4]
154 |                 maxpos = pos
155 |             pos = pos + 1
156 | 
157 | 	# add max box as a detection 
158 |         boxes[i,0] = boxes[maxpos,0]
159 |         boxes[i,1] = boxes[maxpos,1]
160 |         boxes[i,2] = boxes[maxpos,2]
161 |         boxes[i,3] = boxes[maxpos,3]
162 |         boxes[i,4] = boxes[maxpos,4]
163 | 
164 | 	# swap ith box with position of max box
165 |         boxes[maxpos,0] = tx1
166 |         boxes[maxpos,1] = ty1
167 |         boxes[maxpos,2] = tx2
168 |         boxes[maxpos,3] = ty2
169 |         boxes[maxpos,4] = ts
170 | 
171 |         tx1 = boxes[i,0]
172 |         ty1 = boxes[i,1]
173 |         tx2 = boxes[i,2]
174 |         ty2 = boxes[i,3]
175 |         ts = boxes[i,4]
176 | 
177 |         pos = i + 1
178 | 	# NMS iterations, note that N changes if detection boxes fall below threshold
179 |         while pos < N:
180 |             x1 = boxes[pos, 0]
181 |             y1 = boxes[pos, 1]
182 |             x2 = boxes[pos, 2]
183 |             y2 = boxes[pos, 3]
184 |             s = boxes[pos, 4]
185 | 
186 |             area = (x2 - x1 + 1) * (y2 - y1 + 1)
187 |             iw = (min(tx2, x2) - max(tx1, x1) + 1)
188 |             if iw > 0:
189 |                 ih = (min(ty2, y2) - max(ty1, y1) + 1)
190 |                 if ih > 0:
191 |                     ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
192 |                     ov = iw * ih / ua #iou between max box and detection box
193 | 
194 |                     if method == 1: # linear
195 |                         if ov > Nt: 
196 |                             weight = 1 - ov
197 |                         else:
198 |                             weight = 1
199 |                     elif method == 2: # gaussian
200 |                         weight = np.exp(-(ov * ov)/sigma)
201 |                     else: # original NMS
202 |                         if ov > Nt: 
203 |                             weight = 0
204 |                         else:
205 |                             weight = 1
206 | 
207 |                     boxes[pos, 4] = weight*boxes[pos, 4]
208 | 		    
209 | 		    # if box score falls below threshold, discard the box by swapping with last box
210 | 		    # update N
211 |                     if boxes[pos, 4] < threshold:
212 |                         boxes[pos,0] = boxes[N-1, 0]
213 |                         boxes[pos,1] = boxes[N-1, 1]
214 |                         boxes[pos,2] = boxes[N-1, 2]
215 |                         boxes[pos,3] = boxes[N-1, 3]
216 |                         boxes[pos,4] = boxes[N-1, 4]
217 |                         N = N - 1
218 |                         pos = pos - 1
219 | 
220 |             pos = pos + 1
221 | 
222 |     keep = [i for i in range(N)]
223 |     return keep
224 | 


--------------------------------------------------------------------------------
/eval_dcn.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.optim as optim
  6 | import torch.backends.cudnn as cudnn
  7 | import torch.nn.init as init
  8 | import argparse
  9 | from torch.autograd import Variable
 10 | import torch.utils.data as data
 11 | from data import COCODetection, VOCDetection, detection_collate, BaseTransform, preproc
 12 | from layers.modules import MultiBoxLoss, RefineMultiBoxLoss
 13 | from layers.functions import Detect
 14 | from utils.nms_wrapper import nms, soft_nms
 15 | from configs.config import cfg, cfg_from_file
 16 | import numpy as np
 17 | import time
 18 | import os
 19 | import sys
 20 | import pickle
 21 | import datetime
 22 | # from models.model_builder import SSD
 23 | from models.model_builder_vgg import SSD
 24 | 
 25 | # from models.model_builder_resnet import SSD
 26 | 
 27 | 
 28 | def arg_parse():
 29 |     parser = argparse.ArgumentParser(
 30 |         description='Single Shot MultiBox Detection')
 31 |     parser.add_argument(
 32 |         '--weights',
 33 |         # default='/media/jnie/Storage/iccv_weights/efrgnet_vgg_epoch_320.pth',
 34 |         default='/media/jnie/Storage/iccv_weights/efrgnet_vgg_epoch_512.pth',
 35 |         type=str,
 36 |         help='Trained state_dict file path to open')
 37 |     parser.add_argument(
 38 |         '--cfg',
 39 |         dest='cfg_file',
 40 |         required=True,
 41 |         help='Config file for training (and optionally testing)')
 42 |     parser.add_argument(
 43 |         '--save_folder',
 44 |         default='eval/',
 45 |         type=str,
 46 |         help='File path to save results')
 47 |     parser.add_argument(
 48 |         '--num_workers',
 49 |         default=8,
 50 |         type=int,
 51 |         help='Number of workers used in dataloading')
 52 |     parser.add_argument(
 53 |         '--retest', default=False, type=bool, help='test cache results')
 54 |     args = parser.parse_args()
 55 |     return args
 56 | 
 57 | 
 58 | def eval_net(val_dataset,
 59 |              val_loader,
 60 |              net,
 61 |              detector,
 62 |              cfg,
 63 |              transform,
 64 |              max_per_image=300,
 65 |              thresh=0.01,
 66 |              batch_size=1):
 67 |     net.eval()
 68 |     num_images = len(val_dataset)
 69 |     num_classes = cfg.MODEL.NUM_CLASSES
 70 |     eval_save_folder = "./eval/"
 71 |     if not os.path.exists(eval_save_folder):
 72 |         os.mkdir(eval_save_folder)
 73 |     all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]
 74 |     det_file = os.path.join(eval_save_folder, 'detections.pkl')
 75 | 
 76 |     if args.retest:
 77 |         f = open(det_file, 'rb')
 78 |         all_boxes = pickle.load(f)
 79 |         print('Evaluating detections')
 80 |         val_dataset.evaluate_detections(all_boxes, eval_save_folder)
 81 |         return
 82 | 
 83 |     # img_idexes = val_dataset.image_indexes  #coco
 84 |     img_idexes = val_dataset.ids  #voc
 85 |     total_times = 0
 86 |     network_times = 0
 87 |     total_nms_times = 0
 88 |     total_forward_times = 0
 89 | 
 90 |     for idx, (imgs, _, img_info) in enumerate(val_loader):
 91 |         with torch.no_grad():
 92 | 
 93 |             x = imgs
 94 |             x = x.cuda()
 95 |             torch.cuda.synchronize()
 96 |             t1 = time.time()
 97 |             output= net(x)
 98 | 
 99 |             torch.cuda.synchronize()
100 |             t4 = time.time()
101 |             boxes, scores = detector.forward(output)
102 | 
103 |             # idx = np.where(scores == scores[:,:,1:].max())
104 |             torch.cuda.synchronize()
105 |             t2 = time.time()
106 | 
107 |             for k in range(boxes.size(0)):
108 |                 i = idx * batch_size + k
109 |                 boxes_ = boxes[k]
110 |                 scores_ = scores[k]
111 |                 boxes_ = boxes_.cpu().numpy()
112 |                 scores_ = scores_.cpu().numpy()
113 |                 img_wh = img_info[k]
114 |                 scale = np.array([img_wh[0], img_wh[1], img_wh[0], img_wh[1]])
115 |                 boxes_ *= scale
116 |                 for j in range(1, num_classes):
117 |                     inds = np.where(scores_[:, j] > thresh)[0]
118 |                     if len(inds) == 0:
119 |                         all_boxes[j][i] = np.empty([0, 5], dtype=np.float32)
120 |                         continue
121 |                     c_bboxes = boxes_[inds]
122 |                     c_scores = scores_[inds, j]
123 |                     c_dets = np.hstack((c_bboxes,
124 |                                         c_scores[:, np.newaxis])).astype(
125 |                                             np.float32, copy=False)
126 |                     keep = nms(c_dets, cfg.TEST.NMS_OVERLAP, force_cpu=False)
127 |                     # keep = soft_nms(c_dets, Nt=0.45, method=2)
128 |                     keep = keep[:50]
129 |                     c_dets = c_dets[keep, :]
130 |                     all_boxes[j][i] = c_dets
131 | 
132 |             torch.cuda.synchronize()
133 |             t3 = time.time()
134 |             detect_time = t2 - t4
135 |             nms_time = t3 - t2
136 |             forward_time = t4 - t1
137 |             if idx % 10 == 0:
138 |                 print('im_detect: {:d}/{:d} {:.6f}s {:.6f}s {:.3f}s'.format(
139 |                     i + 1, num_images, forward_time, detect_time, nms_time))
140 |             network_times += (t4 - t1)
141 |             total_times += (t3 - t1)
142 |             total_nms_times += nms_time
143 |             total_forward_times += (t2-t1)
144 | 
145 |     print("detect time: ", time.time() - st)
146 |     print("net time: ", network_times/5000.0)
147 |     print("avg time: ", total_times/5000.0)
148 |     print("nms time: ", total_nms_times/5000.0)
149 |     print("forward time: ", total_forward_times/5000.0)
150 | 
151 |     with open(det_file, 'wb') as f:
152 |         pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)
153 |     print('Evaluating detections')
154 |     val_dataset.evaluate_detections(all_boxes, eval_save_folder)
155 | 
156 | 
157 | def main():
158 |     global args
159 |     args = arg_parse()
160 |     cfg_from_file(args.cfg_file)
161 |     bgr_means = cfg.TRAIN.BGR_MEAN
162 |     dataset_name = cfg.DATASETS.DATA_TYPE
163 |     batch_size = cfg.TEST.BATCH_SIZE
164 |     num_workers = args.num_workers
165 |     if cfg.DATASETS.DATA_TYPE == 'VOC':
166 |         trainvalDataset = VOCDetection
167 |         top_k = 200
168 |     else:
169 |         trainvalDataset = COCODetection
170 |         top_k = 300
171 |     dataroot = cfg.DATASETS.DATAROOT
172 |     if cfg.MODEL.SIZE == '300':
173 |         size_cfg = cfg.SMALL
174 |     else:
175 |         size_cfg = cfg.BIG
176 | 
177 |     valSet = cfg.DATASETS.VAL_TYPE
178 |     num_classes = cfg.MODEL.NUM_CLASSES
179 |     save_folder = args.save_folder
180 |     if not os.path.exists(save_folder):
181 |         os.mkdir(save_folder)
182 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
183 |     cfg.TRAIN.TRAIN_ON = False
184 |     net = SSD(cfg)
185 | 
186 |     checkpoint = torch.load(args.weights)
187 |     state_dict = checkpoint['model']
188 |     from collections import OrderedDict
189 |     new_state_dict = OrderedDict()
190 |     for k, v in state_dict.items():
191 |         head = k[:7]
192 |         if head == 'module.':
193 |             name = k[7:]  # remove `module.`
194 |         else:
195 |             name = k
196 |         new_state_dict[name] = v
197 |     net.load_state_dict(new_state_dict)
198 |     detector = Detect(cfg)
199 |     ValTransform = BaseTransform(size_cfg.IMG_WH, bgr_means, (2, 0, 1))
200 | 
201 |     val_dataset = trainvalDataset(dataroot, valSet, ValTransform)
202 |     val_loader = data.DataLoader(
203 |         val_dataset,
204 |         batch_size,
205 |         shuffle=False,
206 |         num_workers=num_workers,
207 |         collate_fn=detection_collate)
208 |     top_k = 300
209 |     thresh = cfg.TEST.CONFIDENCE_THRESH
210 |     eval_net(
211 |         val_dataset,
212 |         val_loader,
213 |         net,
214 |         detector,
215 |         cfg,
216 |         ValTransform,
217 |         top_k,
218 |         thresh=thresh,
219 |         batch_size=batch_size)
220 | 
221 | 
222 | if __name__ == '__main__':
223 |     st = time.time()
224 |     main()
225 |     print("final time", time.time() - st)
226 | 


--------------------------------------------------------------------------------
/data/voc_eval.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast/er R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Bharath Hariharan
  5 | # --------------------------------------------------------
  6 | 
  7 | import xml.etree.ElementTree as ET
  8 | import os
  9 | import pickle
 10 | import numpy as np
 11 | import pdb
 12 | import matplotlib
 13 | matplotlib.use('Agg')
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | 
 17 | def parse_rec(filename):
 18 |     """ Parse a PASCAL VOC xml file """
 19 |     tree = ET.parse(filename)
 20 |     objects = []
 21 |     for obj in tree.findall('object'):
 22 |         obj_struct = {}
 23 |         obj_struct['name'] = obj.find('name').text
 24 |         obj_struct['pose'] = obj.find('pose').text
 25 |         obj_struct['truncated'] = int(obj.find('truncated').text)
 26 |         obj_struct['difficult'] = int(obj.find('difficult').text)
 27 |         bbox = obj.find('bndbox')
 28 |         obj_struct['bbox'] = [
 29 |             int(bbox.find('xmin').text),
 30 |             int(bbox.find('ymin').text),
 31 |             int(bbox.find('xmax').text),
 32 |             int(bbox.find('ymax').text)
 33 |         ]
 34 |         objects.append(obj_struct)
 35 | 
 36 |     return objects
 37 | 
 38 | 
 39 | def voc_ap(rec, prec, use_07_metric=False):
 40 |     """ ap = voc_ap(rec, prec, [use_07_metric])
 41 |     Compute VOC AP given precision and recall.
 42 |     If use_07_metric is true, uses the
 43 |     VOC 07 11 point method (default:False).
 44 |     """
 45 |     if use_07_metric:
 46 |         # 11 point metric
 47 |         ap = 0.
 48 |         for t in np.arange(0., 1.1, 0.1):
 49 |             if np.sum(rec >= t) == 0:
 50 |                 p = 0
 51 |             else:
 52 |                 p = np.max(prec[rec >= t])
 53 |             ap = ap + p / 11.
 54 |     else:
 55 |         # correct AP calculation
 56 |         # first append sentinel values at the end
 57 |         mrec = np.concatenate(([0.], rec, [1.]))
 58 |         mpre = np.concatenate(([0.], prec, [0.]))
 59 | 
 60 |         # compute the precision envelope
 61 |         for i in range(mpre.size - 1, 0, -1):
 62 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 63 | 
 64 |         # to calculate area under PR curve, look for points
 65 |         # where X axis (recall) changes value
 66 |         i = np.where(mrec[1:] != mrec[:-1])[0]
 67 | 
 68 |         # and sum (\Delta recall) * prec
 69 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 70 |     return ap
 71 | 
 72 | 
 73 | def voc_eval(detpath,
 74 |              annopath,
 75 |              imagesetfile,
 76 |              classname,
 77 |              cachedir,
 78 |              ovthresh=0.5,
 79 |              use_07_metric=False):
 80 |     """rec, prec, ap = voc_eval(detpath,
 81 |                                 annopath,
 82 |                                 imagesetfile,
 83 |                                 classname,
 84 |                                 [ovthresh],
 85 |                                 [use_07_metric])
 86 | 
 87 |     Top level function that does the PASCAL VOC evaluation.
 88 | 
 89 |     detpath: Path to detections
 90 |         detpath.format(classname) should produce the detection results file.
 91 |     annopath: Path to annotations
 92 |         annopath.format(imagename) should be the xml annotations file.
 93 |     imagesetfile: Text file containing the list of images, one image per line.
 94 |     classname: Category name (duh)
 95 |     cachedir: Directory for caching the annotations
 96 |     [ovthresh]: Overlap threshold (default = 0.5)
 97 |     [use_07_metric]: Whether to use VOC07's 11 point AP computation
 98 |         (default False)
 99 |     """
100 |     # assumes detections are in detpath.format(classname)
101 |     # assumes annotations are in annopath.format(imagename)
102 |     # assumes imagesetfile is a text file with each line an image name
103 |     # cachedir caches the annotations in a pickle file
104 | 
105 |     # first load gt
106 |     if not os.path.isdir(cachedir):
107 |         os.mkdir(cachedir)
108 |     cachefile = os.path.join(cachedir, 'annots.pkl')
109 |     # read list of images
110 |     with open(imagesetfile, 'r') as f:
111 |         lines = f.readlines()
112 |     imagenames = [x.strip() for x in lines]
113 | 
114 |     if not os.path.isfile(cachefile):
115 |         # load annots
116 |         recs = {}
117 |         for i, imagename in enumerate(imagenames):
118 |             recs[imagename] = parse_rec(annopath.format(imagename))
119 |             if i % 100 == 0:
120 |                 print('Reading annotation for {:d}/{:d}'.format(
121 |                     i + 1, len(imagenames)))
122 |         # save
123 |         print('Saving cached annotations to {:s}'.format(cachefile))
124 |         with open(cachefile, 'wb') as f:
125 |             pickle.dump(recs, f)
126 |     else:
127 |         # load
128 |         with open(cachefile, 'rb') as f:
129 |             recs = pickle.load(f)
130 | 
131 |     # extract gt objects for this class
132 |     class_recs = {}
133 |     npos = 0
134 |     for imagename in imagenames:
135 |         R = [obj for obj in recs[imagename] if obj['name'] == classname]
136 |         bbox = np.array([x['bbox'] for x in R])
137 |         difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
138 |         det = [False] * len(R)
139 |         npos = npos + sum(~difficult)
140 |         class_recs[imagename] = {
141 |             'bbox': bbox,
142 |             'difficult': difficult,
143 |             'det': det
144 |         }
145 | 
146 |     # read dets
147 |     detfile = detpath.format(classname)
148 |     with open(detfile, 'r') as f:
149 |         lines = f.readlines()
150 | 
151 |     splitlines = [x.strip().split(' ') for x in lines]
152 |     image_ids = [x[0] for x in splitlines]
153 |     confidence = np.array([float(x[1]) for x in splitlines])
154 |     BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
155 |     # sort by confidence
156 |     sorted_ind = np.argsort(-confidence)
157 |     sorted_scores = np.sort(-confidence)
158 |     BB = BB[sorted_ind, :]
159 |     image_ids = [image_ids[x] for x in sorted_ind]
160 | 
161 |     # go down dets and mark TPs and FPs
162 |     nd = len(image_ids)
163 |     tp = np.zeros(nd)
164 |     fp = np.zeros(nd)
165 |     for d in range(nd):
166 |         R = class_recs[image_ids[d]]
167 |         bb = BB[d, :].astype(float)
168 |         ovmax = -np.inf
169 |         BBGT = R['bbox'].astype(float)
170 | 
171 |         if BBGT.size > 0:
172 |             # compute overlaps
173 |             # intersection
174 |             ixmin = np.maximum(BBGT[:, 0], bb[0])
175 |             iymin = np.maximum(BBGT[:, 1], bb[1])
176 |             ixmax = np.minimum(BBGT[:, 2], bb[2])
177 |             iymax = np.minimum(BBGT[:, 3], bb[3])
178 |             iw = np.maximum(ixmax - ixmin + 1., 0.)
179 |             ih = np.maximum(iymax - iymin + 1., 0.)
180 |             inters = iw * ih
181 | 
182 |             # union
183 |             uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
184 |                    (BBGT[:, 2] - BBGT[:, 0] + 1.) *
185 |                    (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
186 | 
187 |             overlaps = inters / uni
188 |             ovmax = np.max(overlaps)
189 |             jmax = np.argmax(overlaps)
190 | 
191 |         if ovmax > ovthresh:
192 |             if not R['difficult'][jmax]:
193 |                 if not R['det'][jmax]:
194 |                     tp[d] = 1.
195 |                     R['det'][jmax] = 1
196 |                 else:
197 |                     fp[d] = 1.
198 |         else:
199 |             fp[d] = 1.
200 | 
201 |         # compute precision recall
202 |     fp = np.cumsum(fp)
203 |     tp = np.cumsum(tp)
204 |     rec = tp / float(npos)
205 |     # avoid divide by zero in case the first detection matches a difficult
206 |     # ground truth
207 |     prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
208 |     # if classname == 'person':
209 |     final_rec = round(rec[-1], 4)
210 |     final_prec = round(prec[-1], 4)
211 |     plt_save_path = os.path.join(".", "eval", "pr")
212 |     if not os.path.exists(plt_save_path):
213 |         os.makedirs(plt_save_path)
214 |     plt.plot(rec, prec, 'r')
215 |     pr_curl = os.path.join(
216 |         plt_save_path, '{}_{}_{}pr.jpg'.format(classname, str(final_prec),
217 |                                                str(final_rec)))
218 |     plt.savefig(pr_curl)
219 |     plt.close()
220 |     ap = voc_ap(rec, prec, use_07_metric)
221 | 
222 |     return rec, prec, ap
223 | 


--------------------------------------------------------------------------------
/models/vgg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | import torch.nn.init as init
  9 | from models.model_helper import weights_init
 10 | 
 11 | class BasicConv(nn.Module):
 12 |     def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False):
 13 |         super(BasicConv, self).__init__()
 14 |         self.out_channels = out_planes
 15 |         self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
 16 |         self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None
 17 |         self.relu = nn.ReLU(inplace=True) if relu else None
 18 | 
 19 |     def forward(self, x):
 20 |         x = self.conv(x)
 21 |         if self.bn is not None:
 22 |             x = self.bn(x)
 23 |         if self.relu is not None:
 24 |             x = self.relu(x)
 25 |         return x
 26 | 
 27 | class BasicBlock(nn.Module):
 28 | 
 29 |     def __init__(self, in_planes, out_planes, stride=1):
 30 |         super(BasicBlock, self).__init__()
 31 |         self.out_channels = out_planes
 32 |         inter_planes = in_planes // 4
 33 |         self.single_branch = nn.Sequential(
 34 |                 BasicConv(in_planes, inter_planes, kernel_size=(3, 3), stride=stride, padding=(1, 1)),
 35 |                 BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=2, dilation=2),
 36 |                 BasicConv(inter_planes, out_planes, kernel_size=(3, 3), stride=1, padding=(1, 1))
 37 |                 )
 38 | 
 39 |     def forward(self, x):
 40 |         out = self.single_branch(x)
 41 |         return out
 42 | 
 43 | class L2Norm(nn.Module):
 44 |     def __init__(self, n_channels, scale):
 45 |         super(L2Norm, self).__init__()
 46 |         self.n_channels = n_channels
 47 |         self.gamma = scale or None
 48 |         self.eps = 1e-10
 49 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
 50 |         self.reset_parameters()
 51 | 
 52 |     def reset_parameters(self):
 53 |         init.constant_(self.weight, self.gamma)
 54 | 
 55 |     def forward(self, x):
 56 |         norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
 57 |         x = x / norm
 58 |         out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(
 59 |             x) * x
 60 |         return out
 61 | 
 62 | 
 63 | # This function is derived from torchvision VGG make_layers()
 64 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
 65 | 
 66 | 
 67 | def vgg(cfg, i, batch_norm=False):
 68 |     layers = []
 69 |     in_channels = i
 70 |     for v in cfg:
 71 |         if v == 'M':
 72 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
 73 |         elif v == 'C':
 74 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
 75 |         else:
 76 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
 77 |             if batch_norm:
 78 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
 79 |             else:
 80 |                 layers += [conv2d, nn.ReLU(inplace=True)]
 81 |             in_channels = v
 82 |     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
 83 |     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 84 |     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
 85 |     layers += [
 86 |         pool5, conv6,
 87 |         nn.ReLU(inplace=True), conv7,
 88 |         nn.ReLU(inplace=True)
 89 |     ]
 90 |     return layers
 91 | 
 92 | 
 93 | extras_cfg = {
 94 |     '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
 95 |     '512': [
 96 |         256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128, 'S',
 97 |         256
 98 |     ],
 99 | }
100 | 
101 | base = {
102 |     '300': [
103 |         64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
104 |         512, 512, 512
105 |     ],
106 |     '512': [
107 |         64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
108 |         512, 512, 512
109 |     ],
110 | }
111 | 
112 | 
113 | def add_extras(cfg, i, batch_norm=False):
114 |     # Extra layers added to VGG for feature scaling
115 |     layers = []
116 |     in_channels = i
117 |     flag = False
118 |     for k, v in enumerate(cfg):
119 |         if in_channels != 'S':
120 |             if v == 'S':
121 |                 layers += [
122 |                     nn.Conv2d(
123 |                         in_channels,
124 |                         cfg[k + 1],
125 |                         kernel_size=(1, 3)[flag],
126 |                         stride=2,
127 |                         padding=1)
128 |                 ]
129 |             else:
130 |                 layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
131 |             flag = not flag
132 |         in_channels = v
133 |     return layers
134 | 
135 | def add_extras_bn(cfg, i, batch_norm=False):
136 |     # Extra layers added to VGG for feature scaling
137 |     layers = []
138 |     in_channels = i
139 |     flag = False
140 |     for k, v in enumerate(cfg):
141 |         if in_channels != 'S':
142 |             if v == 'S':
143 |                 layers += [
144 |                     BasicConv(
145 |                         in_channels,
146 |                         cfg[k + 1],
147 |                         kernel_size=(1, 3)[flag],
148 |                         stride=2,
149 |                         padding=1)
150 |                 ]
151 |             else:
152 |                 layers += [BasicConv(in_channels, v, kernel_size=(1, 3)[flag])]
153 |             flag = not flag
154 |         in_channels = v
155 |     return layers
156 | 
157 | 
158 | class VGG16Extractor(nn.Module):
159 |     def __init__(self, size):
160 |         super(VGG16Extractor, self).__init__()
161 |         self.vgg = nn.ModuleList(vgg(base[str(size)], 3))
162 |         self.L2Norm = L2Norm(512, 20)
163 |         self.extras = nn.ModuleList(add_extras(extras_cfg[str(size)], 1024))
164 |         # self.extras_bn = nn.ModuleList(add_extras_bn(extras_cfg[str(size)], 1024))
165 |         self._init_modules()
166 | 
167 |     def _init_modules(self):
168 |         self.extras.apply(weights_init)
169 |         # self.extras_bn.apply(weights_init)
170 |         self.vgg.apply(weights_init)
171 | 
172 |     def forward(self, x):
173 |         """Applies network layers and ops on input image(s) x.
174 | 
175 |         Args:
176 |             x: input image or batch of images. Shape: [batch,3*batch,300,300].
177 | 
178 |         Return:
179 |             Depending on phase:
180 |             test:
181 |                 Variable(tensor) of output class label predictions,
182 |                 confidence score, and corresponding location predictions for
183 |                 each object detected. Shape: [batch,topk,7]
184 | 
185 |             train:
186 |                 list of concat outputs from:
187 |                     1: confidence layers, Shape: [batch*num_priors,num_classes]
188 |                     2: localization layers, Shape: [batch,num_priors*4]
189 |                     3: priorbox layers, Shape: [2,num_priors*4]
190 |         """
191 |         sources = list()
192 | 
193 |         # apply vgg up to conv4_3 relu
194 |         for k in range(23):
195 |             x = self.vgg[k](x)
196 | 
197 |         s = self.L2Norm(x)
198 |         sources.append(s)
199 | 
200 |         # apply vgg up to fc7
201 |         for k in range(23, len(self.vgg)):
202 |             x = self.vgg[k](x)
203 |         sources.append(x)
204 | 
205 |         # apply extra layers and cache source layer outputs
206 |         for k, v in enumerate(self.extras):
207 |             x = F.relu(v(x), inplace=True)
208 |             if k % 2 == 1:
209 |                 sources.append(x)
210 | 
211 |         # for k, v in enumerate(self.extras_bn):
212 |         #     x = v(x)
213 |         #     if k % 2 == 1:
214 |         #         sources.append(x)
215 |         return sources
216 | 
217 | 
218 | def SSDVgg(size, channel_size='48'):
219 |     return VGG16Extractor(size)
220 | 
221 | 
222 | if __name__ == "__main__":
223 |     import os
224 |     os.environ["CUDA_VISIBLE_DEVICES"] = "3"
225 |     with torch.no_grad():
226 |         model3 = VGG16Extractor(300)
227 |         model3.eval()
228 |         x = torch.randn(16, 3, 300, 300)
229 |         model3.cuda()
230 |         model3(x.cuda())
231 |         import time
232 |         st = time.time()
233 |         for i in range(1000):
234 |             model3(x.cuda())
235 |         print(time.time() - st)
236 | 


--------------------------------------------------------------------------------
/utils/convert_darknet.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | #
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | import numpy as np
  9 | from data.config import voc_config, coco_config
 10 | from model.yolo import Yolov3
 11 | from model.darknet53 import Darknet53
 12 | import argparse
 13 | import os
 14 | 
 15 | 
 16 | def copy_weights(bn, conv, ptr, weights, use_bn=True):
 17 |     if use_bn:
 18 |         num_bn_biases = bn.bias.numel()
 19 | 
 20 |         #Load the weights
 21 |         bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
 22 |         ptr += num_bn_biases
 23 | 
 24 |         bn_weights = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
 25 |         ptr += num_bn_biases
 26 | 
 27 |         bn_running_mean = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
 28 |         ptr += num_bn_biases
 29 | 
 30 |         bn_running_var = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
 31 |         ptr += num_bn_biases
 32 | 
 33 |         #Cast the loaded weights into dims of model weights.
 34 |         bn_biases = bn_biases.view_as(bn.bias.data)
 35 |         bn_weights = bn_weights.view_as(bn.weight.data)
 36 |         bn_running_mean = bn_running_mean.view_as(bn.running_mean)
 37 |         bn_running_var = bn_running_var.view_as(bn.running_var)
 38 | 
 39 |         #Copy the data to model
 40 |         bn.bias.data.copy_(bn_biases)
 41 |         bn.weight.data.copy_(bn_weights)
 42 |         bn.running_mean.copy_(bn_running_mean)
 43 |         bn.running_var.copy_(bn_running_var)
 44 |     else:
 45 |         #Number of biases
 46 |         num_biases = conv.bias.numel()
 47 | 
 48 |         #Load the weights
 49 |         conv_biases = torch.from_numpy(weights[ptr:ptr + num_biases])
 50 |         ptr = ptr + num_biases
 51 | 
 52 |         #reshape the loaded weights according to the dims of the model weights
 53 |         conv_biases = conv_biases.view_as(conv.bias.data)
 54 | 
 55 |         #Finally copy the data
 56 |         conv.bias.data.copy_(conv_biases)
 57 | 
 58 |     #Let us load the weights for the Convolutional layers
 59 |     num_weights = conv.weight.numel()
 60 |     conv_weights = torch.from_numpy(weights[ptr:ptr + num_weights])
 61 |     ptr = ptr + num_weights
 62 | 
 63 |     conv_weights = conv_weights.view_as(conv.weight.data)
 64 |     conv.weight.data.copy_(conv_weights)
 65 |     return ptr
 66 | 
 67 | 
 68 | def load_weights_darknet53(weightfile, yolov3):
 69 |     fp = open(weightfile, "rb")
 70 |     #The first 5 values are header information
 71 |     # 1. Major version number
 72 |     # 2. Minor Version Number
 73 |     # 3. Subversion number
 74 |     # 4. IMages seen
 75 |     header = np.fromfile(fp, dtype=np.int32, count=5)
 76 |     weights = np.fromfile(fp, dtype=np.float32)
 77 |     print(len(weights))
 78 |     ptr = 0
 79 |     first_conv = yolov3.conv
 80 |     bn = first_conv.bn
 81 |     conv = first_conv.conv
 82 |     # first conv copy
 83 |     ptr = copy_weights(bn, conv, ptr, weights)
 84 | 
 85 |     layers = [
 86 |         yolov3.layer1, yolov3.layer2, yolov3.layer3, yolov3.layer4,
 87 |         yolov3.layer5
 88 |     ]
 89 |     for layer in layers:
 90 |         for i in range(len(layer)):
 91 |             if i == 0:
 92 |                 bn = layer[i].bn
 93 |                 conv = layer[i].conv
 94 |                 ptr = copy_weights(bn, conv, ptr, weights)
 95 |             else:
 96 |                 bn = layer[i].conv1.bn
 97 |                 conv = layer[i].conv1.conv
 98 |                 ptr = copy_weights(bn, conv, ptr, weights)
 99 |                 bn = layer[i].conv2.bn
100 |                 conv = layer[i].conv2.conv
101 |                 ptr = copy_weights(bn, conv, ptr, weights)
102 |     print(ptr)
103 |     fp.close()
104 | 
105 | 
106 | def load_weights(weightfile, yolov3, version):
107 |     if version == "voc" or version == "coco":
108 |         load_weights_yolov3(weightfile, yolov3)
109 |     elif version == "darknet53":
110 |         load_weights_darknet53(weightfile, yolov3)
111 | 
112 | 
113 | def load_weights_yolov3(weightfile, yolov3):
114 |     fp = open(weightfile, "rb")
115 |     #The first 5 values are header information
116 |     # 1. Major version number
117 |     # 2. Minor Version Number
118 |     # 3. Subversion number
119 |     # 4, 5. IMages seen
120 |     header = np.fromfile(fp, dtype=np.int32, count=5)
121 |     weights = np.fromfile(fp, dtype=np.float32)
122 |     print(len(weights))
123 |     ptr = 0
124 |     extractor = yolov3.extractor
125 |     first_conv = extractor.conv
126 |     bn = first_conv.bn
127 |     conv = first_conv.conv
128 |     # first conv copy
129 |     ptr = copy_weights(bn, conv, ptr, weights)
130 | 
131 |     layers = [
132 |         extractor.layer1, extractor.layer2, extractor.layer3, extractor.layer4,
133 |         extractor.layer5
134 |     ]
135 |     for layer in layers:
136 |         for i in range(len(layer)):
137 |             if i == 0:
138 |                 bn = layer[i].bn
139 |                 conv = layer[i].conv
140 |                 ptr = copy_weights(bn, conv, ptr, weights)
141 |             else:
142 |                 bn = layer[i].conv1.bn
143 |                 conv = layer[i].conv1.conv
144 |                 ptr = copy_weights(bn, conv, ptr, weights)
145 |                 bn = layer[i].conv2.bn
146 |                 conv = layer[i].conv2.conv
147 |                 ptr = copy_weights(bn, conv, ptr, weights)
148 |     predict_conv_list1 = yolov3.predict_conv_list1
149 |     smooth_conv1 = yolov3.smooth_conv1
150 |     predict_conv_list2 = yolov3.predict_conv_list2
151 |     smooth_conv2 = yolov3.smooth_conv2
152 |     predict_conv_list3 = yolov3.predict_conv_list3
153 |     for i in range(len(predict_conv_list1)):
154 |         if i == 6:
155 |             bn = 0
156 |             conv = predict_conv_list1[i]
157 |             ptr = copy_weights(bn, conv, ptr, weights, use_bn=False)
158 |         else:
159 |             bn = predict_conv_list1[i].bn
160 |             conv = predict_conv_list1[i].conv
161 |             ptr = copy_weights(bn, conv, ptr, weights)
162 |     bn = smooth_conv1.bn
163 |     conv = smooth_conv1.conv
164 |     ptr = copy_weights(bn, conv, ptr, weights)
165 |     for i in range(len(predict_conv_list2)):
166 |         if i == 6:
167 |             bn = 0
168 |             conv = predict_conv_list2[i]
169 |             ptr = copy_weights(bn, conv, ptr, weights, use_bn=False)
170 |         else:
171 |             bn = predict_conv_list2[i].bn
172 |             conv = predict_conv_list2[i].conv
173 |             ptr = copy_weights(bn, conv, ptr, weights)
174 |     bn = smooth_conv2.bn
175 |     conv = smooth_conv2.conv
176 |     ptr = copy_weights(bn, conv, ptr, weights)
177 | 
178 |     for i in range(len(predict_conv_list3)):
179 |         if i == 6:
180 |             bn = 0
181 |             conv = predict_conv_list3[i]
182 |             ptr = copy_weights(bn, conv, ptr, weights, use_bn=False)
183 |         else:
184 |             bn = predict_conv_list3[i].bn
185 |             conv = predict_conv_list3[i].conv
186 |             ptr = copy_weights(bn, conv, ptr, weights)
187 |     print(ptr)
188 |     fp.close()
189 | 
190 | 
191 | def arg_parse():
192 |     """
193 |     Parse arguments to the train module
194 |     """
195 |     parser = argparse.ArgumentParser(description='Yolov3 pytorch Training')
196 |     parser.add_argument('--input_wh', default=(416, 416), help='input size.')
197 |     parser.add_argument(
198 |         '--version',
199 |         '--version',
200 |         default='darknet53',
201 |         help='voc, coco, darknet53')
202 |     parser.add_argument(
203 |         '--weights',
204 |         default='./weights/darknet53.conv.74',
205 |         help='pretrained base model')
206 |     parser.add_argument(
207 |         '--save_name',
208 |         default='./weights/convert_yolov3_coco.pth',
209 |         help='save name')
210 | 
211 |     return parser.parse_args()
212 | 
213 | 
214 | def load_weights_darknet19(weightfile, darknet19):
215 |     fp = open(weightfile, "rb")
216 |     #The first 4 values are header information
217 |     # 1. Major version number
218 |     # 2. Minor Version Number
219 |     # 3. Subversion number
220 |     # 4. IMages seen
221 |     header = np.fromfile(fp, dtype=np.int32, count=4)
222 |     weights = np.fromfile(fp, dtype=np.float32)
223 |     ptr = 0
224 |     first_conv = darknet19.conv
225 |     bn = first_conv.bn
226 |     conv = first_conv.conv
227 |     # first conv copy
228 |     ptr = copy_weights(bn, conv, ptr, weights)
229 |     layers = [
230 |         darknet19.layer1, darknet19.layer2, darknet19.layer3, darknet19.layer4,
231 |         darknet19.layer5
232 |     ]
233 |     for layer in layers:
234 |         for i in range(len(layer)):
235 |             if i == 0:
236 |                 pass
237 |             else:
238 |                 bn = layer[i].bn
239 |                 conv = layer[i].conv
240 |                 ptr = copy_weights(bn, conv, ptr, weights)
241 |     fp.close()
242 | 
243 | 
244 | if __name__ == '__main__':
245 |     args = arg_parse()
246 |     weightfile = args.weights
247 |     input_wh = args.input_wh
248 |     version = args.version
249 |     save_name = args.save_name
250 |     if version == "voc":
251 |         cfg = voc_config
252 |         yolov3 = Yolov3("train", input_wh, cfg["anchors"], cfg["anchors_mask"],
253 |                         cfg["num_classes"])
254 |     elif version == "coco":
255 |         cfg = coco_config
256 |         yolov3 = Yolov3("train", input_wh, cfg["anchors"], cfg["anchors_mask"],
257 |                         cfg["num_classes"])
258 |     elif version == "darknet53":
259 |         cfg = voc_config
260 |         num_blocks = [1, 2, 8, 8, 4]
261 |         yolov3 = Darknet53(num_blocks)
262 |     else:
263 |         print("Unkown version !!!")
264 |         import sys
265 |         sys.exit()
266 | 
267 |     load_weights(weightfile, yolov3, version)
268 |     # name = "convert_yolo_" + version + ".pth"
269 |     # save_path = os.path.join("./weights", name)
270 |     torch.save(darknet53.state_dict(), save_name)
271 | 


--------------------------------------------------------------------------------
/models/model_builder_vgg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.autograd import Variable
  7 | from layers import *
  8 | import os
  9 | from models.model_helper import weights_init
 10 | import importlib
 11 | from layers.functions.prior_layer import PriorLayer
 12 | # from dcn.modules.deform_conv import DeformConv, ModulatedDeformConv
 13 | from mmdet.ops import DeformConv, ModulatedDeformConv
 14 | 
 15 | def get_func(func_name):
 16 |     """Helper to return a function object by name. func_name must identify a
 17 |     function in this module or the path to a function relative to the base
 18 |     'modeling' module.
 19 |     """
 20 |     if func_name == '':
 21 |         return None
 22 |     try:
 23 |         parts = func_name.split('.')
 24 |         # Refers to a function in this module
 25 |         if len(parts) == 1:
 26 |             return globals()[parts[0]]
 27 |         # Otherwise, assume we're referencing a module under modeling
 28 |         module_name = 'models.' + '.'.join(parts[:-1])
 29 |         module = importlib.import_module(module_name)
 30 |         return getattr(module, parts[-1])
 31 |     except Exception:
 32 |         print('Failed to find function: %s', func_name)
 33 |         raise
 34 | 
 35 | class BasicConv(nn.Module):
 36 |     def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False):
 37 |         super(BasicConv, self).__init__()
 38 |         self.out_channels = out_planes
 39 |         self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
 40 |         self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None
 41 |         self.relu = nn.ReLU(inplace=True) if relu else None
 42 | 
 43 |     def forward(self, x):
 44 |         x = self.conv(x)
 45 |         if self.bn is not None:
 46 |             x = self.bn(x)
 47 |         if self.relu is not None:
 48 |             x = self.relu(x)
 49 |         return x
 50 | 
 51 | 
 52 | 
 53 | def add_dcn_dilas():
 54 |     planes = [512,1024,256,256]
 55 |     deformable_groups = 1
 56 |     conv_layers = []
 57 |     for i in range(4):
 58 |         conv_layers += [DeformConv(
 59 |             planes[i],
 60 |             256,
 61 |             kernel_size=3,
 62 |             stride=1,
 63 |             padding=5-i,
 64 |             dilation=5-i,
 65 |             deformable_groups=deformable_groups,
 66 |             bias=False)]
 67 |     return conv_layers
 68 | 
 69 | def BN_layers():
 70 |     bn_layers =[]
 71 |     bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)]
 72 |     bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)]
 73 |     bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)]
 74 |     bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)]
 75 | 
 76 |     return bn_layers
 77 | 
 78 | class SSD(nn.Module):
 79 |     """Single Shot Multibox Architecture
 80 |     The network is composed of a base VGG network followed by the
 81 |     added multibox conv layers.  Each multibox layer branches into
 82 |         1) conv2d for class conf scores
 83 |         2) conv2d for localization predictions
 84 |         3) associated priorbox layer to produce default bounding
 85 |            boxes specific to the layer's feature map size.
 86 |     See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 87 | 
 88 |     Args:
 89 |         phase: (string) Can be "test" or "train"
 90 |         base: VGG16 layers for input, size of either 300 or 500
 91 |         extras: extra layers that feed to multibox loc and conf layers
 92 |         head: "multibox head" consists of loc and conf conv layers
 93 |     """
 94 | 
 95 |     def _init_modules(self):
 96 |         self.arm_loc.apply(weights_init)
 97 |         self.arm_conf.apply(weights_init)
 98 |         if self.cfg.MODEL.REFINE:
 99 |             self.odm_loc.apply(weights_init)
100 |             self.odm_conf.apply(weights_init)
101 | 
102 |             self.loc_offset_conv.apply(weights_init)
103 |             # self.offsets.apply(weights_init)
104 |             self.dcn_convs.apply(weights_init)
105 |         if self.cfg.MODEL.LOAD_PRETRAINED_WEIGHTS:
106 |             weights = torch.load(self.cfg.MODEL.PRETRAIN_WEIGHTS)
107 |             print("load pretrain model {}".format(
108 |                 self.cfg.MODEL.PRETRAIN_WEIGHTS))
109 |             if self.cfg.MODEL.TYPE.split('_')[-1] == 'vgg':
110 |                 self.extractor.vgg.load_state_dict(weights)
111 |             else:
112 |                 self.extractor.load_state_dict(weights, strict=False)
113 | 
114 |     def __init__(self, cfg):
115 |         super(SSD, self).__init__()
116 |         self.cfg = cfg
117 |         self.size = cfg.MODEL.SIZE
118 |         if self.size == '300':
119 |             size_cfg = cfg.SMALL
120 |         else:
121 |             size_cfg = cfg.BIG
122 |         self.num_classes = cfg.MODEL.NUM_CLASSES
123 |         self.prior_layer = PriorLayer(cfg)
124 |         self.priorbox = PriorBox(cfg)
125 |         self.priors = self.priorbox.forward()
126 |         self.extractor = get_func(cfg.MODEL.CONV_BODY)(self.size,
127 |                                                        cfg.TRAIN.CHANNEL_SIZE)
128 |         if cfg.MODEL.REFINE:
129 |             self.odm_channels = size_cfg.ODM_CHANNELS
130 |             self.arm_num_classes = 2
131 |             self.odm_loc = nn.ModuleList()
132 |             self.odm_conf = nn.ModuleList()
133 | 
134 | 
135 |             self.loc_offset_conv = nn.ModuleList()
136 |             self.dcn_convs = nn.ModuleList(add_dcn_dilas())
137 |             self.bn_layers = nn.ModuleList(BN_layers())
138 | 
139 |         self.arm_loc = nn.ModuleList()
140 |         self.arm_conf = nn.ModuleList()
141 |         self.arm_channels = size_cfg.ARM_CHANNELS
142 |         self.num_anchors = size_cfg.NUM_ANCHORS
143 |         self.input_fixed = size_cfg.INPUT_FIXED
144 |         self.arm_loc = nn.ModuleList()
145 |         self.arm_conf = nn.ModuleList()
146 | 
147 |         for i in range(len(self.arm_channels)):
148 |             if cfg.MODEL.REFINE:
149 |                 self.arm_loc += [
150 |                     nn.Conv2d(
151 |                         self.arm_channels[i],
152 |                         self.num_anchors[i] * 4,
153 |                         kernel_size=3,
154 |                         padding=1)
155 |                 ]
156 |                 self.arm_conf += [
157 |                     nn.Conv2d(
158 |                         self.arm_channels[i],
159 |                         self.num_anchors[i] * self.arm_num_classes,
160 |                         kernel_size=3,
161 |                         padding=1)
162 |                 ]
163 | 
164 |                 self.loc_offset_conv +=[BasicConv(self.num_anchors[i] * 2, 18, kernel_size=1)]
165 |                 self.odm_loc += [
166 |                     nn.Conv2d(
167 |                         self.odm_channels[i],
168 |                         self.num_anchors[i] * 4,
169 |                         kernel_size=3,
170 |                         padding=1)
171 |                 ]
172 |                 self.odm_conf += [
173 |                     nn.Conv2d(
174 |                         self.odm_channels[i],
175 |                         self.num_anchors[i] * self.num_classes,
176 |                         kernel_size=3,
177 |                         padding=1)
178 |                 ]
179 |             else:
180 |                 self.arm_loc += [
181 |                     nn.Conv2d(
182 |                         self.arm_channels[i],
183 |                         self.num_anchors[i] * 4,
184 |                         kernel_size=3,
185 |                         padding=1)
186 |                 ]
187 |                 self.arm_conf += [
188 |                     nn.Conv2d(
189 |                         self.arm_channels[i],
190 |                         self.num_anchors[i] * self.num_classes,
191 |                         kernel_size=3,
192 |                         padding=1)
193 |                 ]
194 |         if cfg.TRAIN.TRAIN_ON:
195 |             self._init_modules()
196 | 
197 |     def forward(self, input):
198 | 
199 |         arm_loc = list()
200 |         arm_conf = list()
201 |         if self.cfg.MODEL.REFINE:
202 |             odm_loc = list()
203 |             odm_conf = list()
204 |             conf = list()
205 |             odm_xs_n = list()
206 |             arm_loc_list = list()
207 |             arm_xs, odm_xs = self.extractor(input)
208 | 
209 |             for (x, l, c) in zip(arm_xs, self.arm_loc, self.arm_conf):
210 |                 arm_loc_conv = l(x)
211 |                 cc = c(x)
212 |                 conf.append(cc)
213 |                 arm_loc_list.append(torch.cat([arm_loc_conv[:,0::4,:,:], arm_loc_conv[:,1::4,:,:]], 1))
214 |                 arm_loc.append(arm_loc_conv.permute(0, 2, 3, 1).contiguous())
215 |                 arm_conf.append(cc.permute(0, 2, 3, 1).contiguous())
216 | 
217 | 
218 |             for (conf_fea, odm_xs_fea) in zip(conf, odm_xs):
219 |                 conf_obj = conf_fea[:, 1::2, :, :]
220 |                 conf_max, _ = torch.max(conf_obj, dim=1, keepdim=True)
221 |                 conf_attention = conf_max.sigmoid()
222 |                 odm_xs_fea_n = odm_xs_fea * conf_attention + odm_xs_fea
223 |                 odm_xs_n.append(odm_xs_fea_n)
224 | 
225 |             offset_0 = self.loc_offset_conv[0](arm_loc_list[0])
226 |             d0 = F.relu(self.bn_layers[0](self.dcn_convs[0](odm_xs_n[0], offset_0)), inplace=True)
227 | 
228 |             offset_1 = self.loc_offset_conv[1](arm_loc_list[1])
229 |             d1 = F.relu(self.bn_layers[1](self.dcn_convs[1](odm_xs_n[1], offset_1)), inplace=True)
230 | 
231 |             offset_2 = self.loc_offset_conv[2](arm_loc_list[2])
232 |             d2 = F.relu(self.bn_layers[2](self.dcn_convs[2](odm_xs_n[2], offset_2)), inplace=True)
233 | 
234 |             offset_3 = self.loc_offset_conv[3](arm_loc_list[3])
235 |             d3 = F.relu(self.bn_layers[3](self.dcn_convs[3](odm_xs_n[3], offset_3)), inplace=True)
236 |             odm_xs_new = [d0,d1,d2,d3]
237 | 
238 | 
239 |             for (x, l, c) in zip(odm_xs_new, self.odm_loc, self.odm_conf):
240 |                 odm_loc.append(l(x).permute(0, 2, 3, 1).contiguous())
241 |                 odm_conf.append(c(x).permute(0, 2, 3, 1).contiguous())
242 |             odm_loc = torch.cat([o.view(o.size(0), -1) for o in odm_loc], 1)
243 |             odm_conf = torch.cat([o.view(o.size(0), -1) for o in odm_conf], 1)
244 |         else:
245 |             arm_xs = self.extractor(input)
246 |         img_wh = (input.size(3), input.size(2))
247 |         feature_maps_wh = [(t.size(3), t.size(2)) for t in arm_xs]
248 | 
249 |         arm_loc = torch.cat([o.view(o.size(0), -1) for o in arm_loc], 1)
250 |         arm_conf = torch.cat([o.view(o.size(0), -1) for o in arm_conf], 1)
251 |         if self.cfg.MODEL.REFINE:
252 |             output = (arm_loc.view(arm_loc.size(0), -1, 4),
253 |                       arm_conf.view(
254 |                           arm_conf.size(0), -1, self.arm_num_classes),
255 |                       odm_loc.view(odm_loc.size(0), -1, 4),
256 |                       odm_conf.view(odm_conf.size(0), -1, self.num_classes),
257 |                       self.priors if self.input_fixed else self.prior_layer(
258 |                           img_wh, feature_maps_wh))
259 |         else:
260 |             output = (arm_loc.view(arm_loc.size(0), -1, 4),
261 |                       arm_conf.view(arm_conf.size(0), -1, self.num_classes),
262 |                       self.priors if self.input_fixed else self.prior_layer(
263 |                           img_wh, feature_maps_wh))
264 |         return output
265 | 
266 | 


--------------------------------------------------------------------------------
/data/data_augment.py:
--------------------------------------------------------------------------------
  1 | """Data augmentation functionality. Passed as callable transformations to
  2 | Dataset classes.
  3 | 
  4 | The data augmentation procedures were interpreted from @weiliu89's SSD paper
  5 | http://arxiv.org/abs/1512.02325
  6 | 
  7 | TODO: implement data_augment for training
  8 | 
  9 | Ellis Brown, Max deGroot
 10 | """
 11 | 
 12 | import torch
 13 | from torchvision import transforms
 14 | import cv2
 15 | import numpy as np
 16 | import random
 17 | import math
 18 | from utils.box_utils import matrix_iou
 19 | 
 20 | 
 21 | def _crop(image, boxes, labels):
 22 |     height, width, _ = image.shape
 23 | 
 24 |     if len(boxes) == 0:
 25 |         return image, boxes, labels
 26 | 
 27 |     while True:
 28 |         mode = random.choice((
 29 |             None,
 30 |             (0.1, None),
 31 |             (0.3, None),
 32 |             (0.5, None),
 33 |             (0.7, None),
 34 |             (0.9, None),
 35 |             (None, None),
 36 |         ))
 37 | 
 38 |         if mode is None:
 39 |             return image, boxes, labels
 40 | 
 41 |         min_iou, max_iou = mode
 42 |         if min_iou is None:
 43 |             min_iou = float('-inf')
 44 |         if max_iou is None:
 45 |             max_iou = float('inf')
 46 | 
 47 |         for _ in range(50):
 48 |             scale = random.uniform(0.3, 1.)
 49 |             min_ratio = max(0.5, scale * scale)
 50 |             max_ratio = min(2, 1. / scale / scale)
 51 |             ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
 52 |             w = int(scale * ratio * width)
 53 |             h = int((scale / ratio) * height)
 54 | 
 55 |             l = random.randrange(width - w)
 56 |             t = random.randrange(height - h)
 57 |             roi = np.array((l, t, l + w, t + h))
 58 | 
 59 |             iou = matrix_iou(boxes, roi[np.newaxis])
 60 | 
 61 |             if not (min_iou <= iou.min() and iou.max() <= max_iou):
 62 |                 continue
 63 | 
 64 |             image_t = image[roi[1]:roi[3], roi[0]:roi[2]]
 65 | 
 66 |             centers = (boxes[:, :2] + boxes[:, 2:]) / 2
 67 |             mask = np.logical_and(roi[:2] < centers, centers < roi[2:]) \
 68 |                      .all(axis=1)
 69 |             boxes_t = boxes[mask].copy()
 70 |             labels_t = labels[mask].copy()
 71 |             if len(boxes_t) == 0:
 72 |                 continue
 73 | 
 74 |             boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2])
 75 |             boxes_t[:, :2] -= roi[:2]
 76 |             boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:])
 77 |             boxes_t[:, 2:] -= roi[:2]
 78 | 
 79 |             return image_t, boxes_t, labels_t
 80 | 
 81 | 
 82 | def _distort(image):
 83 |     def _convert(image, alpha=1, beta=0):
 84 |         tmp = image.astype(float) * alpha + beta
 85 |         tmp[tmp < 0] = 0
 86 |         tmp[tmp > 255] = 255
 87 |         image[:] = tmp
 88 | 
 89 |     image = image.copy()
 90 | 
 91 |     if random.randrange(2):
 92 |         _convert(image, beta=random.uniform(-32, 32))
 93 | 
 94 |     if random.randrange(2):
 95 |         _convert(image, alpha=random.uniform(0.5, 1.5))
 96 | 
 97 |     image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
 98 | 
 99 |     if random.randrange(2):
100 |         tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
101 |         tmp %= 180
102 |         image[:, :, 0] = tmp
103 | 
104 |     if random.randrange(2):
105 |         _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))
106 | 
107 |     image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
108 | 
109 |     return image
110 | 
111 | 
112 | def _expand(image, boxes, fill, p):
113 |     if random.random() > p:
114 |         return image, boxes
115 | 
116 |     height, width, depth = image.shape
117 |     for _ in range(50):
118 |         scale = random.uniform(1, 4)
119 | 
120 |         min_ratio = max(0.5, 1. / scale / scale)
121 |         max_ratio = min(2, scale * scale)
122 |         ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
123 |         ws = scale * ratio
124 |         hs = scale / ratio
125 |         if ws < 1 or hs < 1:
126 |             continue
127 |         w = int(ws * width)
128 |         h = int(hs * height)
129 | 
130 |         left = random.randint(0, w - width)
131 |         top = random.randint(0, h - height)
132 | 
133 |         boxes_t = boxes.copy()
134 |         boxes_t[:, :2] += (left, top)
135 |         boxes_t[:, 2:] += (left, top)
136 | 
137 |         expand_image = np.empty((h, w, depth), dtype=image.dtype)
138 |         expand_image[:, :] = fill
139 |         expand_image[top:top + height, left:left + width] = image
140 |         image = expand_image
141 | 
142 |         return image, boxes_t
143 | 
144 | 
145 | def _mirror(image, boxes):
146 |     _, width, _ = image.shape
147 |     if random.randrange(2):
148 |         image = image[:, ::-1]
149 |         boxes = boxes.copy()
150 |         boxes[:, 0::2] = width - boxes[:, 2::-2]
151 |     return image, boxes
152 | 
153 | 
154 | def preproc_for_test(image, resize_wh, mean):
155 |     interp_methods = [
156 |         cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST,
157 |         cv2.INTER_LANCZOS4
158 |     ]
159 |     interp_method = interp_methods[random.randrange(5)]
160 |     # interp_method = interp_methods[0]
161 |     image = cv2.resize(
162 |         image, (resize_wh[0], resize_wh[1]), interpolation=interp_method)
163 |     image = image.astype(np.float32)
164 |     image -= mean
165 |     # to rgb
166 |     # image = image[:, :, (2, 1, 0)]
167 |     return image.transpose(2, 0, 1)
168 | 
169 | 
170 | class preproc(object):
171 |     def __init__(self, resize_wh, rgb_means, p):
172 |         self.means = rgb_means
173 |         self.resize_wh = resize_wh
174 |         self.p = p
175 | 
176 |     def __call__(self, image, targets):
177 |         boxes = targets[:, :-1].copy()
178 |         labels = targets[:, -1].copy()
179 |         if len(boxes) == 0:
180 |             #boxes = np.empty((0, 4))
181 |             targets = np.zeros((1, 5))
182 |             image = preproc_for_test(image, self.resize_wh, self.means)
183 |             return torch.from_numpy(image), targets
184 | 
185 |         image_o = image.copy()
186 |         targets_o = targets.copy()
187 |         height_o, width_o, _ = image_o.shape
188 |         boxes_o = targets_o[:, :-1]
189 |         labels_o = targets_o[:, -1]
190 |         boxes_o[:, 0::2] /= width_o
191 |         boxes_o[:, 1::2] /= height_o
192 |         labels_o = np.expand_dims(labels_o, 1)
193 |         targets_o = np.hstack((boxes_o, labels_o))
194 | 
195 |         image_t, boxes, labels = _crop(image, boxes, labels)
196 |         image_t = _distort(image_t)
197 |         image_t, boxes = _expand(image_t, boxes, self.means, self.p)
198 |         image_t, boxes = _mirror(image_t, boxes)
199 |         #image_t, boxes = _mirror(image, boxes)
200 | 
201 |         height, width, _ = image_t.shape
202 |         image_t = preproc_for_test(image_t, self.resize_wh, self.means)
203 |         boxes = boxes.copy()
204 |         boxes[:, 0::2] /= width
205 |         boxes[:, 1::2] /= height
206 |         b_w = (boxes[:, 2] - boxes[:, 0]) * 1.
207 |         b_h = (boxes[:, 3] - boxes[:, 1]) * 1.
208 |         mask_b = np.minimum(b_w, b_h) > 0.01
209 |         boxes_t = boxes[mask_b]
210 |         labels_t = labels[mask_b].copy()
211 | 
212 |         if len(boxes_t) == 0:
213 |             image = preproc_for_test(image_o, self.resize_wh, self.means)
214 |             return torch.from_numpy(image), targets_o
215 | 
216 |         labels_t = np.expand_dims(labels_t, 1)
217 |         targets_t = np.hstack((boxes_t, labels_t))
218 | 
219 |         return torch.from_numpy(image_t), targets_t
220 | 
221 | 
222 | class BaseTransform_img(object):
223 |     """Defines the transformations that should be applied to test PIL image
224 |         for input into the network
225 | 
226 |     dimension -> tensorize -> color adj
227 | 
228 |     Arguments:
229 |         resize (int): input dimension to SSD
230 |         rgb_means ((int,int,int)): average RGB of the dataset
231 |             (104,117,123)
232 |         swap ((int,int,int)): final order of channels
233 |     Returns:
234 |         transform (transform) : callable transform to be applied to test/val
235 |         data
236 |     """
237 | 
238 |     def __init__(self, resize_wh, rgb_means, swap=(2, 0, 1)):
239 |         self.means = rgb_means
240 |         self.resize_wh = resize_wh
241 |         self.swap = swap
242 | 
243 |     # assume input is cv2 img for now
244 |     def __call__(self, img, target=None):
245 | 
246 |         interp_methods = [
247 |             cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA,
248 |             cv2.INTER_NEAREST, cv2.INTER_LANCZOS4
249 |         ]
250 |         interp_method = interp_methods[0]
251 |         img = cv2.resize(
252 |             np.array(img), (int(self.resize_wh), int(self.resize_wh)),
253 |             interpolation=interp_method).astype(np.float32)
254 |         img -= self.means
255 |         img = img.transpose(self.swap)
256 |         return torch.from_numpy(img)
257 | 
258 | 
259 | class BaseTransform_ration(object):
260 |     """Defines the transformations that should be applied to test PIL image
261 |         for input into the network
262 | 
263 |     dimension -> tensorize -> color adj
264 | 
265 |     Arguments:
266 |         resize (int): input dimension to SSD
267 |         rgb_means ((int,int,int)): average RGB of the dataset
268 |             (104,117,123)
269 |         swap ((int,int,int)): final order of channels
270 |     Returns:
271 |         transform (transform) : callable transform to be applied to test/val
272 |         data
273 |     """
274 | 
275 |     def __init__(self, resize_wh, rgb_means, swap=(2, 0, 1)):
276 |         self.means = rgb_means
277 |         self.resize_wh = resize_wh
278 |         self.swap = swap
279 | 
280 |     # assume input is cv2 img for now
281 |     def __call__(self, img, target=None):
282 | 
283 |         interp_methods = [
284 |             cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA,
285 |             cv2.INTER_NEAREST, cv2.INTER_LANCZOS4
286 |         ]
287 |         interp_method = interp_methods[0]
288 |         img = cv2.resize(
289 |             np.array(img), None, None, fx=self.resize_wh[0], fy = self.resize_wh[1],
290 |             interpolation=interp_method).astype(np.float32)
291 |         img -= self.means
292 |         img = img.transpose(self.swap)
293 |         return torch.from_numpy(img)
294 | 
295 | 
296 | class BaseTransform(object):
297 |     """Defines the transformations that should be applied to test PIL image
298 |         for input into the network
299 | 
300 |     dimension -> tensorize -> color adj
301 | 
302 |     Arguments:
303 |         resize (int): input dimension to SSD
304 |         rgb_means ((int,int,int)): average RGB of the dataset
305 |             (104,117,123)
306 |         swap ((int,int,int)): final order of channels
307 |     Returns:
308 |         transform (transform) : callable transform to be applied to test/val
309 |         data
310 |     """
311 | 
312 |     def __init__(self, resize_wh, rgb_means, swap=(2, 0, 1)):
313 |         self.means = rgb_means
314 |         self.resize_wh = resize_wh
315 |         self.swap = swap
316 | 
317 |     # assume input is cv2 img for now
318 |     def __call__(self, img, target=None):
319 | 
320 |         interp_methods = [
321 |             cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA,
322 |             cv2.INTER_NEAREST, cv2.INTER_LANCZOS4
323 |         ]
324 |         interp_method = interp_methods[0]
325 |         img = cv2.resize(
326 |             np.array(img), (self.resize_wh[0], self.resize_wh[1]),
327 |             interpolation=interp_method).astype(np.float32)
328 |         img -= self.means
329 |         img = img.transpose(self.swap)
330 |         return torch.from_numpy(img), target
331 |         # return torch.from_numpy(img), target


--------------------------------------------------------------------------------
/models/model_builder_resnet.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.autograd import Variable
  7 | from layers import *
  8 | import os
  9 | from models.model_helper import weights_init
 10 | import importlib
 11 | from layers.functions.prior_layer import PriorLayer
 12 | from dcn.modules.deform_conv import DeformConv, ModulatedDeformConv
 13 | 
 14 | def get_func(func_name):
 15 |     """Helper to return a function object by name. func_name must identify a
 16 |     function in this module or the path to a function relative to the base
 17 |     'modeling' module.
 18 |     """
 19 |     if func_name == '':
 20 |         return None
 21 |     try:
 22 |         parts = func_name.split('.')
 23 |         # Refers to a function in this module
 24 |         if len(parts) == 1:
 25 |             return globals()[parts[0]]
 26 |         # Otherwise, assume we're referencing a module under modeling
 27 |         module_name = 'models.' + '.'.join(parts[:-1])
 28 |         module = importlib.import_module(module_name)
 29 |         return getattr(module, parts[-1])
 30 |     except Exception:
 31 |         print('Failed to find function: %s', func_name)
 32 |         raise
 33 | 
 34 | class BasicConv(nn.Module):
 35 |     def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False):
 36 |         super(BasicConv, self).__init__()
 37 |         self.out_channels = out_planes
 38 |         self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
 39 |         self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None
 40 |         self.relu = nn.ReLU(inplace=True) if relu else None
 41 | 
 42 |     def forward(self, x):
 43 |         x = self.conv(x)
 44 |         if self.bn is not None:
 45 |             x = self.bn(x)
 46 |         if self.relu is not None:
 47 |             x = self.relu(x)
 48 |         return x
 49 | 
 50 | class Basic2Conv(nn.Module):
 51 | 
 52 |     def __init__(self, in_planes, out_planes):
 53 |         super(Basic2Conv, self).__init__()
 54 |         self.branch1 = BasicConv(in_planes, out_planes, kernel_size=1)
 55 |         self.branch2 = BasicConv(out_planes, out_planes, kernel_size=1)
 56 | 
 57 |     def forward(self, x):
 58 |         x1 = self.branch1(x)
 59 |         x2 = self.branch2(x1)
 60 | 
 61 |         return x2
 62 | 
 63 | def add_dcn_dilas():
 64 | 
 65 |     planes = [512,1024,512,256]
 66 |     deformable_groups = 1
 67 |     conv_layers = []
 68 |     for i in range(4):
 69 |         conv_layers += [DeformConv(
 70 |             planes[i],
 71 |             256,
 72 |             kernel_size=3,
 73 |             stride=1,
 74 |             padding=5-i,
 75 |             dilation=5-i,
 76 |             deformable_groups=deformable_groups,
 77 |             bias=False)]
 78 |     return conv_layers
 79 | 
 80 | def BN_layers():
 81 |     bn_layers =[]
 82 |     bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)]
 83 |     bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)]
 84 |     bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)]
 85 |     bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)]
 86 | 
 87 |     return bn_layers
 88 | 
 89 | class SSD(nn.Module):
 90 |     """Single Shot Multibox Architecture
 91 |     The network is composed of a base VGG network followed by the
 92 |     added multibox conv layers.  Each multibox layer branches into
 93 |         1) conv2d for class conf scores
 94 |         2) conv2d for localization predictions
 95 |         3) associated priorbox layer to produce default bounding
 96 |            boxes specific to the layer's feature map size.
 97 |     See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 98 | 
 99 |     Args:
100 |         phase: (string) Can be "test" or "train"
101 |         base: VGG16 layers for input, size of either 300 or 500
102 |         extras: extra layers that feed to multibox loc and conf layers
103 |         head: "multibox head" consists of loc and conf conv layers
104 |     """
105 | 
106 |     def _init_modules(self):
107 |         self.arm_loc.apply(weights_init)
108 |         self.arm_conf.apply(weights_init)
109 |         if self.cfg.MODEL.REFINE:
110 |             self.odm_loc.apply(weights_init)
111 |             self.odm_conf.apply(weights_init)
112 | 
113 |             self.loc_offset_conv.apply(weights_init)
114 |             self.dcn_convs.apply(weights_init)
115 |         if self.cfg.MODEL.LOAD_PRETRAINED_WEIGHTS:
116 |             weights = torch.load(self.cfg.MODEL.PRETRAIN_WEIGHTS)
117 |             print("load pretrain model {}".format(
118 |                 self.cfg.MODEL.PRETRAIN_WEIGHTS))
119 |             if self.cfg.MODEL.TYPE.split('_')[-1] == 'vgg':
120 |                 self.extractor.vgg.load_state_dict(weights)
121 |             else:
122 |                 self.extractor.load_state_dict(weights, strict=False)
123 | 
124 |     def __init__(self, cfg):
125 |         super(SSD, self).__init__()
126 |         self.cfg = cfg
127 |         self.size = cfg.MODEL.SIZE
128 |         if self.size == '300':
129 |             size_cfg = cfg.SMALL
130 |         else:
131 |             size_cfg = cfg.BIG
132 |         self.num_classes = cfg.MODEL.NUM_CLASSES
133 |         self.prior_layer = PriorLayer(cfg)
134 |         self.priorbox = PriorBox(cfg)
135 |         self.priors = self.priorbox.forward()
136 |         self.extractor = get_func(cfg.MODEL.CONV_BODY)(self.size,
137 |                                                        cfg.TRAIN.CHANNEL_SIZE)
138 |         if cfg.MODEL.REFINE:
139 |             self.odm_channels = size_cfg.ODM_CHANNELS
140 |             self.arm_num_classes = 2
141 |             self.odm_loc = nn.ModuleList()
142 |             self.odm_conf = nn.ModuleList()
143 | 
144 |             self.loc_offset_conv = nn.ModuleList()
145 |             self.dcn_convs = nn.ModuleList(add_dcn_dilas())
146 |             self.bn_layers = nn.ModuleList(BN_layers())
147 | 
148 |         self.arm_loc = nn.ModuleList()
149 |         self.arm_conf = nn.ModuleList()
150 |         self.arm_channels = size_cfg.ARM_CHANNELS
151 |         self.num_anchors = size_cfg.NUM_ANCHORS
152 |         self.input_fixed = size_cfg.INPUT_FIXED
153 |         self.arm_loc = nn.ModuleList()
154 |         self.arm_conf = nn.ModuleList()
155 | 
156 |         for i in range(len(self.arm_channels)):
157 |             if cfg.MODEL.REFINE:
158 |                 self.arm_loc += [
159 |                     nn.Conv2d(
160 |                         self.arm_channels[i],
161 |                         self.num_anchors[i] * 4,
162 |                         kernel_size=3,
163 |                         padding=1)
164 |                 ]
165 |                 self.arm_conf += [
166 |                     nn.Conv2d(
167 |                         self.arm_channels[i],
168 |                         self.num_anchors[i] * self.arm_num_classes,
169 |                         kernel_size=3,
170 |                         padding=1)
171 |                 ]
172 | 
173 |                 self.loc_offset_conv +=[BasicConv(self.num_anchors[i] * 2, 18, kernel_size=1)]
174 |                 self.odm_loc += [nn.Sequential(Basic2Conv(self.odm_channels[i], 512),
175 |                            nn.Conv2d(512, self.num_anchors[i] * 4, kernel_size=3, padding=1))
176 |                                  ]
177 |                 self.odm_conf += [
178 |                     nn.Sequential(Basic2Conv(self.odm_channels[i], 512),
179 |                                   nn.Conv2d(512, self.num_anchors[i] * self.num_classes, kernel_size=3, padding=1))
180 |                                  ]
181 |             else:
182 |                 self.arm_loc += [
183 |                     nn.Conv2d(
184 |                         self.arm_channels[i],
185 |                         self.num_anchors[i] * 4,
186 |                         kernel_size=3,
187 |                         padding=1)
188 |                 ]
189 |                 self.arm_conf += [
190 |                     nn.Conv2d(
191 |                         self.arm_channels[i],
192 |                         self.num_anchors[i] * self.num_classes,
193 |                         kernel_size=3,
194 |                         padding=1)
195 |                 ]
196 |         if cfg.TRAIN.TRAIN_ON:
197 |             self._init_modules()
198 | 
199 |     def forward(self, input):
200 | 
201 |         arm_loc = list()
202 |         arm_conf = list()
203 |         if self.cfg.MODEL.REFINE:
204 |             odm_loc = list()
205 |             odm_conf = list()
206 |             conf = list()
207 |             odm_xs_n = list()
208 |             arm_loc_list = list()
209 |             arm_xs, odm_xs = self.extractor(input)
210 | 
211 | 
212 |             for (x, l, c) in zip(arm_xs, self.arm_loc, self.arm_conf):
213 |                 arm_loc_conv = l(x)
214 |                 cc = c(x)
215 |                 conf.append(cc)
216 |                 arm_loc_list.append(torch.cat([arm_loc_conv[:,0::4,:,:], arm_loc_conv[:,1::4,:,:]], 1))
217 |                 arm_loc.append(arm_loc_conv.permute(0, 2, 3, 1).contiguous())
218 |                 arm_conf.append(cc.permute(0, 2, 3, 1).contiguous())
219 | 
220 | 
221 |             for (conf_fea, odm_xs_fea) in zip(conf, odm_xs):
222 |                 conf_obj = conf_fea[:, 1::2, :, :]
223 |                 conf_max, _ = torch.max(conf_obj, dim=1, keepdim=True)
224 |                 conf_attention = conf_max.sigmoid()
225 |                 odm_xs_fea_n = odm_xs_fea * conf_attention + odm_xs_fea
226 |                 odm_xs_n.append(odm_xs_fea_n)
227 | 
228 |             offset_0 = self.loc_offset_conv[0](arm_loc_list[0])
229 |             d0 = F.relu(self.bn_layers[0](self.dcn_convs[0](odm_xs_n[0], offset_0)), inplace=True)
230 | 
231 |             offset_1 = self.loc_offset_conv[1](arm_loc_list[1])
232 |             d1 = F.relu(self.bn_layers[1](self.dcn_convs[1](odm_xs_n[1], offset_1)), inplace=True)
233 | 
234 |             offset_2 = self.loc_offset_conv[2](arm_loc_list[2])
235 |             d2 = F.relu(self.bn_layers[2](self.dcn_convs[2](odm_xs_n[2], offset_2)), inplace=True)
236 | 
237 |             offset_3 = self.loc_offset_conv[3](arm_loc_list[3])
238 |             d3 = F.relu(self.bn_layers[3](self.dcn_convs[3](odm_xs_n[3], offset_3)), inplace=True)
239 |             odm_xs_new = [d0,d1,d2,d3]
240 | 
241 |             for (x, l, c) in zip(odm_xs_new, self.odm_loc, self.odm_conf):
242 |                 odm_loc.append(l(x).permute(0, 2, 3, 1).contiguous())
243 |                 odm_conf.append(c(x).permute(0, 2, 3, 1).contiguous())
244 |             odm_loc = torch.cat([o.view(o.size(0), -1) for o in odm_loc], 1)
245 |             odm_conf = torch.cat([o.view(o.size(0), -1) for o in odm_conf], 1)
246 |         else:
247 |             arm_xs = self.extractor(input)
248 |         img_wh = (input.size(3), input.size(2))
249 |         feature_maps_wh = [(t.size(3), t.size(2)) for t in arm_xs]
250 | 
251 |         arm_loc = torch.cat([o.view(o.size(0), -1) for o in arm_loc], 1)
252 |         arm_conf = torch.cat([o.view(o.size(0), -1) for o in arm_conf], 1)
253 |         if self.cfg.MODEL.REFINE:
254 |             output = (arm_loc.view(arm_loc.size(0), -1, 4),
255 |                       arm_conf.view(
256 |                           arm_conf.size(0), -1, self.arm_num_classes),
257 |                       odm_loc.view(odm_loc.size(0), -1, 4),
258 |                       odm_conf.view(odm_conf.size(0), -1, self.num_classes),
259 |                       self.priors if self.input_fixed else self.prior_layer(
260 |                           img_wh, feature_maps_wh))
261 |         else:
262 |             output = (arm_loc.view(arm_loc.size(0), -1, 4),
263 |                       arm_conf.view(arm_conf.size(0), -1, self.num_classes),
264 |                       self.priors if self.input_fixed else self.prior_layer(
265 |                           img_wh, feature_maps_wh))
266 |         return output
267 | 


--------------------------------------------------------------------------------
/data/voc0712.py:
--------------------------------------------------------------------------------
  1 | """VOC Dataset Classes
  2 | 
  3 | Original author: Francisco Massa
  4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
  5 | 
  6 | Updated by: Ellis Brown, Max deGroot
  7 | """
  8 | 
  9 | import os
 10 | import os.path
 11 | import pickle
 12 | import sys
 13 | import torch
 14 | import torch.utils.data as data
 15 | from PIL import Image, ImageDraw, ImageFont
 16 | import cv2
 17 | import numpy as np
 18 | from .voc_eval import voc_eval
 19 | if sys.version_info[0] == 2:
 20 |     import xml.etree.cElementTree as ET
 21 | else:
 22 |     import xml.etree.ElementTree as ET
 23 | 
 24 | VOC_CLASSES = (
 25 |     '__background__',  # always index 0
 26 |     'aeroplane',
 27 |     'bicycle',
 28 |     'bird',
 29 |     'boat',
 30 |     'bottle',
 31 |     'bus',
 32 |     'car',
 33 |     'cat',
 34 |     'chair',
 35 |     'cow',
 36 |     'diningtable',
 37 |     'dog',
 38 |     'horse',
 39 |     'motorbike',
 40 |     'person',
 41 |     'pottedplant',
 42 |     'sheep',
 43 |     'sofa',
 44 |     'train',
 45 |     'tvmonitor')
 46 | 
 47 | # for making bounding boxes pretty
 48 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128),
 49 |           (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128))
 50 | 
 51 | 
 52 | class AnnotationTransform(object):
 53 |     """Transforms a VOC annotation into a Tensor of bbox coords and label index
 54 |     Initilized with a dictionary lookup of classnames to indexes
 55 | 
 56 |     Arguments:
 57 |         class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
 58 |             (default: alphabetic indexing of VOC's 20 classes)
 59 |         keep_difficult (bool, optional): keep difficult instances or not
 60 |             (default: False)
 61 |         height (int): height
 62 |         width (int): width
 63 |     """
 64 | 
 65 |     def __init__(self, class_to_ind=None, keep_difficult=False):
 66 |         self.class_to_ind = class_to_ind or dict(
 67 |             zip(VOC_CLASSES, range(len(VOC_CLASSES))))
 68 |         self.keep_difficult = keep_difficult
 69 | 
 70 |     def __call__(self, target, width, height):
 71 |         """
 72 |         Arguments:
 73 |             target (annotation) : the target annotation to be made usable
 74 |                 will be an ET.Element
 75 |         Returns:
 76 |             a list containing lists of bounding boxes  [bbox coords, class name]
 77 |         """
 78 |         res = np.empty((0, 5))
 79 |         for obj in target.iter('object'):
 80 |             difficult = int(obj.find('difficult').text) == 1
 81 |             if not self.keep_difficult and difficult:
 82 |                 continue
 83 |             name = obj.find('name').text.lower().strip()
 84 |             bbox = obj.find('bndbox')
 85 | 
 86 |             pts = ['xmin', 'ymin', 'xmax', 'ymax']
 87 |             bndbox = []
 88 |             for i, pt in enumerate(pts):
 89 |                 cur_pt = int(bbox.find(pt).text) - 1
 90 |                 # scale height or width
 91 |                 # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
 92 |                 bndbox.append(cur_pt)
 93 |             label_idx = self.class_to_ind[name]
 94 |             bndbox.append(label_idx)
 95 |             # res += [bndbox]  # [xmin, ymin, xmax, ymax, label_ind]
 96 |             res = np.vstack((res, bndbox))
 97 |             # img_id = target.find('filename').text[:-4]
 98 |         if len(res) == 0:
 99 |             np.vstack((res, [0, 0, 0, 0, 0]))
100 |         return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
101 | 
102 | 
103 | class VOCDetection(data.Dataset):
104 |     """VOC Detection Dataset Object
105 | 
106 |     input is image, target is annotation
107 | 
108 |     Arguments:
109 |         root (string): filepath to VOCdevkit folder.
110 |         image_set (string): imageset to use (eg. 'train', 'val', 'test')
111 |         transform (callable, optional): transformation to perform on the
112 |             input image
113 |         target_transform (callable, optional): transformation to perform on the
114 |             target `annotation`
115 |             (eg: take in caption string, return tensor of word indices)
116 |         dataset_name (string, optional): which dataset to load
117 |             (default: 'VOC2007')
118 |     """
119 | 
120 |     def __init__(self,
121 |                  root,
122 |                  image_sets,
123 |                  transform=None,
124 |                  dataset_name='VOC0712'):
125 |         self.root = root
126 |         self.image_set = image_sets
127 |         self.transform = transform
128 |         self.target_transform = AnnotationTransform()
129 |         self.name = dataset_name
130 |         self._annopath = os.path.join('%s', 'Annotations', '%s.xml')
131 |         self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg')
132 |         self.ids = list()
133 |         for (year, name) in image_sets:
134 |             self._year = year
135 |             rootpath = os.path.join(self.root, 'VOC' + year)
136 |             # rootpath = os.path.join(self.root, dataset_name + year)
137 |             for line in open(
138 |                     os.path.join(rootpath, 'ImageSets', 'Main',
139 |                                  name + '.txt')):
140 |                 self.ids.append((rootpath, line.strip()))
141 | 
142 |     def __getitem__(self, index):
143 |         im, gt, img_info = self.pull_item(index)
144 |         return im, gt, img_info
145 | 
146 |     def __len__(self):
147 |         return len(self.ids)
148 | 
149 |     def pull_item(self, index):
150 |         img_id = self.ids[index]
151 | 
152 |         if self.name != 'test':
153 |             target = ET.parse(self._annopath % img_id).getroot()
154 |         else:
155 |             target = np.zeros((1, 5))
156 |         img = cv2.imread(self._imgpath % img_id)
157 |         im_h, im_w, channels = img.shape
158 |         img_info = [im_w, im_h]
159 |         if self.target_transform is not None:
160 |             target = self.target_transform(target, im_w, im_h)
161 | 
162 |         if self.name != 'test':
163 |             if self.transform is not None:
164 |                 img, target = self.transform(img, target)
165 |         else:
166 |             if self.transform is not None:
167 |                 img = self.transform(img)
168 | 
169 |         return img, target, img_info
170 | 
171 |     def pull_image(self, index):
172 |         '''Returns the original image object at index in PIL form
173 | 
174 |         Note: not using self.__getitem__(), as any transformations passed in
175 |         could mess up this functionality.
176 | 
177 |         Argument:
178 |             index (int): index of img to show
179 |         Return:
180 |             PIL img
181 |         '''
182 |         img_id = self.ids[index]
183 |         return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
184 | 
185 |     def pull_anno(self, index):
186 |         '''Returns the original annotation of image at index
187 | 
188 |         Note: not using self.__getitem__(), as any transformations passed in
189 |         could mess up this functionality.
190 | 
191 |         Argument:
192 |             index (int): index of img to get annotation of
193 |         Return:
194 |             list:  [img_id, [(label, bbox coords),...]]
195 |                 eg: ('001718', [('dog', (96, 13, 438, 332))])
196 |         '''
197 |         img_id = self.ids[index]
198 |         anno = ET.parse(self._annopath % img_id).getroot()
199 |         gt = self.target_transform(anno, 1, 1)
200 |         return img_id[1], gt
201 | 
202 |     def pull_tensor(self, index):
203 |         '''Returns the original image at an index in tensor form
204 | 
205 |         Note: not using self.__getitem__(), as any transformations passed in
206 |         could mess up this functionality.
207 | 
208 |         Argument:
209 |             index (int): index of img to show
210 |         Return:
211 |             tensorized version of img, squeezed
212 |         '''
213 |         return torch.Tensor(self.pull_image(index)).unsqueeze_(0)
214 | 
215 |     def evaluate_detections(self, all_boxes, output_dir=None):
216 |         """
217 |         all_boxes is a list of length number-of-classes.
218 |         Each list element is a list of length number-of-images.
219 |         Each of those list elements is either an empty list []
220 |         or a numpy array of detection.
221 | 
222 |         all_boxes[class][image] = [] or np.array of shape #dets x 5
223 |         """
224 |         self._write_voc_results_file(all_boxes)
225 |         self._do_python_eval(output_dir)
226 | 
227 |     def _get_voc_results_file_template(self):
228 |         filename = 'comp3_det_test' + '_{:s}.txt'
229 |         filedir = os.path.join(self.root, 'results', 'VOC' + self._year,
230 |                                'Main')
231 |         if not os.path.exists(filedir):
232 |             os.makedirs(filedir)
233 |         path = os.path.join(filedir, filename)
234 |         return path
235 | 
236 |     def _write_voc_results_file(self, all_boxes):
237 |         for cls_ind, cls in enumerate(VOC_CLASSES):
238 |             if cls == '__background__':
239 |                 continue
240 |             print('Writing {} VOC results file'.format(cls))
241 |             filename = self._get_voc_results_file_template().format(cls)
242 |             # print(filename)
243 |             with open(filename, 'wt') as f:
244 |                 for im_ind, index in enumerate(self.ids):
245 |                     index = index[1]
246 |                     dets = all_boxes[cls_ind][im_ind]
247 |                     if dets == []:
248 |                         continue
249 |                     for k in range(dets.shape[0]):
250 |                         f.write(
251 |                             '{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.format(
252 |                                 index, dets[k, -1], dets[k, 0] + 1,
253 |                                 dets[k, 1] + 1, dets[k, 2] + 1,
254 |                                 dets[k, 3] + 1))
255 | 
256 |     def _do_python_eval(self, output_dir='output'):
257 |         rootpath = os.path.join(self.root, 'VOC' + self._year)
258 |         name = self.image_set[0][1]
259 |         annopath = os.path.join(rootpath, 'Annotations', '{:s}.xml')
260 |         imagesetfile = os.path.join(rootpath, 'ImageSets', 'Main',
261 |                                     name + '.txt')
262 |         cachedir = os.path.join(self.root, 'annotations_cache')
263 |         aps = []
264 |         # The PASCAL VOC metric changed in 2010
265 |         use_07_metric = True if int(self._year) < 2010 else False
266 |         print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
267 |         if output_dir is not None and not os.path.isdir(output_dir):
268 |             os.mkdir(output_dir)
269 |         for i, cls in enumerate(VOC_CLASSES):
270 |             if cls == '__background__':
271 |                 continue
272 | 
273 |             filename = self._get_voc_results_file_template().format(cls)
274 |             rec, prec, ap = voc_eval(
275 |                 filename,
276 |                 annopath,
277 |                 imagesetfile,
278 |                 cls,
279 |                 cachedir,
280 |                 ovthresh=0.5,
281 |                 use_07_metric=use_07_metric)
282 |             aps += [ap]
283 |             print('AP for {} = {:.4f}'.format(cls, ap))
284 |             if output_dir is not None:
285 |                 with open(os.path.join(output_dir, cls + '_pr.pkl'),
286 |                           'wb') as f:
287 |                     pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
288 |         print('Mean AP = {:.4f}'.format(np.mean(aps)))
289 |         print('~~~~~~~~')
290 |         print('Results:')
291 |         for ap in aps:
292 |             print('{:.3f}'.format(ap))
293 |         print('{:.3f}'.format(np.mean(aps)))
294 |         print('~~~~~~~~')
295 |         print('')
296 |         print('--------------------------------------------------------------')
297 |         print('Results computed with the **unofficial** Python eval code.')
298 |         print('Results should be very close to the official MATLAB eval code.')
299 |         print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
300 |         print('-- Thanks, The Management')
301 |         print('--------------------------------------------------------------')
302 | 
303 | 
304 | def detection_collate(batch):
305 |     """Custom collate fn for dealing with batches of images that have a different
306 |     number of associated object annotations (bounding boxes).
307 | 
308 |     Arguments:
309 |         batch: (tuple) A tuple of tensor images and lists of annotations
310 | 
311 |     Return:
312 |         A tuple containing:
313 |             1) (tensor) batch of images stacked on their 0 dim
314 |             2) (list of tensors) annotations for a given image are stacked on 0 dim
315 |     """
316 |     targets = []
317 |     imgs = []
318 |     img_info = []
319 |     for sample in batch:
320 |         imgs.append(sample[0])
321 |         targets.append(torch.FloatTensor(sample[1]))
322 |         img_info.append(torch.FloatTensor(sample[2]))
323 |     return torch.stack(imgs, 0), targets, img_info
324 | 


--------------------------------------------------------------------------------
/configs/config.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import unicode_literals
  5 | 
  6 | from utils.collections2 import AttrDict
  7 | import six
  8 | import yaml
  9 | import torch
 10 | import torch.nn as nn
 11 | from torch.nn import init
 12 | import numpy as np
 13 | import copy
 14 | from ast import literal_eval
 15 | 
 16 | __C = AttrDict()
 17 | cfg = __C
 18 | 
 19 | __C.MODEL = AttrDict()
 20 | 
 21 | __C.MODEL.NUM_CLASSES = -1
 22 | __C.MODEL.TYPE = ''
 23 | __C.MODEL.SIZE = '300'
 24 | __C.MODEL.CONV_BODY = ''
 25 | __C.MODEL.REFINE = False
 26 | __C.MODEL.LOAD_PRETRAINED_WEIGHTS = False
 27 | __C.MODEL.PRETRAIN_WEIGHTS = ''
 28 | __C.MODEL.OBJECT_SCORE = 0.01
 29 | 
 30 | __C.TRAIN = AttrDict()
 31 | __C.TRAIN.OVERLAP = 0.5
 32 | __C.TRAIN.OHEM = True
 33 | __C.TRAIN.NEG_RATIO = 3
 34 | __C.TRAIN.FOCAL_LOSS = False
 35 | __C.TRAIN.FOCAL_LOSS_TYPE = 'SOFTMAX'
 36 | __C.TRAIN.BGR_MEAN = [104, 117, 123]
 37 | __C.TRAIN.BATCH_SIZE = 1
 38 | __C.TRAIN.CHANNEL_SIZE = '48'
 39 | __C.TRAIN.WARMUP = True
 40 | __C.TRAIN.WARMUP_EPOCH = 2
 41 | __C.TRAIN.DEVICE_IDS = [0]
 42 | __C.TRAIN.TRAIN_ON = True
 43 | 
 44 | __C.SMALL = AttrDict()
 45 | 
 46 | __C.SMALL.FEATURE_MAPS = [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
 47 | __C.SMALL.FEATURE_MAPS_SMALL = [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
 48 | __C.SMALL.FEATURE_MAPS_LARGE = [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
 49 | 
 50 | 
 51 | __C.SMALL.ARM_CHANNELS = [512, 1024, 512, 256, 256, 256]
 52 | __C.SMALL.ODM_CHANNELS = [256, 256, 256, 256]
 53 | __C.SMALL.NUM_ANCHORS = [4, 6, 6, 6, 4, 4]
 54 | __C.SMALL.STEPS = [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100],
 55 |                    [300, 300]]
 56 | 
 57 | __C.SMALL.STEPS_SMALL = [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100],
 58 |                    [300, 300]]
 59 | 
 60 | __C.SMALL.STEPS_LARGE = [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100],
 61 |                    [300, 300]]
 62 | 
 63 | 
 64 | 
 65 | __C.SMALL.MIN_SIZES = [30, 60, 111, 162, 213, 264]
 66 | __C.SMALL.MIN_SIZES_SMALL = [30, 60, 111, 162, 213, 264]
 67 | __C.SMALL.MIN_SIZES_LARGE = [30, 60, 111, 162, 213, 264]
 68 | 
 69 | __C.SMALL.MAX_SIZES = [60, 111, 162, 213, 264, 315]
 70 | __C.SMALL.MAX_SIZES_SMALL = [60, 111, 162, 213, 264, 315]
 71 | __C.SMALL.MAX_SIZES_LARGE = [60, 111, 162, 213, 264, 315]
 72 | 
 73 | __C.SMALL.ASPECT_RATIOS = [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333],
 74 |                            [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
 75 | __C.SMALL.ASPECT_RATIOS_SMALL = [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333],
 76 |                            [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
 77 | __C.SMALL.ASPECT_RATIOS_LARGE = [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333],
 78 |                            [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
 79 | 
 80 | __C.SMALL.VARIANCE = [0.1, 0.2]
 81 | __C.SMALL.CLIP = True
 82 | __C.SMALL.IMG_WH = [300, 300]
 83 | __C.SMALL.INPUT_FIXED = True
 84 | __C.SMALL.USE_MAX_SIZE = True
 85 | 
 86 | __C.BIG = AttrDict()
 87 | __C.BIG.FEATURE_MAPS = [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2],
 88 |                         [1, 1]]
 89 | __C.BIG.ARM_CHANNELS = [512, 1024, 512, 256, 256, 256, 256]
 90 | __C.BIG.ODM_CHANNELS = [256, 256, 256, 256]
 91 | __C.BIG.NUM_ANCHORS = [4, 6, 6, 6, 6, 4, 4]
 92 | __C.BIG.STEPS = [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256],
 93 |                  [512, 512]]
 94 | __C.BIG.MIN_SIZES = [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8]
 95 | __C.BIG.MAX_SIZES = [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
 96 | __C.BIG.ASPECT_RATIOS = [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333],
 97 |                          [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5],
 98 |                          [2, 0.5]]
 99 | __C.BIG.VARIANCE = [0.1, 0.2]
100 | __C.BIG.CLIP = True
101 | __C.BIG.IMG_WH = [512, 512]
102 | __C.BIG.INPUT_FIXED = True
103 | __C.BIG.USE_MAX_SIZE = True
104 | 
105 | __C.SOLVER = AttrDict()
106 | 
107 | __C.SOLVER.WEIGHT_DECAY = 0.0005
108 | __C.SOLVER.BASE_LR = 0.001
109 | __C.SOLVER.GAMMA = 0.1
110 | __C.SOLVER.MOMENTUM = 0.9
111 | __C.SOLVER.EPOCH_STEPS = []
112 | __C.SOLVER.LR = []
113 | __C.SOLVER.END_EPOCH = 1
114 | __C.SOLVER.START_EPOCH = 0
115 | 
116 | __C.DATASETS = AttrDict()
117 | 
118 | VOCROOT = 'data/datasets/VOCdevkit0712/'
119 | COCOROOT = 'data/datasets/coco2015'
120 | UAVROOT = '/raid/jing/data/dataset/UAVdevkit2017/'
121 | DOTAROOT = '/raid/jing/data/dataset/DOTAdevkit2019/'
122 | IIAI_SATROOT = '/raid/flx/data/jing_detection/data/IIAI_SATdevkit2019/'
123 | Objects365_TinyROOT = '/raid/jing/data/dataset/Objects365/'
124 | 
125 | __C.DATASETS.TRAIN_TYPE = []
126 | __C.DATASETS.VAL_TYPE = []
127 | __C.DATASETS.DATAROOT = VOCROOT
128 | __C.DATASETS.DATA_TYPE = ''
129 | 
130 | __C.DATASETS.SETS = AttrDict()
131 | __C.DATASETS.SETS.VOC = [['0712', '0712_trainval']]
132 | __C.DATASETS.SETS.VOC0712PLUS = [['0712', '0712_trainval_test']]
133 | __C.DATASETS.SETS.VOC0712 = [['2012', '2012_trainval']]
134 | __C.DATASETS.SETS.VOC2007 = [['0712', "2007_test"]]
135 | __C.DATASETS.SETS.COCO = [['2014', 'train'], ['2014', 'valminusminival']]
136 | __C.DATASETS.SETS.COCOval = [['2014', 'minival']]
137 | __C.DATASETS.SETS.UAV = [['2017', '2017_trainval']]
138 | __C.DATASETS.SETS.UAVval = [['2017', 'test']]
139 | __C.DATASETS.SETS.DOTA = [['2019', 'train']]
140 | __C.DATASETS.SETS.IIAI_SAT = [['2019', 'train']]
141 | __C.DATASETS.SETS.IIAI_SATval = [['2019', 'val']]
142 | __C.DATASETS.SETS.DOTAval = [['2019', 'val']]
143 | __C.DATASETS.SETS.Objects365_Tiny = [['train']]
144 | __C.DATASETS.SETS.Objects365_Tinyval = [['val']]
145 | __C.DATASETS.SETS.VOCROOT = VOCROOT
146 | __C.DATASETS.SETS.COCOROOT = COCOROOT
147 | __C.DATASETS.SETS.UAVROOT = UAVROOT
148 | __C.DATASETS.SETS.DOTAROOT = DOTAROOT
149 | __C.DATASETS.SETS.IIAI_SATROOT = IIAI_SATROOT
150 | __C.DATASETS.SETS.Objects365_TinyROOT = Objects365_TinyROOT
151 | 
152 | __C.TEST = AttrDict()
153 | __C.TEST.INPUT_WH = [300, 300]
154 | __C.TEST.CONFIDENCE_THRESH = 0.01
155 | __C.TEST.NMS_TYPE = 'NMS'
156 | __C.TEST.NMS_OVERLAP = 0.45
157 | __C.TEST.BATCH_SIZE = 16
158 | 
159 | VOC_CLASSES = (
160 |     '__background__',  # always index 0
161 |     'aeroplane',
162 |     'bicycle',
163 |     'bird',
164 |     'boat',
165 |     'bottle',
166 |     'bus',
167 |     'car',
168 |     'cat',
169 |     'chair',
170 |     'cow',
171 |     'diningtable',
172 |     'dog',
173 |     'horse',
174 |     'motorbike',
175 |     'person',
176 |     'pottedplant',
177 |     'sheep',
178 |     'sofa',
179 |     'train',
180 |     'tvmonitor')
181 | 
182 | UAV_CLASSES = (
183 |     '__background__',  # always index 0
184 |     'car')
185 | 
186 | DOTA_CLASSES = (
187 |     '__background__',  # always index 0
188 |     'plane',
189 |     'ship',
190 |     'storge-tank',
191 |     'baseball-diamond',
192 |     'tennis-court',
193 |     'baskeball-court',
194 |     'ground-track-field',
195 |     'harbor',
196 |     'bridge',
197 |     'large-vehicle',
198 |     'small-vehicle',
199 |     'helicopter',
200 |     'roundabout',
201 |     'soccer-ball-field',
202 |     'swimming-pool',
203 |     'container-crane'
204 |     )
205 | 
206 | COCO_CLASSES = ('__background__', 'person', 'bicycle', 'car', 'motorbike',
207 |                 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
208 |                 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
209 |                 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear',
210 |                 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
211 |                 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
212 |                 'kite', 'baseball bat', 'baseball glove', 'skateboard',
213 |                 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
214 |                 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
215 |                 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
216 |                 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed',
217 |                 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse',
218 |                 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
219 |                 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
220 |                 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
221 | 
222 | Objects365_Tiny_CLASSES = ('__background__', 'pomelo', 'pig', 'race car', 'rice cooker', 'tuba', 'crosswalk sign', 'papaya',
223 |                            'hair drier', 'green onion', 'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill',
224 |                            'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup', 'shark', 'steak', 'poker card',
225 |                            'binoculars', 'llama', 'radish', 'noodles', 'mop', 'yak', 'crab', 'microscope', 'barbell', 'bread/bun',
226 |                            'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'mangosteen', 'seal', 'comb', 'eraser', 'pitaya',
227 |                            'scallop', 'pencil case', 'saw', 'table tennis paddle', 'okra', 'starfish', 'eagle', 'monkey', 'durian',
228 |                            'rabbit', 'game board', 'french horn', 'ambulance', 'hoverboard', 'asparagus', 'pasta', 'target',
229 |                            'hotair balloon', 'chainsaw', 'lobster', 'iron', 'flashlight')
230 | 
231 | 
232 | 
233 | def merge_cfg_from_file(cfg_filename):
234 |     """Load a yaml config file and merge it into the global config."""
235 |     with open(cfg_filename, 'r') as f:
236 |         yaml_cfg = AttrDict(yaml.load(f))
237 |     _merge_a_into_b(yaml_cfg, __C)
238 | 
239 | 
240 | cfg_from_file = merge_cfg_from_file
241 | 
242 | 
243 | def merge_cfg_from_cfg(cfg_other):
244 |     """Merge `cfg_other` into the global config."""
245 |     _merge_a_into_b(cfg_other, __C)
246 | 
247 | 
248 | def _merge_a_into_b(a, b, stack=None):
249 |     """Merge config dictionary a into config dictionary b, clobbering the
250 |     options in b whenever they are also specified in a.
251 |     """
252 |     assert isinstance(a, AttrDict), 'Argument `a` must be an AttrDict'
253 |     assert isinstance(b, AttrDict), 'Argument `b` must be an AttrDict'
254 | 
255 |     for k, v_ in a.items():
256 |         full_key = '.'.join(stack) + '.' + k if stack is not None else k
257 |         # a must specify keys that are in b
258 |         if k not in b:
259 |             raise KeyError('Non-existent config key: {}'.format(full_key))
260 | 
261 |         v = copy.deepcopy(v_)
262 |         v = _decode_cfg_value(v)
263 |         v = _check_and_coerce_cfg_value_type(v, b[k], k, full_key)
264 | 
265 |         # Recursively merge dicts
266 |         if isinstance(v, AttrDict):
267 |             try:
268 |                 stack_push = [k] if stack is None else stack + [k]
269 |                 _merge_a_into_b(v, b[k], stack=stack_push)
270 |             except BaseException:
271 |                 raise
272 |         else:
273 |             b[k] = v
274 | 
275 | 
276 | def _decode_cfg_value(v):
277 |     """Decodes a raw config value (e.g., from a yaml config files or command
278 |     line argument) into a Python object.
279 |     """
280 |     # Configs parsed from raw yaml will contain dictionary keys that need to be
281 |     # converted to AttrDict objects
282 |     if isinstance(v, dict):
283 |         return AttrDict(v)
284 |     # All remaining processing is only applied to strings
285 |     if not isinstance(v, six.string_types):
286 |         return v
287 |     # Try to interpret `v` as a:
288 |     #   string, number, tuple, list, dict, boolean, or None
289 |     try:
290 |         v = literal_eval(v)
291 |     # The following two excepts allow v to pass through when it represents a
292 |     # string.
293 |     #
294 |     # Longer explanation:
295 |     # The type of v is always a string (before calling literal_eval), but
296 |     # sometimes it *represents* a string and other times a data structure, like
297 |     # a list. In the case that v represents a string, what we got back from the
298 |     # yaml parser is 'foo' *without quotes* (so, not '"foo"'). literal_eval is
299 |     # ok with '"foo"', but will raise a ValueError if given 'foo'. In other
300 |     # cases, like paths (v = 'foo/bar' and not v = '"foo/bar"'), literal_eval
301 |     # will raise a SyntaxError.
302 |     except ValueError:
303 |         pass
304 |     except SyntaxError:
305 |         pass
306 |     return v
307 | 
308 | 
309 | def _check_and_coerce_cfg_value_type(value_a, value_b, key, full_key):
310 |     """Checks that `value_a`, which is intended to replace `value_b` is of the
311 |     right type. The type is correct if it matches exactly or is one of a few
312 |     cases in which the type can be easily coerced.
313 |     """
314 |     # The types must match (with some exceptions)
315 |     type_b = type(value_b)
316 |     type_a = type(value_a)
317 |     if type_a is type_b:
318 |         return value_a
319 | 
320 |     # Exceptions: numpy arrays, strings, tuple<->list
321 |     if isinstance(value_b, np.ndarray):
322 |         value_a = np.array(value_a, dtype=value_b.dtype)
323 |     elif isinstance(value_b, six.string_types):
324 |         value_a = str(value_a)
325 |     elif isinstance(value_a, tuple) and isinstance(value_b, list):
326 |         value_a = list(value_a)
327 |     elif isinstance(value_a, list) and isinstance(value_b, tuple):
328 |         value_a = tuple(value_a)
329 |     else:
330 |         raise ValueError(
331 |             'Type mismatch ({} vs. {}) with values ({} vs. {}) for config '
332 |             'key: {}'.format(type_b, type_a, value_b, value_a, full_key))
333 |     return value_a
334 | 


--------------------------------------------------------------------------------