├── utils ├── __init__.py ├── nms │ ├── __init__.py │ ├── gpu_nms.hpp │ ├── py_cpu_nms.py │ ├── gpu_nms.pyx │ ├── nms_kernel.cu │ └── cpu_nms.pyx ├── averageMeter.py ├── timer.py ├── nms_wrapper.py ├── collections2.py ├── build.py ├── get_class_map.py └── convert_darknet.py ├── dcn ├── modules │ ├── __init__.py │ ├── deform_conv.py │ └── deform_pool.py ├── functions │ ├── __init__.py │ ├── deform_pool.py │ └── deform_conv.py ├── __init__.py ├── setup.py └── src │ └── deform_pool_cuda.cpp ├── layers ├── __init__.py ├── functions │ ├── __init__.py │ ├── prior_layer.py │ ├── prior_box.py │ └── detection.py └── modules │ ├── __init__.py │ ├── focal_loss_sigmoid.py │ ├── weight_smooth_l1_loss.py │ ├── weight_softmax_loss.py │ ├── focal_loss_softmax.py │ ├── multibox_loss.py │ ├── refine_multibox_loss.py │ └── refine_multibox_loss_seperate.py ├── .gitignore ├── data ├── __init__.py ├── scripts │ ├── VOC2012.sh │ └── VOC2007.sh ├── voc_eval.py ├── data_augment.py └── voc0712.py ├── compile.sh ├── make.sh ├── configs ├── EFGRNet_vgg_coco_dcn_512.yaml ├── EFGRNet_vgg_coco_dcn.yaml └── config.py ├── README.md ├── models ├── resnet.py ├── model_builder.py ├── vgg.py ├── model_builder_vgg.py └── model_builder_resnet.py └── eval_dcn.py /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dcn/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/nms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dcn/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | from .modules import * 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .idea/ 3 | *.so 4 | /eval/ 5 | /utils/nms/cpu_nms.so 6 | /utils/nms/gpu_nms.so 7 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .voc0712 import VOCDetection, detection_collate 3 | from .coco import * 4 | from .data_augment import * 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /utils/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /compile.sh: -------------------------------------------------------------------------------- 1 | PYTHON=${PYTHON:-"python"} 2 | 3 | echo "Building dcn..." 4 | cd ./dcn 5 | if [ -d "build" ]; then 6 | rm -r build 7 | fi 8 | $PYTHON setup.py build_ext --inplace 9 | -------------------------------------------------------------------------------- /layers/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .detection import Detect 2 | from .prior_box import PriorBox 3 | # from .refine_prior_box import RefinePriorBox 4 | 5 | __all__ = ['Detect', 'PriorBox'] 6 | -------------------------------------------------------------------------------- /make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd ./utils/ 3 | 4 | CUDA_PATH=/usr/local/cuda/ 5 | 6 | python build.py build_ext --inplace 7 | # if you use anaconda3 maybe you need add this 8 | # change code like https://github.com/rbgirshick/py-faster-rcnn/issues/706 9 | mv nms/cpu_nms.cpython-36m-x86_64-linux-gnu.so nms/cpu_nms.so 10 | mv nms/gpu_nms.cpython-36m-x86_64-linux-gnu.so nms/gpu_nms.so 11 | cd .. 12 | -------------------------------------------------------------------------------- /layers/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .weight_smooth_l1_loss import WeightSmoothL1Loss 2 | from .weight_softmax_loss import WeightSoftmaxLoss 3 | from .multibox_loss import MultiBoxLoss 4 | from .refine_multibox_loss import RefineMultiBoxLoss 5 | from .focal_loss_sigmoid import FocalLossSigmoid 6 | from .focal_loss_softmax import FocalLossSoftmax 7 | 8 | 9 | 10 | __all__ = ['MultiBoxLoss', 'WeightSoftmaxLoss', ] 11 | -------------------------------------------------------------------------------- /utils/averageMeter.py: -------------------------------------------------------------------------------- 1 | class AverageMeter(object): 2 | """Computes and stores the average and current value""" 3 | 4 | def __init__(self): 5 | self.reset() 6 | 7 | def reset(self): 8 | self.val = 0 9 | self.avg = 0 10 | self.sum = 0 11 | self.count = 0 12 | 13 | def update(self, val, n=1): 14 | self.val = val 15 | self.sum += val * n 16 | self.count += n 17 | self.avg = self.sum / self.count -------------------------------------------------------------------------------- /dcn/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions.deform_conv import deform_conv, modulated_deform_conv 2 | from .functions.deform_pool import deform_roi_pooling 3 | from .modules.deform_conv import (DeformConv, ModulatedDeformConv, 4 | ModulatedDeformConvPack) 5 | from .modules.deform_pool import (DeformRoIPooling, DeformRoIPoolingPack, 6 | ModulatedDeformRoIPoolingPack) 7 | 8 | __all__ = [ 9 | 'DeformConv', 'DeformRoIPooling', 'DeformRoIPoolingPack', 10 | 'ModulatedDeformRoIPoolingPack', 'ModulatedDeformConv', 11 | 'ModulatedDeformConvPack', 'deform_conv', 12 | 'modulated_deform_conv', 'deform_roi_pooling' 13 | ] 14 | -------------------------------------------------------------------------------- /data/scripts/VOC2012.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2012 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 26 | echo "Done downloading." 27 | 28 | 29 | # Extract data 30 | echo "Extracting trainval ..." 31 | tar -xvf VOCtrainval_11-May-2012.tar 32 | echo "removing tar ..." 33 | rm VOCtrainval_11-May-2012.tar 34 | 35 | end=`date +%s` 36 | runtime=$((end-start)) 37 | 38 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /dcn/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | nvcc_ARCH = ['-arch=sm_52'] 5 | nvcc_ARCH += ["-gencode=arch=compute_61,code=\"compute_61\""] 6 | # nvcc_ARCH += ["-gencode=arch=compute_75,code=\"sm_75\""] 7 | # nvcc_ARCH += ["-gencode=arch=compute_70,code=\"sm_70\""] 8 | nvcc_ARCH += ["-gencode=arch=compute_61,code=\"sm_61\""] 9 | nvcc_ARCH += ["-gencode=arch=compute_52,code=\"sm_52\""] 10 | extra_compile_args = { 11 | 'cxx': ['-Wno-unused-function', '-Wno-write-strings'], 12 | 'nvcc': nvcc_ARCH,} 13 | 14 | setup( 15 | name='deform_conv', 16 | ext_modules=[ 17 | CUDAExtension('deform_conv_cuda', [ 18 | 'src/deform_conv_cuda.cpp', 19 | 'src/deform_conv_cuda_kernel.cu', 20 | ], 21 | extra_compile_args=extra_compile_args, 22 | ), 23 | CUDAExtension('deform_pool_cuda', [ 24 | 'src/deform_pool_cuda.cpp', 'src/deform_pool_cuda_kernel.cu' 25 | ]), 26 | ], 27 | cmdclass={'build_ext': BuildExtension}) 28 | -------------------------------------------------------------------------------- /data/scripts/VOC2007.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2007 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 26 | echo "Downloading VOC2007 test data ..." 27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 28 | echo "Done downloading." 29 | 30 | # Extract data 31 | echo "Extracting trainval ..." 32 | tar -xvf VOCtrainval_06-Nov-2007.tar 33 | echo "Extracting test ..." 34 | tar -xvf VOCtest_06-Nov-2007.tar 35 | echo "removing tars ..." 36 | rm VOCtrainval_06-Nov-2007.tar 37 | rm VOCtest_06-Nov-2007.tar 38 | 39 | end=`date +%s` 40 | runtime=$((end-start)) 41 | 42 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /layers/modules/focal_loss_sigmoid.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class FocalLossSigmoid(nn.Module): 11 | ''' 12 | sigmoid version focal loss 13 | ''' 14 | 15 | def __init__(self, alpha=0.25, gamma=2, size_average=False): 16 | super(FocalLossSigmoid, self).__init__() 17 | self.alpha = alpha 18 | self.gamma = gamma 19 | self.size_average = size_average 20 | 21 | def forward(self, inputs, targets): 22 | N = inputs.size(0) 23 | C = inputs.size(1) 24 | P = torch.sigmoid(inputs) 25 | alpha_mask = self.alpha * targets 26 | loss_pos = -1. * torch.pow( 27 | 1 - P, self.gamma) * torch.log(P) * targets * alpha_mask 28 | loss_neg = -1. * torch.pow(1 - P, self.gamma) * torch.log(1 - P) * ( 29 | 1 - targets) * (1 - alpha_mask) 30 | batch_loss = loss_neg + loss_pos 31 | if self.size_average: 32 | loss = batch_loss.mean() 33 | else: 34 | loss = batch_loss.sum() 35 | return loss 36 | -------------------------------------------------------------------------------- /utils/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /utils/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | 11 | class Timer(object): 12 | """A simple timer.""" 13 | 14 | def __init__(self): 15 | self.total_time = 0. 16 | self.calls = 0 17 | self.start_time = 0. 18 | self.diff = 0. 19 | self.average_time = 0. 20 | 21 | def tic(self): 22 | # using time.time instead of time.clock because time time.clock 23 | # does not normalize for multithreading 24 | self.start_time = time.time() 25 | 26 | def toc(self, average=True): 27 | self.diff = time.time() - self.start_time 28 | self.total_time += self.diff 29 | self.calls += 1 30 | self.average_time = self.total_time / self.calls 31 | if average: 32 | return self.average_time 33 | else: 34 | return self.diff 35 | 36 | def clear(self): 37 | self.total_time = 0. 38 | self.calls = 0 39 | self.start_time = 0. 40 | self.diff = 0. 41 | self.average_time = 0. 42 | -------------------------------------------------------------------------------- /utils/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from .nms.cpu_nms import cpu_nms, cpu_soft_nms 9 | from .nms.gpu_nms import gpu_nms 10 | 11 | # def nms(dets, thresh, force_cpu=False): 12 | # """Dispatch to either CPU or GPU NMS implementations.""" 13 | 14 | # if dets.shape[0] == 0: 15 | # return [] 16 | # if cfg.USE_GPU_NMS and not force_cpu: 17 | # return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 18 | # else: 19 | # return cpu_nms(dets, thresh) 20 | 21 | 22 | def nms(dets, thresh, force_cpu=False): 23 | """Dispatch to either CPU or GPU NMS implementations.""" 24 | 25 | if dets.shape[0] == 0: 26 | return [] 27 | if force_cpu: 28 | #return cpu_soft_nms(dets, thresh, method = 0) 29 | return cpu_nms(dets, thresh) 30 | return gpu_nms(dets, thresh) 31 | 32 | 33 | def soft_nms(dets, Nt=0.3, sigma=0.5, thresh=0.001, method=1): 34 | """Dispatch to either CPU or GPU NMS implementations.""" 35 | 36 | if dets.shape[0] == 0: 37 | return [] 38 | return cpu_soft_nms(dets, sigma, Nt, thresh, method) -------------------------------------------------------------------------------- /layers/modules/weight_smooth_l1_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class WeightSmoothL1Loss(nn.Module): 11 | def __init__(self, class_num, size_average=False): 12 | super(WeightSmoothL1Loss, self).__init__() 13 | self.class_num = class_num 14 | self.size_average = size_average 15 | 16 | def forward(self, inputs, targets, weights): 17 | N = inputs.size(0) 18 | loc_num = inputs.size(1) 19 | abs_out = torch.abs(inputs - targets) 20 | 21 | if inputs.is_cuda and not weights.is_cuda: 22 | weights = weights.cuda() 23 | 24 | weights = weights.view(-1, 1) 25 | 26 | weights = torch.cat((weights, weights, weights, weights), 1) 27 | mask_big = abs_out >= 1. 28 | mask_small = abs_out < 1. 29 | loss_big = weights[mask_big] * (abs_out[mask_big] - 0.5) 30 | loss_small = weights[mask_small] * 0.5 * torch.pow( 31 | abs_out[mask_small], 2) 32 | loss_sum = loss_big.sum() + loss_small.sum() 33 | 34 | if self.size_average: 35 | loss = loss_sum / N * loc_num 36 | else: 37 | loss = loss_sum 38 | return loss 39 | -------------------------------------------------------------------------------- /layers/modules/weight_softmax_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class WeightSoftmaxLoss(nn.Module): 11 | def __init__(self, class_num, gamma=2, size_average=True): 12 | super(WeightSoftmaxLoss, self).__init__() 13 | # if isinstance(weights, Variable): 14 | # self.weights = weights 15 | # else: 16 | # self.weights = Variable(weights) 17 | 18 | self.class_num = class_num 19 | self.gamma = gamma 20 | self.size_average = size_average 21 | 22 | def forward(self, inputs, targets, weights): 23 | N = inputs.size(0) 24 | C = inputs.size(1) 25 | P = F.softmax(inputs) 26 | 27 | class_mask = inputs.data.new(N, C).fill_(0) 28 | class_mask = Variable(class_mask) 29 | ids = targets.view(-1, 1) 30 | class_mask.scatter_(1, ids.data, 1.) 31 | if inputs.is_cuda and not weights.is_cuda: 32 | weights = weights.cuda() 33 | probs = (P * class_mask).sum(1).view(-1, 1) 34 | 35 | log_p = probs.log() 36 | weights = weights.view(-1, 1) 37 | batch_loss = -weights * log_p 38 | 39 | if self.size_average: 40 | loss = batch_loss.mean() 41 | else: 42 | loss = batch_loss.sum() 43 | return loss -------------------------------------------------------------------------------- /layers/modules/focal_loss_softmax.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class FocalLossSoftmax(nn.Module): 11 | ''' 12 | softmax version focal loss 13 | ''' 14 | 15 | def __init__(self, class_num, alpha=None, gamma=2, size_average=True): 16 | super(FocalLossSoftmax, self).__init__() 17 | if alpha is None: 18 | self.alpha = Variable(torch.ones(class_num, 1)) 19 | else: 20 | if isinstance(alpha, Variable): 21 | self.alpha = alpha 22 | else: 23 | self.alpha = Variable(alpha) 24 | self.gamma = gamma 25 | self.class_num = class_num 26 | self.size_average = size_average 27 | 28 | def forward(self, inputs, targets): 29 | N = inputs.size(0) 30 | C = inputs.size(1) 31 | P = F.softmax(inputs) 32 | 33 | class_mask = inputs.data.new(N, C).fill_(0) 34 | class_mask = Variable(class_mask) 35 | ids = targets.view(-1, 1) 36 | class_mask.scatter_(1, ids.data, 1.) 37 | 38 | if inputs.is_cuda and not self.alpha.is_cuda: 39 | self.alpha = self.alpha.cuda() 40 | alpha = self.alpha[ids.data.view(-1)] 41 | probs = (P * class_mask).sum(1).view(-1, 1) 42 | log_p = probs.log() 43 | batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p 44 | 45 | if self.size_average: 46 | loss = batch_loss.mean() 47 | else: 48 | loss = batch_loss.sum() 49 | return loss -------------------------------------------------------------------------------- /configs/EFGRNet_vgg_coco_dcn_512.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: refine_vgg 3 | SIZE: '512' 4 | REFINE: True 5 | CONV_BODY: efrgnet_vgg.refine_vgg 6 | NUM_CLASSES: 81 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/vgg16_reducedfc.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 5 18 | TRAIN_ON: True 19 | 20 | BIG: 21 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]] 22 | ARM_CHANNELS: [512, 1024, 256, 256] 23 | ODM_CHANNELS: [256, 256, 256, 256] 24 | NUM_ANCHORS: [3, 3, 3, 3] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 26 | MIN_SIZES: [30, 64, 128, 256] 27 | MAX_SIZES: [64, 128, 256, 315] 28 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 29 | CLIP: True 30 | IMG_WH: [512, 512] 31 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 32 | USE_MAX_SIZE: False 33 | 34 | SOLVER: 35 | WEIGHT_DECAY: 0.0005 36 | BASE_LR: 0.002 37 | GAMMA: 0.1 38 | MOMENTUM: 0.9 39 | # EPOCH_STEPS: [0, 90, 120, 140] 40 | LR: [0.004, 0.002, 0.0004, 0.00004, 0.000004] 41 | EPOCH_STEPS: [90, 110, 130, 150, 160] 42 | END_EPOCH: 160 43 | START_EPOCH: 0 44 | 45 | DATASETS: 46 | TRAIN_TYPE: [['2014', 'train'], ['2014', 'valminusminival']] 47 | VAL_TYPE: [['2014', 'minival']] 48 | # VAL_TYPE: [['2015', 'test-dev']] 49 | DATAROOT: '/media/jnie/Storage/ubuntu/DataSets/coco/' 50 | DATA_TYPE: 'COCO' 51 | SETS: 52 | VOC: [['2007', 'trainval'], ['2012', 'trainval']] 53 | VOC0712PLUS: [['2007', 'trainval'], ['2012', 'trainval'],['2007', 'test']] 54 | VOC0712: [['2012', '2012_trainval']] 55 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 56 | VOC2007: [['2007', 'test']] 57 | COCOval: [['2014', 'minival']] 58 | VOCROOT: 'raid/jing/data/VOCdevkit/' 59 | COCOROOT: 'data/coco' 60 | 61 | TEST: 62 | INPUT_WH: [512, 512] 63 | CONFIDENCE_THRESH: 0.01 64 | NMS_OVERLAP: 0.45 65 | BATCH_SIZE: 1 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /configs/EFGRNet_vgg_coco_dcn.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: refine_vgg 3 | SIZE: '300' 4 | REFINE: True 5 | # CONV_BODY: efrgnet_vgg_reduce.refine_vgg 6 | CONV_BODY: efrgnet_vgg.refine_vgg 7 | NUM_CLASSES: 81 8 | LOAD_PRETRAINED_WEIGHTS: False 9 | PRETRAIN_WEIGHTS: './weights/vgg16_reducedfc.pth' 10 | 11 | TRAIN: 12 | OVERLAP: 0.5 13 | BGR_MEAN: [104, 117, 123] 14 | BATCH_SIZE: 32 15 | OHEM: True 16 | NEG_RATIO: 3 17 | WARMUP: True 18 | WARMUP_EPOCH: 5 19 | TRAIN_ON: True 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]] 23 | ARM_CHANNELS: [512, 1024, 256, 256] 24 | ODM_CHANNELS: [256, 256, 256, 256] 25 | NUM_ANCHORS: [3, 3, 3, 3] 26 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 27 | MIN_SIZES: [30, 64, 128, 256] 28 | MAX_SIZES: [64, 128, 256, 315] 29 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 30 | VARIANCE : [0.1, 0.2] 31 | CLIP: True 32 | IMG_WH: [320, 320] 33 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 34 | USE_MAX_SIZE: False 35 | 36 | SOLVER: 37 | WEIGHT_DECAY: 0.0005 38 | BASE_LR: 0.002 39 | GAMMA: 0.1 40 | MOMENTUM: 0.9 41 | # EPOCH_STEPS: [0, 90, 120, 140] 42 | LR: [0.004, 0.002, 0.0004, 0.00004, 0.000004] 43 | EPOCH_STEPS: [90, 110, 130, 150, 160] 44 | END_EPOCH: 160 45 | START_EPOCH: 0 46 | 47 | DATASETS: 48 | TRAIN_TYPE: [['2014', 'train'], ['2014', 'valminusminival']] 49 | VAL_TYPE: [['2014', 'minival']] 50 | DATAROOT: '/media/jnie/Storage/ubuntu/DataSets/coco/' 51 | DATA_TYPE: 'COCO' 52 | SETS: 53 | VOC: [['2007', 'trainval'], ['2012', 'trainval']] 54 | VOC0712PLUS: [['2007', 'trainval'], ['2012', 'trainval'],['2007', 'test']] 55 | VOC0712: [['2012', '2012_trainval']] 56 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 57 | VOC2007: [['2007', 'test']] 58 | COCOval: [['2014', 'minival']] 59 | VOCROOT: 'raid/jing/data/VOCdevkit/' 60 | COCOROOT: 'data/coco' 61 | 62 | TEST: 63 | INPUT_WH: [320, 320] 64 | CONFIDENCE_THRESH: 0.01 65 | NMS_OVERLAP: 0.45 66 | BATCH_SIZE: 1 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /utils/collections2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | ############################################################################## 15 | """A simple attribute dictionary used for representing configuration options.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | from __future__ import unicode_literals 21 | 22 | 23 | class AttrDict(dict): 24 | 25 | IMMUTABLE = '__immutable__' 26 | 27 | def __init__(self, *args, **kwargs): 28 | super(AttrDict, self).__init__(*args, **kwargs) 29 | self.__dict__[AttrDict.IMMUTABLE] = False 30 | 31 | def __getattr__(self, name): 32 | if name in self.__dict__: 33 | return self.__dict__[name] 34 | elif name in self: 35 | return self[name] 36 | else: 37 | raise AttributeError(name) 38 | 39 | def __setattr__(self, name, value): 40 | if not self.__dict__[AttrDict.IMMUTABLE]: 41 | if name in self.__dict__: 42 | self.__dict__[name] = value 43 | else: 44 | self[name] = value 45 | else: 46 | raise AttributeError( 47 | 'Attempted to set "{}" to "{}", but AttrDict is immutable'. 48 | format(name, value)) 49 | 50 | def immutable(self, is_immutable): 51 | """Set immutability to is_immutable and recursively apply the setting 52 | to all nested AttrDicts. 53 | """ 54 | self.__dict__[AttrDict.IMMUTABLE] = is_immutable 55 | # Recursively set immutable state 56 | for v in self.__dict__.values(): 57 | if isinstance(v, AttrDict): 58 | v.immutable(is_immutable) 59 | for v in self.values(): 60 | if isinstance(v, AttrDict): 61 | v.immutable(is_immutable) 62 | 63 | def is_immutable(self): 64 | return self.__dict__[AttrDict.IMMUTABLE] 65 | -------------------------------------------------------------------------------- /dcn/functions/deform_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | 4 | from .. import deform_pool_cuda 5 | 6 | 7 | class DeformRoIPoolingFunction(Function): 8 | 9 | @staticmethod 10 | def forward(ctx, 11 | data, 12 | rois, 13 | offset, 14 | spatial_scale, 15 | out_size, 16 | out_channels, 17 | no_trans, 18 | group_size=1, 19 | part_size=None, 20 | sample_per_part=4, 21 | trans_std=.0): 22 | ctx.spatial_scale = spatial_scale 23 | ctx.out_size = out_size 24 | ctx.out_channels = out_channels 25 | ctx.no_trans = no_trans 26 | ctx.group_size = group_size 27 | ctx.part_size = out_size if part_size is None else part_size 28 | ctx.sample_per_part = sample_per_part 29 | ctx.trans_std = trans_std 30 | 31 | assert 0.0 <= ctx.trans_std <= 1.0 32 | if not data.is_cuda: 33 | raise NotImplementedError 34 | 35 | n = rois.shape[0] 36 | output = data.new_empty(n, out_channels, out_size, out_size) 37 | output_count = data.new_empty(n, out_channels, out_size, out_size) 38 | deform_pool_cuda.deform_psroi_pooling_cuda_forward( 39 | data, rois, offset, output, output_count, ctx.no_trans, 40 | ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size, 41 | ctx.part_size, ctx.sample_per_part, ctx.trans_std) 42 | 43 | if data.requires_grad or rois.requires_grad or offset.requires_grad: 44 | ctx.save_for_backward(data, rois, offset) 45 | ctx.output_count = output_count 46 | 47 | return output 48 | 49 | @staticmethod 50 | def backward(ctx, grad_output): 51 | if not grad_output.is_cuda: 52 | raise NotImplementedError 53 | 54 | data, rois, offset = ctx.saved_tensors 55 | output_count = ctx.output_count 56 | grad_input = torch.zeros_like(data) 57 | grad_rois = None 58 | grad_offset = torch.zeros_like(offset) 59 | 60 | deform_pool_cuda.deform_psroi_pooling_cuda_backward( 61 | grad_output, data, rois, offset, output_count, grad_input, 62 | grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels, 63 | ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part, 64 | ctx.trans_std) 65 | return (grad_input, grad_rois, grad_offset, None, None, None, None, 66 | None, None, None, None) 67 | 68 | 69 | deform_roi_pooling = DeformRoIPoolingFunction.apply 70 | -------------------------------------------------------------------------------- /layers/functions/prior_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from math import sqrt as sqrt 3 | from math import ceil 4 | import torch.nn as nn 5 | from itertools import product as product 6 | 7 | 8 | class PriorLayer(nn.Module): 9 | def __init__(self, cfg): 10 | super(PriorLayer, self).__init__() 11 | self.size = cfg.MODEL.SIZE 12 | if self.size == '300': 13 | size_cfg = cfg.SMALL 14 | else: 15 | size_cfg = cfg.BIG 16 | self.img_wh = size_cfg.IMG_WH 17 | self.num_priors = len(size_cfg.ASPECT_RATIOS) 18 | self.feature_maps = size_cfg.FEATURE_MAPS 19 | self.variance = size_cfg.VARIANCE or [0.1] 20 | self.min_sizes = size_cfg.MIN_SIZES 21 | self.use_max_sizes = size_cfg.USE_MAX_SIZE 22 | if self.use_max_sizes: 23 | self.max_sizes = size_cfg.MAX_SIZES 24 | self.steps = size_cfg.STEPS 25 | self.aspect_ratios = size_cfg.ASPECT_RATIOS 26 | self.clip = size_cfg.CLIP 27 | for v in self.variance: 28 | if v <= 0: 29 | raise ValueError('Variances must be greater than 0') 30 | 31 | def forward(self, img_wh, feature_maps_wh): 32 | self.img_wh = img_wh 33 | self.feature_maps_wh = feature_maps_wh 34 | mean = [] 35 | for k, f in enumerate(self.feature_maps_wh): 36 | grid_h, grid_w = f[1], f[0] 37 | for i in range(grid_h): 38 | for j in range(grid_w): 39 | f_k_h = self.img_wh[1] / self.steps[k][1] 40 | f_k_w = self.img_wh[0] / self.steps[k][0] 41 | # unit center x,y 42 | cx = (j + 0.5) / f_k_w 43 | cy = (i + 0.5) / f_k_h 44 | 45 | # aspect_ratio: 1 46 | # rel size: min_size 47 | s_k_h = self.min_sizes[k] / self.img_wh[1] 48 | s_k_w = self.min_sizes[k] / self.img_wh[0] 49 | mean += [cx, cy, s_k_w, s_k_h] 50 | 51 | # aspect_ratio: 1 52 | # rel size: sqrt(s_k * s_(k+1)) 53 | if self.use_max_sizes: 54 | s_k_prime_w = sqrt( 55 | s_k_w * (self.max_sizes[k] / self.img_wh[0])) 56 | s_k_prime_h = sqrt( 57 | s_k_h * (self.max_sizes[k] / self.img_wh[1])) 58 | mean += [cx, cy, s_k_prime_w, s_k_prime_h] 59 | 60 | for ar in self.aspect_ratios[k]: 61 | mean += [cx, cy, s_k_w * sqrt(ar), s_k_h / sqrt(ar)] 62 | 63 | output = torch.Tensor(mean).view(-1, 4) 64 | if self.clip: 65 | output.clamp_(max=1, min=0) 66 | return output 67 | -------------------------------------------------------------------------------- /layers/functions/prior_box.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from math import sqrt as sqrt 3 | from itertools import product as product 4 | 5 | 6 | class PriorBox(object): 7 | """Compute priorbox coordinates in center-offset form for each source 8 | feature map. 9 | Note: 10 | This 'layer' has changed between versions of the original SSD 11 | paper, so we include both versions, but note v2 is the most tested and most 12 | recent version of the paper. 13 | 14 | """ 15 | 16 | def __init__(self, cfg): 17 | super(PriorBox, self).__init__() 18 | self.size = cfg.MODEL.SIZE 19 | if self.size == '300': 20 | size_cfg = cfg.SMALL 21 | else: 22 | size_cfg = cfg.BIG 23 | self.img_wh = size_cfg.IMG_WH 24 | self.num_priors = len(size_cfg.ASPECT_RATIOS) 25 | self.feature_maps = size_cfg.FEATURE_MAPS 26 | self.variance = size_cfg.VARIANCE or [0.1] 27 | self.min_sizes = size_cfg.MIN_SIZES 28 | self.use_max_sizes = size_cfg.USE_MAX_SIZE 29 | if self.use_max_sizes: 30 | self.max_sizes = size_cfg.MAX_SIZES 31 | self.steps = size_cfg.STEPS 32 | self.aspect_ratios = size_cfg.ASPECT_RATIOS 33 | self.clip = size_cfg.CLIP 34 | for v in self.variance: 35 | if v <= 0: 36 | raise ValueError('Variances must be greater than 0') 37 | 38 | def forward(self): 39 | mean = [] 40 | for k, f in enumerate(self.feature_maps): 41 | grid_h, grid_w = f[1], f[0] 42 | for i in range(grid_h): 43 | for j in range(grid_w): 44 | f_k_h = self.img_wh[1] / self.steps[k][1] 45 | f_k_w = self.img_wh[0] / self.steps[k][0] 46 | # unit center x,y 47 | cx = (j + 0.5) / f_k_w 48 | cy = (i + 0.5) / f_k_h 49 | 50 | # aspect_ratio: 1 51 | # rel size: min_size 52 | s_k_h = self.min_sizes[k] / self.img_wh[1] 53 | s_k_w = self.min_sizes[k] / self.img_wh[0] 54 | mean += [cx, cy, s_k_w, s_k_h] 55 | 56 | # aspect_ratio: 1 57 | # rel size: sqrt(s_k * s_(k+1)) 58 | if self.use_max_sizes: 59 | s_k_prime_w = sqrt( 60 | s_k_w * (self.max_sizes[k] / self.img_wh[0])) 61 | s_k_prime_h = sqrt( 62 | s_k_h * (self.max_sizes[k] / self.img_wh[1])) 63 | mean += [cx, cy, s_k_prime_w, s_k_prime_h] 64 | 65 | for ar in self.aspect_ratios[k]: 66 | mean += [cx, cy, s_k_w * sqrt(ar), s_k_h / sqrt(ar)] 67 | 68 | # back to torch land 69 | output = torch.Tensor(mean).view(-1, 4) 70 | if self.clip: 71 | output.clamp_(max=1, min=0) 72 | # print(output.size()) 73 | return output 74 | -------------------------------------------------------------------------------- /layers/functions/detection.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.backends.cudnn as cudnn 4 | from torch.autograd import Function 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | from utils.box_utils import decode, center_size 8 | import time 9 | 10 | 11 | class Detect(Function): 12 | """At test time, Detect is the final layer of SSD. Decode location preds, 13 | apply non-maximum suppression to location predictions based on conf 14 | scores and threshold to a top_k number of output predictions for both 15 | confidence score and locations. 16 | """ 17 | 18 | def __init__(self, cfg): 19 | self.cfg = cfg 20 | self.num_classes = cfg.MODEL.NUM_CLASSES 21 | #self.thresh = thresh 22 | self.size = cfg.MODEL.SIZE 23 | if self.size == '300': 24 | size_cfg = cfg.SMALL 25 | else: 26 | size_cfg = cfg.BIG 27 | # Parameters used in nms. 28 | self.variance = size_cfg.VARIANCE 29 | self.object_score = cfg.MODEL.OBJECT_SCORE 30 | 31 | def forward(self, predictions): 32 | """ 33 | Args: 34 | loc_data: (tensor) Loc preds from loc layers 35 | Shape: [batch,num_priors*4] 36 | conf_data: (tensor) Shape: Conf preds from conf layers 37 | Shape: [batch*num_priors,num_classes] 38 | prior_data: (tensor) Prior boxes and variances from priorbox layers 39 | Shape: [1,num_priors,4] 40 | """ 41 | # loc, conf, priors = predictions 42 | if self.cfg.MODEL.REFINE: 43 | # start_time = time.time() 44 | arm_loc, arm_conf, loc, conf, priors = predictions 45 | arm_conf = F.softmax(arm_conf.view(-1, 2), 1) 46 | conf = F.softmax(conf.view(-1, self.num_classes), 1) 47 | arm_loc_data = arm_loc.data 48 | arm_conf_data = arm_conf.data 49 | arm_object_conf = arm_conf_data[:, 1:] 50 | no_object_index = arm_object_conf <= self.object_score 51 | conf.data[no_object_index.expand_as(conf.data)] = 0 52 | # time1 = time.time() - start_time 53 | # print('prediction_time_first:', time1) 54 | else: 55 | loc, conf, priors = predictions 56 | conf = F.softmax(conf.view(-1, self.num_classes), 1) 57 | 58 | # start_time2 = time.time() 59 | loc_data = loc.data 60 | conf_data = conf.data 61 | # prior_data = priors.data 62 | prior_data = priors[:loc_data.size(1), :] 63 | 64 | num = loc_data.size(0) # batch size 65 | 66 | self.num_priors = prior_data.size(0) 67 | # time2 = time.time() - start_time2 68 | # print('prepare_time:', time2) 69 | 70 | # start_time3 = time.time() 71 | self.boxes = torch.zeros(num, self.num_priors, 4) 72 | self.scores = torch.zeros(num, self.num_priors, self.num_classes) 73 | conf_preds = conf_data.view(num, self.num_priors, self.num_classes) 74 | batch_prior = prior_data.view(-1, self.num_priors, 4).expand( 75 | (num, self.num_priors, 4)) 76 | batch_prior = batch_prior.contiguous().view(-1, 4) 77 | # time3 = time.time() - start_time3 78 | # print('prepare_time2:', time3) 79 | 80 | # start_time4 = time.time() 81 | if self.cfg.MODEL.REFINE: 82 | default = decode( 83 | arm_loc_data.view(-1, 4), batch_prior, self.variance) 84 | default = center_size(default) 85 | decoded_boxes = decode( 86 | loc_data.view(-1, 4), default, self.variance) 87 | else: 88 | decoded_boxes = decode( 89 | loc_data.view(-1, 4), batch_prior, self.variance) 90 | 91 | self.scores = conf_preds.view(num, self.num_priors, self.num_classes) 92 | self.boxes = decoded_boxes.view(num, self.num_priors, 4) 93 | 94 | # time4 = time.time() - start_time4 95 | # print('prediction_time2:', time4) 96 | return self.boxes, self.scores -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Enriched Feature Guided Refinement Network for Object Detection 2 | 3 | By Jing Nie1†, Rao Muhammad Anwer†, Hisham Cholakkal, Fahad Shahbaz Khan, Yanwei Pang1‡, Ling Shao \ 4 | † denotes equal contribution,‡ Corresponding author 5 | 6 | 7 | ### Introduction 8 | We propose a single-stage detection framework that 9 | jointly tackles the problem of multi-scale object detection and class imbalance. 10 | Rather than designing deeper networks, we introduce a simple yet effective feature enrichment scheme to produce multi-scale contextual features. 11 | We further introduce a cascaded refinement scheme which first instills multi-scale contextual features into the prediction layers of the single-stage detector 12 | in order to enrich their discriminative power for multi-scale detection. Second, the cascaded refinement scheme counters the class im- balance problem by refining the 13 | anchors and enriched features to improve classification and regression. 14 | 15 | ## Installation 16 | - Clone this repository. This repository is mainly based mainly based on [SSD_Pytorch](https://github.com/yqyao/SSD_Pytorch.git) 17 | 18 | ```Shell 19 | EFGR_ROOT=/path/to/clone/EFGR 20 | git clone https://github.com/Ranchentx/EFGRNet.git $EFGR_ROOT 21 | ``` 22 | 23 | - The code was tested on Ubuntu 16.04, with [Anaconda](https://www.anaconda.com/download) Python 3.6 and [PyTorch]((http://pytorch.org/)) v0.4.1. 24 | NVIDIA GPUs are needed for testing. After install Anaconda, create a new conda environment, activate the environment and install pytorch0.4.1. 25 | 26 | ```Shell 27 | conda create -n EFGRNet python=3.6 28 | source activate EFGRNet 29 | conda install pytorch=0.4.1 torchvision -c pytorch 30 | ``` 31 | 32 | 33 | - Install opencv and COCOAPI. 34 | ```Shell 35 | pip install opencv-python 36 | pip install pycocotools 37 | ``` 38 | 39 | - Compile NMS: 40 | 41 | ```Shell 42 | cd $EFGR_ROOT/ 43 | ./make.sh 44 | ``` 45 | 46 | - Compile DCN: 47 | 48 | ```Shell 49 | ./compile.sh 50 | ``` 51 | 52 | 53 | ## Download 54 | To evaluate the performance reported in the paper, Pascal VOC and COCO dataset as well as our trained models need to be downloaded. 55 | 56 | 57 | ### VOC Dataset 58 | - Directly download the images and annotations from the [VOC website](http://host.robots.ox.ac.uk/pascal/VOC/) and put them into $LFIP_ROOT/data/VOCdevkit/. 59 | - Create the `VOCdevkit` folder and make the data(or create symlinks) folder like: 60 | 61 | ~~~ 62 | ${$EFGR_ROOT} 63 | |-- data 64 | `-- |-- VOCdevkit 65 | `-- |-- VOC2007 66 | | |-- annotations 67 | | |-- ImageSets 68 | | |-- JPEGImages 69 | |-- VOC2012 70 | | |-- annotations 71 | | |-- ImageSets 72 | | |-- JPEGImages 73 | |-- results 74 | ~~~ 75 | 76 | ### COCO Dataset 77 | - Download the images and annotation files from coco website [coco website](http://cocodataset.org/#download). 78 | - Place the data (or create symlinks) to make the data folder like: 79 | 80 | ~~~ 81 | ${$EFGR_ROOT} 82 | |-- data 83 | `-- |-- coco 84 | `-- |-- annotations 85 | | |-- instances_train2014.json 86 | | |-- instances_val2014.json 87 | | |-- image_info_test-dev2015.json 88 | `-- images 89 | | |-- train2014 90 | | |-- val2014 91 | | |-- test2015 92 | `-- cache 93 | ~~~ 94 | 95 | ## Training 96 | 97 | 98 | 99 | ```Shell 100 | python train_coco.py --cfg ./configs/EFGRNet_vgg_coco_dcn.yaml 101 | ``` 102 | 103 | 104 | ## Testing 105 | 106 | - Note: 107 | All testing configs are in EFGRNet_vgg_coco_dcn.yaml, you can change it by yourself. 108 | 109 | - To evaluate a trained network: 110 | 111 | ```Shell 112 | python eval_dcn.py --cfg ./configs/EFGRNet_vgg_coco_dcn.yaml --weights ./eval_weights 113 | ``` 114 | 115 | ## Models 116 | 117 | * COCO [EFGRNet_VGG320](https://drive.google.com/open?id=1-_x9e4kX3ZJBKzfTKloslJxK2qO8bfkO); [BaiduYun Driver](https://pan.baidu.com/s/1ZPiibo-PnoTJl5HjAl63Pg&shfl=sharepset) 118 | * COCO [EFGRNet_VGG512](https://drive.google.com/open?id=1OVRiYRAyJiErUYsOXPaE12XEXtAV4ZrD); [BaiduYun Driver](https://pan.baidu.com/s/1YvXhhIXdziDV9q3wj9mLRg&shfl=sharepset) 119 | 120 | 121 | ## Citation 122 | Please cite our paper in your publications if it helps your research: 123 | 124 | @article{Jing2019EFGR, 125 | title = {Enriched Feature Guided Refinement Network for Object Detection}, 126 | author = {Jing Nie, Rao Muhammad Anwer, Hisham Cholakkal, Fahad Shahbaz Khan, Yanwei Pang, Ling Shao}, 127 | booktitle = {ICCV}, 128 | year = {2019} 129 | } -------------------------------------------------------------------------------- /dcn/modules/deform_conv.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn.modules.utils import _pair 6 | 7 | from ..functions.deform_conv import deform_conv, modulated_deform_conv 8 | 9 | 10 | class DeformConv(nn.Module): 11 | 12 | def __init__(self, 13 | in_channels, 14 | out_channels, 15 | kernel_size, 16 | stride=1, 17 | padding=0, 18 | dilation=1, 19 | groups=1, 20 | deformable_groups=1, 21 | bias=False): 22 | assert not bias 23 | super(DeformConv, self).__init__() 24 | 25 | assert in_channels % groups == 0, \ 26 | 'in_channels {} cannot be divisible by groups {}'.format( 27 | in_channels, groups) 28 | assert out_channels % groups == 0, \ 29 | 'out_channels {} cannot be divisible by groups {}'.format( 30 | out_channels, groups) 31 | self.in_channels = in_channels 32 | self.out_channels = out_channels 33 | self.kernel_size = _pair(kernel_size) 34 | self.stride = _pair(stride) 35 | self.padding = _pair(padding) 36 | self.dilation = _pair(dilation) 37 | self.groups = groups 38 | self.deformable_groups = deformable_groups 39 | 40 | self.weight = nn.Parameter( 41 | torch.Tensor(out_channels, in_channels // self.groups, 42 | *self.kernel_size)) 43 | 44 | self.reset_parameters() 45 | 46 | def reset_parameters(self): 47 | n = self.in_channels 48 | for k in self.kernel_size: 49 | n *= k 50 | stdv = 1. / math.sqrt(n) 51 | self.weight.data.uniform_(-stdv, stdv) 52 | 53 | def forward(self, input, offset): 54 | return deform_conv(input, offset, self.weight, self.stride, 55 | self.padding, self.dilation, self.groups, 56 | self.deformable_groups) 57 | 58 | 59 | class ModulatedDeformConv(nn.Module): 60 | 61 | def __init__(self, 62 | in_channels, 63 | out_channels, 64 | kernel_size, 65 | stride=1, 66 | padding=0, 67 | dilation=1, 68 | groups=1, 69 | deformable_groups=1, 70 | bias=True): 71 | super(ModulatedDeformConv, self).__init__() 72 | self.in_channels = in_channels 73 | self.out_channels = out_channels 74 | self.kernel_size = _pair(kernel_size) 75 | self.stride = stride 76 | self.padding = padding 77 | self.dilation = dilation 78 | self.groups = groups 79 | self.deformable_groups = deformable_groups 80 | self.with_bias = bias 81 | 82 | self.weight = nn.Parameter( 83 | torch.Tensor(out_channels, in_channels // groups, 84 | *self.kernel_size)) 85 | if bias: 86 | self.bias = nn.Parameter(torch.Tensor(out_channels)) 87 | else: 88 | self.register_parameter('bias', None) 89 | self.reset_parameters() 90 | 91 | def reset_parameters(self): 92 | n = self.in_channels 93 | for k in self.kernel_size: 94 | n *= k 95 | stdv = 1. / math.sqrt(n) 96 | self.weight.data.uniform_(-stdv, stdv) 97 | if self.bias is not None: 98 | self.bias.data.zero_() 99 | 100 | def forward(self, input, offset, mask): 101 | return modulated_deform_conv( 102 | input, offset, mask, self.weight, self.bias, self.stride, 103 | self.padding, self.dilation, self.groups, self.deformable_groups) 104 | 105 | 106 | class ModulatedDeformConvPack(ModulatedDeformConv): 107 | 108 | def __init__(self, 109 | in_channels, 110 | out_channels, 111 | kernel_size, 112 | stride=1, 113 | padding=0, 114 | dilation=1, 115 | groups=1, 116 | deformable_groups=1, 117 | bias=True): 118 | super(ModulatedDeformConvPack, self).__init__( 119 | in_channels, out_channels, kernel_size, stride, padding, dilation, 120 | groups, deformable_groups, bias) 121 | 122 | self.conv_offset_mask = nn.Conv2d( 123 | self.in_channels // self.groups, 124 | self.deformable_groups * 3 * self.kernel_size[0] * 125 | self.kernel_size[1], 126 | kernel_size=self.kernel_size, 127 | stride=_pair(self.stride), 128 | padding=_pair(self.padding), 129 | bias=True) 130 | self.init_offset() 131 | 132 | def init_offset(self): 133 | self.conv_offset_mask.weight.data.zero_() 134 | self.conv_offset_mask.bias.data.zero_() 135 | 136 | def forward(self, input): 137 | out = self.conv_offset_mask(input) 138 | o1, o2, mask = torch.chunk(out, 3, dim=1) 139 | offset = torch.cat((o1, o2), dim=1) 140 | mask = torch.sigmoid(mask) 141 | return modulated_deform_conv( 142 | input, offset, mask, self.weight, self.bias, self.stride, 143 | self.padding, self.dilation, self.groups, self.deformable_groups) 144 | -------------------------------------------------------------------------------- /utils/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /utils/build.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | import numpy as np 11 | from distutils.core import setup 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 19 | for dir in path.split(os.pathsep): 20 | binpath = pjoin(dir, name) 21 | if os.path.exists(binpath): 22 | return os.path.abspath(binpath) 23 | return None 24 | 25 | 26 | def locate_cuda(): 27 | """Locate the CUDA environment on the system 28 | 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | 32 | Starts by looking for the CUDAHOME env variable. If not found, everything 33 | is based on finding 'nvcc' in the PATH. 34 | """ 35 | 36 | # first check if the CUDAHOME env variable is in use 37 | if 'CUDAHOME' in os.environ: 38 | home = os.environ['CUDAHOME'] 39 | nvcc = pjoin(home, 'bin', 'nvcc') 40 | else: 41 | # otherwise, search the PATH for NVCC 42 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 43 | nvcc = find_in_path('nvcc', 44 | os.environ['PATH'] + os.pathsep + default_path) 45 | if nvcc is None: 46 | raise EnvironmentError( 47 | 'The nvcc binary could not be ' 48 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME' 49 | ) 50 | home = os.path.dirname(os.path.dirname(nvcc)) 51 | 52 | cudaconfig = { 53 | 'home': home, 54 | 'nvcc': nvcc, 55 | 'include': pjoin(home, 'include'), 56 | 'lib64': pjoin(home, 'lib64') 57 | } 58 | for k, v in cudaconfig.items(): 59 | if not os.path.exists(v): 60 | raise EnvironmentError( 61 | 'The CUDA %s path could not be located in %s' % (k, v)) 62 | 63 | return cudaconfig 64 | 65 | 66 | CUDA = locate_cuda() 67 | 68 | # Obtain the numpy include directory. This logic works across numpy versions. 69 | try: 70 | numpy_include = np.get_include() 71 | except AttributeError: 72 | numpy_include = np.get_numpy_include() 73 | 74 | 75 | def customize_compiler_for_nvcc(self): 76 | """inject deep into distutils to customize how the dispatch 77 | to gcc/nvcc works. 78 | 79 | If you subclass UnixCCompiler, it's not trivial to get your subclass 80 | injected in, and still have the right customizations (i.e. 81 | distutils.sysconfig.customize_compiler) run on it. So instead of going 82 | the OO route, I have this. Note, it's kindof like a wierd functional 83 | subclassing going on.""" 84 | 85 | # tell the compiler it can processes .cu 86 | self.src_extensions.append('.cu') 87 | 88 | # save references to the default compiler_so and _comple methods 89 | default_compiler_so = self.compiler_so 90 | super = self._compile 91 | 92 | # now redefine the _compile method. This gets executed for each 93 | # object but distutils doesn't have the ability to change compilers 94 | # based on source extension: we add it. 95 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 96 | print(extra_postargs) 97 | if os.path.splitext(src)[1] == '.cu': 98 | # use the cuda for .cu files 99 | self.set_executable('compiler_so', CUDA['nvcc']) 100 | # use only a subset of the extra_postargs, which are 1-1 translated 101 | # from the extra_compile_args in the Extension class 102 | postargs = extra_postargs['nvcc'] 103 | else: 104 | postargs = extra_postargs['gcc'] 105 | 106 | super(obj, src, ext, cc_args, postargs, pp_opts) 107 | # reset the default compiler_so, which we might have changed for cuda 108 | self.compiler_so = default_compiler_so 109 | 110 | # inject our redefined _compile method into the class 111 | self._compile = _compile 112 | 113 | 114 | # run the customize_compiler 115 | class custom_build_ext(build_ext): 116 | def build_extensions(self): 117 | customize_compiler_for_nvcc(self.compiler) 118 | build_ext.build_extensions(self) 119 | 120 | 121 | ext_modules = [ 122 | Extension( 123 | "nms.cpu_nms", ["nms/cpu_nms.pyx"], 124 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 125 | include_dirs=[numpy_include]), 126 | Extension( 127 | 'nms.gpu_nms', 128 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 129 | library_dirs=[CUDA['lib64']], 130 | libraries=['cudart'], 131 | language='c++', 132 | runtime_library_dirs=[CUDA['lib64']], 133 | # this syntax is specific to this build system 134 | # we're only going to use certain compiler args with nvcc and not with gcc 135 | # the implementation of this trick is in customize_compiler() below 136 | extra_compile_args={ 137 | 'gcc': ["-Wno-unused-function"], 138 | 'nvcc': [ 139 | '-arch=sm_61', '--ptxas-options=-v', '-c', 140 | '--compiler-options', "'-fPIC'" 141 | ] 142 | }, 143 | include_dirs=[numpy_include, CUDA['include']]) 144 | ] 145 | 146 | setup( 147 | name='mot_utils', 148 | ext_modules=ext_modules, 149 | # inject our custom trigger 150 | cmdclass={'build_ext': custom_build_ext}, 151 | ) 152 | -------------------------------------------------------------------------------- /utils/get_class_map.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import argparse 4 | import os.path as osp 5 | 6 | 7 | def check_size(submit_file): 8 | max_size = 60 * 1024 * 1024 9 | if osp.getsize(submit_file) > max_size: 10 | raise ( 11 | IOError, 12 | "File size exceeds the specified maximum size, which is 60M for the server." 13 | ) 14 | 15 | 16 | def parse_submission(submit_file): 17 | with open(submit_file, 'r') as f: 18 | lines = f.readlines() 19 | submit_dict = dict() 20 | final_dict = dict() 21 | splitlines = [x.strip().split(' ') for x in lines] 22 | for idx, val in enumerate(splitlines): 23 | cls = str(int(float(val[1]))) 24 | if cls not in submit_dict: 25 | submit_dict[cls] = list() 26 | final_dict[cls] = dict() 27 | submit_dict[cls].append( 28 | [val[0], val[2], val[3], val[4], val[5], val[6]]) 29 | for k, v in submit_dict.items(): 30 | image_ids = [x[0] for x in v] 31 | confidence = np.array([float(x[1]) for x in v]) 32 | BB = np.array([[float(z) for z in x[2:]] for x in v]) 33 | sorted_ind = np.argsort(-confidence) 34 | sorted_scores = np.sort(-confidence) 35 | BB = BB[sorted_ind, :] 36 | image_ids = [image_ids[x] for x in sorted_ind] 37 | final_dict[k]["image_ids"] = image_ids 38 | final_dict[k]["BB"] = np.array(BB) 39 | return final_dict 40 | 41 | 42 | def parse_gt_annotation(gt_file): 43 | with open(gt_file, 'r') as f: 44 | lines = f.readlines() 45 | info = [x.strip().split() for x in lines] 46 | gt = {} 47 | for item in info: 48 | img_id = item[0] 49 | obj_struct = {} 50 | obj_struct['class'] = item[1] 51 | obj_struct['bbox'] = [ 52 | int(item[2]), 53 | int(item[3]), 54 | int(item[4]), 55 | int(item[5]) 56 | ] 57 | if img_id not in gt: 58 | gt[img_id] = list() 59 | gt[img_id].append(obj_struct) 60 | return gt 61 | 62 | 63 | def get_class_recs(recs, classname): 64 | npos = 0 65 | class_recs = {} 66 | for key in recs.keys(): 67 | R = [obj for obj in recs[key] if obj['class'] == classname] 68 | bbox = np.array([x['bbox'] for x in R]) 69 | det = [False] * len(R) 70 | npos += len(R) 71 | class_recs[key] = {'bbox': bbox, 'det': det} 72 | return class_recs, npos 73 | 74 | 75 | def compute_ap(rec, prec): 76 | mrec = np.concatenate(([0.], rec, [1.])) 77 | mpre = np.concatenate(([0.], prec, [0.])) 78 | for i in range(mpre.size - 1, 0, -1): 79 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 80 | i = np.where(mrec[1:] != mrec[:-1])[0] 81 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 82 | return ap 83 | 84 | 85 | def eval(submit_file, gt_file, ovthresh, classname): 86 | recs = parse_gt_annotation(gt_file) 87 | submit_result = parse_submission(submit_file) 88 | # get one class result 89 | class_recs, npos = get_class_recs(recs, classname) 90 | image_ids = submit_result[classname]["image_ids"] 91 | BB = submit_result[classname]["BB"] 92 | nd = len(image_ids) 93 | tp = np.zeros(nd) 94 | fp = np.zeros(nd) 95 | for d in range(nd): 96 | if image_ids[d] not in recs.keys(): 97 | raise KeyError( 98 | "Can not find image {} in the groundtruth file, did you submit the result file for the right dataset?" 99 | .format(image_ids[d])) 100 | for d in range(nd): 101 | R = class_recs[image_ids[d]] 102 | bb = BB[d, :].astype(float) 103 | ovmax = -np.inf 104 | BBGT = R['bbox'].astype(float) 105 | if BBGT.size > 0: 106 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 107 | iymin = np.maximum(BBGT[:, 1], bb[1]) 108 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 109 | iymax = np.minimum(BBGT[:, 3], bb[3]) 110 | iw = np.maximum(ixmax - ixmin + 1., 0.) 111 | ih = np.maximum(iymax - iymin + 1., 0.) 112 | inters = iw * ih 113 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 114 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 115 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 116 | overlaps = inters / uni 117 | ovmax = np.max(overlaps) 118 | jmax = np.argmax(overlaps) 119 | if ovmax > ovthresh: 120 | if not R['det'][jmax]: 121 | tp[d] = 1. 122 | R['det'][jmax] = 1 123 | else: 124 | fp[d] = 1. 125 | else: 126 | fp[d] = 1. 127 | fp = np.cumsum(fp) 128 | tp = np.cumsum(tp) 129 | rec = tp / float(npos) 130 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 131 | ap = compute_ap(rec, prec) 132 | return ap 133 | 134 | 135 | def result_eval(submit_file, gt, class_list): 136 | ove_aap = [] 137 | for ove in np.arange(0.5, 1.0, 0.05): 138 | cls_aap = [] 139 | for cls in class_list: 140 | ap = eval(submit_file, gt, ove, cls) 141 | cls_aap.append(ap) 142 | cls_mAP = np.average(cls_aap) 143 | print("thresh", round(ove, 3), "map", round(cls_mAP * 100, 3)) 144 | ove_aap.append(cls_mAP) 145 | mAP = np.average(ove_aap) * 100 146 | return round(mAP, 3) 147 | 148 | 149 | if __name__ == '__main__': 150 | ''' 151 | submit_file: image_id, class, score, xmin, ymin, xmax, ymax 152 | gt_file: image_id, class, xmin, ymin, xmax, ymax 153 | ''' 154 | class_list = [] 155 | for i in range(1, 61): 156 | class_list.append(str(i)) 157 | submit_file = "./results/fpn_dcn_result.csv" 158 | gt_file = "./results/val_label.txt" 159 | check_size(submit_file) 160 | mAP = result_eval(submit_file, gt_file, class_list) 161 | out = {'Average AP': str(round(mAP, 3))} 162 | print(out) -------------------------------------------------------------------------------- /layers/modules/multibox_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from torch.autograd import Variable 6 | from utils.box_utils import match, log_sum_exp 7 | from .focal_loss_softmax import FocalLossSoftmax 8 | from .focal_loss_sigmoid import FocalLossSigmoid 9 | 10 | GPU = False 11 | if torch.cuda.is_available(): 12 | GPU = True 13 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 14 | 15 | 16 | class MultiBoxLoss(nn.Module): 17 | """SSD Weighted Loss Function 18 | Compute Targets: 19 | 1) Produce Confidence Target Indices by matching ground truth boxes 20 | with (default) 'priorboxes' that have jaccard index > threshold parameter 21 | (default threshold: 0.5). 22 | 2) Produce localization target by 'encoding' variance into offsets of ground 23 | truth boxes and their matched 'priorboxes'. 24 | 3) Hard negative mining to filter the excessive number of negative examples 25 | that comes with using a large number of default bounding boxes. 26 | (default negative:positive ratio 3:1) 27 | Objective Loss: 28 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 29 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 30 | weighted by α which is set to 1 by cross val. 31 | Args: 32 | c: class confidences, 33 | l: predicted boxes, 34 | g: ground truth boxes 35 | N: number of matched default boxes 36 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 37 | """ 38 | 39 | def __init__(self, cfg): 40 | super(MultiBoxLoss, self).__init__() 41 | self.cfg = cfg 42 | self.size = cfg.MODEL.SIZE 43 | if self.size == '300': 44 | size_cfg = cfg.SMALL 45 | else: 46 | size_cfg = cfg.BIG 47 | self.variance = size_cfg.VARIANCE 48 | self.num_classes = cfg.MODEL.NUM_CLASSES 49 | self.threshold = cfg.TRAIN.OVERLAP 50 | self.OHEM = cfg.TRAIN.OHEM 51 | self.negpos_ratio = cfg.TRAIN.NEG_RATIO 52 | self.variance = size_cfg.VARIANCE 53 | if cfg.TRAIN.FOCAL_LOSS: 54 | if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX': 55 | self.focaloss = FocalLossSoftmax( 56 | self.num_classes, gamma=2, size_average=False) 57 | else: 58 | self.focaloss = FocalLossSigmoid() 59 | 60 | def forward(self, predictions, targets): 61 | """Multibox Loss 62 | Args: 63 | predictions (tuple): A tuple containing loc preds, conf preds, 64 | and prior boxes from SSD net. 65 | conf shape: torch.size(batch_size,num_priors,num_classes) 66 | loc shape: torch.size(batch_size,num_priors,4) 67 | priors shape: torch.size(num_priors,4) 68 | 69 | ground_truth (tensor): Ground truth boxes and labels for a batch, 70 | shape: [batch_size,num_objs,5] (last idx is the label). 71 | """ 72 | loc_data, conf_data, priors = predictions 73 | num = loc_data.size(0) 74 | priors = priors[:loc_data.size(1), :] 75 | num_priors = (priors.size(0)) 76 | num_classes = self.num_classes 77 | loc_t = torch.Tensor(num, num_priors, 4) 78 | conf_t = torch.LongTensor(num, num_priors) 79 | for idx in range(num): 80 | truths = targets[idx][:, :-1].data 81 | labels = targets[idx][:, -1].data 82 | if self.num_classes == 2: 83 | labels = labels > 0 84 | defaults = priors.data 85 | match(self.threshold, truths, defaults, self.variance, labels, 86 | loc_t, conf_t, idx) 87 | loc_t = loc_t.cuda() 88 | conf_t = conf_t.cuda() 89 | 90 | pos = conf_t > 0 91 | num_pos = pos.sum(1, keepdim=True) 92 | 93 | if self.OHEM: 94 | # Compute max conf across batch for hard negative mining 95 | batch_conf = conf_data.view(-1, self.num_classes) 96 | 97 | loss_hard = log_sum_exp(batch_conf) - batch_conf.gather( 98 | 1, conf_t.view(-1, 1)) 99 | # Hard Negative Mining 100 | loss_hard[pos.view(-1, 1)] = 0 # filter out pos boxes for now 101 | loss_hard = loss_hard.view(num, -1) 102 | _, loss_idx = loss_hard.sort(1, descending=True) 103 | _, idx_rank = loss_idx.sort(1) 104 | num_pos = pos.long().sum(1, keepdim=True) 105 | if num_pos.data.sum() > 0: 106 | num_neg = torch.clamp( 107 | self.negpos_ratio * num_pos, max=pos.size(1) - 1) 108 | else: 109 | fake_num_pos = torch.ones(32, 1).long() * 15 110 | num_neg = torch.clamp( 111 | self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1) 112 | neg = idx_rank < num_neg.expand_as(idx_rank) 113 | 114 | # Confidence Loss Including Positive and Negative Examples 115 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 116 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 117 | conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view( 118 | -1, self.num_classes) 119 | targets_weighted = conf_t[(pos + neg).gt(0)] 120 | loss_c = F.cross_entropy( 121 | conf_p, targets_weighted, size_average=False) 122 | else: 123 | loss_c = F.cross_entropy(conf_p, conf_t, size_average=False) 124 | # Localization Loss (Smooth L1) 125 | # Shape: [batch,num_priors,4] 126 | if num_pos.data.sum() > 0: 127 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 128 | loc_p = loc_data[pos_idx].view(-1, 4) 129 | loc_t = loc_t[pos_idx].view(-1, 4) 130 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 131 | N = num_pos.data.sum() 132 | else: 133 | loss_l = torch.zeros(1) 134 | N = 1.0 135 | loss_l /= float(N) 136 | loss_c /= float(N) 137 | return loss_l, loss_c 138 | -------------------------------------------------------------------------------- /dcn/modules/deform_pool.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from ..functions.deform_pool import deform_roi_pooling 4 | 5 | 6 | class DeformRoIPooling(nn.Module): 7 | 8 | def __init__(self, 9 | spatial_scale, 10 | out_size, 11 | out_channels, 12 | no_trans, 13 | group_size=1, 14 | part_size=None, 15 | sample_per_part=4, 16 | trans_std=.0): 17 | super(DeformRoIPooling, self).__init__() 18 | self.spatial_scale = spatial_scale 19 | self.out_size = out_size 20 | self.out_channels = out_channels 21 | self.no_trans = no_trans 22 | self.group_size = group_size 23 | self.part_size = out_size if part_size is None else part_size 24 | self.sample_per_part = sample_per_part 25 | self.trans_std = trans_std 26 | 27 | def forward(self, data, rois, offset): 28 | if self.no_trans: 29 | offset = data.new_empty(0) 30 | return deform_roi_pooling( 31 | data, rois, offset, self.spatial_scale, self.out_size, 32 | self.out_channels, self.no_trans, self.group_size, self.part_size, 33 | self.sample_per_part, self.trans_std) 34 | 35 | 36 | class DeformRoIPoolingPack(DeformRoIPooling): 37 | 38 | def __init__(self, 39 | spatial_scale, 40 | out_size, 41 | out_channels, 42 | no_trans, 43 | group_size=1, 44 | part_size=None, 45 | sample_per_part=4, 46 | trans_std=.0, 47 | deform_fc_channels=1024): 48 | super(DeformRoIPoolingPack, 49 | self).__init__(spatial_scale, out_size, out_channels, no_trans, 50 | group_size, part_size, sample_per_part, trans_std) 51 | 52 | self.deform_fc_channels = deform_fc_channels 53 | 54 | if not no_trans: 55 | self.offset_fc = nn.Sequential( 56 | nn.Linear(self.out_size * self.out_size * self.out_channels, 57 | self.deform_fc_channels), 58 | nn.ReLU(inplace=True), 59 | nn.Linear(self.deform_fc_channels, self.deform_fc_channels), 60 | nn.ReLU(inplace=True), 61 | nn.Linear(self.deform_fc_channels, 62 | self.out_size * self.out_size * 2)) 63 | self.offset_fc[-1].weight.data.zero_() 64 | self.offset_fc[-1].bias.data.zero_() 65 | 66 | def forward(self, data, rois): 67 | assert data.size(1) == self.out_channels 68 | if self.no_trans: 69 | offset = data.new_empty(0) 70 | return deform_roi_pooling( 71 | data, rois, offset, self.spatial_scale, self.out_size, 72 | self.out_channels, self.no_trans, self.group_size, 73 | self.part_size, self.sample_per_part, self.trans_std) 74 | else: 75 | n = rois.shape[0] 76 | offset = data.new_empty(0) 77 | x = deform_roi_pooling(data, rois, offset, self.spatial_scale, 78 | self.out_size, self.out_channels, True, 79 | self.group_size, self.part_size, 80 | self.sample_per_part, self.trans_std) 81 | offset = self.offset_fc(x.view(n, -1)) 82 | offset = offset.view(n, 2, self.out_size, self.out_size) 83 | return deform_roi_pooling( 84 | data, rois, offset, self.spatial_scale, self.out_size, 85 | self.out_channels, self.no_trans, self.group_size, 86 | self.part_size, self.sample_per_part, self.trans_std) 87 | 88 | 89 | class ModulatedDeformRoIPoolingPack(DeformRoIPooling): 90 | 91 | def __init__(self, 92 | spatial_scale, 93 | out_size, 94 | out_channels, 95 | no_trans, 96 | group_size=1, 97 | part_size=None, 98 | sample_per_part=4, 99 | trans_std=.0, 100 | deform_fc_channels=1024): 101 | super(ModulatedDeformRoIPoolingPack, self).__init__( 102 | spatial_scale, out_size, out_channels, no_trans, group_size, 103 | part_size, sample_per_part, trans_std) 104 | 105 | self.deform_fc_channels = deform_fc_channels 106 | 107 | if not no_trans: 108 | self.offset_fc = nn.Sequential( 109 | nn.Linear(self.out_size * self.out_size * self.out_channels, 110 | self.deform_fc_channels), 111 | nn.ReLU(inplace=True), 112 | nn.Linear(self.deform_fc_channels, self.deform_fc_channels), 113 | nn.ReLU(inplace=True), 114 | nn.Linear(self.deform_fc_channels, 115 | self.out_size * self.out_size * 2)) 116 | self.offset_fc[-1].weight.data.zero_() 117 | self.offset_fc[-1].bias.data.zero_() 118 | self.mask_fc = nn.Sequential( 119 | nn.Linear(self.out_size * self.out_size * self.out_channels, 120 | self.deform_fc_channels), 121 | nn.ReLU(inplace=True), 122 | nn.Linear(self.deform_fc_channels, 123 | self.out_size * self.out_size * 1), 124 | nn.Sigmoid()) 125 | self.mask_fc[2].weight.data.zero_() 126 | self.mask_fc[2].bias.data.zero_() 127 | 128 | def forward(self, data, rois): 129 | assert data.size(1) == self.out_channels 130 | if self.no_trans: 131 | offset = data.new_empty(0) 132 | return deform_roi_pooling( 133 | data, rois, offset, self.spatial_scale, self.out_size, 134 | self.out_channels, self.no_trans, self.group_size, 135 | self.part_size, self.sample_per_part, self.trans_std) 136 | else: 137 | n = rois.shape[0] 138 | offset = data.new_empty(0) 139 | x = deform_roi_pooling(data, rois, offset, self.spatial_scale, 140 | self.out_size, self.out_channels, True, 141 | self.group_size, self.part_size, 142 | self.sample_per_part, self.trans_std) 143 | offset = self.offset_fc(x.view(n, -1)) 144 | offset = offset.view(n, 2, self.out_size, self.out_size) 145 | mask = self.mask_fc(x.view(n, -1)) 146 | mask = mask.view(n, 1, self.out_size, self.out_size) 147 | return deform_roi_pooling( 148 | data, rois, offset, self.spatial_scale, self.out_size, 149 | self.out_channels, self.no_trans, self.group_size, 150 | self.part_size, self.sample_per_part, self.trans_std) * mask 151 | -------------------------------------------------------------------------------- /dcn/src/deform_pool_cuda.cpp: -------------------------------------------------------------------------------- 1 | // author: Charles Shang 2 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu 3 | 4 | // modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob /mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | void DeformablePSROIPoolForward(const at::Tensor data, 12 | const at::Tensor bbox, 13 | const at::Tensor trans, 14 | at::Tensor out, 15 | at::Tensor top_count, 16 | const int batch, 17 | const int channels, 18 | const int height, 19 | const int width, 20 | const int num_bbox, 21 | const int channels_trans, 22 | const int no_trans, 23 | const float spatial_scale, 24 | const int output_dim, 25 | const int group_size, 26 | const int pooled_size, 27 | const int part_size, 28 | const int sample_per_part, 29 | const float trans_std); 30 | 31 | void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad, 32 | const at::Tensor data, 33 | const at::Tensor bbox, 34 | const at::Tensor trans, 35 | const at::Tensor top_count, 36 | at::Tensor in_grad, 37 | at::Tensor trans_grad, 38 | const int batch, 39 | const int channels, 40 | const int height, 41 | const int width, 42 | const int num_bbox, 43 | const int channels_trans, 44 | const int no_trans, 45 | const float spatial_scale, 46 | const int output_dim, 47 | const int group_size, 48 | const int pooled_size, 49 | const int part_size, 50 | const int sample_per_part, 51 | const float trans_std); 52 | 53 | void deform_psroi_pooling_cuda_forward(at::Tensor input, at::Tensor bbox, 54 | at::Tensor trans, 55 | at::Tensor out, at::Tensor top_count, 56 | const int no_trans, 57 | const float spatial_scale, 58 | const int output_dim, 59 | const int group_size, 60 | const int pooled_size, 61 | const int part_size, 62 | const int sample_per_part, 63 | const float trans_std) 64 | { 65 | AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); 66 | 67 | const int batch = input.size(0); 68 | const int channels = input.size(1); 69 | const int height = input.size(2); 70 | const int width = input.size(3); 71 | const int channels_trans = no_trans ? 2 : trans.size(1); 72 | 73 | const int num_bbox = bbox.size(0); 74 | if (num_bbox != out.size(0)) 75 | AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", 76 | out.size(0), num_bbox); 77 | 78 | DeformablePSROIPoolForward(input, bbox, trans, out, top_count, 79 | batch, channels, height, width, 80 | num_bbox, 81 | channels_trans, 82 | no_trans, 83 | spatial_scale, 84 | output_dim, 85 | group_size, 86 | pooled_size, 87 | part_size, 88 | sample_per_part, 89 | trans_std); 90 | } 91 | 92 | void deform_psroi_pooling_cuda_backward(at::Tensor out_grad, 93 | at::Tensor input, at::Tensor bbox, 94 | at::Tensor trans, at::Tensor top_count, 95 | at::Tensor input_grad, at::Tensor trans_grad, 96 | const int no_trans, 97 | const float spatial_scale, 98 | const int output_dim, 99 | const int group_size, 100 | const int pooled_size, 101 | const int part_size, 102 | const int sample_per_part, 103 | const float trans_std) 104 | { 105 | AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous"); 106 | AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); 107 | 108 | const int batch = input.size(0); 109 | const int channels = input.size(1); 110 | const int height = input.size(2); 111 | const int width = input.size(3); 112 | const int channels_trans = no_trans ? 2 : trans.size(1); 113 | 114 | const int num_bbox = bbox.size(0); 115 | if (num_bbox != out_grad.size(0)) 116 | AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", 117 | out_grad.size(0), num_bbox); 118 | 119 | DeformablePSROIPoolBackwardAcc(out_grad, 120 | input, 121 | bbox, 122 | trans, 123 | top_count, 124 | input_grad, 125 | trans_grad, 126 | batch, channels, height, width, num_bbox, 127 | channels_trans, 128 | no_trans, 129 | spatial_scale, 130 | output_dim, 131 | group_size, 132 | pooled_size, 133 | part_size, 134 | sample_per_part, 135 | trans_std); 136 | } 137 | 138 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 139 | { 140 | m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward, 141 | "deform psroi pooling forward(CUDA)"); 142 | m.def("deform_psroi_pooling_cuda_backward", &deform_psroi_pooling_cuda_backward, 143 | "deform psroi pooling backward(CUDA)"); 144 | } -------------------------------------------------------------------------------- /models/resnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | from models.model_helper import weights_init 9 | 10 | 11 | def add_extras(size, in_channel, batch_norm=False): 12 | # Extra layers added to resnet for feature scaling 13 | layers = [] 14 | layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)] 15 | layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)] 16 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 17 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 18 | if size == '300': 19 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 20 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=0)] 21 | else: 22 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 23 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 24 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 25 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 26 | 27 | return layers 28 | 29 | 30 | def conv3x3(in_planes, out_planes, stride=1): 31 | "3x3 convolution with padding" 32 | return nn.Conv2d( 33 | in_planes, 34 | out_planes, 35 | kernel_size=3, 36 | stride=stride, 37 | padding=1, 38 | bias=False) 39 | 40 | 41 | class BasicBlock(nn.Module): 42 | expansion = 1 43 | 44 | def __init__(self, inplanes, planes, stride=1, downsample=None): 45 | super(BasicBlock, self).__init__() 46 | self.conv1 = conv3x3(inplanes, planes, stride) 47 | self.bn1 = nn.BatchNorm2d(planes) 48 | self.relu = nn.ReLU(inplace=True) 49 | self.conv2 = conv3x3(planes, planes) 50 | self.bn2 = nn.BatchNorm2d(planes) 51 | self.downsample = downsample 52 | self.stride = stride 53 | 54 | def forward(self, x): 55 | residual = x 56 | 57 | out = self.conv1(x) 58 | out = self.bn1(out) 59 | out = self.relu(out) 60 | 61 | out = self.conv2(out) 62 | out = self.bn2(out) 63 | 64 | if self.downsample is not None: 65 | residual = self.downsample(x) 66 | out += residual 67 | out = self.relu(out) 68 | 69 | return out 70 | 71 | 72 | class Bottleneck(nn.Module): 73 | expansion = 4 74 | 75 | def __init__(self, inplanes, planes, stride=1, downsample=None): 76 | super(Bottleneck, self).__init__() 77 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 78 | self.bn1 = nn.BatchNorm2d(planes) 79 | self.conv2 = nn.Conv2d( 80 | planes, 81 | planes, 82 | kernel_size=3, 83 | stride=stride, 84 | padding=1, 85 | bias=False) 86 | self.bn2 = nn.BatchNorm2d(planes) 87 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 88 | self.bn3 = nn.BatchNorm2d(planes * 4) 89 | self.relu = nn.ReLU(inplace=True) 90 | self.downsample = downsample 91 | self.stride = stride 92 | 93 | def forward(self, x): 94 | residual = x 95 | 96 | out = self.conv1(x) 97 | out = self.bn1(out) 98 | out = self.relu(out) 99 | 100 | out = self.conv2(out) 101 | out = self.bn2(out) 102 | out = self.relu(out) 103 | 104 | out = self.conv3(out) 105 | out = self.bn3(out) 106 | 107 | if self.downsample is not None: 108 | residual = self.downsample(x) 109 | 110 | out += residual 111 | out = self.relu(out) 112 | 113 | return out 114 | 115 | 116 | class SSDResnet(nn.Module): 117 | def __init__(self, block, num_blocks, size): 118 | super(SSDResnet, self).__init__() 119 | self.inplanes = 64 120 | 121 | self.conv1 = nn.Conv2d( 122 | 3, 64, kernel_size=7, stride=2, padding=3, bias=False) 123 | self.bn1 = nn.BatchNorm2d(64) 124 | 125 | # Bottom-up layers 126 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 127 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 128 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 129 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 130 | self.inchannel = block.expansion * 512 131 | self.extras = nn.ModuleList(add_extras(str(size), self.inchannel)) 132 | self.smooth1 = nn.Conv2d( 133 | self.inchannel, 512, kernel_size=3, stride=1, padding=1) 134 | self._init_modules() 135 | 136 | def _make_layer(self, block, planes, blocks, stride=1): 137 | downsample = None 138 | if stride != 1 or self.inplanes != planes * block.expansion: 139 | downsample = nn.Sequential( 140 | nn.Conv2d( 141 | self.inplanes, 142 | planes * block.expansion, 143 | kernel_size=1, 144 | stride=stride, 145 | bias=False), 146 | nn.BatchNorm2d(planes * block.expansion), 147 | ) 148 | 149 | layers = [] 150 | layers.append(block(self.inplanes, planes, stride, downsample)) 151 | self.inplanes = planes * block.expansion 152 | for i in range(1, blocks): 153 | layers.append(block(self.inplanes, planes)) 154 | 155 | return nn.Sequential(*layers) 156 | 157 | def _init_modules(self): 158 | self.extras.apply(weights_init) 159 | self.smooth1.apply(weights_init) 160 | 161 | def forward(self, x): 162 | # Bottom-up 163 | c1 = F.relu(self.bn1(self.conv1(x))) 164 | c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1) 165 | c2 = self.layer1(c1) 166 | c3 = self.layer2(c2) 167 | c4 = self.layer3(c3) 168 | c5 = self.layer4(c4) 169 | x = c5 170 | c5_ = self.smooth1(c5) 171 | sources = [c3, c4, c5_] 172 | for k, v in enumerate(self.extras): 173 | x = F.relu(v(x), inplace=True) 174 | if k % 2 == 1: 175 | sources.append(x) 176 | return sources 177 | 178 | 179 | def SSDResnet18(size, channel_size='48'): 180 | return SSDResnet(BasicBlock, [2, 2, 2, 2], size) 181 | 182 | 183 | def SSDResnet34(size, channel_size='48'): 184 | return SSDResnet(BasicBlock, [3, 4, 6, 3], size) 185 | 186 | 187 | def SSDResnet50(size, channel_size='48'): 188 | return SSDResnet(Bottleneck, [3, 4, 6, 3], size) 189 | 190 | 191 | def SSDResnet101(size, channel_size='48'): 192 | return SSDResnet(Bottleneck, [3, 4, 23, 3], size) 193 | 194 | 195 | def SSDResnet152(size, channel_size='48'): 196 | return SSDResnet(Bottleneck, [3, 8, 36, 3], size) 197 | 198 | 199 | if __name__ == "__main__": 200 | import os 201 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 202 | model3 = SSDResnet18(size=300) 203 | with torch.no_grad(): 204 | model3.eval() 205 | x = torch.randn(1, 3, 300, 300) 206 | model3.cuda() 207 | model3(x.cuda()) 208 | import time 209 | st = time.time() 210 | for i in range(1): 211 | model3(x.cuda()) 212 | print(time.time() - st) 213 | # print(model3(x)) 214 | -------------------------------------------------------------------------------- /models/model_builder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | from layers import * 9 | import os 10 | from models.model_helper import weights_init 11 | import importlib 12 | from layers.functions.prior_layer import PriorLayer 13 | 14 | 15 | def get_func(func_name): 16 | """Helper to return a function object by name. func_name must identify a 17 | function in this module or the path to a function relative to the base 18 | 'modeling' module. 19 | """ 20 | if func_name == '': 21 | return None 22 | try: 23 | parts = func_name.split('.') 24 | # Refers to a function in this module 25 | if len(parts) == 1: 26 | return globals()[parts[0]] 27 | # Otherwise, assume we're referencing a module under modeling 28 | module_name = 'models.' + '.'.join(parts[:-1]) 29 | module = importlib.import_module(module_name) 30 | return getattr(module, parts[-1]) 31 | except Exception: 32 | print('Failed to find function: %s', func_name) 33 | raise 34 | 35 | 36 | class SSD(nn.Module): 37 | """Single Shot Multibox Architecture 38 | The network is composed of a base VGG network followed by the 39 | added multibox conv layers. Each multibox layer branches into 40 | 1) conv2d for class conf scores 41 | 2) conv2d for localization predictions 42 | 3) associated priorbox layer to produce default bounding 43 | boxes specific to the layer's feature map size. 44 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 45 | 46 | Args: 47 | phase: (string) Can be "test" or "train" 48 | base: VGG16 layers for input, size of either 300 or 500 49 | extras: extra layers that feed to multibox loc and conf layers 50 | head: "multibox head" consists of loc and conf conv layers 51 | """ 52 | 53 | def _init_modules(self): 54 | self.arm_loc.apply(weights_init) 55 | self.arm_conf.apply(weights_init) 56 | if self.cfg.MODEL.REFINE: 57 | self.odm_loc.apply(weights_init) 58 | self.odm_conf.apply(weights_init) 59 | if self.cfg.MODEL.LOAD_PRETRAINED_WEIGHTS: 60 | weights = torch.load(self.cfg.MODEL.PRETRAIN_WEIGHTS) 61 | print("load pretrain model {}".format( 62 | self.cfg.MODEL.PRETRAIN_WEIGHTS)) 63 | if self.cfg.MODEL.TYPE.split('_')[-1] == 'vgg': 64 | self.extractor.vgg.load_state_dict(weights) 65 | else: 66 | self.extractor.load_state_dict(weights, strict=False) 67 | 68 | def __init__(self, cfg): 69 | super(SSD, self).__init__() 70 | self.cfg = cfg 71 | self.size = cfg.MODEL.SIZE 72 | if self.size == '300': 73 | size_cfg = cfg.SMALL 74 | else: 75 | size_cfg = cfg.BIG 76 | self.num_classes = cfg.MODEL.NUM_CLASSES 77 | self.prior_layer = PriorLayer(cfg) 78 | self.priorbox = PriorBox(cfg) 79 | self.priors = self.priorbox.forward() 80 | self.extractor = get_func(cfg.MODEL.CONV_BODY)(self.size, 81 | cfg.TRAIN.CHANNEL_SIZE) 82 | if cfg.MODEL.REFINE: 83 | self.odm_channels = size_cfg.ODM_CHANNELS 84 | self.arm_num_classes = 2 85 | self.odm_loc = nn.ModuleList() 86 | self.odm_conf = nn.ModuleList() 87 | self.arm_loc = nn.ModuleList() 88 | self.arm_conf = nn.ModuleList() 89 | self.arm_channels = size_cfg.ARM_CHANNELS 90 | self.num_anchors = size_cfg.NUM_ANCHORS 91 | self.input_fixed = size_cfg.INPUT_FIXED 92 | self.arm_loc = nn.ModuleList() 93 | self.arm_conf = nn.ModuleList() 94 | 95 | for i in range(len(self.arm_channels)): 96 | if cfg.MODEL.REFINE: 97 | self.arm_loc += [ 98 | nn.Conv2d( 99 | self.arm_channels[i], 100 | self.num_anchors[i] * 4, 101 | kernel_size=3, 102 | padding=1) 103 | ] 104 | self.arm_conf += [ 105 | nn.Conv2d( 106 | self.arm_channels[i], 107 | self.num_anchors[i] * self.arm_num_classes, 108 | kernel_size=3, 109 | padding=1) 110 | ] 111 | 112 | self.odm_loc += [ 113 | nn.Conv2d( 114 | self.odm_channels[i], 115 | self.num_anchors[i] * 4, 116 | kernel_size=3, 117 | padding=1) 118 | ] 119 | self.odm_conf += [ 120 | nn.Conv2d( 121 | self.odm_channels[i], 122 | self.num_anchors[i] * self.num_classes, 123 | kernel_size=3, 124 | padding=1) 125 | ] 126 | else: 127 | self.arm_loc += [ 128 | nn.Conv2d( 129 | self.arm_channels[i], 130 | self.num_anchors[i] * 4, 131 | kernel_size=3, 132 | padding=1) 133 | ] 134 | self.arm_conf += [ 135 | nn.Conv2d( 136 | self.arm_channels[i], 137 | self.num_anchors[i] * self.num_classes, 138 | kernel_size=3, 139 | padding=1) 140 | ] 141 | if cfg.TRAIN.TRAIN_ON: 142 | self._init_modules() 143 | 144 | def forward(self, x): 145 | 146 | arm_loc = list() 147 | arm_conf = list() 148 | if self.cfg.MODEL.REFINE: 149 | odm_loc = list() 150 | odm_conf = list() 151 | arm_xs, odm_xs = self.extractor(x) 152 | for (x, l, c) in zip(odm_xs, self.odm_loc, self.odm_conf): 153 | # for (x) in (odm_xs): 154 | odm_loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 155 | odm_conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 156 | odm_loc = torch.cat([o.view(o.size(0), -1) for o in odm_loc], 1) 157 | odm_conf = torch.cat([o.view(o.size(0), -1) for o in odm_conf], 1) 158 | else: 159 | arm_xs, fbb, att, mm = self.extractor(x) 160 | img_wh = (x.size(3), x.size(2)) 161 | feature_maps_wh = [(t.size(3), t.size(2)) for t in arm_xs] 162 | for (x, l, c) in zip(arm_xs, self.arm_loc, self.arm_conf): 163 | arm_loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 164 | arm_conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 165 | arm_loc = torch.cat([o.view(o.size(0), -1) for o in arm_loc], 1) 166 | arm_conf = torch.cat([o.view(o.size(0), -1) for o in arm_conf], 1) 167 | if self.cfg.MODEL.REFINE: 168 | output = (arm_loc.view(arm_loc.size(0), -1, 4), 169 | arm_conf.view( 170 | arm_conf.size(0), -1, self.arm_num_classes), 171 | odm_loc.view(odm_loc.size(0), -1, 4), 172 | odm_conf.view(odm_conf.size(0), -1, self.num_classes), 173 | self.priors if self.input_fixed else self.prior_layer( 174 | img_wh, feature_maps_wh)) 175 | else: 176 | output = (arm_loc.view(arm_loc.size(0), -1, 4), 177 | arm_conf.view(arm_conf.size(0), -1, self.num_classes), 178 | self.priors if self.input_fixed else self.prior_layer( 179 | img_wh, feature_maps_wh)) 180 | 181 | return output, arm_xs, fbb, att, mm 182 | 183 | -------------------------------------------------------------------------------- /layers/modules/refine_multibox_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import numpy as np 8 | from torch.autograd import Variable 9 | from utils.box_utils import match, log_sum_exp, refine_match 10 | from layers.modules import WeightSoftmaxLoss, WeightSmoothL1Loss 11 | GPU = False 12 | if torch.cuda.is_available(): 13 | GPU = True 14 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 15 | 16 | 17 | class RefineMultiBoxLoss(nn.Module): 18 | """SSD Weighted Loss Function 19 | Compute Targets: 20 | 1) Produce Confidence Target Indices by matching ground truth boxes 21 | with (default) 'priorboxes' that have jaccard index > threshold parameter 22 | (default threshold: 0.5). 23 | 2) Produce localization target by 'encoding' variance into offsets of ground 24 | truth boxes and their matched 'priorboxes'. 25 | 3) Hard negative mining to filter the excessive number of negative examples 26 | that comes with using a large number of default bounding boxes. 27 | (default negative:positive ratio 3:1) 28 | Objective Loss: 29 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 30 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 31 | weighted by α which is set to 1 by cross val. 32 | Args: 33 | c: class confidences, 34 | l: predicted boxes, 35 | g: ground truth boxes 36 | N: number of matched default boxes 37 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 38 | """ 39 | 40 | def __init__(self, cfg, num_classes): 41 | super(RefineMultiBoxLoss, self).__init__() 42 | self.cfg = cfg 43 | self.size = cfg.MODEL.SIZE 44 | if self.size == '300': 45 | size_cfg = cfg.SMALL 46 | else: 47 | size_cfg = cfg.BIG 48 | self.variance = size_cfg.VARIANCE 49 | self.num_classes = num_classes 50 | self.threshold = cfg.TRAIN.OVERLAP 51 | self.OHEM = cfg.TRAIN.OHEM 52 | self.negpos_ratio = cfg.TRAIN.NEG_RATIO 53 | self.object_score = cfg.MODEL.OBJECT_SCORE 54 | self.variance = size_cfg.VARIANCE 55 | if cfg.TRAIN.FOCAL_LOSS: 56 | if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX': 57 | self.focaloss = FocalLossSoftmax( 58 | self.num_classes, gamma=2, size_average=False) 59 | else: 60 | self.focaloss = FocalLossSigmoid() 61 | 62 | def forward(self, 63 | predictions, 64 | targets, 65 | use_arm=False, 66 | filter_object=False, 67 | debug=False): 68 | """Multibox Loss 69 | Args: 70 | predictions (tuple): A tuple containing loc preds, conf preds, 71 | and prior boxes from SSD net. 72 | conf shape: torch.size(batch_size,num_priors,num_classes) 73 | loc shape: torch.size(batch_size,num_priors,4) 74 | priors shape: torch.size(num_priors,4) 75 | 76 | ground_truth (tensor): Ground truth boxes and labels for a batch, 77 | shape: [batch_size,num_objs,5] (last idx is the label). 78 | """ 79 | # arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions 80 | if use_arm: 81 | arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions 82 | else: 83 | loc_data, conf_data, _, _, priors = predictions 84 | num = loc_data.size(0) 85 | priors = priors[:loc_data.size(1), :] 86 | num_priors = (priors.size(0)) 87 | num_classes = self.num_classes 88 | 89 | # match priors (default boxes) and ground truth boxes 90 | loc_t = torch.Tensor(num, num_priors, 4) 91 | conf_t = torch.LongTensor(num, num_priors) 92 | defaults = priors.data 93 | for idx in range(num): 94 | truths = targets[idx][:, :-1].data 95 | labels = targets[idx][:, -1].data 96 | if self.num_classes == 2: 97 | labels = labels > 0 98 | if use_arm: 99 | bbox_weight = refine_match( 100 | self.threshold, 101 | truths, 102 | defaults, 103 | self.variance, 104 | labels, 105 | loc_t, 106 | conf_t, 107 | idx, 108 | arm_loc_data[idx].data, 109 | use_weight=False) 110 | else: 111 | match(self.threshold, truths, defaults, self.variance, labels, 112 | loc_t, conf_t, idx) 113 | 114 | loc_t = loc_t.cuda() 115 | conf_t = conf_t.cuda() 116 | # wrap targets 117 | loc_t = Variable(loc_t, requires_grad=False) 118 | conf_t = Variable(conf_t, requires_grad=False) 119 | 120 | if use_arm and filter_object: 121 | P = F.softmax(arm_conf_data, 2) 122 | arm_conf_data_temp = P[:, :, 1] 123 | object_score_index = arm_conf_data_temp <= self.object_score 124 | pos = conf_t > 0 125 | pos[object_score_index.detach()] = 0 126 | else: 127 | pos = conf_t > 0 128 | num_pos = pos.sum(1, keepdim=True) 129 | if debug: 130 | if use_arm: 131 | print("odm pos num: ", str(loc_t.size(0)), str(loc_t.size(1))) 132 | else: 133 | print("arm pos num", str(loc_t.size(0)), str(loc_t.size(1))) 134 | 135 | if self.OHEM: 136 | # Compute max conf across batch for hard negative mining 137 | batch_conf = conf_data.view(-1, self.num_classes) 138 | 139 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather( 140 | 1, conf_t.view(-1, 1)) 141 | 142 | # Hard Negative Mining 143 | loss_c[pos.view(-1, 1)] = 0 # filter out pos boxes for now 144 | loss_c = loss_c.view(num, -1) 145 | _, loss_idx = loss_c.sort(1, descending=True) 146 | _, idx_rank = loss_idx.sort(1) 147 | num_pos = pos.long().sum(1, keepdim=True) 148 | 149 | if num_pos.data.sum() > 0: 150 | num_neg = torch.clamp( 151 | self.negpos_ratio * num_pos, max=pos.size(1) - 1) 152 | else: 153 | fake_num_pos = torch.ones(32, 1).long() * 15 154 | num_neg = torch.clamp( 155 | self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1) 156 | neg = idx_rank < num_neg.expand_as(idx_rank) 157 | 158 | # Confidence Loss Including Positive and Negative Examples 159 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 160 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 161 | conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view( 162 | -1, self.num_classes) 163 | 164 | targets_weighted = conf_t[(pos + neg).gt(0)] 165 | loss_c = F.cross_entropy( 166 | conf_p, targets_weighted, size_average=False) 167 | else: 168 | loss_c = F.cross_entropy(conf_p, conf_t, size_average=False) 169 | 170 | # Localization Loss (Smooth L1) 171 | # Shape: [batch,num_priors,4] 172 | if num_pos.data.sum() > 0: 173 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 174 | loc_p = loc_data[pos_idx].view(-1, 4) 175 | loc_t = loc_t[pos_idx].view(-1, 4) 176 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 177 | N = num_pos.data.sum() 178 | else: 179 | loss_l = torch.zeros(1) 180 | N = 1.0 181 | 182 | loss_l /= float(N) 183 | loss_c /= float(N) 184 | return loss_l, loss_c 185 | -------------------------------------------------------------------------------- /layers/modules/refine_multibox_loss_seperate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import numpy as np 8 | from torch.autograd import Variable 9 | from utils.box_utils import match, log_sum_exp, refine_match 10 | from layers.modules import WeightSoftmaxLoss, WeightSmoothL1Loss 11 | GPU = False 12 | if torch.cuda.is_available(): 13 | GPU = True 14 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 15 | 16 | 17 | class RefineMultiBoxLoss(nn.Module): 18 | """SSD Weighted Loss Function 19 | Compute Targets: 20 | 1) Produce Confidence Target Indices by matching ground truth boxes 21 | with (default) 'priorboxes' that have jaccard index > threshold parameter 22 | (default threshold: 0.5). 23 | 2) Produce localization target by 'encoding' variance into offsets of ground 24 | truth boxes and their matched 'priorboxes'. 25 | 3) Hard negative mining to filter the excessive number of negative examples 26 | that comes with using a large number of default bounding boxes. 27 | (default negative:positive ratio 3:1) 28 | Objective Loss: 29 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 30 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 31 | weighted by α which is set to 1 by cross val. 32 | Args: 33 | c: class confidences, 34 | l: predicted boxes, 35 | g: ground truth boxes 36 | N: number of matched default boxes 37 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 38 | """ 39 | 40 | def __init__(self, cfg, num_classes): 41 | super(RefineMultiBoxLoss, self).__init__() 42 | self.cfg = cfg 43 | self.size = cfg.MODEL.SIZE 44 | if self.size == '300': 45 | size_cfg = cfg.SMALL 46 | else: 47 | size_cfg = cfg.BIG 48 | self.variance = size_cfg.VARIANCE 49 | self.num_classes = num_classes 50 | self.threshold = cfg.TRAIN.OVERLAP 51 | self.OHEM = cfg.TRAIN.OHEM 52 | self.negpos_ratio = cfg.TRAIN.NEG_RATIO 53 | self.object_score = cfg.MODEL.OBJECT_SCORE 54 | self.variance = size_cfg.VARIANCE 55 | if cfg.TRAIN.FOCAL_LOSS: 56 | if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX': 57 | self.focaloss = FocalLossSoftmax( 58 | self.num_classes, gamma=2, size_average=False) 59 | else: 60 | self.focaloss = FocalLossSigmoid() 61 | 62 | def forward(self, 63 | predictions, 64 | targets, 65 | use_arm=False, 66 | filter_object=False, 67 | debug=False): 68 | """Multibox Loss 69 | Args: 70 | predictions (tuple): A tuple containing loc preds, conf preds, 71 | and prior boxes from SSD net. 72 | conf shape: torch.size(batch_size,num_priors,num_classes) 73 | loc shape: torch.size(batch_size,num_priors,4) 74 | priors shape: torch.size(num_priors,4) 75 | 76 | ground_truth (tensor): Ground truth boxes and labels for a batch, 77 | shape: [batch_size,num_objs,5] (last idx is the label). 78 | """ 79 | # arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions 80 | if use_arm: 81 | arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions 82 | else: 83 | loc_data, conf_data, _, _, priors = predictions 84 | num = loc_data.size(0) 85 | priors = priors[:loc_data.size(1), :] 86 | num_priors = (priors.size(0)) 87 | num_classes = self.num_classes 88 | 89 | 90 | 91 | # match priors (default boxes) and ground truth boxes 92 | loc_t = torch.Tensor(num, num_priors, 4) 93 | conf_t = torch.LongTensor(num, num_priors) 94 | defaults = priors.data 95 | for idx in range(num): 96 | truths = targets[idx][:, :-1].data 97 | labels = targets[idx][:, -1].data 98 | if self.num_classes == 2: 99 | labels = labels > 0 100 | if use_arm: 101 | bbox_weight = refine_match( 102 | self.threshold, 103 | truths, 104 | defaults, 105 | self.variance, 106 | labels, 107 | loc_t, 108 | conf_t, 109 | idx, 110 | arm_loc_data[idx].data, 111 | use_weight=False) 112 | else: 113 | match(self.threshold, truths, defaults, self.variance, labels, 114 | loc_t, conf_t, idx) 115 | 116 | loc_t = loc_t.cuda() 117 | conf_t = conf_t.cuda() 118 | # wrap targets 119 | loc_t = Variable(loc_t, requires_grad=False) 120 | conf_t = Variable(conf_t, requires_grad=False) 121 | 122 | if use_arm and filter_object: 123 | P = F.softmax(arm_conf_data, 2) 124 | arm_conf_data_temp = P[:, :, 1] 125 | object_score_index = arm_conf_data_temp <= self.object_score 126 | pos = conf_t > 0 127 | pos[object_score_index.detach()] = 0 128 | else: 129 | pos = conf_t > 0 130 | num_pos = pos.sum(1, keepdim=True) 131 | if debug: 132 | if use_arm: 133 | print("odm pos num: ", str(loc_t.size(0)), str(loc_t.size(1))) 134 | else: 135 | print("arm pos num", str(loc_t.size(0)), str(loc_t.size(1))) 136 | 137 | if self.OHEM: 138 | # Compute max conf across batch for hard negative mining 139 | batch_conf = conf_data.view(-1, self.num_classes) 140 | 141 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather( 142 | 1, conf_t.view(-1, 1)) 143 | 144 | # Hard Negative Mining 145 | loss_c[pos.view(-1, 1)] = 0 # filter out pos boxes for now 146 | loss_c = loss_c.view(num, -1) 147 | _, loss_idx = loss_c.sort(1, descending=True) 148 | _, idx_rank = loss_idx.sort(1) 149 | num_pos = pos.long().sum(1, keepdim=True) 150 | 151 | if num_pos.data.sum() > 0: 152 | num_neg = torch.clamp( 153 | self.negpos_ratio * num_pos, max=pos.size(1) - 1) 154 | else: 155 | fake_num_pos = torch.ones(32, 1).long() * 15 156 | num_neg = torch.clamp( 157 | self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1) 158 | neg = idx_rank < num_neg.expand_as(idx_rank) 159 | 160 | # Confidence Loss Including Positive and Negative Examples 161 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 162 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 163 | conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view( 164 | -1, self.num_classes) 165 | 166 | targets_weighted = conf_t[(pos + neg).gt(0)] 167 | loss_c = F.cross_entropy( 168 | conf_p, targets_weighted, size_average=False) 169 | else: 170 | loss_c = F.cross_entropy(conf_p, conf_t, size_average=False) 171 | 172 | # Localization Loss (Smooth L1) 173 | # Shape: [batch,num_priors,4] 174 | if num_pos.data.sum() > 0: 175 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 176 | loc_p = loc_data[pos_idx].view(-1, 4) 177 | loc_t = loc_t[pos_idx].view(-1, 4) 178 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 179 | N = num_pos.data.sum() 180 | else: 181 | loss_l = torch.zeros(1) 182 | N = 1.0 183 | 184 | loss_l /= float(N) 185 | loss_c /= float(N) 186 | return loss_l, loss_c 187 | -------------------------------------------------------------------------------- /dcn/functions/deform_conv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from torch.nn.modules.utils import _pair 4 | 5 | from .. import deform_conv_cuda 6 | 7 | 8 | class DeformConvFunction(Function): 9 | 10 | @staticmethod 11 | def forward(ctx, 12 | input, 13 | offset, 14 | weight, 15 | stride=1, 16 | padding=0, 17 | dilation=1, 18 | groups=1, 19 | deformable_groups=1, 20 | im2col_step=64): 21 | if input is not None and input.dim() != 4: 22 | raise ValueError( 23 | "Expected 4D tensor as input, got {}D tensor instead.".format( 24 | input.dim())) 25 | ctx.stride = _pair(stride) 26 | ctx.padding = _pair(padding) 27 | ctx.dilation = _pair(dilation) 28 | ctx.groups = groups 29 | ctx.deformable_groups = deformable_groups 30 | ctx.im2col_step = im2col_step 31 | 32 | ctx.save_for_backward(input, offset, weight) 33 | 34 | output = input.new_empty( 35 | DeformConvFunction._output_size(input, weight, ctx.padding, 36 | ctx.dilation, ctx.stride)) 37 | 38 | ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones 39 | 40 | if not input.is_cuda: 41 | raise NotImplementedError 42 | else: 43 | cur_im2col_step = min(ctx.im2col_step, input.shape[0]) 44 | assert (input.shape[0] % 45 | cur_im2col_step) == 0, 'im2col step must divide batchsize' 46 | deform_conv_cuda.deform_conv_forward_cuda( 47 | input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1], 48 | weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], 49 | ctx.padding[1], ctx.padding[0], ctx.dilation[1], 50 | ctx.dilation[0], ctx.groups, ctx.deformable_groups, 51 | cur_im2col_step) 52 | return output 53 | 54 | @staticmethod 55 | def backward(ctx, grad_output): 56 | input, offset, weight = ctx.saved_tensors 57 | 58 | grad_input = grad_offset = grad_weight = None 59 | 60 | if not grad_output.is_cuda: 61 | raise NotImplementedError 62 | else: 63 | cur_im2col_step = min(ctx.im2col_step, input.shape[0]) 64 | assert (input.shape[0] % 65 | cur_im2col_step) == 0, 'im2col step must divide batchsize' 66 | 67 | if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: 68 | grad_input = torch.zeros_like(input) 69 | grad_offset = torch.zeros_like(offset) 70 | deform_conv_cuda.deform_conv_backward_input_cuda( 71 | input, offset, grad_output, grad_input, 72 | grad_offset, weight, ctx.bufs_[0], weight.size(3), 73 | weight.size(2), ctx.stride[1], ctx.stride[0], 74 | ctx.padding[1], ctx.padding[0], ctx.dilation[1], 75 | ctx.dilation[0], ctx.groups, ctx.deformable_groups, 76 | cur_im2col_step) 77 | 78 | if ctx.needs_input_grad[2]: 79 | grad_weight = torch.zeros_like(weight) 80 | deform_conv_cuda.deform_conv_backward_parameters_cuda( 81 | input, offset, grad_output, 82 | grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3), 83 | weight.size(2), ctx.stride[1], ctx.stride[0], 84 | ctx.padding[1], ctx.padding[0], ctx.dilation[1], 85 | ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1, 86 | cur_im2col_step) 87 | 88 | return (grad_input, grad_offset, grad_weight, None, None, None, None, 89 | None) 90 | 91 | @staticmethod 92 | def _output_size(input, weight, padding, dilation, stride): 93 | channels = weight.size(0) 94 | output_size = (input.size(0), channels) 95 | for d in range(input.dim() - 2): 96 | in_size = input.size(d + 2) 97 | pad = padding[d] 98 | kernel = dilation[d] * (weight.size(d + 2) - 1) + 1 99 | stride_ = stride[d] 100 | output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) 101 | if not all(map(lambda s: s > 0, output_size)): 102 | raise ValueError( 103 | "convolution input is too small (output would be {})".format( 104 | 'x'.join(map(str, output_size)))) 105 | return output_size 106 | 107 | 108 | class ModulatedDeformConvFunction(Function): 109 | 110 | @staticmethod 111 | def forward(ctx, 112 | input, 113 | offset, 114 | mask, 115 | weight, 116 | bias=None, 117 | stride=1, 118 | padding=0, 119 | dilation=1, 120 | groups=1, 121 | deformable_groups=1): 122 | ctx.stride = stride 123 | ctx.padding = padding 124 | ctx.dilation = dilation 125 | ctx.groups = groups 126 | ctx.deformable_groups = deformable_groups 127 | ctx.with_bias = bias is not None 128 | if not ctx.with_bias: 129 | bias = input.new_empty(1) # fake tensor 130 | if not input.is_cuda: 131 | raise NotImplementedError 132 | if weight.requires_grad or mask.requires_grad or offset.requires_grad \ 133 | or input.requires_grad: 134 | ctx.save_for_backward(input, offset, mask, weight, bias) 135 | output = input.new_empty( 136 | ModulatedDeformConvFunction._infer_shape(ctx, input, weight)) 137 | ctx._bufs = [input.new_empty(0), input.new_empty(0)] 138 | deform_conv_cuda.modulated_deform_conv_cuda_forward( 139 | input, weight, bias, ctx._bufs[0], offset, mask, output, 140 | ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride, 141 | ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, 142 | ctx.groups, ctx.deformable_groups, ctx.with_bias) 143 | return output 144 | 145 | @staticmethod 146 | def backward(ctx, grad_output): 147 | if not grad_output.is_cuda: 148 | raise NotImplementedError 149 | input, offset, mask, weight, bias = ctx.saved_tensors 150 | grad_input = torch.zeros_like(input) 151 | grad_offset = torch.zeros_like(offset) 152 | grad_mask = torch.zeros_like(mask) 153 | grad_weight = torch.zeros_like(weight) 154 | grad_bias = torch.zeros_like(bias) 155 | deform_conv_cuda.modulated_deform_conv_cuda_backward( 156 | input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1], 157 | grad_input, grad_weight, grad_bias, grad_offset, grad_mask, 158 | grad_output, weight.shape[2], weight.shape[3], ctx.stride, 159 | ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, 160 | ctx.groups, ctx.deformable_groups, ctx.with_bias) 161 | if not ctx.with_bias: 162 | grad_bias = None 163 | 164 | return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias, 165 | None, None, None, None, None) 166 | 167 | @staticmethod 168 | def _infer_shape(ctx, input, weight): 169 | n = input.size(0) 170 | channels_out = weight.size(0) 171 | height, width = input.shape[2:4] 172 | kernel_h, kernel_w = weight.shape[2:4] 173 | height_out = (height + 2 * ctx.padding - 174 | (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1 175 | width_out = (width + 2 * ctx.padding - 176 | (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1 177 | return n, channels_out, height_out, width_out 178 | 179 | 180 | deform_conv = DeformConvFunction.apply 181 | modulated_deform_conv = ModulatedDeformConvFunction.apply 182 | -------------------------------------------------------------------------------- /utils/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | cdef inline np.float32_t abs(np.float32_t a, np.float32_t b): 18 | return a - b if a >= b else b - a 19 | 20 | def get_iou_weights(np.ndarray[np.float32_t, ndim=1] ious, np.float threshold, float init_weight): 21 | 22 | cdef: 23 | int num = ious.shape[0] 24 | # np.ndarray[np.float32_t, ndim=1] out = np.zeros(num, dtype=np.float) 25 | int idx 26 | float iou 27 | float weight 28 | 29 | for idx, iou in enumerate(ious): 30 | weight = init_weight 31 | if iou > 0.0: 32 | if iou > threshold + 0.1: 33 | weight += 1.0 34 | elif iou < threshold - 0.1: 35 | weight += 1.0 36 | else: 37 | weight += 0.0 38 | ious[idx] = weight 39 | return ious 40 | 41 | def get_mask(np.ndarray[np.float32_t, ndim=1] ious, np.float threshold): 42 | cdef: 43 | int num = ious.shape[0] 44 | int idx = 0 45 | float distance 46 | float iou 47 | np.ndarray[np.int64_t, ndim=1] out = np.zeros((num), dtype=np.int64) 48 | for idx, iou in enumerate(ious): 49 | # if iou >= threshold: 50 | # distance = iou - threshold 51 | # if distance < 0.1: 52 | # out[idx] = 0 53 | # elif distance < 0.2: 54 | # out[idx] = 1 55 | # else: 56 | # out[idx] = 2 57 | # else: 58 | # distance = threshold - iou 59 | # if distance < 0.1: 60 | # out[idx] = 2 61 | # elif distance < 0.2: 62 | # out[idx] = 1 63 | # else: 64 | # out[idx] = 0 65 | distance = abs(iou, threshold) 66 | if distance < 0.1: 67 | # out[:,2] = 1 68 | out[idx] = 2 69 | elif distance < 0.2: 70 | # out[:,1] = 1 71 | out[idx] = 1 72 | else: 73 | # out[:,0] = 0 74 | out[idx] = 0 75 | return out 76 | 77 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 78 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 79 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 80 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 81 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 82 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 83 | 84 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 85 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 86 | 87 | cdef int ndets = dets.shape[0] 88 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 89 | np.zeros((ndets), dtype=np.int) 90 | 91 | # nominal indices 92 | cdef int _i, _j 93 | # sorted indices 94 | cdef int i, j 95 | # temp variables for box i's (the box currently under consideration) 96 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 97 | # variables for computing overlap with box j (lower scoring box) 98 | cdef np.float32_t xx1, yy1, xx2, yy2 99 | cdef np.float32_t w, h 100 | cdef np.float32_t inter, ovr 101 | 102 | keep = [] 103 | for _i in range(ndets): 104 | i = order[_i] 105 | if suppressed[i] == 1: 106 | continue 107 | keep.append(i) 108 | ix1 = x1[i] 109 | iy1 = y1[i] 110 | ix2 = x2[i] 111 | iy2 = y2[i] 112 | iarea = areas[i] 113 | for _j in range(_i + 1, ndets): 114 | j = order[_j] 115 | if suppressed[j] == 1: 116 | continue 117 | xx1 = max(ix1, x1[j]) 118 | yy1 = max(iy1, y1[j]) 119 | xx2 = min(ix2, x2[j]) 120 | yy2 = min(iy2, y2[j]) 121 | w = max(0.0, xx2 - xx1 + 1) 122 | h = max(0.0, yy2 - yy1 + 1) 123 | inter = w * h 124 | ovr = inter / (iarea + areas[j] - inter) 125 | if ovr >= thresh: 126 | suppressed[j] = 1 127 | 128 | return keep 129 | 130 | def cpu_soft_nms(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0): 131 | cdef unsigned int N = boxes.shape[0] 132 | cdef float iw, ih, box_area 133 | cdef float ua 134 | cdef int pos = 0 135 | cdef float maxscore = 0 136 | cdef int maxpos = 0 137 | cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov 138 | 139 | for i in range(N): 140 | maxscore = boxes[i, 4] 141 | maxpos = i 142 | 143 | tx1 = boxes[i,0] 144 | ty1 = boxes[i,1] 145 | tx2 = boxes[i,2] 146 | ty2 = boxes[i,3] 147 | ts = boxes[i,4] 148 | 149 | pos = i + 1 150 | # get max box 151 | while pos < N: 152 | if maxscore < boxes[pos, 4]: 153 | maxscore = boxes[pos, 4] 154 | maxpos = pos 155 | pos = pos + 1 156 | 157 | # add max box as a detection 158 | boxes[i,0] = boxes[maxpos,0] 159 | boxes[i,1] = boxes[maxpos,1] 160 | boxes[i,2] = boxes[maxpos,2] 161 | boxes[i,3] = boxes[maxpos,3] 162 | boxes[i,4] = boxes[maxpos,4] 163 | 164 | # swap ith box with position of max box 165 | boxes[maxpos,0] = tx1 166 | boxes[maxpos,1] = ty1 167 | boxes[maxpos,2] = tx2 168 | boxes[maxpos,3] = ty2 169 | boxes[maxpos,4] = ts 170 | 171 | tx1 = boxes[i,0] 172 | ty1 = boxes[i,1] 173 | tx2 = boxes[i,2] 174 | ty2 = boxes[i,3] 175 | ts = boxes[i,4] 176 | 177 | pos = i + 1 178 | # NMS iterations, note that N changes if detection boxes fall below threshold 179 | while pos < N: 180 | x1 = boxes[pos, 0] 181 | y1 = boxes[pos, 1] 182 | x2 = boxes[pos, 2] 183 | y2 = boxes[pos, 3] 184 | s = boxes[pos, 4] 185 | 186 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 187 | iw = (min(tx2, x2) - max(tx1, x1) + 1) 188 | if iw > 0: 189 | ih = (min(ty2, y2) - max(ty1, y1) + 1) 190 | if ih > 0: 191 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) 192 | ov = iw * ih / ua #iou between max box and detection box 193 | 194 | if method == 1: # linear 195 | if ov > Nt: 196 | weight = 1 - ov 197 | else: 198 | weight = 1 199 | elif method == 2: # gaussian 200 | weight = np.exp(-(ov * ov)/sigma) 201 | else: # original NMS 202 | if ov > Nt: 203 | weight = 0 204 | else: 205 | weight = 1 206 | 207 | boxes[pos, 4] = weight*boxes[pos, 4] 208 | 209 | # if box score falls below threshold, discard the box by swapping with last box 210 | # update N 211 | if boxes[pos, 4] < threshold: 212 | boxes[pos,0] = boxes[N-1, 0] 213 | boxes[pos,1] = boxes[N-1, 1] 214 | boxes[pos,2] = boxes[N-1, 2] 215 | boxes[pos,3] = boxes[N-1, 3] 216 | boxes[pos,4] = boxes[N-1, 4] 217 | N = N - 1 218 | pos = pos - 1 219 | 220 | pos = pos + 1 221 | 222 | keep = [i for i in range(N)] 223 | return keep 224 | -------------------------------------------------------------------------------- /eval_dcn.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | import torch.backends.cudnn as cudnn 7 | import torch.nn.init as init 8 | import argparse 9 | from torch.autograd import Variable 10 | import torch.utils.data as data 11 | from data import COCODetection, VOCDetection, detection_collate, BaseTransform, preproc 12 | from layers.modules import MultiBoxLoss, RefineMultiBoxLoss 13 | from layers.functions import Detect 14 | from utils.nms_wrapper import nms, soft_nms 15 | from configs.config import cfg, cfg_from_file 16 | import numpy as np 17 | import time 18 | import os 19 | import sys 20 | import pickle 21 | import datetime 22 | # from models.model_builder import SSD 23 | from models.model_builder_vgg import SSD 24 | 25 | # from models.model_builder_resnet import SSD 26 | 27 | 28 | def arg_parse(): 29 | parser = argparse.ArgumentParser( 30 | description='Single Shot MultiBox Detection') 31 | parser.add_argument( 32 | '--weights', 33 | # default='/media/jnie/Storage/iccv_weights/efrgnet_vgg_epoch_320.pth', 34 | default='/media/jnie/Storage/iccv_weights/efrgnet_vgg_epoch_512.pth', 35 | type=str, 36 | help='Trained state_dict file path to open') 37 | parser.add_argument( 38 | '--cfg', 39 | dest='cfg_file', 40 | required=True, 41 | help='Config file for training (and optionally testing)') 42 | parser.add_argument( 43 | '--save_folder', 44 | default='eval/', 45 | type=str, 46 | help='File path to save results') 47 | parser.add_argument( 48 | '--num_workers', 49 | default=8, 50 | type=int, 51 | help='Number of workers used in dataloading') 52 | parser.add_argument( 53 | '--retest', default=False, type=bool, help='test cache results') 54 | args = parser.parse_args() 55 | return args 56 | 57 | 58 | def eval_net(val_dataset, 59 | val_loader, 60 | net, 61 | detector, 62 | cfg, 63 | transform, 64 | max_per_image=300, 65 | thresh=0.01, 66 | batch_size=1): 67 | net.eval() 68 | num_images = len(val_dataset) 69 | num_classes = cfg.MODEL.NUM_CLASSES 70 | eval_save_folder = "./eval/" 71 | if not os.path.exists(eval_save_folder): 72 | os.mkdir(eval_save_folder) 73 | all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)] 74 | det_file = os.path.join(eval_save_folder, 'detections.pkl') 75 | 76 | if args.retest: 77 | f = open(det_file, 'rb') 78 | all_boxes = pickle.load(f) 79 | print('Evaluating detections') 80 | val_dataset.evaluate_detections(all_boxes, eval_save_folder) 81 | return 82 | 83 | # img_idexes = val_dataset.image_indexes #coco 84 | img_idexes = val_dataset.ids #voc 85 | total_times = 0 86 | network_times = 0 87 | total_nms_times = 0 88 | total_forward_times = 0 89 | 90 | for idx, (imgs, _, img_info) in enumerate(val_loader): 91 | with torch.no_grad(): 92 | 93 | x = imgs 94 | x = x.cuda() 95 | torch.cuda.synchronize() 96 | t1 = time.time() 97 | output= net(x) 98 | 99 | torch.cuda.synchronize() 100 | t4 = time.time() 101 | boxes, scores = detector.forward(output) 102 | 103 | # idx = np.where(scores == scores[:,:,1:].max()) 104 | torch.cuda.synchronize() 105 | t2 = time.time() 106 | 107 | for k in range(boxes.size(0)): 108 | i = idx * batch_size + k 109 | boxes_ = boxes[k] 110 | scores_ = scores[k] 111 | boxes_ = boxes_.cpu().numpy() 112 | scores_ = scores_.cpu().numpy() 113 | img_wh = img_info[k] 114 | scale = np.array([img_wh[0], img_wh[1], img_wh[0], img_wh[1]]) 115 | boxes_ *= scale 116 | for j in range(1, num_classes): 117 | inds = np.where(scores_[:, j] > thresh)[0] 118 | if len(inds) == 0: 119 | all_boxes[j][i] = np.empty([0, 5], dtype=np.float32) 120 | continue 121 | c_bboxes = boxes_[inds] 122 | c_scores = scores_[inds, j] 123 | c_dets = np.hstack((c_bboxes, 124 | c_scores[:, np.newaxis])).astype( 125 | np.float32, copy=False) 126 | keep = nms(c_dets, cfg.TEST.NMS_OVERLAP, force_cpu=False) 127 | # keep = soft_nms(c_dets, Nt=0.45, method=2) 128 | keep = keep[:50] 129 | c_dets = c_dets[keep, :] 130 | all_boxes[j][i] = c_dets 131 | 132 | torch.cuda.synchronize() 133 | t3 = time.time() 134 | detect_time = t2 - t4 135 | nms_time = t3 - t2 136 | forward_time = t4 - t1 137 | if idx % 10 == 0: 138 | print('im_detect: {:d}/{:d} {:.6f}s {:.6f}s {:.3f}s'.format( 139 | i + 1, num_images, forward_time, detect_time, nms_time)) 140 | network_times += (t4 - t1) 141 | total_times += (t3 - t1) 142 | total_nms_times += nms_time 143 | total_forward_times += (t2-t1) 144 | 145 | print("detect time: ", time.time() - st) 146 | print("net time: ", network_times/5000.0) 147 | print("avg time: ", total_times/5000.0) 148 | print("nms time: ", total_nms_times/5000.0) 149 | print("forward time: ", total_forward_times/5000.0) 150 | 151 | with open(det_file, 'wb') as f: 152 | pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) 153 | print('Evaluating detections') 154 | val_dataset.evaluate_detections(all_boxes, eval_save_folder) 155 | 156 | 157 | def main(): 158 | global args 159 | args = arg_parse() 160 | cfg_from_file(args.cfg_file) 161 | bgr_means = cfg.TRAIN.BGR_MEAN 162 | dataset_name = cfg.DATASETS.DATA_TYPE 163 | batch_size = cfg.TEST.BATCH_SIZE 164 | num_workers = args.num_workers 165 | if cfg.DATASETS.DATA_TYPE == 'VOC': 166 | trainvalDataset = VOCDetection 167 | top_k = 200 168 | else: 169 | trainvalDataset = COCODetection 170 | top_k = 300 171 | dataroot = cfg.DATASETS.DATAROOT 172 | if cfg.MODEL.SIZE == '300': 173 | size_cfg = cfg.SMALL 174 | else: 175 | size_cfg = cfg.BIG 176 | 177 | valSet = cfg.DATASETS.VAL_TYPE 178 | num_classes = cfg.MODEL.NUM_CLASSES 179 | save_folder = args.save_folder 180 | if not os.path.exists(save_folder): 181 | os.mkdir(save_folder) 182 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 183 | cfg.TRAIN.TRAIN_ON = False 184 | net = SSD(cfg) 185 | 186 | checkpoint = torch.load(args.weights) 187 | state_dict = checkpoint['model'] 188 | from collections import OrderedDict 189 | new_state_dict = OrderedDict() 190 | for k, v in state_dict.items(): 191 | head = k[:7] 192 | if head == 'module.': 193 | name = k[7:] # remove `module.` 194 | else: 195 | name = k 196 | new_state_dict[name] = v 197 | net.load_state_dict(new_state_dict) 198 | detector = Detect(cfg) 199 | ValTransform = BaseTransform(size_cfg.IMG_WH, bgr_means, (2, 0, 1)) 200 | 201 | val_dataset = trainvalDataset(dataroot, valSet, ValTransform) 202 | val_loader = data.DataLoader( 203 | val_dataset, 204 | batch_size, 205 | shuffle=False, 206 | num_workers=num_workers, 207 | collate_fn=detection_collate) 208 | top_k = 300 209 | thresh = cfg.TEST.CONFIDENCE_THRESH 210 | eval_net( 211 | val_dataset, 212 | val_loader, 213 | net, 214 | detector, 215 | cfg, 216 | ValTransform, 217 | top_k, 218 | thresh=thresh, 219 | batch_size=batch_size) 220 | 221 | 222 | if __name__ == '__main__': 223 | st = time.time() 224 | main() 225 | print("final time", time.time() - st) 226 | -------------------------------------------------------------------------------- /data/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os 9 | import pickle 10 | import numpy as np 11 | import pdb 12 | import matplotlib 13 | matplotlib.use('Agg') 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | def parse_rec(filename): 18 | """ Parse a PASCAL VOC xml file """ 19 | tree = ET.parse(filename) 20 | objects = [] 21 | for obj in tree.findall('object'): 22 | obj_struct = {} 23 | obj_struct['name'] = obj.find('name').text 24 | obj_struct['pose'] = obj.find('pose').text 25 | obj_struct['truncated'] = int(obj.find('truncated').text) 26 | obj_struct['difficult'] = int(obj.find('difficult').text) 27 | bbox = obj.find('bndbox') 28 | obj_struct['bbox'] = [ 29 | int(bbox.find('xmin').text), 30 | int(bbox.find('ymin').text), 31 | int(bbox.find('xmax').text), 32 | int(bbox.find('ymax').text) 33 | ] 34 | objects.append(obj_struct) 35 | 36 | return objects 37 | 38 | 39 | def voc_ap(rec, prec, use_07_metric=False): 40 | """ ap = voc_ap(rec, prec, [use_07_metric]) 41 | Compute VOC AP given precision and recall. 42 | If use_07_metric is true, uses the 43 | VOC 07 11 point method (default:False). 44 | """ 45 | if use_07_metric: 46 | # 11 point metric 47 | ap = 0. 48 | for t in np.arange(0., 1.1, 0.1): 49 | if np.sum(rec >= t) == 0: 50 | p = 0 51 | else: 52 | p = np.max(prec[rec >= t]) 53 | ap = ap + p / 11. 54 | else: 55 | # correct AP calculation 56 | # first append sentinel values at the end 57 | mrec = np.concatenate(([0.], rec, [1.])) 58 | mpre = np.concatenate(([0.], prec, [0.])) 59 | 60 | # compute the precision envelope 61 | for i in range(mpre.size - 1, 0, -1): 62 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 63 | 64 | # to calculate area under PR curve, look for points 65 | # where X axis (recall) changes value 66 | i = np.where(mrec[1:] != mrec[:-1])[0] 67 | 68 | # and sum (\Delta recall) * prec 69 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 70 | return ap 71 | 72 | 73 | def voc_eval(detpath, 74 | annopath, 75 | imagesetfile, 76 | classname, 77 | cachedir, 78 | ovthresh=0.5, 79 | use_07_metric=False): 80 | """rec, prec, ap = voc_eval(detpath, 81 | annopath, 82 | imagesetfile, 83 | classname, 84 | [ovthresh], 85 | [use_07_metric]) 86 | 87 | Top level function that does the PASCAL VOC evaluation. 88 | 89 | detpath: Path to detections 90 | detpath.format(classname) should produce the detection results file. 91 | annopath: Path to annotations 92 | annopath.format(imagename) should be the xml annotations file. 93 | imagesetfile: Text file containing the list of images, one image per line. 94 | classname: Category name (duh) 95 | cachedir: Directory for caching the annotations 96 | [ovthresh]: Overlap threshold (default = 0.5) 97 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 98 | (default False) 99 | """ 100 | # assumes detections are in detpath.format(classname) 101 | # assumes annotations are in annopath.format(imagename) 102 | # assumes imagesetfile is a text file with each line an image name 103 | # cachedir caches the annotations in a pickle file 104 | 105 | # first load gt 106 | if not os.path.isdir(cachedir): 107 | os.mkdir(cachedir) 108 | cachefile = os.path.join(cachedir, 'annots.pkl') 109 | # read list of images 110 | with open(imagesetfile, 'r') as f: 111 | lines = f.readlines() 112 | imagenames = [x.strip() for x in lines] 113 | 114 | if not os.path.isfile(cachefile): 115 | # load annots 116 | recs = {} 117 | for i, imagename in enumerate(imagenames): 118 | recs[imagename] = parse_rec(annopath.format(imagename)) 119 | if i % 100 == 0: 120 | print('Reading annotation for {:d}/{:d}'.format( 121 | i + 1, len(imagenames))) 122 | # save 123 | print('Saving cached annotations to {:s}'.format(cachefile)) 124 | with open(cachefile, 'wb') as f: 125 | pickle.dump(recs, f) 126 | else: 127 | # load 128 | with open(cachefile, 'rb') as f: 129 | recs = pickle.load(f) 130 | 131 | # extract gt objects for this class 132 | class_recs = {} 133 | npos = 0 134 | for imagename in imagenames: 135 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 136 | bbox = np.array([x['bbox'] for x in R]) 137 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 138 | det = [False] * len(R) 139 | npos = npos + sum(~difficult) 140 | class_recs[imagename] = { 141 | 'bbox': bbox, 142 | 'difficult': difficult, 143 | 'det': det 144 | } 145 | 146 | # read dets 147 | detfile = detpath.format(classname) 148 | with open(detfile, 'r') as f: 149 | lines = f.readlines() 150 | 151 | splitlines = [x.strip().split(' ') for x in lines] 152 | image_ids = [x[0] for x in splitlines] 153 | confidence = np.array([float(x[1]) for x in splitlines]) 154 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 155 | # sort by confidence 156 | sorted_ind = np.argsort(-confidence) 157 | sorted_scores = np.sort(-confidence) 158 | BB = BB[sorted_ind, :] 159 | image_ids = [image_ids[x] for x in sorted_ind] 160 | 161 | # go down dets and mark TPs and FPs 162 | nd = len(image_ids) 163 | tp = np.zeros(nd) 164 | fp = np.zeros(nd) 165 | for d in range(nd): 166 | R = class_recs[image_ids[d]] 167 | bb = BB[d, :].astype(float) 168 | ovmax = -np.inf 169 | BBGT = R['bbox'].astype(float) 170 | 171 | if BBGT.size > 0: 172 | # compute overlaps 173 | # intersection 174 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 175 | iymin = np.maximum(BBGT[:, 1], bb[1]) 176 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 177 | iymax = np.minimum(BBGT[:, 3], bb[3]) 178 | iw = np.maximum(ixmax - ixmin + 1., 0.) 179 | ih = np.maximum(iymax - iymin + 1., 0.) 180 | inters = iw * ih 181 | 182 | # union 183 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 184 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 185 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 186 | 187 | overlaps = inters / uni 188 | ovmax = np.max(overlaps) 189 | jmax = np.argmax(overlaps) 190 | 191 | if ovmax > ovthresh: 192 | if not R['difficult'][jmax]: 193 | if not R['det'][jmax]: 194 | tp[d] = 1. 195 | R['det'][jmax] = 1 196 | else: 197 | fp[d] = 1. 198 | else: 199 | fp[d] = 1. 200 | 201 | # compute precision recall 202 | fp = np.cumsum(fp) 203 | tp = np.cumsum(tp) 204 | rec = tp / float(npos) 205 | # avoid divide by zero in case the first detection matches a difficult 206 | # ground truth 207 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 208 | # if classname == 'person': 209 | final_rec = round(rec[-1], 4) 210 | final_prec = round(prec[-1], 4) 211 | plt_save_path = os.path.join(".", "eval", "pr") 212 | if not os.path.exists(plt_save_path): 213 | os.makedirs(plt_save_path) 214 | plt.plot(rec, prec, 'r') 215 | pr_curl = os.path.join( 216 | plt_save_path, '{}_{}_{}pr.jpg'.format(classname, str(final_prec), 217 | str(final_rec))) 218 | plt.savefig(pr_curl) 219 | plt.close() 220 | ap = voc_ap(rec, prec, use_07_metric) 221 | 222 | return rec, prec, ap 223 | -------------------------------------------------------------------------------- /models/vgg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import torch.nn.init as init 9 | from models.model_helper import weights_init 10 | 11 | class BasicConv(nn.Module): 12 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False): 13 | super(BasicConv, self).__init__() 14 | self.out_channels = out_planes 15 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) 16 | self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None 17 | self.relu = nn.ReLU(inplace=True) if relu else None 18 | 19 | def forward(self, x): 20 | x = self.conv(x) 21 | if self.bn is not None: 22 | x = self.bn(x) 23 | if self.relu is not None: 24 | x = self.relu(x) 25 | return x 26 | 27 | class BasicBlock(nn.Module): 28 | 29 | def __init__(self, in_planes, out_planes, stride=1): 30 | super(BasicBlock, self).__init__() 31 | self.out_channels = out_planes 32 | inter_planes = in_planes // 4 33 | self.single_branch = nn.Sequential( 34 | BasicConv(in_planes, inter_planes, kernel_size=(3, 3), stride=stride, padding=(1, 1)), 35 | BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=2, dilation=2), 36 | BasicConv(inter_planes, out_planes, kernel_size=(3, 3), stride=1, padding=(1, 1)) 37 | ) 38 | 39 | def forward(self, x): 40 | out = self.single_branch(x) 41 | return out 42 | 43 | class L2Norm(nn.Module): 44 | def __init__(self, n_channels, scale): 45 | super(L2Norm, self).__init__() 46 | self.n_channels = n_channels 47 | self.gamma = scale or None 48 | self.eps = 1e-10 49 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 50 | self.reset_parameters() 51 | 52 | def reset_parameters(self): 53 | init.constant_(self.weight, self.gamma) 54 | 55 | def forward(self, x): 56 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps 57 | x = x / norm 58 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as( 59 | x) * x 60 | return out 61 | 62 | 63 | # This function is derived from torchvision VGG make_layers() 64 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 65 | 66 | 67 | def vgg(cfg, i, batch_norm=False): 68 | layers = [] 69 | in_channels = i 70 | for v in cfg: 71 | if v == 'M': 72 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 73 | elif v == 'C': 74 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 75 | else: 76 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 77 | if batch_norm: 78 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 79 | else: 80 | layers += [conv2d, nn.ReLU(inplace=True)] 81 | in_channels = v 82 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 83 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 84 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 85 | layers += [ 86 | pool5, conv6, 87 | nn.ReLU(inplace=True), conv7, 88 | nn.ReLU(inplace=True) 89 | ] 90 | return layers 91 | 92 | 93 | extras_cfg = { 94 | '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 95 | '512': [ 96 | 256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128, 'S', 97 | 256 98 | ], 99 | } 100 | 101 | base = { 102 | '300': [ 103 | 64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 104 | 512, 512, 512 105 | ], 106 | '512': [ 107 | 64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 108 | 512, 512, 512 109 | ], 110 | } 111 | 112 | 113 | def add_extras(cfg, i, batch_norm=False): 114 | # Extra layers added to VGG for feature scaling 115 | layers = [] 116 | in_channels = i 117 | flag = False 118 | for k, v in enumerate(cfg): 119 | if in_channels != 'S': 120 | if v == 'S': 121 | layers += [ 122 | nn.Conv2d( 123 | in_channels, 124 | cfg[k + 1], 125 | kernel_size=(1, 3)[flag], 126 | stride=2, 127 | padding=1) 128 | ] 129 | else: 130 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 131 | flag = not flag 132 | in_channels = v 133 | return layers 134 | 135 | def add_extras_bn(cfg, i, batch_norm=False): 136 | # Extra layers added to VGG for feature scaling 137 | layers = [] 138 | in_channels = i 139 | flag = False 140 | for k, v in enumerate(cfg): 141 | if in_channels != 'S': 142 | if v == 'S': 143 | layers += [ 144 | BasicConv( 145 | in_channels, 146 | cfg[k + 1], 147 | kernel_size=(1, 3)[flag], 148 | stride=2, 149 | padding=1) 150 | ] 151 | else: 152 | layers += [BasicConv(in_channels, v, kernel_size=(1, 3)[flag])] 153 | flag = not flag 154 | in_channels = v 155 | return layers 156 | 157 | 158 | class VGG16Extractor(nn.Module): 159 | def __init__(self, size): 160 | super(VGG16Extractor, self).__init__() 161 | self.vgg = nn.ModuleList(vgg(base[str(size)], 3)) 162 | self.L2Norm = L2Norm(512, 20) 163 | self.extras = nn.ModuleList(add_extras(extras_cfg[str(size)], 1024)) 164 | # self.extras_bn = nn.ModuleList(add_extras_bn(extras_cfg[str(size)], 1024)) 165 | self._init_modules() 166 | 167 | def _init_modules(self): 168 | self.extras.apply(weights_init) 169 | # self.extras_bn.apply(weights_init) 170 | self.vgg.apply(weights_init) 171 | 172 | def forward(self, x): 173 | """Applies network layers and ops on input image(s) x. 174 | 175 | Args: 176 | x: input image or batch of images. Shape: [batch,3*batch,300,300]. 177 | 178 | Return: 179 | Depending on phase: 180 | test: 181 | Variable(tensor) of output class label predictions, 182 | confidence score, and corresponding location predictions for 183 | each object detected. Shape: [batch,topk,7] 184 | 185 | train: 186 | list of concat outputs from: 187 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 188 | 2: localization layers, Shape: [batch,num_priors*4] 189 | 3: priorbox layers, Shape: [2,num_priors*4] 190 | """ 191 | sources = list() 192 | 193 | # apply vgg up to conv4_3 relu 194 | for k in range(23): 195 | x = self.vgg[k](x) 196 | 197 | s = self.L2Norm(x) 198 | sources.append(s) 199 | 200 | # apply vgg up to fc7 201 | for k in range(23, len(self.vgg)): 202 | x = self.vgg[k](x) 203 | sources.append(x) 204 | 205 | # apply extra layers and cache source layer outputs 206 | for k, v in enumerate(self.extras): 207 | x = F.relu(v(x), inplace=True) 208 | if k % 2 == 1: 209 | sources.append(x) 210 | 211 | # for k, v in enumerate(self.extras_bn): 212 | # x = v(x) 213 | # if k % 2 == 1: 214 | # sources.append(x) 215 | return sources 216 | 217 | 218 | def SSDVgg(size, channel_size='48'): 219 | return VGG16Extractor(size) 220 | 221 | 222 | if __name__ == "__main__": 223 | import os 224 | os.environ["CUDA_VISIBLE_DEVICES"] = "3" 225 | with torch.no_grad(): 226 | model3 = VGG16Extractor(300) 227 | model3.eval() 228 | x = torch.randn(16, 3, 300, 300) 229 | model3.cuda() 230 | model3(x.cuda()) 231 | import time 232 | st = time.time() 233 | for i in range(1000): 234 | model3(x.cuda()) 235 | print(time.time() - st) 236 | -------------------------------------------------------------------------------- /utils/convert_darknet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | # 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import numpy as np 9 | from data.config import voc_config, coco_config 10 | from model.yolo import Yolov3 11 | from model.darknet53 import Darknet53 12 | import argparse 13 | import os 14 | 15 | 16 | def copy_weights(bn, conv, ptr, weights, use_bn=True): 17 | if use_bn: 18 | num_bn_biases = bn.bias.numel() 19 | 20 | #Load the weights 21 | bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases]) 22 | ptr += num_bn_biases 23 | 24 | bn_weights = torch.from_numpy(weights[ptr:ptr + num_bn_biases]) 25 | ptr += num_bn_biases 26 | 27 | bn_running_mean = torch.from_numpy(weights[ptr:ptr + num_bn_biases]) 28 | ptr += num_bn_biases 29 | 30 | bn_running_var = torch.from_numpy(weights[ptr:ptr + num_bn_biases]) 31 | ptr += num_bn_biases 32 | 33 | #Cast the loaded weights into dims of model weights. 34 | bn_biases = bn_biases.view_as(bn.bias.data) 35 | bn_weights = bn_weights.view_as(bn.weight.data) 36 | bn_running_mean = bn_running_mean.view_as(bn.running_mean) 37 | bn_running_var = bn_running_var.view_as(bn.running_var) 38 | 39 | #Copy the data to model 40 | bn.bias.data.copy_(bn_biases) 41 | bn.weight.data.copy_(bn_weights) 42 | bn.running_mean.copy_(bn_running_mean) 43 | bn.running_var.copy_(bn_running_var) 44 | else: 45 | #Number of biases 46 | num_biases = conv.bias.numel() 47 | 48 | #Load the weights 49 | conv_biases = torch.from_numpy(weights[ptr:ptr + num_biases]) 50 | ptr = ptr + num_biases 51 | 52 | #reshape the loaded weights according to the dims of the model weights 53 | conv_biases = conv_biases.view_as(conv.bias.data) 54 | 55 | #Finally copy the data 56 | conv.bias.data.copy_(conv_biases) 57 | 58 | #Let us load the weights for the Convolutional layers 59 | num_weights = conv.weight.numel() 60 | conv_weights = torch.from_numpy(weights[ptr:ptr + num_weights]) 61 | ptr = ptr + num_weights 62 | 63 | conv_weights = conv_weights.view_as(conv.weight.data) 64 | conv.weight.data.copy_(conv_weights) 65 | return ptr 66 | 67 | 68 | def load_weights_darknet53(weightfile, yolov3): 69 | fp = open(weightfile, "rb") 70 | #The first 5 values are header information 71 | # 1. Major version number 72 | # 2. Minor Version Number 73 | # 3. Subversion number 74 | # 4. IMages seen 75 | header = np.fromfile(fp, dtype=np.int32, count=5) 76 | weights = np.fromfile(fp, dtype=np.float32) 77 | print(len(weights)) 78 | ptr = 0 79 | first_conv = yolov3.conv 80 | bn = first_conv.bn 81 | conv = first_conv.conv 82 | # first conv copy 83 | ptr = copy_weights(bn, conv, ptr, weights) 84 | 85 | layers = [ 86 | yolov3.layer1, yolov3.layer2, yolov3.layer3, yolov3.layer4, 87 | yolov3.layer5 88 | ] 89 | for layer in layers: 90 | for i in range(len(layer)): 91 | if i == 0: 92 | bn = layer[i].bn 93 | conv = layer[i].conv 94 | ptr = copy_weights(bn, conv, ptr, weights) 95 | else: 96 | bn = layer[i].conv1.bn 97 | conv = layer[i].conv1.conv 98 | ptr = copy_weights(bn, conv, ptr, weights) 99 | bn = layer[i].conv2.bn 100 | conv = layer[i].conv2.conv 101 | ptr = copy_weights(bn, conv, ptr, weights) 102 | print(ptr) 103 | fp.close() 104 | 105 | 106 | def load_weights(weightfile, yolov3, version): 107 | if version == "voc" or version == "coco": 108 | load_weights_yolov3(weightfile, yolov3) 109 | elif version == "darknet53": 110 | load_weights_darknet53(weightfile, yolov3) 111 | 112 | 113 | def load_weights_yolov3(weightfile, yolov3): 114 | fp = open(weightfile, "rb") 115 | #The first 5 values are header information 116 | # 1. Major version number 117 | # 2. Minor Version Number 118 | # 3. Subversion number 119 | # 4, 5. IMages seen 120 | header = np.fromfile(fp, dtype=np.int32, count=5) 121 | weights = np.fromfile(fp, dtype=np.float32) 122 | print(len(weights)) 123 | ptr = 0 124 | extractor = yolov3.extractor 125 | first_conv = extractor.conv 126 | bn = first_conv.bn 127 | conv = first_conv.conv 128 | # first conv copy 129 | ptr = copy_weights(bn, conv, ptr, weights) 130 | 131 | layers = [ 132 | extractor.layer1, extractor.layer2, extractor.layer3, extractor.layer4, 133 | extractor.layer5 134 | ] 135 | for layer in layers: 136 | for i in range(len(layer)): 137 | if i == 0: 138 | bn = layer[i].bn 139 | conv = layer[i].conv 140 | ptr = copy_weights(bn, conv, ptr, weights) 141 | else: 142 | bn = layer[i].conv1.bn 143 | conv = layer[i].conv1.conv 144 | ptr = copy_weights(bn, conv, ptr, weights) 145 | bn = layer[i].conv2.bn 146 | conv = layer[i].conv2.conv 147 | ptr = copy_weights(bn, conv, ptr, weights) 148 | predict_conv_list1 = yolov3.predict_conv_list1 149 | smooth_conv1 = yolov3.smooth_conv1 150 | predict_conv_list2 = yolov3.predict_conv_list2 151 | smooth_conv2 = yolov3.smooth_conv2 152 | predict_conv_list3 = yolov3.predict_conv_list3 153 | for i in range(len(predict_conv_list1)): 154 | if i == 6: 155 | bn = 0 156 | conv = predict_conv_list1[i] 157 | ptr = copy_weights(bn, conv, ptr, weights, use_bn=False) 158 | else: 159 | bn = predict_conv_list1[i].bn 160 | conv = predict_conv_list1[i].conv 161 | ptr = copy_weights(bn, conv, ptr, weights) 162 | bn = smooth_conv1.bn 163 | conv = smooth_conv1.conv 164 | ptr = copy_weights(bn, conv, ptr, weights) 165 | for i in range(len(predict_conv_list2)): 166 | if i == 6: 167 | bn = 0 168 | conv = predict_conv_list2[i] 169 | ptr = copy_weights(bn, conv, ptr, weights, use_bn=False) 170 | else: 171 | bn = predict_conv_list2[i].bn 172 | conv = predict_conv_list2[i].conv 173 | ptr = copy_weights(bn, conv, ptr, weights) 174 | bn = smooth_conv2.bn 175 | conv = smooth_conv2.conv 176 | ptr = copy_weights(bn, conv, ptr, weights) 177 | 178 | for i in range(len(predict_conv_list3)): 179 | if i == 6: 180 | bn = 0 181 | conv = predict_conv_list3[i] 182 | ptr = copy_weights(bn, conv, ptr, weights, use_bn=False) 183 | else: 184 | bn = predict_conv_list3[i].bn 185 | conv = predict_conv_list3[i].conv 186 | ptr = copy_weights(bn, conv, ptr, weights) 187 | print(ptr) 188 | fp.close() 189 | 190 | 191 | def arg_parse(): 192 | """ 193 | Parse arguments to the train module 194 | """ 195 | parser = argparse.ArgumentParser(description='Yolov3 pytorch Training') 196 | parser.add_argument('--input_wh', default=(416, 416), help='input size.') 197 | parser.add_argument( 198 | '--version', 199 | '--version', 200 | default='darknet53', 201 | help='voc, coco, darknet53') 202 | parser.add_argument( 203 | '--weights', 204 | default='./weights/darknet53.conv.74', 205 | help='pretrained base model') 206 | parser.add_argument( 207 | '--save_name', 208 | default='./weights/convert_yolov3_coco.pth', 209 | help='save name') 210 | 211 | return parser.parse_args() 212 | 213 | 214 | def load_weights_darknet19(weightfile, darknet19): 215 | fp = open(weightfile, "rb") 216 | #The first 4 values are header information 217 | # 1. Major version number 218 | # 2. Minor Version Number 219 | # 3. Subversion number 220 | # 4. IMages seen 221 | header = np.fromfile(fp, dtype=np.int32, count=4) 222 | weights = np.fromfile(fp, dtype=np.float32) 223 | ptr = 0 224 | first_conv = darknet19.conv 225 | bn = first_conv.bn 226 | conv = first_conv.conv 227 | # first conv copy 228 | ptr = copy_weights(bn, conv, ptr, weights) 229 | layers = [ 230 | darknet19.layer1, darknet19.layer2, darknet19.layer3, darknet19.layer4, 231 | darknet19.layer5 232 | ] 233 | for layer in layers: 234 | for i in range(len(layer)): 235 | if i == 0: 236 | pass 237 | else: 238 | bn = layer[i].bn 239 | conv = layer[i].conv 240 | ptr = copy_weights(bn, conv, ptr, weights) 241 | fp.close() 242 | 243 | 244 | if __name__ == '__main__': 245 | args = arg_parse() 246 | weightfile = args.weights 247 | input_wh = args.input_wh 248 | version = args.version 249 | save_name = args.save_name 250 | if version == "voc": 251 | cfg = voc_config 252 | yolov3 = Yolov3("train", input_wh, cfg["anchors"], cfg["anchors_mask"], 253 | cfg["num_classes"]) 254 | elif version == "coco": 255 | cfg = coco_config 256 | yolov3 = Yolov3("train", input_wh, cfg["anchors"], cfg["anchors_mask"], 257 | cfg["num_classes"]) 258 | elif version == "darknet53": 259 | cfg = voc_config 260 | num_blocks = [1, 2, 8, 8, 4] 261 | yolov3 = Darknet53(num_blocks) 262 | else: 263 | print("Unkown version !!!") 264 | import sys 265 | sys.exit() 266 | 267 | load_weights(weightfile, yolov3, version) 268 | # name = "convert_yolo_" + version + ".pth" 269 | # save_path = os.path.join("./weights", name) 270 | torch.save(darknet53.state_dict(), save_name) 271 | -------------------------------------------------------------------------------- /models/model_builder_vgg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | from layers import * 8 | import os 9 | from models.model_helper import weights_init 10 | import importlib 11 | from layers.functions.prior_layer import PriorLayer 12 | # from dcn.modules.deform_conv import DeformConv, ModulatedDeformConv 13 | from mmdet.ops import DeformConv, ModulatedDeformConv 14 | 15 | def get_func(func_name): 16 | """Helper to return a function object by name. func_name must identify a 17 | function in this module or the path to a function relative to the base 18 | 'modeling' module. 19 | """ 20 | if func_name == '': 21 | return None 22 | try: 23 | parts = func_name.split('.') 24 | # Refers to a function in this module 25 | if len(parts) == 1: 26 | return globals()[parts[0]] 27 | # Otherwise, assume we're referencing a module under modeling 28 | module_name = 'models.' + '.'.join(parts[:-1]) 29 | module = importlib.import_module(module_name) 30 | return getattr(module, parts[-1]) 31 | except Exception: 32 | print('Failed to find function: %s', func_name) 33 | raise 34 | 35 | class BasicConv(nn.Module): 36 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False): 37 | super(BasicConv, self).__init__() 38 | self.out_channels = out_planes 39 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) 40 | self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None 41 | self.relu = nn.ReLU(inplace=True) if relu else None 42 | 43 | def forward(self, x): 44 | x = self.conv(x) 45 | if self.bn is not None: 46 | x = self.bn(x) 47 | if self.relu is not None: 48 | x = self.relu(x) 49 | return x 50 | 51 | 52 | 53 | def add_dcn_dilas(): 54 | planes = [512,1024,256,256] 55 | deformable_groups = 1 56 | conv_layers = [] 57 | for i in range(4): 58 | conv_layers += [DeformConv( 59 | planes[i], 60 | 256, 61 | kernel_size=3, 62 | stride=1, 63 | padding=5-i, 64 | dilation=5-i, 65 | deformable_groups=deformable_groups, 66 | bias=False)] 67 | return conv_layers 68 | 69 | def BN_layers(): 70 | bn_layers =[] 71 | bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)] 72 | bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)] 73 | bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)] 74 | bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)] 75 | 76 | return bn_layers 77 | 78 | class SSD(nn.Module): 79 | """Single Shot Multibox Architecture 80 | The network is composed of a base VGG network followed by the 81 | added multibox conv layers. Each multibox layer branches into 82 | 1) conv2d for class conf scores 83 | 2) conv2d for localization predictions 84 | 3) associated priorbox layer to produce default bounding 85 | boxes specific to the layer's feature map size. 86 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 87 | 88 | Args: 89 | phase: (string) Can be "test" or "train" 90 | base: VGG16 layers for input, size of either 300 or 500 91 | extras: extra layers that feed to multibox loc and conf layers 92 | head: "multibox head" consists of loc and conf conv layers 93 | """ 94 | 95 | def _init_modules(self): 96 | self.arm_loc.apply(weights_init) 97 | self.arm_conf.apply(weights_init) 98 | if self.cfg.MODEL.REFINE: 99 | self.odm_loc.apply(weights_init) 100 | self.odm_conf.apply(weights_init) 101 | 102 | self.loc_offset_conv.apply(weights_init) 103 | # self.offsets.apply(weights_init) 104 | self.dcn_convs.apply(weights_init) 105 | if self.cfg.MODEL.LOAD_PRETRAINED_WEIGHTS: 106 | weights = torch.load(self.cfg.MODEL.PRETRAIN_WEIGHTS) 107 | print("load pretrain model {}".format( 108 | self.cfg.MODEL.PRETRAIN_WEIGHTS)) 109 | if self.cfg.MODEL.TYPE.split('_')[-1] == 'vgg': 110 | self.extractor.vgg.load_state_dict(weights) 111 | else: 112 | self.extractor.load_state_dict(weights, strict=False) 113 | 114 | def __init__(self, cfg): 115 | super(SSD, self).__init__() 116 | self.cfg = cfg 117 | self.size = cfg.MODEL.SIZE 118 | if self.size == '300': 119 | size_cfg = cfg.SMALL 120 | else: 121 | size_cfg = cfg.BIG 122 | self.num_classes = cfg.MODEL.NUM_CLASSES 123 | self.prior_layer = PriorLayer(cfg) 124 | self.priorbox = PriorBox(cfg) 125 | self.priors = self.priorbox.forward() 126 | self.extractor = get_func(cfg.MODEL.CONV_BODY)(self.size, 127 | cfg.TRAIN.CHANNEL_SIZE) 128 | if cfg.MODEL.REFINE: 129 | self.odm_channels = size_cfg.ODM_CHANNELS 130 | self.arm_num_classes = 2 131 | self.odm_loc = nn.ModuleList() 132 | self.odm_conf = nn.ModuleList() 133 | 134 | 135 | self.loc_offset_conv = nn.ModuleList() 136 | self.dcn_convs = nn.ModuleList(add_dcn_dilas()) 137 | self.bn_layers = nn.ModuleList(BN_layers()) 138 | 139 | self.arm_loc = nn.ModuleList() 140 | self.arm_conf = nn.ModuleList() 141 | self.arm_channels = size_cfg.ARM_CHANNELS 142 | self.num_anchors = size_cfg.NUM_ANCHORS 143 | self.input_fixed = size_cfg.INPUT_FIXED 144 | self.arm_loc = nn.ModuleList() 145 | self.arm_conf = nn.ModuleList() 146 | 147 | for i in range(len(self.arm_channels)): 148 | if cfg.MODEL.REFINE: 149 | self.arm_loc += [ 150 | nn.Conv2d( 151 | self.arm_channels[i], 152 | self.num_anchors[i] * 4, 153 | kernel_size=3, 154 | padding=1) 155 | ] 156 | self.arm_conf += [ 157 | nn.Conv2d( 158 | self.arm_channels[i], 159 | self.num_anchors[i] * self.arm_num_classes, 160 | kernel_size=3, 161 | padding=1) 162 | ] 163 | 164 | self.loc_offset_conv +=[BasicConv(self.num_anchors[i] * 2, 18, kernel_size=1)] 165 | self.odm_loc += [ 166 | nn.Conv2d( 167 | self.odm_channels[i], 168 | self.num_anchors[i] * 4, 169 | kernel_size=3, 170 | padding=1) 171 | ] 172 | self.odm_conf += [ 173 | nn.Conv2d( 174 | self.odm_channels[i], 175 | self.num_anchors[i] * self.num_classes, 176 | kernel_size=3, 177 | padding=1) 178 | ] 179 | else: 180 | self.arm_loc += [ 181 | nn.Conv2d( 182 | self.arm_channels[i], 183 | self.num_anchors[i] * 4, 184 | kernel_size=3, 185 | padding=1) 186 | ] 187 | self.arm_conf += [ 188 | nn.Conv2d( 189 | self.arm_channels[i], 190 | self.num_anchors[i] * self.num_classes, 191 | kernel_size=3, 192 | padding=1) 193 | ] 194 | if cfg.TRAIN.TRAIN_ON: 195 | self._init_modules() 196 | 197 | def forward(self, input): 198 | 199 | arm_loc = list() 200 | arm_conf = list() 201 | if self.cfg.MODEL.REFINE: 202 | odm_loc = list() 203 | odm_conf = list() 204 | conf = list() 205 | odm_xs_n = list() 206 | arm_loc_list = list() 207 | arm_xs, odm_xs = self.extractor(input) 208 | 209 | for (x, l, c) in zip(arm_xs, self.arm_loc, self.arm_conf): 210 | arm_loc_conv = l(x) 211 | cc = c(x) 212 | conf.append(cc) 213 | arm_loc_list.append(torch.cat([arm_loc_conv[:,0::4,:,:], arm_loc_conv[:,1::4,:,:]], 1)) 214 | arm_loc.append(arm_loc_conv.permute(0, 2, 3, 1).contiguous()) 215 | arm_conf.append(cc.permute(0, 2, 3, 1).contiguous()) 216 | 217 | 218 | for (conf_fea, odm_xs_fea) in zip(conf, odm_xs): 219 | conf_obj = conf_fea[:, 1::2, :, :] 220 | conf_max, _ = torch.max(conf_obj, dim=1, keepdim=True) 221 | conf_attention = conf_max.sigmoid() 222 | odm_xs_fea_n = odm_xs_fea * conf_attention + odm_xs_fea 223 | odm_xs_n.append(odm_xs_fea_n) 224 | 225 | offset_0 = self.loc_offset_conv[0](arm_loc_list[0]) 226 | d0 = F.relu(self.bn_layers[0](self.dcn_convs[0](odm_xs_n[0], offset_0)), inplace=True) 227 | 228 | offset_1 = self.loc_offset_conv[1](arm_loc_list[1]) 229 | d1 = F.relu(self.bn_layers[1](self.dcn_convs[1](odm_xs_n[1], offset_1)), inplace=True) 230 | 231 | offset_2 = self.loc_offset_conv[2](arm_loc_list[2]) 232 | d2 = F.relu(self.bn_layers[2](self.dcn_convs[2](odm_xs_n[2], offset_2)), inplace=True) 233 | 234 | offset_3 = self.loc_offset_conv[3](arm_loc_list[3]) 235 | d3 = F.relu(self.bn_layers[3](self.dcn_convs[3](odm_xs_n[3], offset_3)), inplace=True) 236 | odm_xs_new = [d0,d1,d2,d3] 237 | 238 | 239 | for (x, l, c) in zip(odm_xs_new, self.odm_loc, self.odm_conf): 240 | odm_loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 241 | odm_conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 242 | odm_loc = torch.cat([o.view(o.size(0), -1) for o in odm_loc], 1) 243 | odm_conf = torch.cat([o.view(o.size(0), -1) for o in odm_conf], 1) 244 | else: 245 | arm_xs = self.extractor(input) 246 | img_wh = (input.size(3), input.size(2)) 247 | feature_maps_wh = [(t.size(3), t.size(2)) for t in arm_xs] 248 | 249 | arm_loc = torch.cat([o.view(o.size(0), -1) for o in arm_loc], 1) 250 | arm_conf = torch.cat([o.view(o.size(0), -1) for o in arm_conf], 1) 251 | if self.cfg.MODEL.REFINE: 252 | output = (arm_loc.view(arm_loc.size(0), -1, 4), 253 | arm_conf.view( 254 | arm_conf.size(0), -1, self.arm_num_classes), 255 | odm_loc.view(odm_loc.size(0), -1, 4), 256 | odm_conf.view(odm_conf.size(0), -1, self.num_classes), 257 | self.priors if self.input_fixed else self.prior_layer( 258 | img_wh, feature_maps_wh)) 259 | else: 260 | output = (arm_loc.view(arm_loc.size(0), -1, 4), 261 | arm_conf.view(arm_conf.size(0), -1, self.num_classes), 262 | self.priors if self.input_fixed else self.prior_layer( 263 | img_wh, feature_maps_wh)) 264 | return output 265 | 266 | -------------------------------------------------------------------------------- /data/data_augment.py: -------------------------------------------------------------------------------- 1 | """Data augmentation functionality. Passed as callable transformations to 2 | Dataset classes. 3 | 4 | The data augmentation procedures were interpreted from @weiliu89's SSD paper 5 | http://arxiv.org/abs/1512.02325 6 | 7 | TODO: implement data_augment for training 8 | 9 | Ellis Brown, Max deGroot 10 | """ 11 | 12 | import torch 13 | from torchvision import transforms 14 | import cv2 15 | import numpy as np 16 | import random 17 | import math 18 | from utils.box_utils import matrix_iou 19 | 20 | 21 | def _crop(image, boxes, labels): 22 | height, width, _ = image.shape 23 | 24 | if len(boxes) == 0: 25 | return image, boxes, labels 26 | 27 | while True: 28 | mode = random.choice(( 29 | None, 30 | (0.1, None), 31 | (0.3, None), 32 | (0.5, None), 33 | (0.7, None), 34 | (0.9, None), 35 | (None, None), 36 | )) 37 | 38 | if mode is None: 39 | return image, boxes, labels 40 | 41 | min_iou, max_iou = mode 42 | if min_iou is None: 43 | min_iou = float('-inf') 44 | if max_iou is None: 45 | max_iou = float('inf') 46 | 47 | for _ in range(50): 48 | scale = random.uniform(0.3, 1.) 49 | min_ratio = max(0.5, scale * scale) 50 | max_ratio = min(2, 1. / scale / scale) 51 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 52 | w = int(scale * ratio * width) 53 | h = int((scale / ratio) * height) 54 | 55 | l = random.randrange(width - w) 56 | t = random.randrange(height - h) 57 | roi = np.array((l, t, l + w, t + h)) 58 | 59 | iou = matrix_iou(boxes, roi[np.newaxis]) 60 | 61 | if not (min_iou <= iou.min() and iou.max() <= max_iou): 62 | continue 63 | 64 | image_t = image[roi[1]:roi[3], roi[0]:roi[2]] 65 | 66 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2 67 | mask = np.logical_and(roi[:2] < centers, centers < roi[2:]) \ 68 | .all(axis=1) 69 | boxes_t = boxes[mask].copy() 70 | labels_t = labels[mask].copy() 71 | if len(boxes_t) == 0: 72 | continue 73 | 74 | boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2]) 75 | boxes_t[:, :2] -= roi[:2] 76 | boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:]) 77 | boxes_t[:, 2:] -= roi[:2] 78 | 79 | return image_t, boxes_t, labels_t 80 | 81 | 82 | def _distort(image): 83 | def _convert(image, alpha=1, beta=0): 84 | tmp = image.astype(float) * alpha + beta 85 | tmp[tmp < 0] = 0 86 | tmp[tmp > 255] = 255 87 | image[:] = tmp 88 | 89 | image = image.copy() 90 | 91 | if random.randrange(2): 92 | _convert(image, beta=random.uniform(-32, 32)) 93 | 94 | if random.randrange(2): 95 | _convert(image, alpha=random.uniform(0.5, 1.5)) 96 | 97 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 98 | 99 | if random.randrange(2): 100 | tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) 101 | tmp %= 180 102 | image[:, :, 0] = tmp 103 | 104 | if random.randrange(2): 105 | _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) 106 | 107 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 108 | 109 | return image 110 | 111 | 112 | def _expand(image, boxes, fill, p): 113 | if random.random() > p: 114 | return image, boxes 115 | 116 | height, width, depth = image.shape 117 | for _ in range(50): 118 | scale = random.uniform(1, 4) 119 | 120 | min_ratio = max(0.5, 1. / scale / scale) 121 | max_ratio = min(2, scale * scale) 122 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 123 | ws = scale * ratio 124 | hs = scale / ratio 125 | if ws < 1 or hs < 1: 126 | continue 127 | w = int(ws * width) 128 | h = int(hs * height) 129 | 130 | left = random.randint(0, w - width) 131 | top = random.randint(0, h - height) 132 | 133 | boxes_t = boxes.copy() 134 | boxes_t[:, :2] += (left, top) 135 | boxes_t[:, 2:] += (left, top) 136 | 137 | expand_image = np.empty((h, w, depth), dtype=image.dtype) 138 | expand_image[:, :] = fill 139 | expand_image[top:top + height, left:left + width] = image 140 | image = expand_image 141 | 142 | return image, boxes_t 143 | 144 | 145 | def _mirror(image, boxes): 146 | _, width, _ = image.shape 147 | if random.randrange(2): 148 | image = image[:, ::-1] 149 | boxes = boxes.copy() 150 | boxes[:, 0::2] = width - boxes[:, 2::-2] 151 | return image, boxes 152 | 153 | 154 | def preproc_for_test(image, resize_wh, mean): 155 | interp_methods = [ 156 | cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, 157 | cv2.INTER_LANCZOS4 158 | ] 159 | interp_method = interp_methods[random.randrange(5)] 160 | # interp_method = interp_methods[0] 161 | image = cv2.resize( 162 | image, (resize_wh[0], resize_wh[1]), interpolation=interp_method) 163 | image = image.astype(np.float32) 164 | image -= mean 165 | # to rgb 166 | # image = image[:, :, (2, 1, 0)] 167 | return image.transpose(2, 0, 1) 168 | 169 | 170 | class preproc(object): 171 | def __init__(self, resize_wh, rgb_means, p): 172 | self.means = rgb_means 173 | self.resize_wh = resize_wh 174 | self.p = p 175 | 176 | def __call__(self, image, targets): 177 | boxes = targets[:, :-1].copy() 178 | labels = targets[:, -1].copy() 179 | if len(boxes) == 0: 180 | #boxes = np.empty((0, 4)) 181 | targets = np.zeros((1, 5)) 182 | image = preproc_for_test(image, self.resize_wh, self.means) 183 | return torch.from_numpy(image), targets 184 | 185 | image_o = image.copy() 186 | targets_o = targets.copy() 187 | height_o, width_o, _ = image_o.shape 188 | boxes_o = targets_o[:, :-1] 189 | labels_o = targets_o[:, -1] 190 | boxes_o[:, 0::2] /= width_o 191 | boxes_o[:, 1::2] /= height_o 192 | labels_o = np.expand_dims(labels_o, 1) 193 | targets_o = np.hstack((boxes_o, labels_o)) 194 | 195 | image_t, boxes, labels = _crop(image, boxes, labels) 196 | image_t = _distort(image_t) 197 | image_t, boxes = _expand(image_t, boxes, self.means, self.p) 198 | image_t, boxes = _mirror(image_t, boxes) 199 | #image_t, boxes = _mirror(image, boxes) 200 | 201 | height, width, _ = image_t.shape 202 | image_t = preproc_for_test(image_t, self.resize_wh, self.means) 203 | boxes = boxes.copy() 204 | boxes[:, 0::2] /= width 205 | boxes[:, 1::2] /= height 206 | b_w = (boxes[:, 2] - boxes[:, 0]) * 1. 207 | b_h = (boxes[:, 3] - boxes[:, 1]) * 1. 208 | mask_b = np.minimum(b_w, b_h) > 0.01 209 | boxes_t = boxes[mask_b] 210 | labels_t = labels[mask_b].copy() 211 | 212 | if len(boxes_t) == 0: 213 | image = preproc_for_test(image_o, self.resize_wh, self.means) 214 | return torch.from_numpy(image), targets_o 215 | 216 | labels_t = np.expand_dims(labels_t, 1) 217 | targets_t = np.hstack((boxes_t, labels_t)) 218 | 219 | return torch.from_numpy(image_t), targets_t 220 | 221 | 222 | class BaseTransform_img(object): 223 | """Defines the transformations that should be applied to test PIL image 224 | for input into the network 225 | 226 | dimension -> tensorize -> color adj 227 | 228 | Arguments: 229 | resize (int): input dimension to SSD 230 | rgb_means ((int,int,int)): average RGB of the dataset 231 | (104,117,123) 232 | swap ((int,int,int)): final order of channels 233 | Returns: 234 | transform (transform) : callable transform to be applied to test/val 235 | data 236 | """ 237 | 238 | def __init__(self, resize_wh, rgb_means, swap=(2, 0, 1)): 239 | self.means = rgb_means 240 | self.resize_wh = resize_wh 241 | self.swap = swap 242 | 243 | # assume input is cv2 img for now 244 | def __call__(self, img, target=None): 245 | 246 | interp_methods = [ 247 | cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, 248 | cv2.INTER_NEAREST, cv2.INTER_LANCZOS4 249 | ] 250 | interp_method = interp_methods[0] 251 | img = cv2.resize( 252 | np.array(img), (int(self.resize_wh), int(self.resize_wh)), 253 | interpolation=interp_method).astype(np.float32) 254 | img -= self.means 255 | img = img.transpose(self.swap) 256 | return torch.from_numpy(img) 257 | 258 | 259 | class BaseTransform_ration(object): 260 | """Defines the transformations that should be applied to test PIL image 261 | for input into the network 262 | 263 | dimension -> tensorize -> color adj 264 | 265 | Arguments: 266 | resize (int): input dimension to SSD 267 | rgb_means ((int,int,int)): average RGB of the dataset 268 | (104,117,123) 269 | swap ((int,int,int)): final order of channels 270 | Returns: 271 | transform (transform) : callable transform to be applied to test/val 272 | data 273 | """ 274 | 275 | def __init__(self, resize_wh, rgb_means, swap=(2, 0, 1)): 276 | self.means = rgb_means 277 | self.resize_wh = resize_wh 278 | self.swap = swap 279 | 280 | # assume input is cv2 img for now 281 | def __call__(self, img, target=None): 282 | 283 | interp_methods = [ 284 | cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, 285 | cv2.INTER_NEAREST, cv2.INTER_LANCZOS4 286 | ] 287 | interp_method = interp_methods[0] 288 | img = cv2.resize( 289 | np.array(img), None, None, fx=self.resize_wh[0], fy = self.resize_wh[1], 290 | interpolation=interp_method).astype(np.float32) 291 | img -= self.means 292 | img = img.transpose(self.swap) 293 | return torch.from_numpy(img) 294 | 295 | 296 | class BaseTransform(object): 297 | """Defines the transformations that should be applied to test PIL image 298 | for input into the network 299 | 300 | dimension -> tensorize -> color adj 301 | 302 | Arguments: 303 | resize (int): input dimension to SSD 304 | rgb_means ((int,int,int)): average RGB of the dataset 305 | (104,117,123) 306 | swap ((int,int,int)): final order of channels 307 | Returns: 308 | transform (transform) : callable transform to be applied to test/val 309 | data 310 | """ 311 | 312 | def __init__(self, resize_wh, rgb_means, swap=(2, 0, 1)): 313 | self.means = rgb_means 314 | self.resize_wh = resize_wh 315 | self.swap = swap 316 | 317 | # assume input is cv2 img for now 318 | def __call__(self, img, target=None): 319 | 320 | interp_methods = [ 321 | cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, 322 | cv2.INTER_NEAREST, cv2.INTER_LANCZOS4 323 | ] 324 | interp_method = interp_methods[0] 325 | img = cv2.resize( 326 | np.array(img), (self.resize_wh[0], self.resize_wh[1]), 327 | interpolation=interp_method).astype(np.float32) 328 | img -= self.means 329 | img = img.transpose(self.swap) 330 | return torch.from_numpy(img), target 331 | # return torch.from_numpy(img), target -------------------------------------------------------------------------------- /models/model_builder_resnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | from layers import * 8 | import os 9 | from models.model_helper import weights_init 10 | import importlib 11 | from layers.functions.prior_layer import PriorLayer 12 | from dcn.modules.deform_conv import DeformConv, ModulatedDeformConv 13 | 14 | def get_func(func_name): 15 | """Helper to return a function object by name. func_name must identify a 16 | function in this module or the path to a function relative to the base 17 | 'modeling' module. 18 | """ 19 | if func_name == '': 20 | return None 21 | try: 22 | parts = func_name.split('.') 23 | # Refers to a function in this module 24 | if len(parts) == 1: 25 | return globals()[parts[0]] 26 | # Otherwise, assume we're referencing a module under modeling 27 | module_name = 'models.' + '.'.join(parts[:-1]) 28 | module = importlib.import_module(module_name) 29 | return getattr(module, parts[-1]) 30 | except Exception: 31 | print('Failed to find function: %s', func_name) 32 | raise 33 | 34 | class BasicConv(nn.Module): 35 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False): 36 | super(BasicConv, self).__init__() 37 | self.out_channels = out_planes 38 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) 39 | self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None 40 | self.relu = nn.ReLU(inplace=True) if relu else None 41 | 42 | def forward(self, x): 43 | x = self.conv(x) 44 | if self.bn is not None: 45 | x = self.bn(x) 46 | if self.relu is not None: 47 | x = self.relu(x) 48 | return x 49 | 50 | class Basic2Conv(nn.Module): 51 | 52 | def __init__(self, in_planes, out_planes): 53 | super(Basic2Conv, self).__init__() 54 | self.branch1 = BasicConv(in_planes, out_planes, kernel_size=1) 55 | self.branch2 = BasicConv(out_planes, out_planes, kernel_size=1) 56 | 57 | def forward(self, x): 58 | x1 = self.branch1(x) 59 | x2 = self.branch2(x1) 60 | 61 | return x2 62 | 63 | def add_dcn_dilas(): 64 | 65 | planes = [512,1024,512,256] 66 | deformable_groups = 1 67 | conv_layers = [] 68 | for i in range(4): 69 | conv_layers += [DeformConv( 70 | planes[i], 71 | 256, 72 | kernel_size=3, 73 | stride=1, 74 | padding=5-i, 75 | dilation=5-i, 76 | deformable_groups=deformable_groups, 77 | bias=False)] 78 | return conv_layers 79 | 80 | def BN_layers(): 81 | bn_layers =[] 82 | bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)] 83 | bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)] 84 | bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)] 85 | bn_layers += [nn.BatchNorm2d(256,eps=1e-5, momentum=0.01, affine=True)] 86 | 87 | return bn_layers 88 | 89 | class SSD(nn.Module): 90 | """Single Shot Multibox Architecture 91 | The network is composed of a base VGG network followed by the 92 | added multibox conv layers. Each multibox layer branches into 93 | 1) conv2d for class conf scores 94 | 2) conv2d for localization predictions 95 | 3) associated priorbox layer to produce default bounding 96 | boxes specific to the layer's feature map size. 97 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 98 | 99 | Args: 100 | phase: (string) Can be "test" or "train" 101 | base: VGG16 layers for input, size of either 300 or 500 102 | extras: extra layers that feed to multibox loc and conf layers 103 | head: "multibox head" consists of loc and conf conv layers 104 | """ 105 | 106 | def _init_modules(self): 107 | self.arm_loc.apply(weights_init) 108 | self.arm_conf.apply(weights_init) 109 | if self.cfg.MODEL.REFINE: 110 | self.odm_loc.apply(weights_init) 111 | self.odm_conf.apply(weights_init) 112 | 113 | self.loc_offset_conv.apply(weights_init) 114 | self.dcn_convs.apply(weights_init) 115 | if self.cfg.MODEL.LOAD_PRETRAINED_WEIGHTS: 116 | weights = torch.load(self.cfg.MODEL.PRETRAIN_WEIGHTS) 117 | print("load pretrain model {}".format( 118 | self.cfg.MODEL.PRETRAIN_WEIGHTS)) 119 | if self.cfg.MODEL.TYPE.split('_')[-1] == 'vgg': 120 | self.extractor.vgg.load_state_dict(weights) 121 | else: 122 | self.extractor.load_state_dict(weights, strict=False) 123 | 124 | def __init__(self, cfg): 125 | super(SSD, self).__init__() 126 | self.cfg = cfg 127 | self.size = cfg.MODEL.SIZE 128 | if self.size == '300': 129 | size_cfg = cfg.SMALL 130 | else: 131 | size_cfg = cfg.BIG 132 | self.num_classes = cfg.MODEL.NUM_CLASSES 133 | self.prior_layer = PriorLayer(cfg) 134 | self.priorbox = PriorBox(cfg) 135 | self.priors = self.priorbox.forward() 136 | self.extractor = get_func(cfg.MODEL.CONV_BODY)(self.size, 137 | cfg.TRAIN.CHANNEL_SIZE) 138 | if cfg.MODEL.REFINE: 139 | self.odm_channels = size_cfg.ODM_CHANNELS 140 | self.arm_num_classes = 2 141 | self.odm_loc = nn.ModuleList() 142 | self.odm_conf = nn.ModuleList() 143 | 144 | self.loc_offset_conv = nn.ModuleList() 145 | self.dcn_convs = nn.ModuleList(add_dcn_dilas()) 146 | self.bn_layers = nn.ModuleList(BN_layers()) 147 | 148 | self.arm_loc = nn.ModuleList() 149 | self.arm_conf = nn.ModuleList() 150 | self.arm_channels = size_cfg.ARM_CHANNELS 151 | self.num_anchors = size_cfg.NUM_ANCHORS 152 | self.input_fixed = size_cfg.INPUT_FIXED 153 | self.arm_loc = nn.ModuleList() 154 | self.arm_conf = nn.ModuleList() 155 | 156 | for i in range(len(self.arm_channels)): 157 | if cfg.MODEL.REFINE: 158 | self.arm_loc += [ 159 | nn.Conv2d( 160 | self.arm_channels[i], 161 | self.num_anchors[i] * 4, 162 | kernel_size=3, 163 | padding=1) 164 | ] 165 | self.arm_conf += [ 166 | nn.Conv2d( 167 | self.arm_channels[i], 168 | self.num_anchors[i] * self.arm_num_classes, 169 | kernel_size=3, 170 | padding=1) 171 | ] 172 | 173 | self.loc_offset_conv +=[BasicConv(self.num_anchors[i] * 2, 18, kernel_size=1)] 174 | self.odm_loc += [nn.Sequential(Basic2Conv(self.odm_channels[i], 512), 175 | nn.Conv2d(512, self.num_anchors[i] * 4, kernel_size=3, padding=1)) 176 | ] 177 | self.odm_conf += [ 178 | nn.Sequential(Basic2Conv(self.odm_channels[i], 512), 179 | nn.Conv2d(512, self.num_anchors[i] * self.num_classes, kernel_size=3, padding=1)) 180 | ] 181 | else: 182 | self.arm_loc += [ 183 | nn.Conv2d( 184 | self.arm_channels[i], 185 | self.num_anchors[i] * 4, 186 | kernel_size=3, 187 | padding=1) 188 | ] 189 | self.arm_conf += [ 190 | nn.Conv2d( 191 | self.arm_channels[i], 192 | self.num_anchors[i] * self.num_classes, 193 | kernel_size=3, 194 | padding=1) 195 | ] 196 | if cfg.TRAIN.TRAIN_ON: 197 | self._init_modules() 198 | 199 | def forward(self, input): 200 | 201 | arm_loc = list() 202 | arm_conf = list() 203 | if self.cfg.MODEL.REFINE: 204 | odm_loc = list() 205 | odm_conf = list() 206 | conf = list() 207 | odm_xs_n = list() 208 | arm_loc_list = list() 209 | arm_xs, odm_xs = self.extractor(input) 210 | 211 | 212 | for (x, l, c) in zip(arm_xs, self.arm_loc, self.arm_conf): 213 | arm_loc_conv = l(x) 214 | cc = c(x) 215 | conf.append(cc) 216 | arm_loc_list.append(torch.cat([arm_loc_conv[:,0::4,:,:], arm_loc_conv[:,1::4,:,:]], 1)) 217 | arm_loc.append(arm_loc_conv.permute(0, 2, 3, 1).contiguous()) 218 | arm_conf.append(cc.permute(0, 2, 3, 1).contiguous()) 219 | 220 | 221 | for (conf_fea, odm_xs_fea) in zip(conf, odm_xs): 222 | conf_obj = conf_fea[:, 1::2, :, :] 223 | conf_max, _ = torch.max(conf_obj, dim=1, keepdim=True) 224 | conf_attention = conf_max.sigmoid() 225 | odm_xs_fea_n = odm_xs_fea * conf_attention + odm_xs_fea 226 | odm_xs_n.append(odm_xs_fea_n) 227 | 228 | offset_0 = self.loc_offset_conv[0](arm_loc_list[0]) 229 | d0 = F.relu(self.bn_layers[0](self.dcn_convs[0](odm_xs_n[0], offset_0)), inplace=True) 230 | 231 | offset_1 = self.loc_offset_conv[1](arm_loc_list[1]) 232 | d1 = F.relu(self.bn_layers[1](self.dcn_convs[1](odm_xs_n[1], offset_1)), inplace=True) 233 | 234 | offset_2 = self.loc_offset_conv[2](arm_loc_list[2]) 235 | d2 = F.relu(self.bn_layers[2](self.dcn_convs[2](odm_xs_n[2], offset_2)), inplace=True) 236 | 237 | offset_3 = self.loc_offset_conv[3](arm_loc_list[3]) 238 | d3 = F.relu(self.bn_layers[3](self.dcn_convs[3](odm_xs_n[3], offset_3)), inplace=True) 239 | odm_xs_new = [d0,d1,d2,d3] 240 | 241 | for (x, l, c) in zip(odm_xs_new, self.odm_loc, self.odm_conf): 242 | odm_loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 243 | odm_conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 244 | odm_loc = torch.cat([o.view(o.size(0), -1) for o in odm_loc], 1) 245 | odm_conf = torch.cat([o.view(o.size(0), -1) for o in odm_conf], 1) 246 | else: 247 | arm_xs = self.extractor(input) 248 | img_wh = (input.size(3), input.size(2)) 249 | feature_maps_wh = [(t.size(3), t.size(2)) for t in arm_xs] 250 | 251 | arm_loc = torch.cat([o.view(o.size(0), -1) for o in arm_loc], 1) 252 | arm_conf = torch.cat([o.view(o.size(0), -1) for o in arm_conf], 1) 253 | if self.cfg.MODEL.REFINE: 254 | output = (arm_loc.view(arm_loc.size(0), -1, 4), 255 | arm_conf.view( 256 | arm_conf.size(0), -1, self.arm_num_classes), 257 | odm_loc.view(odm_loc.size(0), -1, 4), 258 | odm_conf.view(odm_conf.size(0), -1, self.num_classes), 259 | self.priors if self.input_fixed else self.prior_layer( 260 | img_wh, feature_maps_wh)) 261 | else: 262 | output = (arm_loc.view(arm_loc.size(0), -1, 4), 263 | arm_conf.view(arm_conf.size(0), -1, self.num_classes), 264 | self.priors if self.input_fixed else self.prior_layer( 265 | img_wh, feature_maps_wh)) 266 | return output 267 | -------------------------------------------------------------------------------- /data/voc0712.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | 9 | import os 10 | import os.path 11 | import pickle 12 | import sys 13 | import torch 14 | import torch.utils.data as data 15 | from PIL import Image, ImageDraw, ImageFont 16 | import cv2 17 | import numpy as np 18 | from .voc_eval import voc_eval 19 | if sys.version_info[0] == 2: 20 | import xml.etree.cElementTree as ET 21 | else: 22 | import xml.etree.ElementTree as ET 23 | 24 | VOC_CLASSES = ( 25 | '__background__', # always index 0 26 | 'aeroplane', 27 | 'bicycle', 28 | 'bird', 29 | 'boat', 30 | 'bottle', 31 | 'bus', 32 | 'car', 33 | 'cat', 34 | 'chair', 35 | 'cow', 36 | 'diningtable', 37 | 'dog', 38 | 'horse', 39 | 'motorbike', 40 | 'person', 41 | 'pottedplant', 42 | 'sheep', 43 | 'sofa', 44 | 'train', 45 | 'tvmonitor') 46 | 47 | # for making bounding boxes pretty 48 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128), 49 | (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128)) 50 | 51 | 52 | class AnnotationTransform(object): 53 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 54 | Initilized with a dictionary lookup of classnames to indexes 55 | 56 | Arguments: 57 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 58 | (default: alphabetic indexing of VOC's 20 classes) 59 | keep_difficult (bool, optional): keep difficult instances or not 60 | (default: False) 61 | height (int): height 62 | width (int): width 63 | """ 64 | 65 | def __init__(self, class_to_ind=None, keep_difficult=False): 66 | self.class_to_ind = class_to_ind or dict( 67 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 68 | self.keep_difficult = keep_difficult 69 | 70 | def __call__(self, target, width, height): 71 | """ 72 | Arguments: 73 | target (annotation) : the target annotation to be made usable 74 | will be an ET.Element 75 | Returns: 76 | a list containing lists of bounding boxes [bbox coords, class name] 77 | """ 78 | res = np.empty((0, 5)) 79 | for obj in target.iter('object'): 80 | difficult = int(obj.find('difficult').text) == 1 81 | if not self.keep_difficult and difficult: 82 | continue 83 | name = obj.find('name').text.lower().strip() 84 | bbox = obj.find('bndbox') 85 | 86 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 87 | bndbox = [] 88 | for i, pt in enumerate(pts): 89 | cur_pt = int(bbox.find(pt).text) - 1 90 | # scale height or width 91 | # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 92 | bndbox.append(cur_pt) 93 | label_idx = self.class_to_ind[name] 94 | bndbox.append(label_idx) 95 | # res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] 96 | res = np.vstack((res, bndbox)) 97 | # img_id = target.find('filename').text[:-4] 98 | if len(res) == 0: 99 | np.vstack((res, [0, 0, 0, 0, 0])) 100 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 101 | 102 | 103 | class VOCDetection(data.Dataset): 104 | """VOC Detection Dataset Object 105 | 106 | input is image, target is annotation 107 | 108 | Arguments: 109 | root (string): filepath to VOCdevkit folder. 110 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 111 | transform (callable, optional): transformation to perform on the 112 | input image 113 | target_transform (callable, optional): transformation to perform on the 114 | target `annotation` 115 | (eg: take in caption string, return tensor of word indices) 116 | dataset_name (string, optional): which dataset to load 117 | (default: 'VOC2007') 118 | """ 119 | 120 | def __init__(self, 121 | root, 122 | image_sets, 123 | transform=None, 124 | dataset_name='VOC0712'): 125 | self.root = root 126 | self.image_set = image_sets 127 | self.transform = transform 128 | self.target_transform = AnnotationTransform() 129 | self.name = dataset_name 130 | self._annopath = os.path.join('%s', 'Annotations', '%s.xml') 131 | self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg') 132 | self.ids = list() 133 | for (year, name) in image_sets: 134 | self._year = year 135 | rootpath = os.path.join(self.root, 'VOC' + year) 136 | # rootpath = os.path.join(self.root, dataset_name + year) 137 | for line in open( 138 | os.path.join(rootpath, 'ImageSets', 'Main', 139 | name + '.txt')): 140 | self.ids.append((rootpath, line.strip())) 141 | 142 | def __getitem__(self, index): 143 | im, gt, img_info = self.pull_item(index) 144 | return im, gt, img_info 145 | 146 | def __len__(self): 147 | return len(self.ids) 148 | 149 | def pull_item(self, index): 150 | img_id = self.ids[index] 151 | 152 | if self.name != 'test': 153 | target = ET.parse(self._annopath % img_id).getroot() 154 | else: 155 | target = np.zeros((1, 5)) 156 | img = cv2.imread(self._imgpath % img_id) 157 | im_h, im_w, channels = img.shape 158 | img_info = [im_w, im_h] 159 | if self.target_transform is not None: 160 | target = self.target_transform(target, im_w, im_h) 161 | 162 | if self.name != 'test': 163 | if self.transform is not None: 164 | img, target = self.transform(img, target) 165 | else: 166 | if self.transform is not None: 167 | img = self.transform(img) 168 | 169 | return img, target, img_info 170 | 171 | def pull_image(self, index): 172 | '''Returns the original image object at index in PIL form 173 | 174 | Note: not using self.__getitem__(), as any transformations passed in 175 | could mess up this functionality. 176 | 177 | Argument: 178 | index (int): index of img to show 179 | Return: 180 | PIL img 181 | ''' 182 | img_id = self.ids[index] 183 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 184 | 185 | def pull_anno(self, index): 186 | '''Returns the original annotation of image at index 187 | 188 | Note: not using self.__getitem__(), as any transformations passed in 189 | could mess up this functionality. 190 | 191 | Argument: 192 | index (int): index of img to get annotation of 193 | Return: 194 | list: [img_id, [(label, bbox coords),...]] 195 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 196 | ''' 197 | img_id = self.ids[index] 198 | anno = ET.parse(self._annopath % img_id).getroot() 199 | gt = self.target_transform(anno, 1, 1) 200 | return img_id[1], gt 201 | 202 | def pull_tensor(self, index): 203 | '''Returns the original image at an index in tensor form 204 | 205 | Note: not using self.__getitem__(), as any transformations passed in 206 | could mess up this functionality. 207 | 208 | Argument: 209 | index (int): index of img to show 210 | Return: 211 | tensorized version of img, squeezed 212 | ''' 213 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 214 | 215 | def evaluate_detections(self, all_boxes, output_dir=None): 216 | """ 217 | all_boxes is a list of length number-of-classes. 218 | Each list element is a list of length number-of-images. 219 | Each of those list elements is either an empty list [] 220 | or a numpy array of detection. 221 | 222 | all_boxes[class][image] = [] or np.array of shape #dets x 5 223 | """ 224 | self._write_voc_results_file(all_boxes) 225 | self._do_python_eval(output_dir) 226 | 227 | def _get_voc_results_file_template(self): 228 | filename = 'comp3_det_test' + '_{:s}.txt' 229 | filedir = os.path.join(self.root, 'results', 'VOC' + self._year, 230 | 'Main') 231 | if not os.path.exists(filedir): 232 | os.makedirs(filedir) 233 | path = os.path.join(filedir, filename) 234 | return path 235 | 236 | def _write_voc_results_file(self, all_boxes): 237 | for cls_ind, cls in enumerate(VOC_CLASSES): 238 | if cls == '__background__': 239 | continue 240 | print('Writing {} VOC results file'.format(cls)) 241 | filename = self._get_voc_results_file_template().format(cls) 242 | # print(filename) 243 | with open(filename, 'wt') as f: 244 | for im_ind, index in enumerate(self.ids): 245 | index = index[1] 246 | dets = all_boxes[cls_ind][im_ind] 247 | if dets == []: 248 | continue 249 | for k in range(dets.shape[0]): 250 | f.write( 251 | '{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.format( 252 | index, dets[k, -1], dets[k, 0] + 1, 253 | dets[k, 1] + 1, dets[k, 2] + 1, 254 | dets[k, 3] + 1)) 255 | 256 | def _do_python_eval(self, output_dir='output'): 257 | rootpath = os.path.join(self.root, 'VOC' + self._year) 258 | name = self.image_set[0][1] 259 | annopath = os.path.join(rootpath, 'Annotations', '{:s}.xml') 260 | imagesetfile = os.path.join(rootpath, 'ImageSets', 'Main', 261 | name + '.txt') 262 | cachedir = os.path.join(self.root, 'annotations_cache') 263 | aps = [] 264 | # The PASCAL VOC metric changed in 2010 265 | use_07_metric = True if int(self._year) < 2010 else False 266 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 267 | if output_dir is not None and not os.path.isdir(output_dir): 268 | os.mkdir(output_dir) 269 | for i, cls in enumerate(VOC_CLASSES): 270 | if cls == '__background__': 271 | continue 272 | 273 | filename = self._get_voc_results_file_template().format(cls) 274 | rec, prec, ap = voc_eval( 275 | filename, 276 | annopath, 277 | imagesetfile, 278 | cls, 279 | cachedir, 280 | ovthresh=0.5, 281 | use_07_metric=use_07_metric) 282 | aps += [ap] 283 | print('AP for {} = {:.4f}'.format(cls, ap)) 284 | if output_dir is not None: 285 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 286 | 'wb') as f: 287 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 288 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 289 | print('~~~~~~~~') 290 | print('Results:') 291 | for ap in aps: 292 | print('{:.3f}'.format(ap)) 293 | print('{:.3f}'.format(np.mean(aps))) 294 | print('~~~~~~~~') 295 | print('') 296 | print('--------------------------------------------------------------') 297 | print('Results computed with the **unofficial** Python eval code.') 298 | print('Results should be very close to the official MATLAB eval code.') 299 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 300 | print('-- Thanks, The Management') 301 | print('--------------------------------------------------------------') 302 | 303 | 304 | def detection_collate(batch): 305 | """Custom collate fn for dealing with batches of images that have a different 306 | number of associated object annotations (bounding boxes). 307 | 308 | Arguments: 309 | batch: (tuple) A tuple of tensor images and lists of annotations 310 | 311 | Return: 312 | A tuple containing: 313 | 1) (tensor) batch of images stacked on their 0 dim 314 | 2) (list of tensors) annotations for a given image are stacked on 0 dim 315 | """ 316 | targets = [] 317 | imgs = [] 318 | img_info = [] 319 | for sample in batch: 320 | imgs.append(sample[0]) 321 | targets.append(torch.FloatTensor(sample[1])) 322 | img_info.append(torch.FloatTensor(sample[2])) 323 | return torch.stack(imgs, 0), targets, img_info 324 | -------------------------------------------------------------------------------- /configs/config.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | 6 | from utils.collections2 import AttrDict 7 | import six 8 | import yaml 9 | import torch 10 | import torch.nn as nn 11 | from torch.nn import init 12 | import numpy as np 13 | import copy 14 | from ast import literal_eval 15 | 16 | __C = AttrDict() 17 | cfg = __C 18 | 19 | __C.MODEL = AttrDict() 20 | 21 | __C.MODEL.NUM_CLASSES = -1 22 | __C.MODEL.TYPE = '' 23 | __C.MODEL.SIZE = '300' 24 | __C.MODEL.CONV_BODY = '' 25 | __C.MODEL.REFINE = False 26 | __C.MODEL.LOAD_PRETRAINED_WEIGHTS = False 27 | __C.MODEL.PRETRAIN_WEIGHTS = '' 28 | __C.MODEL.OBJECT_SCORE = 0.01 29 | 30 | __C.TRAIN = AttrDict() 31 | __C.TRAIN.OVERLAP = 0.5 32 | __C.TRAIN.OHEM = True 33 | __C.TRAIN.NEG_RATIO = 3 34 | __C.TRAIN.FOCAL_LOSS = False 35 | __C.TRAIN.FOCAL_LOSS_TYPE = 'SOFTMAX' 36 | __C.TRAIN.BGR_MEAN = [104, 117, 123] 37 | __C.TRAIN.BATCH_SIZE = 1 38 | __C.TRAIN.CHANNEL_SIZE = '48' 39 | __C.TRAIN.WARMUP = True 40 | __C.TRAIN.WARMUP_EPOCH = 2 41 | __C.TRAIN.DEVICE_IDS = [0] 42 | __C.TRAIN.TRAIN_ON = True 43 | 44 | __C.SMALL = AttrDict() 45 | 46 | __C.SMALL.FEATURE_MAPS = [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 47 | __C.SMALL.FEATURE_MAPS_SMALL = [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 48 | __C.SMALL.FEATURE_MAPS_LARGE = [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 49 | 50 | 51 | __C.SMALL.ARM_CHANNELS = [512, 1024, 512, 256, 256, 256] 52 | __C.SMALL.ODM_CHANNELS = [256, 256, 256, 256] 53 | __C.SMALL.NUM_ANCHORS = [4, 6, 6, 6, 4, 4] 54 | __C.SMALL.STEPS = [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], 55 | [300, 300]] 56 | 57 | __C.SMALL.STEPS_SMALL = [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], 58 | [300, 300]] 59 | 60 | __C.SMALL.STEPS_LARGE = [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], 61 | [300, 300]] 62 | 63 | 64 | 65 | __C.SMALL.MIN_SIZES = [30, 60, 111, 162, 213, 264] 66 | __C.SMALL.MIN_SIZES_SMALL = [30, 60, 111, 162, 213, 264] 67 | __C.SMALL.MIN_SIZES_LARGE = [30, 60, 111, 162, 213, 264] 68 | 69 | __C.SMALL.MAX_SIZES = [60, 111, 162, 213, 264, 315] 70 | __C.SMALL.MAX_SIZES_SMALL = [60, 111, 162, 213, 264, 315] 71 | __C.SMALL.MAX_SIZES_LARGE = [60, 111, 162, 213, 264, 315] 72 | 73 | __C.SMALL.ASPECT_RATIOS = [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], 74 | [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 75 | __C.SMALL.ASPECT_RATIOS_SMALL = [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], 76 | [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 77 | __C.SMALL.ASPECT_RATIOS_LARGE = [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], 78 | [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 79 | 80 | __C.SMALL.VARIANCE = [0.1, 0.2] 81 | __C.SMALL.CLIP = True 82 | __C.SMALL.IMG_WH = [300, 300] 83 | __C.SMALL.INPUT_FIXED = True 84 | __C.SMALL.USE_MAX_SIZE = True 85 | 86 | __C.BIG = AttrDict() 87 | __C.BIG.FEATURE_MAPS = [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], 88 | [1, 1]] 89 | __C.BIG.ARM_CHANNELS = [512, 1024, 512, 256, 256, 256, 256] 90 | __C.BIG.ODM_CHANNELS = [256, 256, 256, 256] 91 | __C.BIG.NUM_ANCHORS = [4, 6, 6, 6, 6, 4, 4] 92 | __C.BIG.STEPS = [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], 93 | [512, 512]] 94 | __C.BIG.MIN_SIZES = [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 95 | __C.BIG.MAX_SIZES = [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 96 | __C.BIG.ASPECT_RATIOS = [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], 97 | [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], 98 | [2, 0.5]] 99 | __C.BIG.VARIANCE = [0.1, 0.2] 100 | __C.BIG.CLIP = True 101 | __C.BIG.IMG_WH = [512, 512] 102 | __C.BIG.INPUT_FIXED = True 103 | __C.BIG.USE_MAX_SIZE = True 104 | 105 | __C.SOLVER = AttrDict() 106 | 107 | __C.SOLVER.WEIGHT_DECAY = 0.0005 108 | __C.SOLVER.BASE_LR = 0.001 109 | __C.SOLVER.GAMMA = 0.1 110 | __C.SOLVER.MOMENTUM = 0.9 111 | __C.SOLVER.EPOCH_STEPS = [] 112 | __C.SOLVER.LR = [] 113 | __C.SOLVER.END_EPOCH = 1 114 | __C.SOLVER.START_EPOCH = 0 115 | 116 | __C.DATASETS = AttrDict() 117 | 118 | VOCROOT = 'data/datasets/VOCdevkit0712/' 119 | COCOROOT = 'data/datasets/coco2015' 120 | UAVROOT = '/raid/jing/data/dataset/UAVdevkit2017/' 121 | DOTAROOT = '/raid/jing/data/dataset/DOTAdevkit2019/' 122 | IIAI_SATROOT = '/raid/flx/data/jing_detection/data/IIAI_SATdevkit2019/' 123 | Objects365_TinyROOT = '/raid/jing/data/dataset/Objects365/' 124 | 125 | __C.DATASETS.TRAIN_TYPE = [] 126 | __C.DATASETS.VAL_TYPE = [] 127 | __C.DATASETS.DATAROOT = VOCROOT 128 | __C.DATASETS.DATA_TYPE = '' 129 | 130 | __C.DATASETS.SETS = AttrDict() 131 | __C.DATASETS.SETS.VOC = [['0712', '0712_trainval']] 132 | __C.DATASETS.SETS.VOC0712PLUS = [['0712', '0712_trainval_test']] 133 | __C.DATASETS.SETS.VOC0712 = [['2012', '2012_trainval']] 134 | __C.DATASETS.SETS.VOC2007 = [['0712', "2007_test"]] 135 | __C.DATASETS.SETS.COCO = [['2014', 'train'], ['2014', 'valminusminival']] 136 | __C.DATASETS.SETS.COCOval = [['2014', 'minival']] 137 | __C.DATASETS.SETS.UAV = [['2017', '2017_trainval']] 138 | __C.DATASETS.SETS.UAVval = [['2017', 'test']] 139 | __C.DATASETS.SETS.DOTA = [['2019', 'train']] 140 | __C.DATASETS.SETS.IIAI_SAT = [['2019', 'train']] 141 | __C.DATASETS.SETS.IIAI_SATval = [['2019', 'val']] 142 | __C.DATASETS.SETS.DOTAval = [['2019', 'val']] 143 | __C.DATASETS.SETS.Objects365_Tiny = [['train']] 144 | __C.DATASETS.SETS.Objects365_Tinyval = [['val']] 145 | __C.DATASETS.SETS.VOCROOT = VOCROOT 146 | __C.DATASETS.SETS.COCOROOT = COCOROOT 147 | __C.DATASETS.SETS.UAVROOT = UAVROOT 148 | __C.DATASETS.SETS.DOTAROOT = DOTAROOT 149 | __C.DATASETS.SETS.IIAI_SATROOT = IIAI_SATROOT 150 | __C.DATASETS.SETS.Objects365_TinyROOT = Objects365_TinyROOT 151 | 152 | __C.TEST = AttrDict() 153 | __C.TEST.INPUT_WH = [300, 300] 154 | __C.TEST.CONFIDENCE_THRESH = 0.01 155 | __C.TEST.NMS_TYPE = 'NMS' 156 | __C.TEST.NMS_OVERLAP = 0.45 157 | __C.TEST.BATCH_SIZE = 16 158 | 159 | VOC_CLASSES = ( 160 | '__background__', # always index 0 161 | 'aeroplane', 162 | 'bicycle', 163 | 'bird', 164 | 'boat', 165 | 'bottle', 166 | 'bus', 167 | 'car', 168 | 'cat', 169 | 'chair', 170 | 'cow', 171 | 'diningtable', 172 | 'dog', 173 | 'horse', 174 | 'motorbike', 175 | 'person', 176 | 'pottedplant', 177 | 'sheep', 178 | 'sofa', 179 | 'train', 180 | 'tvmonitor') 181 | 182 | UAV_CLASSES = ( 183 | '__background__', # always index 0 184 | 'car') 185 | 186 | DOTA_CLASSES = ( 187 | '__background__', # always index 0 188 | 'plane', 189 | 'ship', 190 | 'storge-tank', 191 | 'baseball-diamond', 192 | 'tennis-court', 193 | 'baskeball-court', 194 | 'ground-track-field', 195 | 'harbor', 196 | 'bridge', 197 | 'large-vehicle', 198 | 'small-vehicle', 199 | 'helicopter', 200 | 'roundabout', 201 | 'soccer-ball-field', 202 | 'swimming-pool', 203 | 'container-crane' 204 | ) 205 | 206 | COCO_CLASSES = ('__background__', 'person', 'bicycle', 'car', 'motorbike', 207 | 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 208 | 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 209 | 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 210 | 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 211 | 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 212 | 'kite', 'baseball bat', 'baseball glove', 'skateboard', 213 | 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 214 | 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 215 | 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 216 | 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 217 | 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 218 | 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 219 | 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 220 | 'scissors', 'teddy bear', 'hair drier', 'toothbrush') 221 | 222 | Objects365_Tiny_CLASSES = ('__background__', 'pomelo', 'pig', 'race car', 'rice cooker', 'tuba', 'crosswalk sign', 'papaya', 223 | 'hair drier', 'green onion', 'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill', 224 | 'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup', 'shark', 'steak', 'poker card', 225 | 'binoculars', 'llama', 'radish', 'noodles', 'mop', 'yak', 'crab', 'microscope', 'barbell', 'bread/bun', 226 | 'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'mangosteen', 'seal', 'comb', 'eraser', 'pitaya', 227 | 'scallop', 'pencil case', 'saw', 'table tennis paddle', 'okra', 'starfish', 'eagle', 'monkey', 'durian', 228 | 'rabbit', 'game board', 'french horn', 'ambulance', 'hoverboard', 'asparagus', 'pasta', 'target', 229 | 'hotair balloon', 'chainsaw', 'lobster', 'iron', 'flashlight') 230 | 231 | 232 | 233 | def merge_cfg_from_file(cfg_filename): 234 | """Load a yaml config file and merge it into the global config.""" 235 | with open(cfg_filename, 'r') as f: 236 | yaml_cfg = AttrDict(yaml.load(f)) 237 | _merge_a_into_b(yaml_cfg, __C) 238 | 239 | 240 | cfg_from_file = merge_cfg_from_file 241 | 242 | 243 | def merge_cfg_from_cfg(cfg_other): 244 | """Merge `cfg_other` into the global config.""" 245 | _merge_a_into_b(cfg_other, __C) 246 | 247 | 248 | def _merge_a_into_b(a, b, stack=None): 249 | """Merge config dictionary a into config dictionary b, clobbering the 250 | options in b whenever they are also specified in a. 251 | """ 252 | assert isinstance(a, AttrDict), 'Argument `a` must be an AttrDict' 253 | assert isinstance(b, AttrDict), 'Argument `b` must be an AttrDict' 254 | 255 | for k, v_ in a.items(): 256 | full_key = '.'.join(stack) + '.' + k if stack is not None else k 257 | # a must specify keys that are in b 258 | if k not in b: 259 | raise KeyError('Non-existent config key: {}'.format(full_key)) 260 | 261 | v = copy.deepcopy(v_) 262 | v = _decode_cfg_value(v) 263 | v = _check_and_coerce_cfg_value_type(v, b[k], k, full_key) 264 | 265 | # Recursively merge dicts 266 | if isinstance(v, AttrDict): 267 | try: 268 | stack_push = [k] if stack is None else stack + [k] 269 | _merge_a_into_b(v, b[k], stack=stack_push) 270 | except BaseException: 271 | raise 272 | else: 273 | b[k] = v 274 | 275 | 276 | def _decode_cfg_value(v): 277 | """Decodes a raw config value (e.g., from a yaml config files or command 278 | line argument) into a Python object. 279 | """ 280 | # Configs parsed from raw yaml will contain dictionary keys that need to be 281 | # converted to AttrDict objects 282 | if isinstance(v, dict): 283 | return AttrDict(v) 284 | # All remaining processing is only applied to strings 285 | if not isinstance(v, six.string_types): 286 | return v 287 | # Try to interpret `v` as a: 288 | # string, number, tuple, list, dict, boolean, or None 289 | try: 290 | v = literal_eval(v) 291 | # The following two excepts allow v to pass through when it represents a 292 | # string. 293 | # 294 | # Longer explanation: 295 | # The type of v is always a string (before calling literal_eval), but 296 | # sometimes it *represents* a string and other times a data structure, like 297 | # a list. In the case that v represents a string, what we got back from the 298 | # yaml parser is 'foo' *without quotes* (so, not '"foo"'). literal_eval is 299 | # ok with '"foo"', but will raise a ValueError if given 'foo'. In other 300 | # cases, like paths (v = 'foo/bar' and not v = '"foo/bar"'), literal_eval 301 | # will raise a SyntaxError. 302 | except ValueError: 303 | pass 304 | except SyntaxError: 305 | pass 306 | return v 307 | 308 | 309 | def _check_and_coerce_cfg_value_type(value_a, value_b, key, full_key): 310 | """Checks that `value_a`, which is intended to replace `value_b` is of the 311 | right type. The type is correct if it matches exactly or is one of a few 312 | cases in which the type can be easily coerced. 313 | """ 314 | # The types must match (with some exceptions) 315 | type_b = type(value_b) 316 | type_a = type(value_a) 317 | if type_a is type_b: 318 | return value_a 319 | 320 | # Exceptions: numpy arrays, strings, tuple<->list 321 | if isinstance(value_b, np.ndarray): 322 | value_a = np.array(value_a, dtype=value_b.dtype) 323 | elif isinstance(value_b, six.string_types): 324 | value_a = str(value_a) 325 | elif isinstance(value_a, tuple) and isinstance(value_b, list): 326 | value_a = list(value_a) 327 | elif isinstance(value_a, list) and isinstance(value_b, tuple): 328 | value_a = tuple(value_a) 329 | else: 330 | raise ValueError( 331 | 'Type mismatch ({} vs. {}) with values ({} vs. {}) for config ' 332 | 'key: {}'.format(type_b, type_a, value_b, value_a, full_key)) 333 | return value_a 334 | --------------------------------------------------------------------------------