├── utils ├── DCN │ ├── make.sh │ ├── functions │ │ ├── __init__.py │ │ ├── deform_conv2d_func.py │ │ └── modulated_deform_conv2d_func.py │ ├── modules │ │ ├── __init__.py │ │ ├── modulated_deform_conv2d.py │ │ └── deform_conv2d.py │ ├── src │ │ ├── vision.cpp │ │ ├── cpu │ │ │ ├── deform_conv2d_cpu.h │ │ │ ├── deform_conv2d_cpu.cpp │ │ │ ├── modulated_deform_conv2d_cpu.h │ │ │ └── modulated_deform_conv2d_cpu.cpp │ │ ├── cuda │ │ │ ├── deform_conv2d_cuda.h │ │ │ └── modulated_deform_conv2d_cuda.h │ │ ├── deform_conv2d.h │ │ └── modulated_deform_conv2d.h │ ├── setup.py │ └── deform_conv2d_naive.py ├── __init__.py ├── fp16_utils │ ├── __init__.py │ ├── README.md │ ├── fp16util.py │ └── loss_scaler.py ├── vis_utils.py ├── utils.py ├── distributed_util.py ├── voc_evaluator.py └── cocoapi_evaluator.py ├── dataset ├── __init__.py ├── mixupdetection.py ├── cocodataset.py ├── voc_eval.py ├── dataloading.py ├── vocdataset.py └── data_augment.py ├── make.sh ├── doc └── asff.png ├── example └── test.jpg ├── .gitignore ├── config ├── yolov3_mobile.cfg └── yolov3_baseline.cfg ├── models ├── utils_loss.py ├── yolov3_baseline.py ├── yolov3_asff.py ├── yolov3_mobilev2.py └── yolov3_head.py ├── demo.py ├── eval.py └── README.md /utils/DCN/make.sh: -------------------------------------------------------------------------------- 1 | python setup.py build install 2 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | -------------------------------------------------------------------------------- /make.sh: -------------------------------------------------------------------------------- 1 | cd utils/DCN 2 | 3 | python setup.py install 4 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | -------------------------------------------------------------------------------- /doc/asff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/ASFF/master/doc/asff.png -------------------------------------------------------------------------------- /example/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/ASFF/master/example/test.jpg -------------------------------------------------------------------------------- /utils/DCN/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .deform_conv2d_func import DeformConv2dFunction 2 | from .modulated_deform_conv2d_func import ModulatedDeformConv2dFunction 3 | -------------------------------------------------------------------------------- /utils/DCN/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .deform_conv2d import DeformConv2d, _DeformConv2d, DeformConv2dPack, DeformConv2dPackMore 2 | from .modulated_deform_conv2d import ModulatedDeformConv2d, _ModulatedDeformConv2d, ModulatedDeformConv2dPack 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.pyc 4 | 5 | # C extensions 6 | *.so 7 | *.o 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | 13 | *.swp 14 | 15 | weights/ 16 | log/ 17 | save/ 18 | trained_model/ 19 | dist/ 20 | *.egg-info/ 21 | 22 | -------------------------------------------------------------------------------- /config/yolov3_mobile.cfg: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: YOLOv3 3 | BACKBONE: mobile 4 | TRAIN: 5 | LR: 0.001 6 | MOMENTUM: 0.9 7 | DECAY: 0.0005 8 | BURN_IN: 5 9 | MAXEPOCH: 300 10 | COS: True 11 | SYBN: True 12 | MIX: True 13 | NO_MIXUP_EPOCHS: 30 14 | LABAL_SMOOTH: True 15 | BATCHSIZE: 8 16 | IMGSIZE: 416 17 | IGNORETHRE: 0.7 18 | RANDRESIZE: True 19 | TEST: 20 | CONFTHRE: 0.001 21 | NMSTHRE: 0.65 22 | -------------------------------------------------------------------------------- /config/yolov3_baseline.cfg: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: YOLOv3 3 | BACKBONE: darknet53 4 | TRAIN: 5 | LR: 0.001 6 | MOMENTUM: 0.9 7 | DECAY: 0.0005 8 | BURN_IN: 5 9 | MAXEPOCH: 300 10 | COS: True 11 | SYBN: True 12 | MIX: True 13 | NO_MIXUP_EPOCHS: 30 14 | LABAL_SMOOTH: True 15 | BATCHSIZE: 5 16 | IMGSIZE: 608 17 | IGNORETHRE: 0.7 18 | RANDRESIZE: True 19 | TEST: 20 | CONFTHRE: 0.01 21 | NMSTHRE: 0.65 22 | IMGSIZE: 608 23 | -------------------------------------------------------------------------------- /utils/fp16_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .fp16util import ( 2 | BN_convert_float, 3 | network_to_half, 4 | prep_param_lists, 5 | model_grads_to_master_grads, 6 | master_params_to_model_params, 7 | tofp16, 8 | to_python_float, 9 | clip_grad_norm, 10 | convert_module, 11 | convert_network, 12 | FP16Model, 13 | ) 14 | 15 | from .fp16_optimizer import FP16_Optimizer 16 | from .loss_scaler import LossScaler, DynamicLossScaler 17 | -------------------------------------------------------------------------------- /utils/DCN/src/vision.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "deform_conv2d.h" 3 | #include "modulated_deform_conv2d.h" 4 | 5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 6 | m.def("deform_conv2d_forward", &deform_conv2d_forward, "deform_conv2d_forward"); 7 | m.def("deform_conv2d_backward", &deform_conv2d_backward, "deform_conv2d_backward"); 8 | m.def("modulated_deform_conv2d_forward", &modulated_deform_conv2d_forward, "modulated_deform_conv2d_forward"); 9 | m.def("modulated_deform_conv2d_backward", &modulated_deform_conv2d_backward, "modulated_deform_conv2d_backward"); 10 | } 11 | -------------------------------------------------------------------------------- /utils/fp16_utils/README.md: -------------------------------------------------------------------------------- 1 | fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user. To use `FP16_Optimizer`, only two lines of one's Python model need to change. 2 | 3 | #### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling) 4 | 5 | #### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple) 6 | 7 | #### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) 8 | 9 | #### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) 10 | 11 | 12 | fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses. 13 | 14 | #### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management) 15 | 16 | The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling. These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically. 17 | -------------------------------------------------------------------------------- /utils/DCN/src/cpu/deform_conv2d_cpu.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor 5 | deform_conv2d_cpu_forward(const at::Tensor &input, 6 | const at::Tensor &weight, 7 | const at::Tensor &bias, 8 | const at::Tensor &offset, 9 | const int kernel_h, 10 | const int kernel_w, 11 | const int stride_h, 12 | const int stride_w, 13 | const int pad_h, 14 | const int pad_w, 15 | const int dilation_h, 16 | const int dilation_w, 17 | const int group, 18 | const int deformable_group, 19 | const int im2col_step); 20 | 21 | std::vector 22 | deform_conv2d_cpu_backward(const at::Tensor &input, 23 | const at::Tensor &weight, 24 | const at::Tensor &bias, 25 | const at::Tensor &offset, 26 | const at::Tensor &grad_output, 27 | const int kernel_h, 28 | const int kernel_w, 29 | const int stride_h, 30 | const int stride_w, 31 | const int pad_h, 32 | const int pad_w, 33 | const int dilation_h, 34 | const int dilation_w, 35 | const int group, 36 | const int deformable_group, 37 | const int im2col_step); 38 | 39 | 40 | -------------------------------------------------------------------------------- /utils/DCN/src/cuda/deform_conv2d_cuda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor 5 | deform_conv2d_cuda_forward(const at::Tensor &input, 6 | const at::Tensor &weight, 7 | const at::Tensor &bias, 8 | const at::Tensor &offset, 9 | const int kernel_h, 10 | const int kernel_w, 11 | const int stride_h, 12 | const int stride_w, 13 | const int pad_h, 14 | const int pad_w, 15 | const int dilation_h, 16 | const int dilation_w, 17 | const int group, 18 | const int deformable_group, 19 | const int im2col_step); 20 | 21 | std::vector 22 | deform_conv2d_cuda_backward(const at::Tensor &input, 23 | const at::Tensor &weight, 24 | const at::Tensor &bias, 25 | const at::Tensor &offset, 26 | const at::Tensor &grad_output, 27 | const int kernel_h, 28 | const int kernel_w, 29 | const int stride_h, 30 | const int stride_w, 31 | const int pad_h, 32 | const int pad_w, 33 | const int dilation_h, 34 | const int dilation_w, 35 | const int group, 36 | const int deformable_group, 37 | const int im2col_step); 38 | 39 | -------------------------------------------------------------------------------- /utils/DCN/src/cpu/deform_conv2d_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | 7 | at::Tensor 8 | deform_conv2d_cpu_forward(const at::Tensor &input, 9 | const at::Tensor &weight, 10 | const at::Tensor &bias, 11 | const at::Tensor &offset, 12 | const int kernel_h, 13 | const int kernel_w, 14 | const int stride_h, 15 | const int stride_w, 16 | const int pad_h, 17 | const int pad_w, 18 | const int dilation_h, 19 | const int dilation_w, 20 | const int group, 21 | const int deformable_group, 22 | const int im2col_step) 23 | { 24 | AT_ERROR("Not implement on cpu"); 25 | } 26 | 27 | std::vector 28 | deform_conv2d_cpu_backward(const at::Tensor &input, 29 | const at::Tensor &weight, 30 | const at::Tensor &bias, 31 | const at::Tensor &offset, 32 | const at::Tensor &grad_output, 33 | const int kernel_h, 34 | const int kernel_w, 35 | const int stride_h, 36 | const int stride_w, 37 | const int pad_h, 38 | const int pad_w, 39 | const int dilation_h, 40 | const int dilation_w, 41 | const int group, 42 | const int deformable_group, 43 | const int im2col_step) 44 | { 45 | AT_ERROR("Not implement on cpu"); 46 | } 47 | 48 | -------------------------------------------------------------------------------- /utils/DCN/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import glob 5 | 6 | import torch 7 | 8 | from torch.utils.cpp_extension import CUDA_HOME 9 | from torch.utils.cpp_extension import CppExtension 10 | from torch.utils.cpp_extension import CUDAExtension 11 | 12 | from setuptools import find_packages 13 | from setuptools import setup 14 | 15 | requirements = ["torch", "torchvision"] 16 | 17 | def get_extensions(): 18 | this_dir = os.path.dirname(os.path.abspath(__file__)) 19 | extensions_dir = os.path.join(this_dir, "src") 20 | 21 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 22 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 23 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 24 | 25 | sources = main_file + source_cpu 26 | extension = CppExtension 27 | extra_compile_args = {"cxx": []} 28 | define_macros = [] 29 | 30 | if torch.cuda.is_available() and CUDA_HOME is not None: 31 | extension = CUDAExtension 32 | sources += source_cuda 33 | define_macros += [("WITH_CUDA", None)] 34 | extra_compile_args["nvcc"] = [ 35 | "-DCUDA_HAS_FP16=1", 36 | "-D__CUDA_NO_HALF_OPERATORS__", 37 | "-D__CUDA_NO_HALF_CONVERSIONS__", 38 | "-D__CUDA_NO_HALF2_OPERATORS__", 39 | ] 40 | else: 41 | raise NotImplementedError('Cuda is not availabel') 42 | 43 | sources = [os.path.join(extensions_dir, s) for s in sources] 44 | include_dirs = [extensions_dir] 45 | ext_modules = [ 46 | extension( 47 | "DCN", 48 | sources, 49 | include_dirs=include_dirs, 50 | define_macros=define_macros, 51 | extra_compile_args=extra_compile_args, 52 | ) 53 | ] 54 | return ext_modules 55 | 56 | setup( 57 | name="DCN", 58 | version="1.0", 59 | description="deformable convolutional networks", 60 | packages=find_packages(exclude=("configs", "tests",)), 61 | ext_modules=get_extensions(), 62 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 63 | ) 64 | -------------------------------------------------------------------------------- /utils/DCN/src/cpu/modulated_deform_conv2d_cpu.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor 5 | modulated_deform_conv2d_cpu_forward(const at::Tensor &input, 6 | const at::Tensor &weight, 7 | const at::Tensor &bias, 8 | const at::Tensor &offset, 9 | const at::Tensor &mask, 10 | const int kernel_h, 11 | const int kernel_w, 12 | const int stride_h, 13 | const int stride_w, 14 | const int pad_h, 15 | const int pad_w, 16 | const int dilation_h, 17 | const int dilation_w, 18 | const int group, 19 | const int deformable_group, 20 | const int im2col_step); 21 | 22 | std::vector 23 | modulated_deform_conv2d_cpu_backward(const at::Tensor &input, 24 | const at::Tensor &weight, 25 | const at::Tensor &bias, 26 | const at::Tensor &offset, 27 | const at::Tensor &mask, 28 | const at::Tensor &grad_output, 29 | const int kernel_h, 30 | const int kernel_w, 31 | const int stride_h, 32 | const int stride_w, 33 | const int pad_h, 34 | const int pad_w, 35 | const int dilation_h, 36 | const int dilation_w, 37 | const int group, 38 | const int deformable_group, 39 | const int im2col_step); 40 | 41 | 42 | -------------------------------------------------------------------------------- /utils/DCN/src/cuda/modulated_deform_conv2d_cuda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor 5 | modulated_deform_conv2d_cuda_forward(const at::Tensor &input, 6 | const at::Tensor &weight, 7 | const at::Tensor &bias, 8 | const at::Tensor &offset, 9 | const at::Tensor &mask, 10 | const int kernel_h, 11 | const int kernel_w, 12 | const int stride_h, 13 | const int stride_w, 14 | const int pad_h, 15 | const int pad_w, 16 | const int dilation_h, 17 | const int dilation_w, 18 | const int group, 19 | const int deformable_group, 20 | const int im2col_step); 21 | 22 | std::vector 23 | modulated_deform_conv2d_cuda_backward(const at::Tensor &input, 24 | const at::Tensor &weight, 25 | const at::Tensor &bias, 26 | const at::Tensor &offset, 27 | const at::Tensor &mask, 28 | const at::Tensor &grad_output, 29 | const int kernel_h, 30 | const int kernel_w, 31 | const int stride_h, 32 | const int stride_w, 33 | const int pad_h, 34 | const int pad_w, 35 | const int dilation_h, 36 | const int dilation_w, 37 | const int group, 38 | const int deformable_group, 39 | const int im2col_step); 40 | 41 | -------------------------------------------------------------------------------- /models/utils_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | 6 | 7 | class IOUWH_loss(nn.Module): #used for anchor guiding 8 | def __init__(self, reduction='none'): 9 | super(IOUWH_loss, self).__init__() 10 | self.reduction = reduction 11 | 12 | def forward(self, pred, target): 13 | orig_shape = pred.shape 14 | pred = pred.view(-1,4) 15 | target = target.view(-1,4) 16 | target[:,:2] = 0 17 | tl = torch.max((target[:, :2]-pred[:,2:]/2), 18 | (target[:, :2] - target[:, 2:]/2)) 19 | 20 | br = torch.min((target[:, :2]+pred[:,2:]/2), 21 | (target[:, :2] + target[:, 2:]/2)) 22 | 23 | area_p = torch.prod(pred[:,2:], 1) 24 | area_g = torch.prod(target[:,2:], 1) 25 | 26 | en = (tl< br).type(tl.type()).prod(dim=1) 27 | area_i = torch.prod(br-tl, 1) * en 28 | U = area_p+area_g-area_i+ 1e-16 29 | iou= area_i / U 30 | 31 | loss = 1-iou**2 32 | if self.reduction =='mean': 33 | loss = loss.mean() 34 | elif self.reduction == 'sum': 35 | loss = loss.sum() 36 | 37 | return loss 38 | 39 | class IOUloss(nn.Module): 40 | def __init__(self, reduction='none'): 41 | super(IOUloss, self).__init__() 42 | self.reduction = reduction 43 | 44 | def forward(self, pred, target): 45 | orig_shape = pred.shape 46 | pred = pred.view(-1,4) 47 | target = target.view(-1,4) 48 | tl = torch.max((pred[:, :2]-pred[:,2:]/2), 49 | (target[:, :2] - target[:, 2:]/2)) 50 | br = torch.min((pred[:, :2]+pred[:,2:]/2), 51 | (target[:, :2] + target[:, 2:]/2)) 52 | 53 | area_p = torch.prod(pred[:,2:], 1) 54 | area_g = torch.prod(target[:,2:], 1) 55 | 56 | en = (tl< br).type(tl.type()).prod(dim=1) 57 | area_i = torch.prod(br-tl, 1) * en 58 | iou= (area_i) / (area_p+area_g-area_i+ 1e-16) 59 | 60 | loss = 1-iou**2 61 | if self.reduction =='mean': 62 | loss = loss.mean() 63 | elif self.reduction == 'sum': 64 | loss = loss.sum() 65 | 66 | return loss 67 | -------------------------------------------------------------------------------- /utils/DCN/src/cpu/modulated_deform_conv2d_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | 7 | at::Tensor 8 | modulated_deform_conv2d_cpu_forward(const at::Tensor &input, 9 | const at::Tensor &weight, 10 | const at::Tensor &bias, 11 | const at::Tensor &offset, 12 | const at::Tensor &mask, 13 | const int kernel_h, 14 | const int kernel_w, 15 | const int stride_h, 16 | const int stride_w, 17 | const int pad_h, 18 | const int pad_w, 19 | const int dilation_h, 20 | const int dilation_w, 21 | const int group, 22 | const int deformable_group, 23 | const int im2col_step) 24 | { 25 | AT_ERROR("Not implement on cpu"); 26 | } 27 | 28 | std::vector 29 | modulated_deform_conv2d_cpu_backward(const at::Tensor &input, 30 | const at::Tensor &weight, 31 | const at::Tensor &bias, 32 | const at::Tensor &offset, 33 | const at::Tensor &mask, 34 | const at::Tensor &grad_output, 35 | const int kernel_h, 36 | const int kernel_w, 37 | const int stride_h, 38 | const int stride_w, 39 | const int pad_h, 40 | const int pad_w, 41 | const int dilation_h, 42 | const int dilation_w, 43 | const int group, 44 | const int deformable_group, 45 | const int im2col_step) 46 | { 47 | AT_ERROR("Not implement on cpu"); 48 | } 49 | 50 | -------------------------------------------------------------------------------- /utils/DCN/functions/deform_conv2d_func.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import division 5 | 6 | import math 7 | import torch 8 | from torch import nn 9 | from torch.autograd import Function 10 | from torch.nn.modules.utils import _pair 11 | from torch.autograd.function import once_differentiable 12 | from apex import amp 13 | import DCN 14 | 15 | class DeformConv2dFunction(Function): 16 | @staticmethod 17 | @amp.float_function 18 | def forward(ctx, input, offset, weight, bias, 19 | stride, padding, dilation, group, deformable_groups, im2col_step): 20 | ctx.stride = _pair(stride) 21 | ctx.padding = _pair(padding) 22 | ctx.dilation = _pair(dilation) 23 | ctx.kernel_size = _pair(weight.shape[2:4]) 24 | ctx.group = group 25 | ctx.deformable_groups = deformable_groups 26 | ctx.im2col_step = im2col_step 27 | output = DCN.deform_conv2d_forward(input, weight, bias, 28 | offset, 29 | ctx.kernel_size[0], ctx.kernel_size[1], 30 | ctx.stride[0], ctx.stride[1], 31 | ctx.padding[0], ctx.padding[1], 32 | ctx.dilation[0], ctx.dilation[1], 33 | ctx.group, 34 | ctx.deformable_groups, 35 | ctx.im2col_step) 36 | ctx.save_for_backward(input, offset, weight, bias) 37 | return output 38 | 39 | @staticmethod 40 | @once_differentiable 41 | @amp.float_function 42 | def backward(ctx, grad_output): 43 | input, offset, weight, bias = ctx.saved_tensors 44 | grad_input, grad_offset, grad_weight, grad_bias = \ 45 | DCN.deform_conv2d_backward(input, weight, 46 | bias, 47 | offset, 48 | grad_output, 49 | ctx.kernel_size[0], ctx.kernel_size[1], 50 | ctx.stride[0], ctx.stride[1], 51 | ctx.padding[0], ctx.padding[1], 52 | ctx.dilation[0], ctx.dilation[1], 53 | ctx.group, 54 | ctx.deformable_groups, 55 | ctx.im2col_step) 56 | 57 | return grad_input, grad_offset, grad_weight, grad_bias,\ 58 | None, None, None, None, None, None 59 | -------------------------------------------------------------------------------- /utils/DCN/functions/modulated_deform_conv2d_func.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import division 5 | 6 | import math 7 | import torch 8 | from torch import nn 9 | from torch.autograd import Function 10 | from torch.nn.modules.utils import _pair 11 | from torch.autograd.function import once_differentiable 12 | 13 | import DCN 14 | 15 | class ModulatedDeformConv2dFunction(Function): 16 | @staticmethod 17 | def forward(ctx, input, offset, mask, weight, bias, 18 | stride, padding, dilation, groups, deformable_groups, im2col_step): 19 | ctx.stride = _pair(stride) 20 | ctx.padding = _pair(padding) 21 | ctx.dilation = _pair(dilation) 22 | ctx.kernel_size = _pair(weight.shape[2:4]) 23 | ctx.groups = groups 24 | ctx.deformable_groups = deformable_groups 25 | ctx.im2col_step = im2col_step 26 | output = DCN.modulated_deform_conv2d_forward(input, weight, bias, 27 | offset, mask, 28 | ctx.kernel_size[0], ctx.kernel_size[1], 29 | ctx.stride[0], ctx.stride[1], 30 | ctx.padding[0], ctx.padding[1], 31 | ctx.dilation[0], ctx.dilation[1], 32 | ctx.groups, 33 | ctx.deformable_groups, 34 | ctx.im2col_step) 35 | ctx.save_for_backward(input, offset, mask, weight, bias) 36 | return output 37 | 38 | @staticmethod 39 | @once_differentiable 40 | def backward(ctx, grad_output): 41 | input, offset, mask, weight, bias = ctx.saved_tensors 42 | grad_input, grad_offset, grad_mask, grad_weight, grad_bias = \ 43 | DCN.modulated_deform_conv2d_backward(input, weight, 44 | bias, 45 | offset, mask, 46 | grad_output, 47 | ctx.kernel_size[0], ctx.kernel_size[1], 48 | ctx.stride[0], ctx.stride[1], 49 | ctx.padding[0], ctx.padding[1], 50 | ctx.dilation[0], ctx.dilation[1], 51 | ctx.groups, 52 | ctx.deformable_groups, 53 | ctx.im2col_step) 54 | 55 | return grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\ 56 | None, None, None, None, None, None 57 | -------------------------------------------------------------------------------- /utils/DCN/src/deform_conv2d.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/deform_conv2d_cpu.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/deform_conv2d_cuda.h" 7 | #endif 8 | 9 | 10 | at::Tensor 11 | deform_conv2d_forward(const at::Tensor &input, 12 | const at::Tensor &weight, 13 | const at::Tensor &bias, 14 | const at::Tensor &offset, 15 | const int kernel_h, 16 | const int kernel_w, 17 | const int stride_h, 18 | const int stride_w, 19 | const int pad_h, 20 | const int pad_w, 21 | const int dilation_h, 22 | const int dilation_w, 23 | const int group, 24 | const int deformable_group, 25 | const int im2col_step) 26 | { 27 | if (input.type().is_cuda()) 28 | { 29 | #ifdef WITH_CUDA 30 | return deform_conv2d_cuda_forward(input, weight, bias, offset, 31 | kernel_h, kernel_w, 32 | stride_h, stride_w, 33 | pad_h, pad_w, 34 | dilation_h, dilation_w, 35 | group, 36 | deformable_group, 37 | im2col_step); 38 | #else 39 | AT_ERROR("Not compiled with GPU support"); 40 | #endif 41 | } 42 | AT_ERROR("Not implemented on the CPU"); 43 | } 44 | 45 | std::vector 46 | deform_conv2d_backward(const at::Tensor &input, 47 | const at::Tensor &weight, 48 | const at::Tensor &bias, 49 | const at::Tensor &offset, 50 | const at::Tensor &grad_output, 51 | const int kernel_h, 52 | const int kernel_w, 53 | const int stride_h, 54 | const int stride_w, 55 | const int pad_h, 56 | const int pad_w, 57 | const int dilation_h, 58 | const int dilation_w, 59 | const int group, 60 | const int deformable_group, 61 | const int im2col_step) 62 | { 63 | if (input.type().is_cuda()) 64 | { 65 | #ifdef WITH_CUDA 66 | return deform_conv2d_cuda_backward(input, 67 | weight, 68 | bias, 69 | offset, 70 | grad_output, 71 | kernel_h, kernel_w, 72 | stride_h, stride_w, 73 | pad_h, pad_w, 74 | dilation_h, dilation_w, 75 | group, 76 | deformable_group, 77 | im2col_step); 78 | #else 79 | AT_ERROR("Not compiled with GPU support"); 80 | #endif 81 | } 82 | AT_ERROR("Not implemented on the CPU"); 83 | } 84 | 85 | -------------------------------------------------------------------------------- /utils/DCN/src/modulated_deform_conv2d.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/modulated_deform_conv2d_cpu.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/modulated_deform_conv2d_cuda.h" 7 | #endif 8 | 9 | 10 | at::Tensor 11 | modulated_deform_conv2d_forward(const at::Tensor &input, 12 | const at::Tensor &weight, 13 | const at::Tensor &bias, 14 | const at::Tensor &offset, 15 | const at::Tensor &mask, 16 | const int kernel_h, 17 | const int kernel_w, 18 | const int stride_h, 19 | const int stride_w, 20 | const int pad_h, 21 | const int pad_w, 22 | const int dilation_h, 23 | const int dilation_w, 24 | const int group, 25 | const int deformable_group, 26 | const int im2col_step) 27 | { 28 | if (input.type().is_cuda()) 29 | { 30 | #ifdef WITH_CUDA 31 | return modulated_deform_conv2d_cuda_forward(input, weight, bias, offset, mask, 32 | kernel_h, kernel_w, 33 | stride_h, stride_w, 34 | pad_h, pad_w, 35 | dilation_h, dilation_w, 36 | group, 37 | deformable_group, 38 | im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | modulated_deform_conv2d_backward(const at::Tensor &input, 48 | const at::Tensor &weight, 49 | const at::Tensor &bias, 50 | const at::Tensor &offset, 51 | const at::Tensor &mask, 52 | const at::Tensor &grad_output, 53 | const int kernel_h, 54 | const int kernel_w, 55 | const int stride_h, 56 | const int stride_w, 57 | const int pad_h, 58 | const int pad_w, 59 | const int dilation_h, 60 | const int dilation_w, 61 | const int group, 62 | const int deformable_group, 63 | const int im2col_step) 64 | { 65 | if (input.type().is_cuda()) 66 | { 67 | #ifdef WITH_CUDA 68 | return modulated_deform_conv2d_cuda_backward(input, 69 | weight, 70 | bias, 71 | offset, 72 | mask, 73 | grad_output, 74 | kernel_h, kernel_w, 75 | stride_h, stride_w, 76 | pad_h, pad_w, 77 | dilation_h, dilation_w, 78 | group, 79 | deformable_group, 80 | im2col_step); 81 | #else 82 | AT_ERROR("Not compiled with GPU support"); 83 | #endif 84 | } 85 | AT_ERROR("Not implemented on the CPU"); 86 | } 87 | 88 | -------------------------------------------------------------------------------- /utils/vis_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import os 5 | import matplotlib 6 | 7 | matplotlib.use('AGG') 8 | 9 | import matplotlib.pyplot as plt 10 | import torch 11 | import cv2 12 | import math 13 | from skimage import transform 14 | 15 | def make_vis(dataset, index, img, fuse_weights, fused_fs): 16 | save_dir = 'vis_output/{}/{}'.format(dataset,index) 17 | os.makedirs(save_dir, exist_ok=True) 18 | 19 | for i in range(len(fuse_weights)): 20 | weights = fuse_weights[i].float().cpu().squeeze().numpy() 21 | max_v = weights.max() 22 | min_v = weights.min() 23 | for j in range(3): 24 | v = weights[j,:,:] 25 | save_name = os.path.join(save_dir, 'level_{}_weight_{}.png'.format(i+1,j+1)) 26 | add_heat(img, v, max_v, min_v, save=save_name) 27 | 28 | fused_f = fused_fs[i].float().cpu().squeeze().numpy() 29 | max_f = fused_f.max() 30 | min_f = fused_f.min() 31 | save_f_name = os.path.join(save_dir, 'fused_feature_level_{}.png'.format(i+1)) 32 | add_heat(img, fused_f, max_f, min_f, save=save_f_name) 33 | 34 | def make_pred_vis(dataset,index, img, class_names, bboxes, cls, scores): 35 | save_preddir = 'vis_output/{}/pred/'.format(dataset) 36 | os.makedirs(save_preddir, exist_ok=True) 37 | 38 | save_pred_name = os.path.join(save_preddir,'{}.png'.format(index)) 39 | 40 | bboxes = bboxes.numpy() 41 | scores = scores.numpy() 42 | cls_ids = cls.numpy() 43 | 44 | im = vis(img, bboxes, scores, cls_ids, class_names) 45 | 46 | cv2.imwrite(save_pred_name, im) 47 | 48 | def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None, color=None): 49 | 50 | colors = torch.FloatTensor([[1,0,1],[0,0,1],[0,1,1],[0,1,0],[1,1,0],[1,0,0]]); 51 | def get_color(c, x, max_val): 52 | ratio = float(x)/max_val * 5 53 | i = int(math.floor(ratio)) 54 | j = int(math.ceil(ratio)) 55 | ratio = ratio - i 56 | r = (1-ratio) * colors[i][c] + ratio*colors[j][c] 57 | return int(r*255) 58 | 59 | width = img.shape[1] 60 | height = img.shape[0] 61 | for i in range(len(boxes)): 62 | box = boxes[i] 63 | cls_conf = scores[i] 64 | if cls_conf < conf: 65 | continue 66 | x1 = int(box[0]) 67 | y1 = int(box[1]) 68 | x2 = int(box[0]+box[2]) 69 | y2 = int(box[1]+box[3]) 70 | 71 | 72 | if color: 73 | rgb = color 74 | else: 75 | rgb = (255, 0, 0) 76 | if class_names is not None: 77 | cls_conf = scores[i] 78 | cls_id = int(cls_ids[i]) 79 | class_name = class_names[cls_id] 80 | classes = len(class_names) 81 | offset = cls_id * 123456 % classes 82 | red = get_color(2, offset, classes) 83 | green = get_color(1, offset, classes) 84 | blue = get_color(0, offset, classes) 85 | if color is None: 86 | rgb = (red, green, blue) 87 | img = cv2.putText(img, '%s: %.2f'%(class_name,cls_conf), (x1,y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.3, rgb, 1) 88 | img = cv2.rectangle(img, (x1,y1), (x2,y2), rgb, 1) 89 | return img 90 | 91 | def add_heat(image, heat_map, max_v, min_v, alpha=0.4, save=None, cmap='jet', axis='off'): 92 | height = image.shape[0] 93 | width = image.shape[1] 94 | 95 | # resize heat map 96 | heat_map_resized = transform.resize(heat_map, (height, width)) 97 | 98 | # normalize heat map 99 | max_value = max_v 100 | min_value = min_v 101 | normalized_heat_map = (heat_map_resized - min_value) / (max_value - min_value) 102 | 103 | # display 104 | plt.imshow(image) 105 | plt.imshow(255 * normalized_heat_map, alpha=alpha, cmap=cmap) 106 | plt.axis(axis) 107 | 108 | if save is not None: 109 | plt.savefig(save, bbox_inches='tight', pad_inches=0) 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /utils/DCN/deform_conv2d_naive.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import init 4 | import math 5 | import numpy as np 6 | from torch.nn.modules.module import Module 7 | import torch.nn.functional as F 8 | from torch.nn.modules.utils import _pair 9 | 10 | class deform_conv2d_naive(Module): 11 | def __init__(self, in_channels, out_channels, 12 | kernel_size, stride, padding, dilation=1, groups=1, deformable_groups=1, bias=True): 13 | super(deform_conv2d_naive, self).__init__() 14 | self.in_channels = in_channels 15 | self.out_channels = out_channels 16 | self.kernel_size = _pair(kernel_size) 17 | self.stride = _pair(stride) 18 | self.padding = _pair(padding) 19 | self.dilation = _pair(dilation) 20 | self.groups = groups 21 | self.deformable_groups = deformable_groups 22 | self.use_bias = bias 23 | 24 | self.weight = nn.Parameter(torch.Tensor( 25 | out_channels, in_channels//groups, *self.kernel_size)) 26 | self.bias = nn.Parameter(torch.Tensor(out_channels)) 27 | self.reset_parameters() 28 | if not self.use_bias: 29 | self.bias.requires_grad = False 30 | self.bias.data.zero_() 31 | 32 | def reset_parameters(self): 33 | n = self.in_channels 34 | init.kaiming_uniform_(self.weight, a=math.sqrt(5)) 35 | if self.bias is not None: 36 | fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) 37 | bound = 1 / math.sqrt(fan_in) 38 | init.uniform_(self.bias, -bound, bound) 39 | 40 | def forward(self, input, offset): 41 | N = input.size(0) 42 | in_channels = self.in_channels 43 | out_channels = self.out_channels 44 | in_h = input.size(2) 45 | in_w = input.size(3) 46 | out_h = offset.size(2) 47 | out_w = offset.size(3) 48 | kernel_h = self.kernel_size[0] 49 | kernel_w = self.kernel_size[1] 50 | # [1, kernel_h * kernel_w, out_h, out_w, 2] 51 | mesh = self.compute_mesh_grid(in_h, in_w).cuda(input.get_device()) 52 | offset = offset.view(N, self.deformable_groups, kernel_h, kernel_w, 2, out_h, out_w) 53 | # [N * dg * kernel_h * kernel_w, out_h, out_w, 2] 54 | offset = offset.permute(0, 1, 2, 3, 5, 6, 4).contiguous().view(N * self.deformable_groups * kernel_h * kernel_w, out_h, out_w, 2) 55 | offset_x_normalize = (offset[:, :, :, 1]) / ((in_w - 1) * 1.0 / 2) 56 | offset_y_normalize = (offset[:, :, :, 0]) / ((in_h - 1) * 1.0 / 2) 57 | # [N * dg * kernel_h * kernel_w, out_h, out_w, 2] 58 | offset = torch.cat([offset_x_normalize[..., None], offset_y_normalize[..., None]], dim=3) 59 | # [N * dg * kernel_h * kernel_w, out_h, out_w, 2] 60 | grid = mesh.expand(N * self.deformable_groups, -1, -1, -1, -1).contiguous().view(-1, out_h, out_w, 2) + offset 61 | # [N * kernel_h * kernel_w * dg, in_channels/dg, in_h, in_w] 62 | input = input[:, None, ...].expand(-1, kernel_h * kernel_w, -1, -1, -1).contiguous().view( 63 | N * kernel_h * kernel_w * self.deformable_groups, in_channels // self.deformable_groups, in_h, in_w) 64 | sampled_feat = F.grid_sample(input, grid).view(N, kernel_h * kernel_w, in_channels, out_h, out_w).permute(2, 1, 0, 3, 4).contiguous().view(in_channels * kernel_h * kernel_w, -1) 65 | output_feat = torch.matmul(self.weight.view(self.weight.size(0), -1), sampled_feat).view(out_channels, N, out_h, out_w).permute(1,0,2,3) 66 | return output_feat 67 | 68 | def compute_mesh_grid(self, in_h, in_w): 69 | kernel_h, kernel_w = self.kernel_size 70 | stride_h, stride_w = self.stride 71 | dilation_h, dilation_w = self.dilation 72 | padding_h, padding_w = self.padding 73 | out_h = (in_h + 2 * padding_h - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1 74 | out_w = (in_w + 2 * padding_w - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1 75 | # [out_h, out_w] 76 | mesh_y, mesh_x = torch.meshgrid(torch.arange(out_h), torch.arange(out_w)) 77 | mesh_y = mesh_y * stride_h - padding_h 78 | mesh_x = mesh_x * stride_w - padding_w 79 | # [1, out_h, out_w] 80 | mesh_y = mesh_y.unsqueeze(0).float() 81 | mesh_x = mesh_x.unsqueeze(0).float() 82 | # [kernel_h, kernel_w] 83 | kernel_offset_y, kernel_offset_x = torch.meshgrid(torch.arange(kernel_h), torch.arange(kernel_w)) 84 | # [kernel_h * kernel_w, 1, 1] 85 | kernel_offset_y = kernel_offset_y.float().view(kernel_h * kernel_w, 1, 1) * dilation_h 86 | kernel_offset_x = kernel_offset_x.float().view(kernel_h * kernel_w, 1, 1) * dilation_w 87 | # [kernel_h * kernel_w, out_h, out_w] 88 | mesh_y = mesh_y + kernel_offset_y 89 | mesh_x = mesh_x + kernel_offset_x 90 | mesh_y = (mesh_y - (in_h - 1) / 2.) / ((in_h - 1) / 2.) 91 | mesh_x = (mesh_x - (in_w - 1) / 2.) / ((in_w - 1) / 2.) 92 | mesh = torch.cat([mesh_x[None, ..., None], mesh_y[None, ..., None]], dim=4) 93 | return mesh 94 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from utils.utils import * 2 | from dataset.vocdataset import VOC_CLASSES 3 | from dataset.cocodataset import COCO_CLASSES 4 | from dataset.data_augment import ValTransform 5 | from utils.vis_utils import vis 6 | 7 | import os 8 | import sys 9 | import argparse 10 | import yaml 11 | import cv2 12 | cv2.setNumThreads(0) 13 | 14 | import torch 15 | from torch.autograd import Variable 16 | import time 17 | 18 | ######## unlimit the resource in some dockers or cloud machines ####### 19 | #import resource 20 | #rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 21 | #resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) 22 | 23 | 24 | def parse_args(): 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('--cfg', type=str, default='config/yolov3_baseline.cfg', 27 | help='config file. see readme') 28 | parser.add_argument('-d', '--dataset', type=str, default='COCO') 29 | parser.add_argument('-i', '--img', type=str, default='example/test.jpg',) 30 | parser.add_argument('-c', '--checkpoint', type=str, 31 | help='pytorch checkpoint file path') 32 | parser.add_argument('-s', '--test_size', type=int, default=416) 33 | parser.add_argument('--half', dest='half', action='store_true', default=False, 34 | help='FP16 training') 35 | parser.add_argument('--rfb', dest='rfb', action='store_true', default=False, 36 | help='Use rfb block') 37 | parser.add_argument('--asff', dest='asff', action='store_true', default=False, 38 | help='Use ASFF module for yolov3') 39 | parser.add_argument('--use_cuda', type=bool, default=True) 40 | return parser.parse_args() 41 | 42 | def demo(): 43 | """ 44 | YOLOv3 demo. See README for details. 45 | """ 46 | args = parse_args() 47 | print("Setting Arguments.. : ", args) 48 | 49 | cuda = torch.cuda.is_available() and args.use_cuda 50 | 51 | # Parse config settings 52 | with open(args.cfg, 'r') as f: 53 | cfg = yaml.safe_load(f) 54 | 55 | print("successfully loaded config file: ", cfg) 56 | 57 | backbone=cfg['MODEL']['BACKBONE'] 58 | test_size = (args.test_size,args.test_size) 59 | 60 | if args.dataset == 'COCO': 61 | class_names = COCO_CLASSES 62 | num_class=80 63 | elif args.dataset == 'VOC': 64 | class_names = VOC_CLASSES 65 | num_class=20 66 | else: 67 | raise Exception("Only support COCO or VOC model now!") 68 | 69 | # Initiate model 70 | if args.asff: 71 | if backbone == 'mobile': 72 | from models.yolov3_mobilev2 import YOLOv3 73 | print("For mobilenet, we currently don't support dropblock, rfb and FeatureAdaption") 74 | else: 75 | from models.yolov3_asff import YOLOv3 76 | print('Training YOLOv3 with ASFF!') 77 | model = YOLOv3(num_classes = num_class, rfb=args.rfb, asff=args.asff) 78 | else: 79 | if backbone == 'mobile': 80 | from models.yolov3_mobilev2 import YOLOv3 81 | else: 82 | from models.yolov3_baseline import YOLOv3 83 | print('Training YOLOv3 strong baseline!') 84 | model = YOLOv3(num_classes = num_class, rfb=args.rfb) 85 | 86 | 87 | if args.checkpoint: 88 | print("loading pytorch ckpt...", args.checkpoint) 89 | cpu_device = torch.device("cpu") 90 | ckpt = torch.load(args.checkpoint, map_location=cpu_device) 91 | #model.load_state_dict(ckpt,strict=False) 92 | model.load_state_dict(ckpt) 93 | if cuda: 94 | print("using cuda") 95 | torch.backends.cudnn.benchmark = True 96 | device = torch.device("cuda") 97 | model = model.to(device) 98 | 99 | if args.half: 100 | model = model.half() 101 | 102 | model = model.eval() 103 | dtype = torch.float16 if args.half else torch.float32 104 | 105 | #load img 106 | transform = ValTransform(rgb_means=(0.485, 0.456, 0.406), std=(0.229,0.224,0.225)) 107 | im = cv2.imread(args.img) 108 | height, width, _ = im.shape 109 | ori_im = im.copy() 110 | im_input, _ = transform(im, None, test_size) 111 | if cuda: 112 | im_input = im_input.to(device) 113 | 114 | im_input = Variable(im_input.type(dtype).unsqueeze(0)) 115 | outputs= model(im_input) 116 | outputs = postprocess(outputs, num_class, 0.01, 0.65) 117 | 118 | outputs = outputs[0].cpu().data 119 | bboxes = outputs[:, 0:4] 120 | bboxes[:, 0::2] *= width / test_size[0] 121 | bboxes[:, 1::2] *= height / test_size[1] 122 | bboxes[:, 2] = bboxes[:,2] - bboxes[:,0] 123 | bboxes[:, 3] = bboxes[:,3] - bboxes[:,1] 124 | cls = outputs[:, 6] 125 | scores = outputs[:, 4]* outputs[:,5] 126 | 127 | pred_im=vis(ori_im, bboxes.numpy(), scores.numpy(), cls.numpy(), conf=0.6, class_names=class_names) 128 | cv2.imshow('Detection', pred_im) 129 | cv2.waitKey(0) 130 | cv2.destroyAllWindows() 131 | 132 | sys.exit(0) 133 | 134 | 135 | if __name__ == '__main__': 136 | demo() 137 | -------------------------------------------------------------------------------- /utils/DCN/modules/modulated_deform_conv2d.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import division 5 | 6 | import torch 7 | import math 8 | from torch import nn 9 | from torch.nn import init 10 | from torch.nn.modules.utils import _pair 11 | 12 | from ..functions.modulated_deform_conv2d_func import ModulatedDeformConv2dFunction 13 | 14 | class ModulatedDeformConv2d(nn.Module): 15 | 16 | def __init__(self, in_channels, out_channels, 17 | kernel_size, stride, padding, dilation=1, groups=1, deformable_groups=1, im2col_step=64, bias=True): 18 | super(ModulatedDeformConv2d, self).__init__() 19 | 20 | if in_channels % groups != 0: 21 | raise ValueError('in_channels {} must be divisible by groups {}'.format(in_channels, groups)) 22 | if out_channels % groups != 0: 23 | raise ValueError('out_channels {} must be divisible by groups {}'.format(out_channels, groups)) 24 | 25 | self.in_channels = in_channels 26 | self.out_channels = out_channels 27 | self.kernel_size = _pair(kernel_size) 28 | self.stride = _pair(stride) 29 | self.padding = _pair(padding) 30 | self.dilation = _pair(dilation) 31 | self.groups = groups 32 | self.deformable_groups = deformable_groups 33 | self.im2col_step = im2col_step 34 | self.use_bias = bias 35 | 36 | self.weight = nn.Parameter(torch.Tensor( 37 | out_channels, in_channels//groups, *self.kernel_size)) 38 | self.bias = nn.Parameter(torch.Tensor(out_channels)) 39 | self.reset_parameters() 40 | if not self.use_bias: 41 | self.bias.requires_grad = False 42 | 43 | def reset_parameters(self): 44 | n = self.in_channels 45 | init.kaiming_uniform_(self.weight, a=math.sqrt(5)) 46 | if self.bias is not None: 47 | fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) 48 | bound = 1 / math.sqrt(fan_in) 49 | init.uniform_(self.bias, -bound, bound) 50 | 51 | def forward(self, input, offset, mask): 52 | assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ 53 | offset.shape[1] 54 | assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ 55 | mask.shape[1] 56 | return ModulatedDeformConv2dFunction.apply(input, offset, mask, 57 | self.weight, 58 | self.bias, 59 | self.stride, 60 | self.padding, 61 | self.dilation, 62 | self.groups, 63 | self.deformable_groups, 64 | self.im2col_step) 65 | 66 | _ModulatedDeformConv2d = ModulatedDeformConv2dFunction.apply 67 | 68 | class ModulatedDeformConv2dPack(ModulatedDeformConv2d): 69 | 70 | def __init__(self, in_channels, out_channels, 71 | kernel_size, stride, padding, 72 | dilation=1, groups=1, deformable_groups=1, im2col_step=64, bias=True, lr_mult=0.1): 73 | super(ModulatedDeformConv2dPack, self).__init__(in_channels, out_channels, 74 | kernel_size, stride, padding, dilation, groups, deformable_groups, im2col_step, bias) 75 | 76 | out_channels = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] 77 | self.conv_offset_mask = nn.Conv2d(self.in_channels, 78 | out_channels, 79 | kernel_size=self.kernel_size, 80 | stride=self.stride, 81 | padding=self.padding, 82 | bias=True) 83 | self.conv_offset_mask.lr_mult = lr_mult 84 | self.conv_offset_mask.inited = True 85 | self.init_offset() 86 | 87 | def init_offset(self): 88 | self.conv_offset_mask.weight.data.zero_() 89 | self.conv_offset_mask.bias.data.zero_() 90 | 91 | def forward(self, input): 92 | out = self.conv_offset_mask(input) 93 | o1, o2, mask = torch.chunk(out, 3, dim=1) 94 | offset = torch.cat((o1, o2), dim=1) 95 | mask = torch.sigmoid(mask) 96 | return ModulatedDeformConv2dFunction.apply(input, offset, mask, 97 | self.weight, 98 | self.bias, 99 | self.stride, 100 | self.padding, 101 | self.dilation, 102 | self.groups, 103 | self.deformable_groups, 104 | self.im2col_step) 105 | 106 | -------------------------------------------------------------------------------- /dataset/mixupdetection.py: -------------------------------------------------------------------------------- 1 | """Mixup detection dataset wrapper.""" 2 | from __future__ import absolute_import 3 | import numpy as np 4 | import torch 5 | #from mxnet.gluon.data import Dataset 6 | from .dataloading import Dataset 7 | 8 | 9 | class MixupDetection(Dataset): 10 | """Detection dataset wrapper that performs mixup for normal dataset. 11 | Parameters 12 | ---------- 13 | dataset : mx.gluon.data.Dataset 14 | Gluon dataset object. 15 | mixup : callable random generator, e.g. np.random.uniform 16 | A random mixup ratio sampler, preferably a random generator from numpy.random 17 | A random float will be sampled each time with mixup(*args). 18 | Use None to disable. 19 | *args : list 20 | Additional arguments for mixup random sampler. 21 | """ 22 | def __init__(self, dataset, mixup=None, preproc=None, *args): 23 | super().__init__(dataset.input_dim) 24 | self._dataset = dataset 25 | self.preproc = preproc 26 | self._mixup = mixup 27 | self._mixup_args = args 28 | 29 | def set_mixup(self, mixup=None, *args): 30 | """Set mixup random sampler, use None to disable. 31 | Parameters 32 | ---------- 33 | mixup : callable random generator, e.g. np.random.uniform 34 | A random mixup ratio sampler, preferably a random generator from numpy.random 35 | A random float will be sampled each time with mixup(*args) 36 | *args : list 37 | Additional arguments for mixup random sampler. 38 | """ 39 | self._mixup = mixup 40 | self._mixup_args = args 41 | 42 | def __len__(self): 43 | return len(self._dataset) 44 | 45 | @Dataset.resize_getitem 46 | def __getitem__(self, idx): 47 | self._dataset._input_dim = self.input_dim 48 | # first image 49 | img1, label1, _, _= self._dataset.pull_item(idx) 50 | lambd = 1 51 | 52 | # draw a random lambda ratio from distribution 53 | if self._mixup is not None: 54 | lambd = max(0, min(1, self._mixup(*self._mixup_args))) 55 | 56 | if lambd >= 1: 57 | weights1 = np.ones((label1.shape[0], 1)) 58 | label1 = np.hstack((label1, weights1)) 59 | height, width, _ = img1.shape 60 | img_info = (width, height) 61 | if self.preproc is not None: 62 | img_o, target_o = self.preproc(img1, label1, self.input_dim) 63 | return img_o, target_o, img_info, idx 64 | 65 | # second image 66 | idx2 = int(np.random.choice(np.delete(np.arange(len(self)), idx))) 67 | img2, label2, _, _ = self._dataset.pull_item(idx2) 68 | 69 | # mixup two images 70 | height = max(img1.shape[0], img2.shape[0]) 71 | width = max(img1.shape[1], img2.shape[1]) 72 | mix_img = np.zeros((height, width, 3),dtype=np.float32) 73 | mix_img[:img1.shape[0], :img1.shape[1], :] = img1.astype(np.float32) * lambd 74 | mix_img[:img2.shape[0], :img2.shape[1], :] += img2.astype(np.float32) * (1. - lambd) 75 | mix_img = mix_img.astype(np.uint8) 76 | 77 | y1 = np.hstack((label1, np.full((label1.shape[0], 1), lambd))) 78 | y2 = np.hstack((label2, np.full((label2.shape[0], 1), 1. - lambd))) 79 | mix_label = np.vstack((y1, y2)) 80 | if self.preproc is not None: 81 | mix_img, padded_labels = self.preproc(mix_img, mix_label, self.input_dim) 82 | 83 | img_info = (width, height) 84 | 85 | return mix_img, padded_labels, img_info , idx 86 | 87 | def pull_item(self, idx): 88 | self._dataset._input_dim = self.input_dim 89 | # first image 90 | img1, label1, _, _= self._dataset.pull_item(idx) 91 | lambd = 1 92 | 93 | # draw a random lambda ratio from distribution 94 | if self._mixup is not None: 95 | lambd = max(0, min(1, self._mixup(*self._mixup_args))) 96 | 97 | if lambd >= 1: 98 | weights1 = np.ones((label1.shape[0], 1)) 99 | label1 = np.hstack((label1, weights1)) 100 | height, width, _ = img1.shape 101 | img_info = (width, height) 102 | if self.preproc is not None: 103 | img_o, target_o = self.preproc(img1, label1, self.input_dim) 104 | return img_o, target_o, img_info, idx 105 | 106 | # second image 107 | idx2 = int(np.random.choice(np.delete(np.arange(len(self)), idx))) 108 | img2, label2 = self._dataset.pull_item(idx2) 109 | 110 | # mixup two images 111 | height = max(img1.shape[0], img2.shape[0]) 112 | width = max(img1.shape[1], img2.shape[1]) 113 | mix_img = np.zeros((height, width, 3),dtype=np.float32) 114 | mix_img[:img1.shape[0], :img1.shape[1], :] = img1.astype(np.float32) * lambd 115 | mix_img[:img2.shape[0], :img2.shape[1], :] += img2.astype(np.float32) * (1. - lambd) 116 | mix_img = mix_img.astype(np.uint8) 117 | 118 | y1 = np.hstack((label1, np.full((label1.shape[0], 1), lambd))) 119 | y2 = np.hstack((label2, np.full((label2.shape[0], 1), 1. - lambd))) 120 | mix_label = np.vstack((y1, y2)) 121 | if self.preproc is not None: 122 | mix_img, padded_labels = self.preproc(mix_img, mix_label, self.input_dim) 123 | 124 | img_info = (width, height) 125 | return mix_img, padded_labels, img_info , idx 126 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import torch 3 | import torchvision 4 | import numpy as np 5 | import cv2 6 | 7 | def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45): 8 | """ 9 | Postprocess for the output of YOLO model 10 | perform box transformation, specify the class for each detection, 11 | and perform class-wise non-maximum suppression. 12 | Args: 13 | prediction (torch tensor): The shape is :math:`(N, B, 4)`. 14 | :math:`N` is the number of predictions, 15 | :math:`B` the number of boxes. The last axis consists of 16 | :math:`xc, yc, w, h` where `xc` and `yc` represent a center 17 | of a bounding box. 18 | num_classes (int): 19 | number of dataset classes. 20 | conf_thre (float): 21 | confidence threshold ranging from 0 to 1, 22 | which is defined in the config file. 23 | nms_thre (float): 24 | IoU threshold of non-max suppression ranging from 0 to 1. 25 | 26 | Returns: 27 | output (list of torch tensor): 28 | 29 | """ 30 | box_corner = prediction.new(prediction.shape) 31 | box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 32 | box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 33 | box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 34 | box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 35 | prediction[:, :, :4] = box_corner[:, :, :4] 36 | 37 | output = [None for _ in range(len(prediction))] 38 | for i, image_pred in enumerate(prediction): 39 | 40 | # If none are remaining => process next image 41 | if not image_pred.size(0): 42 | continue 43 | # Get score and class with highest confidence 44 | class_conf, class_pred = torch.max( 45 | image_pred[:, 5:5 + num_classes], 1, keepdim=True) 46 | 47 | conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze() 48 | # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred) 49 | detections = torch.cat( 50 | (image_pred[:, :5], class_conf, class_pred.float()), 1) 51 | detections = detections[conf_mask] 52 | if not detections.size(0): 53 | continue 54 | 55 | # Iterate through all predicted classes 56 | unique_labels = detections[:, -1].unique() 57 | 58 | for c in unique_labels: 59 | # Get the detections with the particular class 60 | detections_class = detections[detections[:, -1] == c] 61 | nms_out_index = torchvision.ops.nms( 62 | detections_class[:, :4], detections_class[:, 4]*detections_class[:, 5], nms_thre) 63 | detections_class = detections_class[nms_out_index] 64 | if output[i] is None: 65 | output[i] = detections_class 66 | else: 67 | output[i] = torch.cat((output[i], detections_class)) 68 | 69 | return output 70 | 71 | 72 | def bboxes_iou(bboxes_a, bboxes_b, xyxy=True): 73 | """Calculate the Intersection of Unions (IoUs) between bounding boxes. 74 | IoU is calculated as a ratio of area of the intersection 75 | and area of the union. 76 | 77 | Args: 78 | bbox_a (array): An array whose shape is :math:`(N, 4)`. 79 | :math:`N` is the number of bounding boxes. 80 | The dtype should be :obj:`numpy.float32`. 81 | bbox_b (array): An array similar to :obj:`bbox_a`, 82 | whose shape is :math:`(K, 4)`. 83 | The dtype should be :obj:`numpy.float32`. 84 | Returns: 85 | array: 86 | An array whose shape is :math:`(N, K)`. \ 87 | An element at index :math:`(n, k)` contains IoUs between \ 88 | :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \ 89 | box in :obj:`bbox_b`. 90 | 91 | from: https://github.com/chainer/chainercv 92 | """ 93 | if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4: 94 | raise IndexError 95 | 96 | if xyxy: 97 | tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2]) 98 | br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:]) 99 | area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1) 100 | area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1) 101 | else: 102 | tl = torch.max((bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2), 103 | (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2)) 104 | br = torch.min((bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2), 105 | (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2)) 106 | 107 | area_a = torch.prod(bboxes_a[:, 2:], 1) 108 | area_b = torch.prod(bboxes_b[:, 2:], 1) 109 | en = (tl < br).type(tl.type()).prod(dim=2) 110 | area_i = torch.prod(br - tl, 2) * en # * ((tl < br).all()) 111 | return area_i / (area_a[:, None] + area_b - area_i) 112 | 113 | 114 | def matrix_iou(a,b): 115 | """ 116 | return iou of a and b, numpy version for data augenmentation 117 | """ 118 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 119 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 120 | 121 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 122 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 123 | area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) 124 | return area_i / (area_a[:, np.newaxis] + area_b - area_i+1e-12) 125 | 126 | def visual(img, boxes, scores): 127 | 128 | COLORS = [(255, 0, 0), (0, 255, 0), (0, 0, 255)] 129 | FONT = cv2.FONT_HERSHEY_SIMPLEX 130 | for i in range(boxes.shape[0]): 131 | 132 | cv2.rectangle(img, (int(boxes[i][0]),int(boxes[i][1])),(int(boxes[i][2]),int(boxes[i][3])),COLORS[i%3],2) 133 | cv2.putText(img, 'Object: %.2f'%scores[i],(int(boxes[i][0])-3,int(boxes[i][1])-5), FONT, 134 | 0.4, (0,0,0),2) 135 | 136 | return img 137 | 138 | 139 | -------------------------------------------------------------------------------- /utils/distributed_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import tempfile 4 | import time 5 | 6 | import torch 7 | 8 | 9 | def get_world_size(): 10 | if not torch.distributed.is_initialized(): 11 | return 1 12 | return torch.distributed.get_world_size() 13 | 14 | 15 | def get_rank(): 16 | if not torch.distributed.is_initialized(): 17 | return 0 18 | return torch.distributed.get_rank() 19 | 20 | 21 | def is_main_process(): 22 | if not torch.distributed.is_initialized(): 23 | return True 24 | return torch.distributed.get_rank() == 0 25 | 26 | 27 | def synchronize(): 28 | """ 29 | Helper function to synchronize between multiple processes when 30 | using distributed training 31 | """ 32 | if not torch.distributed.is_initialized(): 33 | return 34 | world_size = torch.distributed.get_world_size() 35 | rank = torch.distributed.get_rank() 36 | if world_size == 1: 37 | return 38 | 39 | def _send_and_wait(r): 40 | if rank == r: 41 | tensor = torch.tensor(0, device="cuda") 42 | else: 43 | tensor = torch.tensor(1, device="cuda") 44 | torch.distributed.broadcast(tensor, r) 45 | while tensor.item() == 1: 46 | time.sleep(1) 47 | 48 | _send_and_wait(0) 49 | # now sync on the main process 50 | _send_and_wait(1) 51 | 52 | 53 | def _encode(encoded_data, data): 54 | # gets a byte representation for the data 55 | encoded_bytes = pickle.dumps(data) 56 | # convert this byte string into a byte tensor 57 | storage = torch.ByteStorage.from_buffer(encoded_bytes) 58 | tensor = torch.ByteTensor(storage).to("cuda") 59 | # encoding: first byte is the size and then rest is the data 60 | s = tensor.numel() 61 | assert s <= 255, "Can't encode data greater than 255 bytes" 62 | # put the encoded data in encoded_data 63 | encoded_data[0] = s 64 | encoded_data[1: (s + 1)] = tensor 65 | 66 | 67 | def _decode(encoded_data): 68 | size = encoded_data[0] 69 | encoded_tensor = encoded_data[1: (size + 1)].to("cpu") 70 | return pickle.loads(bytearray(encoded_tensor.tolist())) 71 | 72 | 73 | # TODO try to use tensor in shared-memory instead of serializing to disk 74 | # this involves getting the all_gather to work 75 | def scatter_gather(data): 76 | """ 77 | This function gathers data from multiple processes, and returns them 78 | in a list, as they were obtained from each process. 79 | This function is useful for retrieving data from multiple processes, 80 | when launching the code with torch.distributed.launch 81 | Note: this function is slow and should not be used in tight loops, i.e., 82 | do not use it in the training loop. 83 | Arguments: 84 | data: the object to be gathered from multiple processes. 85 | It must be serializable 86 | Returns: 87 | result (list): a list with as many elements as there are processes, 88 | where each element i in the list corresponds to the data that was 89 | gathered from the process of rank i. 90 | """ 91 | # strategy: the main process creates a temporary directory, and communicates 92 | # the location of the temporary directory to all other processes. 93 | # each process will then serialize the data to the folder defined by 94 | # the main process, and then the main process reads all of the serialized 95 | # files and returns them in a list 96 | if not torch.distributed.is_initialized(): 97 | return [data] 98 | synchronize() 99 | # get rank of the current process 100 | rank = torch.distributed.get_rank() 101 | 102 | # the data to communicate should be small 103 | data_to_communicate = torch.empty(256, dtype=torch.uint8, device="cuda") 104 | if rank == 0: 105 | # manually creates a temporary directory, that needs to be cleaned 106 | # afterwards 107 | tmp_dir = tempfile.mkdtemp() 108 | _encode(data_to_communicate, tmp_dir) 109 | 110 | synchronize() 111 | # the main process (rank=0) communicates the data to all processes 112 | torch.distributed.broadcast(data_to_communicate, 0) 113 | 114 | # get the data that was communicated 115 | tmp_dir = _decode(data_to_communicate) 116 | 117 | # each process serializes to a different file 118 | file_template = "file{}.pth" 119 | tmp_file = os.path.join(tmp_dir, file_template.format(rank)) 120 | torch.save(data, tmp_file) 121 | 122 | # synchronize before loading the data 123 | synchronize() 124 | 125 | # only the master process returns the data 126 | if rank == 0: 127 | data_list = [] 128 | world_size = torch.distributed.get_world_size() 129 | for r in range(world_size): 130 | file_path = os.path.join(tmp_dir, file_template.format(r)) 131 | d = torch.load(file_path) 132 | data_list.append(d) 133 | # cleanup 134 | os.remove(file_path) 135 | # cleanup 136 | os.rmdir(tmp_dir) 137 | return data_list 138 | 139 | 140 | def reduce_loss_dict(loss_dict): 141 | """ 142 | Reduce the loss dictionary from all processes so that process with rank 143 | 0 has the averaged results. Returns a dict with the same fields as 144 | loss_dict, after reduction. 145 | """ 146 | world_size = get_world_size() 147 | if world_size < 2: 148 | return loss_dict 149 | with torch.no_grad(): 150 | loss_names = [] 151 | all_losses = [] 152 | for k in sorted(loss_dict.keys()): 153 | loss_names.append(k) 154 | all_losses.append(loss_dict[k]) 155 | all_losses = torch.stack(all_losses, dim=0) 156 | torch.distributed.reduce(all_losses, dst=0) 157 | if torch.distributed.get_rank() == 0: 158 | # only main process gets accumulated, so only divide by 159 | # world_size in this case 160 | all_losses /= world_size 161 | reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} 162 | return reduced_losses 163 | -------------------------------------------------------------------------------- /dataset/cocodataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | import torch 5 | from .dataloading import Dataset 6 | import cv2 7 | from pycocotools.coco import COCO 8 | 9 | from utils.utils import * 10 | 11 | COCO_CLASSES=( 12 | 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 13 | 'boat', 'traffic light', 'fire hydrant', 'street sign', 'stop sign', 14 | 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 15 | 'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 16 | 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 17 | 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 18 | 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass', 19 | 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 20 | 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 21 | 'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk', 22 | 'toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 23 | 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book', 24 | 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') 25 | 26 | 27 | class COCODataset(Dataset): 28 | """ 29 | COCO dataset class. 30 | """ 31 | def __init__(self, data_dir='data/COCO', json_file='instances_train2017.json', 32 | name='train2017', img_size=(416,416), preproc=None, debug=False, voc=False): 33 | """ 34 | COCO dataset initialization. Annotation data are read into memory by COCO API. 35 | Args: 36 | data_dir (str): dataset root directory 37 | json_file (str): COCO json file name 38 | name (str): COCO data name (e.g. 'train2017' or 'val2017') 39 | img_size (int): target image size after pre-processing 40 | preproc: data augmentation strategy 41 | debug (bool): if True, only one data id is selected from the dataset 42 | """ 43 | super().__init__(img_size) 44 | self.data_dir = data_dir 45 | self.json_file = json_file 46 | self.voc = voc 47 | if voc: 48 | self.coco = COCO(self.data_dir+'VOC2007/Annotations/'+self.json_file) 49 | else: 50 | self.coco = COCO(self.data_dir+'annotations/'+self.json_file) 51 | self.ids = self.coco.getImgIds() 52 | if debug: 53 | self.ids = self.ids[1:2] 54 | print("debug mode...", self.ids) 55 | self.class_ids = sorted(self.coco.getCatIds()) 56 | cats = self.coco.loadCats(self.coco.getCatIds()) 57 | self._classes = tuple([c['name'] for c in cats]) 58 | self.name = name 59 | self.max_labels = 50 60 | self.img_size = img_size 61 | self.preproc = preproc 62 | 63 | def __len__(self): 64 | return len(self.ids) 65 | 66 | def pull_item(self, index): 67 | 68 | id_ = self.ids[index] 69 | 70 | im_ann = self.coco.loadImgs(id_)[0] 71 | width = im_ann['width'] 72 | height = im_ann['height'] 73 | anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None) 74 | annotations = self.coco.loadAnns(anno_ids) 75 | 76 | # load image and preprocess 77 | img_file = os.path.join(self.data_dir, 'images', self.name, 78 | #'COCO_'+self.name+'_'+'{:012}'.format(id_) + '.jpg') 79 | '{:012}'.format(id_) + '.jpg') 80 | 81 | if self.voc: 82 | file_name = im_ann['file_name'] 83 | img_file = os.path.join(self.data_dir, 'VOC2007', 'JPEGImages', 84 | file_name) 85 | 86 | img = cv2.imread(img_file) 87 | 88 | if self.json_file == 'instances_val5k.json' and img is None: 89 | img_file = os.path.join(self.data_dir, 'images', 'train2017', 90 | '{:012}'.format(id_) + '.jpg') 91 | img = cv2.imread(img_file) 92 | assert img is not None 93 | 94 | #img, info_img = preprocess(img, self.input_dim[0]) 95 | 96 | # load labels 97 | valid_objs = [] 98 | for obj in annotations: 99 | x1 = np.max((0, obj['bbox'][0])) 100 | y1 = np.max((0, obj['bbox'][1])) 101 | x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1)))) 102 | y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1)))) 103 | if obj['area'] > 0 and x2 >= x1 and y2 >= y1: 104 | obj['clean_bbox'] = [x1, y1, x2, y2] 105 | valid_objs.append(obj) 106 | objs = valid_objs 107 | num_objs = len(objs) 108 | 109 | res = np.zeros((num_objs, 5)) 110 | 111 | for ix, obj in enumerate(objs): 112 | cls = self.class_ids.index(obj['category_id']) 113 | res[ix, 0:4] = obj['clean_bbox'] 114 | res[ix, 4] = cls 115 | 116 | img_info = (width, height) 117 | 118 | return img, res, img_info, id_ 119 | 120 | @Dataset.resize_getitem 121 | def __getitem__(self, index): 122 | """ 123 | One image / label pair for the given index is picked up \ 124 | and pre-processed. 125 | Args: 126 | index (int): data index 127 | Returns: 128 | img (numpy.ndarray): pre-processed image 129 | padded_labels (torch.Tensor): pre-processed label data. \ 130 | The shape is :math:`[self.max_labels, 5]`. \ 131 | each label consists of [class, xc, yc, w, h]: 132 | class (float): class index. 133 | xc, yc (float) : center of bbox whose values range from 0 to 1. 134 | w, h (float) : size of bbox whose values range from 0 to 1. 135 | info_img : tuple of h, w, nh, nw, dx, dy. 136 | h, w (int): original shape of the image 137 | nh, nw (int): shape of the resized image without padding 138 | dx, dy (int): pad size 139 | id_ (int): same as the input index. Used for evaluation. 140 | """ 141 | img, res, img_info, id_ = self.pull_item(index) 142 | 143 | if self.preproc is not None: 144 | img, target = self.preproc(img, res, self.input_dim) 145 | 146 | 147 | return img, target, img_info, id_ 148 | -------------------------------------------------------------------------------- /utils/DCN/modules/deform_conv2d.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import division 5 | 6 | import torch 7 | import math 8 | from torch import nn 9 | from torch.nn import init 10 | from torch.nn.modules.utils import _pair 11 | 12 | from ..functions.deform_conv2d_func import DeformConv2dFunction 13 | 14 | class DeformConv2d(nn.Module): 15 | 16 | def __init__(self, in_channels, out_channels, 17 | kernel_size, stride, padding, dilation=1, groups=1, deformable_groups=1, im2col_step=64, bias=True): 18 | super(DeformConv2d, self).__init__() 19 | 20 | if in_channels % groups != 0: 21 | raise ValueError('in_channels {} must be divisible by groups {}'.format(in_channels, groups)) 22 | if out_channels % groups != 0: 23 | raise ValueError('out_channels {} must be divisible by groups {}'.format(out_channels, groups)) 24 | 25 | self.in_channels = in_channels 26 | self.out_channels = out_channels 27 | self.kernel_size = _pair(kernel_size) 28 | self.stride = _pair(stride) 29 | self.padding = _pair(padding) 30 | self.dilation = _pair(dilation) 31 | self.groups = groups 32 | self.deformable_groups = deformable_groups 33 | self.im2col_step = im2col_step 34 | self.use_bias = bias 35 | 36 | self.weight = nn.Parameter(torch.Tensor( 37 | out_channels, in_channels//groups, *self.kernel_size)) 38 | self.bias = nn.Parameter(torch.Tensor(out_channels)) 39 | self.reset_parameters() 40 | if not self.use_bias: 41 | self.bias.requires_grad = False 42 | self.bias.data.zero_() 43 | 44 | def reset_parameters(self): 45 | n = self.in_channels 46 | init.kaiming_uniform_(self.weight, a=math.sqrt(5)) 47 | if self.bias is not None: 48 | fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) 49 | bound = 1 / math.sqrt(fan_in) 50 | init.uniform_(self.bias, -bound, bound) 51 | 52 | def forward(self, input, offset): 53 | assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ 54 | offset.shape[1] 55 | return DeformConv2dFunction.apply(input, offset, 56 | self.weight, 57 | self.bias, 58 | self.stride, 59 | self.padding, 60 | self.dilation, 61 | self.groups, 62 | self.deformable_groups, 63 | self.im2col_step) 64 | 65 | _DeformConv2d = DeformConv2dFunction.apply 66 | 67 | class DeformConv2dPack(DeformConv2d): 68 | 69 | def __init__(self, in_channels, out_channels, 70 | kernel_size, stride, padding, 71 | dilation=1, groups=1, deformable_groups=1, im2col_step=64, bias=True, lr_mult=0.1): 72 | super(DeformConv2dPack, self).__init__(in_channels, out_channels, 73 | kernel_size, stride, padding, dilation, groups, deformable_groups, im2col_step, bias) 74 | 75 | out_channels = self.deformable_groups * 2 * self.kernel_size[0] * self.kernel_size[1] 76 | self.conv_offset = nn.Conv2d(self.in_channels, 77 | out_channels, 78 | kernel_size=self.kernel_size, 79 | stride=self.stride, 80 | padding=self.padding, 81 | bias=True) 82 | self.conv_offset.lr_mult = lr_mult 83 | self.conv_offset.inited = True 84 | self.init_offset() 85 | 86 | def init_offset(self): 87 | self.conv_offset.weight.data.zero_() 88 | self.conv_offset.bias.data.zero_() 89 | 90 | def forward(self, input): 91 | offset = self.conv_offset(input) 92 | return DeformConv2dFunction.apply(input, offset, 93 | self.weight, 94 | self.bias, 95 | self.stride, 96 | self.padding, 97 | self.dilation, 98 | self.groups, 99 | self.deformable_groups, 100 | self.im2col_step) 101 | 102 | 103 | class DeformConv2dPackMore(DeformConv2d): 104 | 105 | def __init__(self, in_channels, out_channels, 106 | kernel_size, stride, padding, 107 | dilation=1, groups=1, deformable_groups=1, im2col_step=64, bias=True, lr_mult=0.1): 108 | super(DeformConv2dPackMore, self).__init__(in_channels, out_channels, 109 | kernel_size, stride, padding, dilation, groups, deformable_groups, im2col_step, bias) 110 | 111 | out_channels = self.deformable_groups * 2 * self.kernel_size[0] * self.kernel_size[1] 112 | self.conv_offset = nn.Sequential( 113 | nn.Conv2d(self.in_channels, self.in_channels//4, kernel_size=1, bias=False), 114 | nn.BatchNorm2d(self.in_channels//4), 115 | nn.ReLU(inplace=True), 116 | nn.Conv2d(self.in_channels//4, out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, bias=True) 117 | ) 118 | self.conv_offset[-1].lr_mult = lr_mult 119 | self.conv_offset[-1].inited = True 120 | self.init_offset() 121 | 122 | def init_offset(self): 123 | self.conv_offset[-1].weight.data.zero_() 124 | self.conv_offset[-1].bias.data.zero_() 125 | 126 | def forward(self, input): 127 | offset = self.conv_offset(input) 128 | return DeformConv2dFunction.apply(input, offset, 129 | self.weight, 130 | self.bias, 131 | self.stride, 132 | self.padding, 133 | self.dilation, 134 | self.groups, 135 | self.deformable_groups, 136 | self.im2col_step) 137 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | from utils.utils import * 2 | from utils.cocoapi_evaluator import COCOAPIEvaluator 3 | from utils.voc_evaluator import VOCEvaluator 4 | from utils import distributed_util 5 | from utils.distributed_util import reduce_loss_dict 6 | from dataset.cocodataset import * 7 | from dataset.vocdataset import * 8 | from dataset.data_augment import TrainTransform 9 | from dataset.dataloading import * 10 | 11 | import os 12 | import sys 13 | import argparse 14 | import yaml 15 | import random 16 | import math 17 | import cv2 18 | cv2.setNumThreads(0) 19 | 20 | import torch 21 | import torch.nn.init as init 22 | from torch.autograd import Variable 23 | import torch.distributed as dist 24 | import time 25 | 26 | import apex 27 | 28 | ######## unlimit the resource in some dockers or cloud machines ####### 29 | #import resource 30 | #rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 31 | #resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) 32 | 33 | 34 | def parse_args(): 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('--cfg', type=str, default='config/yolov3_baseline.cfg', 37 | help='config file. see readme') 38 | parser.add_argument('-d', '--dataset', type=str, 39 | default='COCO', help='COCO or VOC dataset') 40 | parser.add_argument('--n_cpu', type=int, default=4, 41 | help='number of workers') 42 | parser.add_argument('--distributed', dest='distributed', action='store_true', default=False, 43 | help='distributed training') 44 | parser.add_argument('--local_rank', type=int, 45 | default=0, help='local_rank') 46 | parser.add_argument('--ngpu', type=int, default=10, 47 | help='number of gpu') 48 | parser.add_argument('-c', '--checkpoint', type=str, 49 | help='pytorch checkpoint file path') 50 | parser.add_argument('-s', '--test_size', type=int, default=416) 51 | parser.add_argument('--testset', dest='testset', action='store_true', default=False, 52 | help='test set evaluation') 53 | parser.add_argument('--half', dest='half', action='store_true', default=False, 54 | help='FP16 training') 55 | parser.add_argument('--rfb', dest='rfb', action='store_true', default=False, 56 | help='Use rfb block') 57 | parser.add_argument('--asff', dest='asff', action='store_true', default=False, 58 | help='Use ASFF module for yolov3') 59 | parser.add_argument('--vis', dest='vis', action='store_true', default=False, 60 | help='visualize fusion weight and detection results') 61 | parser.add_argument('--use_cuda', type=bool, default=True) 62 | parser.add_argument('--debug', action='store_true', default=False, 63 | help='debug mode where only one image is trained') 64 | return parser.parse_args() 65 | 66 | def eval(): 67 | """ 68 | YOLOv3 evaler. See README for details. 69 | """ 70 | args = parse_args() 71 | print("Setting Arguments.. : ", args) 72 | 73 | cuda = torch.cuda.is_available() and args.use_cuda 74 | 75 | if args.distributed: 76 | torch.cuda.set_device(args.local_rank) 77 | torch.distributed.init_process_group(backend="nccl", init_method="env://") 78 | 79 | 80 | # Parse config settings 81 | with open(args.cfg, 'r') as f: 82 | cfg = yaml.safe_load(f) 83 | 84 | print("successfully loaded config file: ", cfg) 85 | 86 | backbone=cfg['MODEL']['BACKBONE'] 87 | test_size = (args.test_size,args.test_size) 88 | 89 | if args.dataset == 'COCO': 90 | evaluator = COCOAPIEvaluator( 91 | data_dir='data/COCO/', 92 | img_size=test_size, 93 | confthre=0.001, 94 | nmsthre=0.65, 95 | testset=args.testset, 96 | vis=args.vis) 97 | 98 | num_class=80 99 | 100 | elif args.dataset == 'VOC': 101 | ''' 102 | # COCO style evaluation, you have to convert xml annotation files into a json file. 103 | evaluator = COCOAPIEvaluator( 104 | data_dir='data/VOC/', 105 | img_size=test_size, 106 | confthre=cfg['TEST']['CONFTHRE'], 107 | nmsthre=cfg['TEST']['NMSTHRE'], 108 | testset=args.testset, 109 | voc = True) 110 | ''' 111 | evaluator = VOCEvaluator( 112 | data_dir='data/VOC/', 113 | img_size=test_size, 114 | confthre=0.001, 115 | nmsthre=0.65, 116 | vis=args.vis) 117 | num_class=20 118 | # Initiate model 119 | if args.asff: 120 | if backbone == 'mobile': 121 | from models.yolov3_mobilev2 import YOLOv3 122 | print("For mobilenet, we currently don't support dropblock, rfb and FeatureAdaption") 123 | else: 124 | from models.yolov3_asff import YOLOv3 125 | print('Training YOLOv3 with ASFF!') 126 | model = YOLOv3(num_classes = num_class, rfb=args.rfb, vis=args.vis, asff=args.asff) 127 | else: 128 | if backbone == 'mobile': 129 | from models.yolov3_mobilev2 import YOLOv3 130 | else: 131 | from models.yolov3_baseline import YOLOv3 132 | print('Training YOLOv3 strong baseline!') 133 | if args.vis: 134 | print('Visualization is not supported for YOLOv3 baseline model') 135 | args.vis = False 136 | model = YOLOv3(num_classes = num_class, rfb=args.rfb) 137 | 138 | save_to_disk = (not args.distributed) or distributed_util.get_rank() == 0 139 | 140 | if args.checkpoint: 141 | print("loading pytorch ckpt...", args.checkpoint) 142 | cpu_device = torch.device("cpu") 143 | ckpt = torch.load(args.checkpoint, map_location=cpu_device) 144 | #model.load_state_dict(ckpt,strict=False) 145 | model.load_state_dict(ckpt) 146 | if cuda: 147 | print("using cuda") 148 | torch.backends.cudnn.benchmark = True 149 | device = torch.device("cuda") 150 | model = model.to(device) 151 | 152 | if args.half: 153 | model = model.half() 154 | 155 | if args.ngpu > 1: 156 | if args.distributed: 157 | model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) 158 | #model = apex.parallel.DistributedDataParallel(model) 159 | else: 160 | model = nn.DataParallel(model) 161 | 162 | dtype = torch.float16 if args.half else torch.float32 163 | 164 | if args.distributed: 165 | distributed_util.synchronize() 166 | 167 | ap50_95, ap50 = evaluator.evaluate(model, args.half, args.distributed) 168 | 169 | if args.distributed: 170 | distributed_util.synchronize() 171 | sys.exit(0) 172 | 173 | 174 | if __name__ == '__main__': 175 | eval() 176 | -------------------------------------------------------------------------------- /models/yolov3_baseline.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from collections import defaultdict 5 | from .network_blocks import * 6 | from .yolov3_head import YOLOv3Head 7 | 8 | def create_yolov3_modules(num_classes, ignore_thre, label_smooth, rfb): 9 | """ 10 | Build yolov3 layer modules. 11 | Args: 12 | ignore_thre (float): used in YOLOLayer. 13 | Returns: 14 | mlist (ModuleList): YOLOv3 module list. 15 | """ 16 | # DarkNet53 17 | mlist = nn.ModuleList() 18 | mlist.append(add_conv(in_ch=3, out_ch=32, ksize=3, stride=1)) #0 19 | mlist.append(add_conv(in_ch=32, out_ch=64, ksize=3, stride=2)) #1 20 | mlist.append(resblock(ch=64)) #2 21 | mlist.append(add_conv(in_ch=64, out_ch=128, ksize=3, stride=2)) #3 22 | mlist.append(resblock(ch=128, nblocks=2)) #4 23 | mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=2)) #5 24 | mlist.append(resblock(ch=256, nblocks=8)) # shortcut 1 from here #6 25 | mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=2)) #7 26 | mlist.append(resblock(ch=512, nblocks=8)) # shortcut 2 from here #8 27 | mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=2)) #9 28 | mlist.append(resblock(ch=1024, nblocks=4)) #10 29 | 30 | # YOLOv3 31 | mlist.append(resblock(ch=1024, nblocks=1, shortcut=False)) #11 32 | mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1)) #12 33 | #SPP Layer 34 | mlist.append(SPPLayer()) #13 35 | 36 | mlist.append(add_conv(in_ch=2048, out_ch=512, ksize=1, stride=1)) #14 37 | mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=1)) #15 38 | mlist.append(DropBlock(block_size=1, keep_prob=1.0)) #16 39 | mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1)) #17 40 | # 1st yolo branch 41 | mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=1)) #18 42 | mlist.append( 43 | YOLOv3Head(anch_mask=[6, 7, 8], n_classes=num_classes, stride=32, in_ch=1024, 44 | ignore_thre=ignore_thre,label_smooth = label_smooth, rfb=rfb)) #19 45 | 46 | mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1)) #20 47 | mlist.append(upsample(scale_factor=2, mode='nearest')) #21 48 | mlist.append(add_conv(in_ch=768, out_ch=256, ksize=1, stride=1)) #22 49 | mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=1)) #23 50 | mlist.append(DropBlock(block_size=1, keep_prob=1.0)) #24 51 | mlist.append(resblock(ch=512, nblocks=1, shortcut=False)) #25 52 | mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1)) #26 53 | # 2nd yolo branch 54 | mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=1)) #27 55 | mlist.append( 56 | YOLOv3Head(anch_mask=[3, 4, 5], n_classes=num_classes, stride=16, in_ch=512, 57 | ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb)) #28 58 | 59 | mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1)) #29 60 | mlist.append(upsample(scale_factor=2, mode='nearest')) #30 61 | mlist.append(add_conv(in_ch=384, out_ch=128, ksize=1, stride=1)) #31 62 | mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=1)) #32 63 | mlist.append(DropBlock(block_size=1, keep_prob=1.0)) #33 64 | mlist.append(resblock(ch=256, nblocks=1, shortcut=False)) #34 65 | mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1)) #35 66 | mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=1)) #36 67 | mlist.append( 68 | YOLOv3Head(anch_mask=[0, 1, 2], n_classes=num_classes, stride=8, in_ch=256, 69 | ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb)) #37 70 | 71 | return mlist 72 | 73 | 74 | class YOLOv3(nn.Module): 75 | """ 76 | YOLOv3 model module. The module list is defined by create_yolov3_modules function. \ 77 | The network returns loss values from three YOLO layers during training \ 78 | and detection results during test. 79 | """ 80 | def __init__(self, num_classes = 80, ignore_thre=0.7, label_smooth = False, rfb=False): 81 | 82 | super(YOLOv3, self).__init__() 83 | self.module_list = create_yolov3_modules(num_classes, ignore_thre, label_smooth, rfb) 84 | 85 | def forward(self, x, targets=None, epoch=0): 86 | 87 | train = targets is not None 88 | output = [] 89 | anchor_losses= [] 90 | iou_losses = [] 91 | l1_losses = [] 92 | conf_losses = [] 93 | cls_losses = [] 94 | route_layers = [] 95 | for i, module in enumerate(self.module_list): 96 | 97 | # yolo layers 98 | if i in [19, 28, 37]: 99 | if train: 100 | x, anchor_loss, iou_loss, l1_loss, conf_loss, cls_loss = module(x, targets) 101 | anchor_losses.append(anchor_loss) 102 | iou_losses.append(iou_loss) 103 | l1_losses.append(l1_loss) 104 | conf_losses.append(conf_loss) 105 | cls_losses.append(cls_loss) 106 | else: 107 | x = module(x) 108 | 109 | output.append(x) 110 | else: 111 | x = module(x) 112 | 113 | # route layers 114 | if i in [6, 8, 17, 26]: 115 | route_layers.append(x) 116 | if i == 19: 117 | x = route_layers[2] 118 | if i == 28: # yolo 2nd 119 | x = route_layers[3] 120 | if i == 21: 121 | x = torch.cat((x, route_layers[1]), 1) 122 | if i == 30: 123 | x = torch.cat((x, route_layers[0]), 1) 124 | 125 | if train: 126 | losses = torch.stack(output, 0).unsqueeze(0).sum(1,keepdim=True) 127 | anchor_losses = torch.stack(anchor_losses, 0).unsqueeze(0).sum(1,keepdim=True) 128 | iou_losses = torch.stack(iou_losses, 0).unsqueeze(0).sum(1,keepdim=True) 129 | l1_losses = torch.stack(l1_losses, 0).unsqueeze(0).sum(1,keepdim=True) 130 | conf_losses = torch.stack(conf_losses, 0).unsqueeze(0).sum(1,keepdim=True) 131 | cls_losses = torch.stack(cls_losses, 0).unsqueeze(0).sum(1,keepdim=True) 132 | loss_dict = dict( 133 | losses = losses, 134 | anchor_losses = anchor_losses, 135 | iou_losses = iou_losses, 136 | l1_losses = l1_losses, 137 | conf_losses = conf_losses, 138 | cls_losses = cls_losses, 139 | ) 140 | return loss_dict 141 | else: 142 | return torch.cat(output, 1) 143 | 144 | -------------------------------------------------------------------------------- /utils/fp16_utils/fp16util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors 5 | 6 | 7 | class tofp16(nn.Module): 8 | """ 9 | Utility module that implements:: 10 | 11 | def forward(self, input): 12 | return input.half() 13 | """ 14 | 15 | def __init__(self): 16 | super(tofp16, self).__init__() 17 | 18 | def forward(self, input): 19 | return input.half() 20 | 21 | 22 | def BN_convert_float(module): 23 | """ 24 | Utility function for network_to_half(). 25 | 26 | Retained for legacy purposes. 27 | """ 28 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: 29 | module.float() 30 | for child in module.children(): 31 | BN_convert_float(child) 32 | return module 33 | 34 | 35 | def network_to_half(network): 36 | """ 37 | Convert model to half precision in a batchnorm-safe way. 38 | 39 | Retained for legacy purposes. It is recommended to use FP16Model. 40 | """ 41 | return nn.Sequential(tofp16(), BN_convert_float(network.half())) 42 | 43 | 44 | def convert_module(module, dtype): 45 | """ 46 | Converts a module's immediate parameters and buffers to dtype. 47 | """ 48 | for param in module.parameters(recurse=False): 49 | if param is not None: 50 | if param.data.dtype.is_floating_point: 51 | param.data = param.data.to(dtype=dtype) 52 | if param._grad is not None and param._grad.data.dtype.is_floating_point: 53 | param._grad.data = param._grad.data.to(dtype=dtype) 54 | 55 | for buf in module.buffers(recurse=False): 56 | if buf is not None and buf.data.dtype.is_floating_point: 57 | buf.data = buf.data.to(dtype=dtype) 58 | 59 | 60 | def convert_network(network, dtype): 61 | """ 62 | Converts a network's parameters and buffers to dtype. 63 | """ 64 | for module in network.modules(): 65 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: 66 | continue 67 | convert_module(module, dtype) 68 | return network 69 | 70 | 71 | class FP16Model(nn.Module): 72 | """ 73 | Convert model to half precision in a batchnorm-safe way. 74 | """ 75 | 76 | def __init__(self, network): 77 | super(FP16Model, self).__init__() 78 | self.network = convert_network(network, dtype=torch.half) 79 | 80 | def forward(self, *inputs): 81 | inputs = tuple(t.half() for t in inputs) 82 | return self.network(*inputs) 83 | 84 | 85 | def backwards_debug_hook(grad): 86 | raise RuntimeError("master_params recieved a gradient in the backward pass!") 87 | 88 | def prep_param_lists(model, flat_master=False): 89 | """ 90 | Creates a list of FP32 master parameters for a given model, as in 91 | `Training Neural Networks with Mixed Precision: Real Examples`_. 92 | 93 | Args: 94 | model (torch.nn.Module): Existing Pytorch model 95 | flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization. 96 | Returns: 97 | A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element. 98 | 99 | Example:: 100 | 101 | model_params, master_params = prep_param_lists(model) 102 | 103 | .. warning:: 104 | Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. 105 | 106 | .. _`Training Neural Networks with Mixed Precision: Real Examples`: 107 | http://on-demand.gputechconf.com/gtc/2018/video/S81012/ 108 | """ 109 | model_params = [param for param in model.parameters() if param.requires_grad] 110 | 111 | if flat_master: 112 | # Give the user some more useful error messages 113 | try: 114 | # flatten_dense_tensors returns a contiguous flat array. 115 | # http://pytorch.org/docs/master/_modules/torch/_utils.html 116 | master_params = _flatten_dense_tensors([param.data for param in model_params]).float() 117 | except: 118 | print("Error in prep_param_lists: model may contain a mixture of parameters " 119 | "of different types. Use flat_master=False, or use F16_Optimizer.") 120 | raise 121 | master_params = torch.nn.Parameter(master_params) 122 | master_params.requires_grad = True 123 | # master_params.register_hook(backwards_debug_hook) 124 | if master_params.grad is None: 125 | master_params.grad = master_params.new(*master_params.size()) 126 | return model_params, [master_params] 127 | else: 128 | master_params = [param.clone().float().detach() for param in model_params] 129 | for param in master_params: 130 | param.requires_grad = True 131 | return model_params, master_params 132 | 133 | 134 | def model_grads_to_master_grads(model_params, master_params, flat_master=False): 135 | """ 136 | Copy model gradients to master gradients. 137 | 138 | Args: 139 | model_params: List of model parameters created by :func:`prep_param_lists`. 140 | master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. 141 | """ 142 | if flat_master: 143 | # The flattening may incur one more deep copy than is necessary. 144 | master_params[0].grad.data.copy_( 145 | _flatten_dense_tensors([p.grad.data for p in model_params])) 146 | else: 147 | for model, master in zip(model_params, master_params): 148 | if model.grad is not None: 149 | if master.grad is None: 150 | master.grad = Variable(master.data.new(*master.data.size())) 151 | master.grad.data.copy_(model.grad.data) 152 | else: 153 | master.grad = None 154 | 155 | 156 | def master_params_to_model_params(model_params, master_params, flat_master=False): 157 | """ 158 | Copy master parameters to model parameters. 159 | 160 | Args: 161 | model_params: List of model parameters created by :func:`prep_param_lists`. 162 | master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. 163 | """ 164 | if flat_master: 165 | for model, master in zip(model_params, 166 | _unflatten_dense_tensors(master_params[0].data, model_params)): 167 | model.data.copy_(master) 168 | else: 169 | for model, master in zip(model_params, master_params): 170 | model.data.copy_(master.data) 171 | 172 | # Backward compatibility fixes 173 | 174 | def to_python_float(t): 175 | if hasattr(t, 'item'): 176 | return t.item() 177 | else: 178 | return t[0] 179 | 180 | TORCH_MAJOR = int(torch.__version__.split('.')[0]) 181 | TORCH_MINOR = int(torch.__version__.split('.')[1]) 182 | if TORCH_MAJOR == 0 and TORCH_MINOR <= 4: 183 | clip_grad_norm = torch.nn.utils.clip_grad_norm 184 | else: 185 | clip_grad_norm = torch.nn.utils.clip_grad_norm_ 186 | -------------------------------------------------------------------------------- /dataset/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os 9 | import pickle 10 | import numpy as np 11 | import pdb 12 | 13 | 14 | def parse_rec(filename): 15 | """ Parse a PASCAL VOC xml file """ 16 | tree = ET.parse(filename) 17 | objects = [] 18 | for obj in tree.findall('object'): 19 | obj_struct = {} 20 | obj_struct['name'] = obj.find('name').text 21 | obj_struct['pose'] = obj.find('pose').text 22 | obj_struct['truncated'] = int(obj.find('truncated').text) 23 | obj_struct['difficult'] = int(obj.find('difficult').text) 24 | bbox = obj.find('bndbox') 25 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 26 | int(bbox.find('ymin').text), 27 | int(bbox.find('xmax').text), 28 | int(bbox.find('ymax').text)] 29 | objects.append(obj_struct) 30 | 31 | return objects 32 | 33 | 34 | 35 | def voc_ap(rec, prec, use_07_metric=False): 36 | """ ap = voc_ap(rec, prec, [use_07_metric]) 37 | Compute VOC AP given precision and recall. 38 | If use_07_metric is true, uses the 39 | VOC 07 11 point method (default:False). 40 | """ 41 | if use_07_metric: 42 | # 11 point metric 43 | ap = 0. 44 | for t in np.arange(0., 1.1, 0.1): 45 | if np.sum(rec >= t) == 0: 46 | p = 0 47 | else: 48 | p = np.max(prec[rec >= t]) 49 | ap = ap + p / 11. 50 | else: 51 | # correct AP calculation 52 | # first append sentinel values at the end 53 | mrec = np.concatenate(([0.], rec, [1.])) 54 | mpre = np.concatenate(([0.], prec, [0.])) 55 | 56 | # compute the precision envelope 57 | for i in range(mpre.size - 1, 0, -1): 58 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 59 | 60 | # to calculate area under PR curve, look for points 61 | # where X axis (recall) changes value 62 | i = np.where(mrec[1:] != mrec[:-1])[0] 63 | 64 | # and sum (\Delta recall) * prec 65 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 66 | return ap 67 | 68 | def voc_eval(detpath, 69 | annopath, 70 | imagesetfile, 71 | classname, 72 | cachedir, 73 | ovthresh=0.5, 74 | use_07_metric=False): 75 | """rec, prec, ap = voc_eval(detpath, 76 | annopath, 77 | imagesetfile, 78 | classname, 79 | [ovthresh], 80 | [use_07_metric]) 81 | 82 | Top level function that does the PASCAL VOC evaluation. 83 | 84 | detpath: Path to detections 85 | detpath.format(classname) should produce the detection results file. 86 | annopath: Path to annotations 87 | annopath.format(imagename) should be the xml annotations file. 88 | imagesetfile: Text file containing the list of images, one image per line. 89 | classname: Category name (duh) 90 | cachedir: Directory for caching the annotations 91 | [ovthresh]: Overlap threshold (default = 0.5) 92 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 93 | (default False) 94 | """ 95 | # assumes detections are in detpath.format(classname) 96 | # assumes annotations are in annopath.format(imagename) 97 | # assumes imagesetfile is a text file with each line an image name 98 | # cachedir caches the annotations in a pickle file 99 | 100 | # first load gt 101 | if not os.path.isdir(cachedir): 102 | os.mkdir(cachedir) 103 | cachefile = os.path.join(cachedir, 'annots.pkl') 104 | # read list of images 105 | with open(imagesetfile, 'r') as f: 106 | lines = f.readlines() 107 | imagenames = [x.strip() for x in lines] 108 | 109 | if not os.path.isfile(cachefile): 110 | # load annots 111 | recs = {} 112 | for i, imagename in enumerate(imagenames): 113 | recs[imagename] = parse_rec(annopath.format(imagename)) 114 | if i % 100 == 0: 115 | print('Reading annotation for {:d}/{:d}'.format( 116 | i + 1, len(imagenames))) 117 | # save 118 | print('Saving cached annotations to {:s}'.format(cachefile)) 119 | with open(cachefile, 'wb') as f: 120 | pickle.dump(recs, f) 121 | else: 122 | # load 123 | with open(cachefile, 'rb') as f: 124 | recs = pickle.load(f) 125 | 126 | # extract gt objects for this class 127 | class_recs = {} 128 | npos = 0 129 | for imagename in imagenames: 130 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 131 | bbox = np.array([x['bbox'] for x in R]) 132 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 133 | det = [False] * len(R) 134 | npos = npos + sum(~difficult) 135 | class_recs[imagename] = {'bbox': bbox, 136 | 'difficult': difficult, 137 | 'det': det} 138 | 139 | # read dets 140 | detfile = detpath.format(classname) 141 | with open(detfile, 'r') as f: 142 | lines = f.readlines() 143 | 144 | if len(lines) == 0: 145 | return 0, 0, 0 146 | 147 | splitlines = [x.strip().split(' ') for x in lines] 148 | image_ids = [x[0] for x in splitlines] 149 | confidence = np.array([float(x[1]) for x in splitlines]) 150 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 151 | 152 | # sort by confidence 153 | sorted_ind = np.argsort(-confidence) 154 | sorted_scores = np.sort(-confidence) 155 | BB = BB[sorted_ind, :] 156 | image_ids = [image_ids[x] for x in sorted_ind] 157 | 158 | # go down dets and mark TPs and FPs 159 | nd = len(image_ids) 160 | tp = np.zeros(nd) 161 | fp = np.zeros(nd) 162 | for d in range(nd): 163 | R = class_recs[image_ids[d]] 164 | bb = BB[d, :].astype(float) 165 | ovmax = -np.inf 166 | BBGT = R['bbox'].astype(float) 167 | 168 | if BBGT.size > 0: 169 | # compute overlaps 170 | # intersection 171 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 172 | iymin = np.maximum(BBGT[:, 1], bb[1]) 173 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 174 | iymax = np.minimum(BBGT[:, 3], bb[3]) 175 | iw = np.maximum(ixmax - ixmin + 1., 0.) 176 | ih = np.maximum(iymax - iymin + 1., 0.) 177 | inters = iw * ih 178 | 179 | # union 180 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 181 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 182 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 183 | 184 | overlaps = inters / uni 185 | ovmax = np.max(overlaps) 186 | jmax = np.argmax(overlaps) 187 | 188 | if ovmax > ovthresh: 189 | if not R['difficult'][jmax]: 190 | if not R['det'][jmax]: 191 | tp[d] = 1. 192 | R['det'][jmax] = 1 193 | else: 194 | fp[d] = 1. 195 | else: 196 | fp[d] = 1. 197 | 198 | # compute precision recall 199 | fp = np.cumsum(fp) 200 | tp = np.cumsum(tp) 201 | rec = tp / float(npos) 202 | # avoid divide by zero in case the first detection matches a difficult 203 | # ground truth 204 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 205 | ap = voc_ap(rec, prec, use_07_metric) 206 | 207 | return rec, prec, ap 208 | -------------------------------------------------------------------------------- /utils/fp16_utils/loss_scaler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # item() is a recent addition, so this helps with backward compatibility. 4 | def to_python_float(t): 5 | if hasattr(t, 'item'): 6 | return t.item() 7 | else: 8 | return t[0] 9 | 10 | class LossScaler: 11 | """ 12 | Class that manages a static loss scale. This class is intended to interact with 13 | :class:`FP16_Optimizer`, and should not be directly manipulated by the user. 14 | 15 | Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to 16 | :class:`FP16_Optimizer`'s constructor. 17 | 18 | Args: 19 | scale (float, optional, default=1.0): The loss scale. 20 | """ 21 | 22 | def __init__(self, scale=1): 23 | self.cur_scale = scale 24 | 25 | # `params` is a list / generator of torch.Variable 26 | def has_overflow(self, params): 27 | return False 28 | 29 | # `x` is a torch.Tensor 30 | def _has_inf_or_nan(x): 31 | return False 32 | 33 | def update_scale(self, overflow): 34 | pass 35 | 36 | @property 37 | def loss_scale(self): 38 | return self.cur_scale 39 | 40 | def scale_gradient(self, module, grad_in, grad_out): 41 | return tuple(self.loss_scale * g for g in grad_in) 42 | 43 | def backward(self, loss, retain_graph=False): 44 | scaled_loss = loss*self.loss_scale 45 | scaled_loss.backward(retain_graph=retain_graph) 46 | 47 | class DynamicLossScaler: 48 | """ 49 | Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler` 50 | indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of 51 | :class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler` 52 | operates, because the default options can be changed using the 53 | the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor. 54 | 55 | Loss scaling is designed to combat the problem of underflowing gradients encountered at long 56 | times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss 57 | scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are 58 | encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has 59 | occurred. 60 | :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch, 61 | and :class:`DynamicLossScaler` adjusts the loss scale to a lower value. 62 | If a certain number of iterations occur without overflowing gradients detected, 63 | :class:`DynamicLossScaler` increases the loss scale once more. 64 | In this way :class:`DynamicLossScaler` attempts to "ride the edge" of 65 | always using the highest loss scale possible without incurring overflow. 66 | 67 | Args: 68 | init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.` 69 | scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. 70 | scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale. 71 | """ 72 | 73 | def __init__(self, 74 | init_scale=2**32, 75 | scale_factor=2., 76 | scale_window=1000): 77 | self.cur_scale = init_scale 78 | self.cur_iter = 0 79 | self.last_overflow_iter = -1 80 | self.scale_factor = scale_factor 81 | self.scale_window = scale_window 82 | 83 | # `params` is a list / generator of torch.Variable 84 | def has_overflow(self, params): 85 | for p in params: 86 | if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data): 87 | return True 88 | 89 | return False 90 | 91 | # `x` is a torch.Tensor 92 | def _has_inf_or_nan(x): 93 | try: 94 | # if x is half, the .float() incurs an additional deep copy, but it's necessary if 95 | # Pytorch's .sum() creates a one-element tensor of the same type as x 96 | # (which is true for some recent version of pytorch). 97 | cpu_sum = float(x.float().sum()) 98 | # More efficient version that can be used if .sum() returns a Python scalar 99 | # cpu_sum = float(x.sum()) 100 | except RuntimeError as instance: 101 | # We want to check if inst is actually an overflow exception. 102 | # RuntimeError could come from a different error. 103 | # If so, we still want the exception to propagate. 104 | if "value cannot be converted" not in instance.args[0]: 105 | raise 106 | return True 107 | else: 108 | if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: 109 | return True 110 | return False 111 | 112 | # `overflow` is boolean indicating whether the gradient overflowed 113 | def update_scale(self, overflow): 114 | if overflow: 115 | # self.cur_scale /= self.scale_factor 116 | self.cur_scale = max(self.cur_scale/self.scale_factor, 1) 117 | self.last_overflow_iter = self.cur_iter 118 | else: 119 | if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0: 120 | self.cur_scale *= self.scale_factor 121 | self.cur_iter += 1 122 | 123 | @property 124 | def loss_scale(self): 125 | return self.cur_scale 126 | 127 | def scale_gradient(self, module, grad_in, grad_out): 128 | return tuple(self.loss_scale * g for g in grad_in) 129 | 130 | def backward(self, loss, retain_graph=False): 131 | scaled_loss = loss*self.loss_scale 132 | scaled_loss.backward(retain_graph=retain_graph) 133 | 134 | ############################################################## 135 | # Example usage below here -- assuming it's in a separate file 136 | ############################################################## 137 | """ 138 | TO-DO separate out into an example. 139 | if __name__ == "__main__": 140 | import torch 141 | from torch.autograd import Variable 142 | from dynamic_loss_scaler import DynamicLossScaler 143 | 144 | # N is batch size; D_in is input dimension; 145 | # H is hidden dimension; D_out is output dimension. 146 | N, D_in, H, D_out = 64, 1000, 100, 10 147 | 148 | # Create random Tensors to hold inputs and outputs, and wrap them in Variables. 149 | x = Variable(torch.randn(N, D_in), requires_grad=False) 150 | y = Variable(torch.randn(N, D_out), requires_grad=False) 151 | 152 | w1 = Variable(torch.randn(D_in, H), requires_grad=True) 153 | w2 = Variable(torch.randn(H, D_out), requires_grad=True) 154 | parameters = [w1, w2] 155 | 156 | learning_rate = 1e-6 157 | optimizer = torch.optim.SGD(parameters, lr=learning_rate) 158 | loss_scaler = DynamicLossScaler() 159 | 160 | for t in range(500): 161 | y_pred = x.mm(w1).clamp(min=0).mm(w2) 162 | loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale 163 | print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) 164 | print('Iter {} scaled loss: {}'.format(t, loss.data[0])) 165 | print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) 166 | 167 | # Run backprop 168 | optimizer.zero_grad() 169 | loss.backward() 170 | 171 | # Check for overflow 172 | has_overflow = DynamicLossScaler.has_overflow(parameters) 173 | 174 | # If no overflow, unscale grad and update as usual 175 | if not has_overflow: 176 | for param in parameters: 177 | param.grad.data.mul_(1. / loss_scaler.loss_scale) 178 | optimizer.step() 179 | # Otherwise, don't do anything -- ie, skip iteration 180 | else: 181 | print('OVERFLOW!') 182 | 183 | # Update loss scale for next iteration 184 | loss_scaler.update_scale(has_overflow) 185 | 186 | """ 187 | -------------------------------------------------------------------------------- /models/yolov3_asff.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .network_blocks import * 5 | from .yolov3_head import YOLOv3Head 6 | 7 | from collections import defaultdict 8 | 9 | def build_yolov3_modules(num_classes, ignore_thre, label_smooth, rfb): 10 | """ 11 | Build yolov3 layer modules. 12 | Args: 13 | ignore_thre (float): used in YOLOLayer. 14 | Returns: 15 | mlist (ModuleList): YOLOv3 module list. 16 | """ 17 | # DarkNet53 18 | mlist = nn.ModuleList() 19 | mlist.append(add_conv(in_ch=3, out_ch=32, ksize=3, stride=1)) #0 20 | mlist.append(add_conv(in_ch=32, out_ch=64, ksize=3, stride=2)) #1 21 | mlist.append(resblock(ch=64)) #2 22 | mlist.append(add_conv(in_ch=64, out_ch=128, ksize=3, stride=2)) #3 23 | mlist.append(resblock(ch=128, nblocks=2)) #4 24 | mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=2)) #5 25 | mlist.append(resblock(ch=256, nblocks=8)) # shortcut 1 from here #6 26 | mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=2)) #7 27 | mlist.append(resblock(ch=512, nblocks=8)) # shortcut 2 from here #8 28 | mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=2)) #9 29 | mlist.append(resblock(ch=1024, nblocks=4)) #10 30 | 31 | # YOLOv3 32 | mlist.append(resblock(ch=1024, nblocks=1, shortcut=False)) #11 33 | mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1)) #12 34 | #SPP Layer 35 | mlist.append(SPPLayer()) #13 36 | 37 | mlist.append(add_conv(in_ch=2048, out_ch=512, ksize=1, stride=1)) #14 38 | mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=1)) #15 39 | mlist.append(DropBlock(block_size=1, keep_prob=1)) #16 40 | mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1)) #17 41 | 42 | # 1st yolo branch 43 | mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1)) #18 44 | mlist.append(upsample(scale_factor=2, mode='nearest')) #19 45 | mlist.append(add_conv(in_ch=768, out_ch=256, ksize=1, stride=1)) #20 46 | mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=1)) #21 47 | mlist.append(DropBlock(block_size=1, keep_prob=1)) #22 48 | mlist.append(resblock(ch=512, nblocks=1, shortcut=False)) #23 49 | mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1)) #24 50 | # 2nd yolo branch 51 | 52 | mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1)) #25 53 | mlist.append(upsample(scale_factor=2, mode='nearest')) #26 54 | mlist.append(add_conv(in_ch=384, out_ch=128, ksize=1, stride=1)) #27 55 | mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=1)) #28 56 | mlist.append(DropBlock(block_size=1, keep_prob=1)) #29 57 | mlist.append(resblock(ch=256, nblocks=1, shortcut=False)) #30 58 | mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1)) #31 59 | mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=1)) #32 60 | 61 | return mlist 62 | 63 | 64 | class YOLOv3(nn.Module): 65 | """ 66 | YOLOv3 model module. The module list is defined by create_yolov3_modules function. \ 67 | The network returns loss values from three YOLO layers during training \ 68 | and detection results during test. 69 | """ 70 | def __init__(self, num_classes = 80, ignore_thre=0.7, label_smooth = False, rfb=False, vis=False, asff=False): 71 | """ 72 | Initialization of YOLOv3 class. 73 | Args: 74 | ignore_thre (float): used in YOLOLayer. 75 | """ 76 | super(YOLOv3, self).__init__() 77 | self.module_list = build_yolov3_modules(num_classes, ignore_thre, label_smooth, rfb) 78 | 79 | 80 | self.level_0_fusion = ASFF(level=0,rfb=rfb,vis=vis) 81 | 82 | self.level_0_header = YOLOv3Head(anch_mask=[6, 7, 8], n_classes=num_classes, stride=32, in_ch=1024, 83 | ignore_thre=ignore_thre,label_smooth = label_smooth, rfb=rfb) 84 | 85 | self.level_1_fusion = ASFF(level=1,rfb=rfb,vis=vis) 86 | 87 | self.level_1_header = YOLOv3Head(anch_mask=[3, 4, 5], n_classes=num_classes, stride=16, in_ch=512, 88 | ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb) 89 | 90 | self.level_2_fusion = ASFF(level=2,rfb=rfb,vis=vis) 91 | 92 | self.level_2_header = YOLOv3Head(anch_mask=[0, 1, 2], n_classes=num_classes, stride=8, in_ch=256, 93 | ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb) 94 | self.vis=vis 95 | 96 | def forward(self, x, targets=None, epoch=0): 97 | """ 98 | Forward path of YOLOv3. 99 | Args: 100 | x (torch.Tensor) : input data whose shape is :math:`(N, C, H, W)`, \ 101 | where N, C are batchsize and num. of channels. 102 | targets (torch.Tensor) : label array whose shape is :math:`(N, 50, 5)` 103 | 104 | Returns: 105 | training: 106 | output (torch.Tensor): loss tensor for backpropagation. 107 | test: 108 | output (torch.Tensor): concatenated detection results. 109 | """ 110 | 111 | train = targets is not None 112 | output = [] 113 | anchor_losses= [] 114 | iou_losses = [] 115 | l1_losses = [] 116 | conf_losses = [] 117 | cls_losses = [] 118 | route_layers = [] 119 | if self.vis: 120 | fuse_wegihts = [] 121 | fuse_fs = [] 122 | 123 | for i, module in enumerate(self.module_list): 124 | 125 | # yolo layers 126 | x = module(x) 127 | 128 | # route layers 129 | if i in [6, 8, 17, 24, 32]: 130 | route_layers.append(x) 131 | if i == 19: 132 | x = torch.cat((x, route_layers[1]), 1) 133 | if i == 26: 134 | x = torch.cat((x, route_layers[0]), 1) 135 | 136 | 137 | for l in range(3): 138 | fusion = getattr(self, 'level_{}_fusion'.format(l)) 139 | header = getattr(self, 'level_{}_header'.format(l)) 140 | 141 | if self.vis: 142 | fused, weight, fuse_f = fusion(route_layers[2],route_layers[3],route_layers[4]) 143 | fuse_wegihts.append(weight) 144 | fuse_fs.append(fuse_f) 145 | else: 146 | fused = fusion(route_layers[2],route_layers[3],route_layers[4]) 147 | 148 | if train: 149 | x, anchor_loss, iou_loss, l1_loss, conf_loss, cls_loss = header(fused, targets) 150 | anchor_losses.append(anchor_loss) 151 | iou_losses.append(iou_loss) 152 | l1_losses.append(l1_loss) 153 | conf_losses.append(conf_loss) 154 | cls_losses.append(cls_loss) 155 | else: 156 | x = header(fused) 157 | 158 | output.append(x) 159 | 160 | if train: 161 | losses = torch.stack(output, 0).unsqueeze(0).sum(1,keepdim=True) 162 | anchor_losses = torch.stack(anchor_losses, 0).unsqueeze(0).sum(1,keepdim=True) 163 | iou_losses = torch.stack(iou_losses, 0).unsqueeze(0).sum(1,keepdim=True) 164 | l1_losses = torch.stack(l1_losses, 0).unsqueeze(0).sum(1,keepdim=True) 165 | conf_losses = torch.stack(conf_losses, 0).unsqueeze(0).sum(1,keepdim=True) 166 | cls_losses = torch.stack(cls_losses, 0).unsqueeze(0).sum(1,keepdim=True) 167 | loss_dict = dict( 168 | losses = losses, 169 | anchor_losses = anchor_losses, 170 | iou_losses = iou_losses, 171 | l1_losses = l1_losses, 172 | conf_losses = conf_losses, 173 | cls_losses = cls_losses, 174 | ) 175 | return loss_dict 176 | else: 177 | if self.vis: 178 | return torch.cat(output, 1), fuse_wegihts, fuse_fs 179 | else: 180 | return torch.cat(output, 1) 181 | 182 | -------------------------------------------------------------------------------- /utils/voc_evaluator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | import sys 4 | from tqdm import tqdm 5 | 6 | from pycocotools.cocoeval import COCOeval 7 | from torch.autograd import Variable 8 | 9 | from dataset.vocdataset import * 10 | from dataset.data_augment import ValTransform 11 | from utils.utils import * 12 | from utils import distributed_util 13 | from utils.vis_utils import make_vis, make_pred_vis 14 | 15 | import time 16 | 17 | #DEBUG = True 18 | DEBUG = False 19 | 20 | def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): 21 | all_predictions = distributed_util.scatter_gather(predictions_per_gpu) 22 | if not distributed_util.is_main_process(): 23 | return 24 | # merge the list of dicts 25 | predictions = {} 26 | for p in all_predictions: 27 | predictions.update(p) 28 | # convert a dict where the key is the index in a list 29 | image_ids = list(sorted(predictions.keys())) 30 | if len(image_ids) != image_ids[-1] + 1: 31 | print('num_imgs: ',len(image_ids)) 32 | print('last img_id: ',image_ids[-1]) 33 | print( 34 | "Number of images that were gathered from multiple processes is not " 35 | "a contiguous set. Some images might be missing from the evaluation" 36 | ) 37 | 38 | # convert to a list 39 | predictions = [predictions[i] for i in image_ids] 40 | return predictions 41 | 42 | 43 | class VOCEvaluator(): 44 | """ 45 | COCO AP Evaluation class. 46 | All the data in the val2017 dataset are processed \ 47 | and evaluated by COCO API. 48 | """ 49 | def __init__(self, data_dir, img_size, confthre, nmsthre,vis=False): 50 | """ 51 | Args: 52 | data_dir (str): dataset root directory 53 | img_size (int): image size after preprocess. images are resized \ 54 | to squares whose shape is (img_size, img_size). 55 | confthre (float): 56 | confidence threshold ranging from 0 to 1, \ 57 | which is defined in the config file. 58 | nmsthre (float): 59 | IoU threshold of non-max supression ranging from 0 to 1. 60 | """ 61 | test_sets = [('2007', 'test'),] 62 | self.dataset = VOCDetection( 63 | root=data_dir, 64 | image_sets = test_sets, 65 | input_dim=img_size, 66 | preproc = ValTransform(rgb_means=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225)),) 67 | self.num_images = len(self.dataset) 68 | self.dataloader = torch.utils.data.DataLoader( 69 | self.dataset, batch_size=1, shuffle=False, num_workers=0) 70 | self.img_size = img_size 71 | self.confthre = confthre 72 | self.nmsthre = nmsthre 73 | self.vis=vis 74 | 75 | def evaluate(self, model, half=False): 76 | """ 77 | COCO average precision (AP) Evaluation. Iterate inference on the test dataset 78 | and the results are evaluated by COCO API. 79 | Args: 80 | model : model object 81 | Returns: 82 | ap50_95 (float) : calculated COCO AP for IoU=50:95 83 | ap50 (float) : calculated COCO AP for IoU=50 84 | """ 85 | if isinstance(model, torch.nn.parallel.DistributedDataParallel): 86 | model = model.module 87 | model.eval() 88 | cuda = torch.cuda.is_available() 89 | if half: 90 | Tensor = torch.cuda.HalfTensor if cuda else torch.HalfTensor 91 | else: 92 | Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor 93 | 94 | ids = [] 95 | data_dict = [] 96 | dataiterator = iter(self.dataloader) 97 | img_num = 0 98 | indices = list(range(self.num_images)) 99 | dis_indices = indices[distributed_util.get_rank()::distributed_util.get_world_size()] 100 | progress_bar = tqdm if distributed_util.is_main_process() else iter 101 | num_classes = 20 102 | predictions = {} 103 | 104 | if distributed_util.is_main_process(): 105 | inference_time=0 106 | nms_time=0 107 | n_samples=len(dis_indices) 108 | 109 | for i in progress_bar(dis_indices): 110 | img, _, info_img, id_ = self.dataset[i] # load a batch 111 | info_img = [float(info) for info in info_img] 112 | ids.append(id_) 113 | with torch.no_grad(): 114 | img = Variable(img.type(Tensor).unsqueeze(0)) 115 | 116 | if distributed_util.is_main_process() and i > 9: 117 | start=time.time() 118 | 119 | if self.vis: 120 | outputs,fuse_weights,fused_f = model(img) 121 | else: 122 | outputs = model(img) 123 | 124 | if distributed_util.is_main_process() and i > 9: 125 | infer_end=time.time() 126 | inference_time += (infer_end-start) 127 | 128 | outputs = postprocess( 129 | outputs, 20, self.confthre, self.nmsthre) 130 | 131 | 132 | if distributed_util.is_main_process() and i > 9: 133 | nms_end=time.time() 134 | nms_time +=(nms_end-infer_end) 135 | 136 | if outputs[0] is None: 137 | predictions[i] = (None, None, None) 138 | continue 139 | outputs = outputs[0].cpu().data 140 | 141 | bboxes = outputs[:, 0:4] 142 | bboxes[:, 0::2] *= info_img[0] / self.img_size[0] 143 | bboxes[:, 1::2] *= info_img[1] / self.img_size[1] 144 | cls = outputs[:, 6] 145 | scores = outputs[:, 4]* outputs[:,5] 146 | predictions[i] = (bboxes, cls, scores) 147 | 148 | if self.vis: 149 | o_img,_,_,_ = self.dataset.pull_item(i) 150 | make_vis('VOC', i, o_img, fuse_weights, fused_f) 151 | class_names = self.dataset._classes 152 | 153 | bbox = bboxes.clone() 154 | bbox[:, 2] = bbox[:,2] - bbox[:,0] 155 | bbox[:, 3] = bbox[:,3] - bbox[:,1] 156 | 157 | make_pred_vis('VOC', i, o_img, class_names, bbox, cls, scores) 158 | 159 | if DEBUG and distributed_util.is_main_process(): 160 | o_img,_,_,_ = self.dataset.pull_item(i) 161 | class_names = self.dataset._classes 162 | bbox = bboxes.clone() 163 | bbox[:, 2] = bbox[:,2] - bbox[:,0] 164 | bbox[:, 3] = bbox[:,3] - bbox[:,1] 165 | make_pred_vis('VOC', i, o_img, class_names, bbox, cls, scores) 166 | 167 | distributed_util.synchronize() 168 | predictions = _accumulate_predictions_from_multiple_gpus(predictions) 169 | if not distributed_util.is_main_process(): 170 | return 0, 0 171 | 172 | 173 | print('Main process Evaluating...') 174 | 175 | a_infer_time = 1000*inference_time / (n_samples-10) 176 | a_nms_time= 1000*nms_time / (n_samples-10) 177 | 178 | print('Average forward time: %.2f ms, Average NMS time: %.2f ms, Average inference time: %.2f ms' %(a_infer_time, \ 179 | a_nms_time, (a_infer_time+a_nms_time))) 180 | 181 | all_boxes = [[[] for _ in range(self.num_images)] 182 | for _ in range(num_classes)] 183 | for img_num in range(self.num_images): 184 | bboxes, cls, scores = predictions[img_num] 185 | if bboxes is None: 186 | for j in range(num_classes): 187 | all_boxes[j][img_num] = np.empty([0,5],dtype=np.float32) 188 | continue 189 | for j in range(num_classes): 190 | mask_c = (cls == j) 191 | if sum(mask_c) ==0: 192 | all_boxes[j][img_num] = np.empty([0,5],dtype=np.float32) 193 | continue 194 | 195 | c_dets = torch.cat((bboxes, scores.unsqueeze(1)),dim=1) 196 | all_boxes[j][img_num] = c_dets[mask_c].numpy() 197 | 198 | sys.stdout.write('im_eval: {:d}/{:d} \r'.format(img_num+1, self.num_images)) 199 | sys.stdout.flush() 200 | 201 | with tempfile.TemporaryDirectory() as tempdir: 202 | mAP50, mAP70 = self.dataset.evaluate_detections(all_boxes, tempdir) 203 | return mAP50,mAP70 204 | 205 | -------------------------------------------------------------------------------- /utils/cocoapi_evaluator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | import sys 4 | from tqdm import tqdm 5 | 6 | from pycocotools.cocoeval import COCOeval 7 | from torch.autograd import Variable 8 | 9 | from dataset.cocodataset import * 10 | from dataset.data_augment import ValTransform 11 | from utils.utils import * 12 | from utils import distributed_util 13 | from utils.vis_utils import make_vis, make_pred_vis 14 | import time 15 | import apex 16 | 17 | DEBUG =False 18 | 19 | def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): 20 | all_predictions = distributed_util.scatter_gather(predictions_per_gpu) 21 | if not distributed_util.is_main_process(): 22 | return 23 | # merge the list of dicts 24 | predictions = [] 25 | for p in all_predictions: 26 | for a in p: 27 | predictions.append(a) 28 | 29 | return predictions 30 | 31 | class COCOAPIEvaluator(): 32 | """ 33 | COCO AP Evaluation class. 34 | All the data in the val2017 dataset are processed \ 35 | and evaluated by COCO API. 36 | """ 37 | def __init__(self, data_dir, img_size, confthre, nmsthre, testset=False, voc=False, vis=False): 38 | """ 39 | Args: 40 | data_dir (str): dataset root directory 41 | img_size (int): image size after preprocess. images are resized \ 42 | to squares whose shape is (img_size, img_size). 43 | confthre (float): 44 | confidence threshold ranging from 0 to 1, \ 45 | which is defined in the config file. 46 | nmsthre (float): 47 | IoU threshold of non-max supression ranging from 0 to 1. 48 | """ 49 | json_f = 'instances_val2017.json' 50 | name='val2017' 51 | if testset: 52 | json_f = 'image_info_test-dev2017.json' 53 | name='test2017' 54 | if voc: 55 | json_f = 'pascal_test2007.json' 56 | 57 | self.testset= testset 58 | self.dataset = COCODataset(data_dir=data_dir, 59 | img_size=img_size, 60 | json_file=json_f, 61 | preproc = ValTransform(rgb_means=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225)), 62 | name=name, 63 | voc = voc) 64 | 65 | self.num_images = len(self.dataset) 66 | self.dataloader = torch.utils.data.DataLoader( 67 | self.dataset, batch_size=1, shuffle=False, num_workers=0) 68 | self.img_size = img_size 69 | self.confthre = confthre 70 | self.nmsthre = nmsthre 71 | self.voc = voc 72 | self.vis = vis 73 | 74 | def evaluate(self, model, half=False, distributed=False): 75 | """ 76 | COCO average precision (AP) Evaluation. Iterate inference on the test dataset 77 | and the results are evaluated by COCO API. 78 | Args: 79 | model : model object 80 | Returns: 81 | ap50_95 (float) : calculated COCO AP for IoU=50:95 82 | ap50 (float) : calculated COCO AP for IoU=50 83 | """ 84 | if isinstance(model, apex.parallel.DistributedDataParallel): 85 | model = model.module 86 | distributed=True 87 | 88 | model=model.eval() 89 | cuda = torch.cuda.is_available() 90 | if half: 91 | Tensor = torch.cuda.HalfTensor if cuda else torch.HalfTensor 92 | else: 93 | Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor 94 | ids = [] 95 | data_dict = [] 96 | img_num = 0 97 | 98 | indices = list(range(self.num_images)) 99 | if distributed: 100 | dis_indices = indices[distributed_util.get_rank()::distributed_util.get_world_size()] 101 | else: 102 | dis_indices = indices 103 | progress_bar = tqdm if distributed_util.is_main_process() else iter 104 | num_classes = 80 if not self.voc else 20 105 | 106 | inference_time=0 107 | nms_time=0 108 | n_samples=len(dis_indices)-10 109 | 110 | for k, i in enumerate(progress_bar(dis_indices)): 111 | img, _, info_img, id_ = self.dataset[i] # load a batch 112 | info_img = [float(info) for info in info_img] 113 | id_ = int(id_) 114 | ids.append(id_) 115 | with torch.no_grad(): 116 | img = Variable(img.type(Tensor).unsqueeze(0)) 117 | if k > 9: 118 | start=time.time() 119 | 120 | if self.vis: 121 | outputs,fuse_weights,fused_f = model(img) 122 | else: 123 | outputs = model(img) 124 | 125 | if k > 9: 126 | infer_end=time.time() 127 | inference_time += (infer_end-start) 128 | 129 | outputs = postprocess( 130 | outputs, num_classes, self.confthre, self.nmsthre) 131 | 132 | if k > 9: 133 | nms_end=time.time() 134 | nms_time +=(nms_end-infer_end) 135 | 136 | if outputs[0] is None: 137 | continue 138 | outputs = outputs[0].cpu().data 139 | 140 | bboxes = outputs[:, 0:4] 141 | bboxes[:, 0::2] *= info_img[0] / self.img_size[0] 142 | bboxes[:, 1::2] *= info_img[1] / self.img_size[1] 143 | bboxes[:, 2] = bboxes[:,2] - bboxes[:,0] 144 | bboxes[:, 3] = bboxes[:,3] - bboxes[:,1] 145 | cls = outputs[:, 6] 146 | scores = outputs[:, 4]* outputs[:,5] 147 | for ind in range(bboxes.shape[0]): 148 | label = self.dataset.class_ids[int(cls[ind])] 149 | A = {"image_id": id_, "category_id": label, "bbox": bboxes[ind].numpy().tolist(), 150 | "score": scores[ind].numpy().item(), "segmentation": []} # COCO json format 151 | data_dict.append(A) 152 | 153 | if self.vis: 154 | o_img,_,_,_ = self.dataset.pull_item(i) 155 | make_vis('COCO', i, o_img, fuse_weights, fused_f) 156 | class_names = self.dataset._classes 157 | make_pred_vis('COCO', i, o_img, class_names, bboxes, cls, scores) 158 | 159 | if DEBUG and distributed_util.is_main_process(): 160 | o_img,_ = self.dataset.pull_item(i) 161 | class_names = self.dataset._classes 162 | make_pred_vis('COCO', i, o_img, class_names, bboxes, cls, scores) 163 | 164 | if distributed: 165 | distributed_util.synchronize() 166 | data_dict = _accumulate_predictions_from_multiple_gpus(data_dict) 167 | inference_time = torch.FloatTensor(1).type(Tensor).fill_(inference_time) 168 | nms_time = torch.FloatTensor(1).type(Tensor).fill_(nms_time) 169 | n_samples = torch.LongTensor(1).type(Tensor).fill_(n_samples) 170 | distributed_util.synchronize() 171 | torch.distributed.reduce(inference_time, dst=0) 172 | torch.distributed.reduce(nms_time, dst=0) 173 | torch.distributed.reduce(n_samples, dst=0) 174 | inference_time = inference_time.item() 175 | nms_time = nms_time.item() 176 | n_samples = n_samples.item() 177 | 178 | if not distributed_util.is_main_process(): 179 | return 0, 0 180 | 181 | 182 | print('Main process Evaluating...') 183 | 184 | annType = ['segm', 'bbox', 'keypoints'] 185 | a_infer_time = 1000*inference_time / (n_samples) 186 | a_nms_time= 1000*nms_time / (n_samples) 187 | 188 | print('Average forward time: %.2f ms, Average NMS time: %.2f ms, Average inference time: %.2f ms' %(a_infer_time, \ 189 | a_nms_time, (a_infer_time+a_nms_time))) 190 | 191 | # Evaluate the Dt (detection) json comparing with the ground truth 192 | if len(data_dict) > 0: 193 | cocoGt = self.dataset.coco 194 | # workaround: temporarily write data to json file because pycocotools can't process dict in py36. 195 | if self.testset: 196 | json.dump(data_dict, open('yolov3_2017.json', 'w')) 197 | cocoDt = cocoGt.loadRes('yolov3_2017.json') 198 | else: 199 | _, tmp = tempfile.mkstemp() 200 | json.dump(data_dict, open(tmp, 'w')) 201 | cocoDt = cocoGt.loadRes(tmp) 202 | cocoEval = COCOeval(self.dataset.coco, cocoDt, annType[1]) 203 | cocoEval.evaluate() 204 | cocoEval.accumulate() 205 | cocoEval.summarize() 206 | return cocoEval.stats[0], cocoEval.stats[1] 207 | else: 208 | return 0, 0 209 | 210 | -------------------------------------------------------------------------------- /models/yolov3_mobilev2.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from .network_blocks import * 3 | from .yolov3_head import YOLOv3Head 4 | 5 | 6 | def create_yolov3_mobilenet_v2(num_classes, width_mult=1.0, inverted_residual_setting=None, round_nearest=8): 7 | """ 8 | MobileNet V2 main class 9 | 10 | Args: 11 | num_classes (int): Number of classes 12 | width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount 13 | inverted_residual_setting: Network structure 14 | round_nearest (int): Round the number of channels in each layer to be a multiple of this number 15 | Set to 1 to turn off rounding 16 | """ 17 | block = InvertedResidual 18 | input_channel = 32 19 | last_channel = 1280 20 | 21 | if inverted_residual_setting is None: 22 | inverted_residual_setting = [ 23 | # t, c, n, s 24 | [1, 16, 1, 1], 25 | [6, 24, 2, 2], 26 | [6, 32, 3, 2], 27 | [6, 64, 4, 2], 28 | [6, 96, 3, 1], 29 | [6, 160, 3, 2], 30 | [6, 320, 1, 1], 31 | ] 32 | 33 | # only check the first element, assuming user knows t,c,n,s are required 34 | if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: 35 | raise ValueError("inverted_residual_setting should be non-empty " 36 | "or a 4-element list, got {}".format(inverted_residual_setting)) 37 | 38 | # building first layer 39 | input_channel = make_divisible(input_channel * width_mult, round_nearest) 40 | last_channel = make_divisible(last_channel * max(1.0, width_mult), round_nearest) 41 | mlist = nn.ModuleList() 42 | mlist.append(ConvBNReLU(3, input_channel, stride=2)) 43 | # building inverted residual blocks 44 | for t, c, n, s in inverted_residual_setting: 45 | output_channel =make_divisible(c * width_mult, round_nearest) 46 | for i in range(n): 47 | stride = s if i == 0 else 1 48 | mlist.append(block(input_channel, output_channel, stride, expand_ratio=t)) 49 | input_channel = output_channel 50 | # building last several layers 51 | mlist.append(ConvBNReLU(input_channel, last_channel, kernel_size=1)) #18 52 | 53 | # YOLOv3 54 | mlist.append(ressepblock(last_channel, 1024, in_ch=512, shortcut=False)) #19 55 | mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1,leaky=False)) #20 56 | # SPP Layer 57 | mlist.append(SPPLayer()) #21 58 | 59 | mlist.append(add_conv(in_ch=2048, out_ch=512, ksize=1, stride=1, leaky=False)) #22 60 | mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=1,leaky=False)) #23 61 | mlist.append(DropBlock(block_size=1, keep_prob=1)) #24 62 | mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1, leaky=False)) #25 (17) 63 | 64 | # 1st yolo branch 65 | mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1, leaky=False)) #26 66 | mlist.append(upsample(scale_factor=2, mode='nearest')) #27 67 | mlist.append(add_conv(in_ch=352, out_ch=256, ksize=1, stride=1,leaky=False)) #28 68 | mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=1,leaky=False)) #29 69 | mlist.append(DropBlock(block_size=1, keep_prob=1)) #30 70 | mlist.append(ressepblock(512, 512, in_ch=256,shortcut=False)) #31 71 | mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1,leaky=False)) #32 72 | # 2nd yolo branch 73 | 74 | mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1,leaky=False)) #33 75 | mlist.append(upsample(scale_factor=2, mode='nearest')) #34 76 | mlist.append(add_conv(in_ch=160, out_ch=128, ksize=1, stride=1,leaky=False)) #35 77 | mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=1,leaky=False)) #36 78 | mlist.append(DropBlock(block_size=1, keep_prob=1)) #37 79 | mlist.append(ressepblock(256, 256, in_ch=128,shortcut=False)) #38 80 | mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1,leaky=False)) #39 81 | 82 | return mlist 83 | 84 | 85 | class YOLOv3(nn.Module): 86 | """ 87 | YOLOv3 model module. The module list is defined by create_yolov3_modules function. \ 88 | The network returns loss values from three YOLO layers during training \ 89 | and detection results during test. 90 | """ 91 | def __init__(self, num_classes = 80, ignore_thre=0.7, label_smooth = False, rfb=False, vis=False, asff=False): 92 | """ 93 | Initialization of YOLOv3 class. 94 | Args: 95 | ignore_thre (float): used in YOLOLayer. 96 | """ 97 | super(YOLOv3, self).__init__() 98 | self.module_list = create_yolov3_mobilenet_v2(num_classes) 99 | 100 | if asff: 101 | self.level_0_conv =ASFFmobile(level=0,rfb=rfb,vis=vis) 102 | else: 103 | self.level_0_conv =add_conv(in_ch=512, out_ch=1024, ksize=3, stride=1,leaky=False) 104 | 105 | self.level_0_header = YOLOv3Head(anch_mask=[6, 7, 8], n_classes=num_classes, stride=32, in_ch=1024, 106 | ignore_thre=ignore_thre,label_smooth = label_smooth, rfb=rfb, sep=True) 107 | 108 | if asff: 109 | self.level_1_conv =ASFFmobile(level=1,rfb=rfb,vis=vis) 110 | else: 111 | self.level_1_conv =add_conv(in_ch=256, out_ch=512, ksize=3, stride=1,leaky=False) 112 | 113 | self.level_1_header = YOLOv3Head(anch_mask=[3, 4, 5], n_classes=num_classes, stride=16, in_ch=512, 114 | ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb, sep=True) 115 | 116 | if asff: 117 | self.level_2_conv =ASFFmobile(level=2,rfb=rfb,vis=vis) 118 | else: 119 | self.level_2_conv =add_conv(in_ch=128, out_ch=256, ksize=3, stride=1,leaky=False) 120 | 121 | self.level_2_header = YOLOv3Head(anch_mask=[0, 1, 2], n_classes=num_classes, stride=8, in_ch=256, 122 | ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb, sep=True) 123 | self.asff = asff 124 | 125 | def forward(self, x, targets=None, epoch=0): 126 | """ 127 | Forward path of YOLOv3. 128 | Args: 129 | x (torch.Tensor) : input data whose shape is :math:`(N, C, H, W)`, \ 130 | where N, C are batchsize and num. of channels. 131 | targets (torch.Tensor) : label array whose shape is :math:`(N, 50, 5)` 132 | 133 | Returns: 134 | training: 135 | output (torch.Tensor): loss tensor for backpropagation. 136 | test: 137 | output (torch.Tensor): concatenated detection results. 138 | """ 139 | 140 | train = targets is not None 141 | output = [] 142 | anchor_losses= [] 143 | iou_losses = [] 144 | l1_losses = [] 145 | conf_losses = [] 146 | cls_losses = [] 147 | route_layers = [] 148 | 149 | for i, module in enumerate(self.module_list): 150 | 151 | # yolo layers 152 | x = module(x) 153 | 154 | # route layers 155 | if i in [6, 13, 25, 32, 39]: 156 | route_layers.append(x) 157 | if i == 27: 158 | x = torch.cat((x, route_layers[1]), 1) 159 | if i == 34: 160 | x = torch.cat((x, route_layers[0]), 1) 161 | 162 | 163 | for l in range(3): 164 | conver = getattr(self, 'level_{}_conv'.format(l)) 165 | header = getattr(self, 'level_{}_header'.format(l)) 166 | if self.asff: 167 | f_conv= conver(route_layers[2],route_layers[3],route_layers[4]) 168 | else: 169 | f_conv = conver(route_layers[l+2]) 170 | if train: 171 | x, anchor_loss, iou_loss, l1_loss, conf_loss, cls_loss = header(f_conv, targets) 172 | anchor_losses.append(anchor_loss) 173 | iou_losses.append(iou_loss) 174 | l1_losses.append(l1_loss) 175 | conf_losses.append(conf_loss) 176 | cls_losses.append(cls_loss) 177 | else: 178 | x = header(f_conv) 179 | 180 | output.append(x) 181 | 182 | if train: 183 | losses = torch.stack(output, 0).unsqueeze(0).sum(1,keepdim=True) 184 | anchor_losses = torch.stack(anchor_losses, 0).unsqueeze(0).sum(1,keepdim=True) 185 | iou_losses = torch.stack(iou_losses, 0).unsqueeze(0).sum(1,keepdim=True) 186 | l1_losses = torch.stack(l1_losses, 0).unsqueeze(0).sum(1,keepdim=True) 187 | conf_losses = torch.stack(conf_losses, 0).unsqueeze(0).sum(1,keepdim=True) 188 | cls_losses = torch.stack(cls_losses, 0).unsqueeze(0).sum(1,keepdim=True) 189 | loss_dict = dict( 190 | losses = losses, 191 | anchor_losses = anchor_losses, 192 | iou_losses = iou_losses, 193 | l1_losses = l1_losses, 194 | conf_losses = conf_losses, 195 | cls_losses = cls_losses, 196 | ) 197 | return loss_dict 198 | else: 199 | return torch.cat(output, 1) 200 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning Spatial Fusion for Single-Shot Object Detection 2 | 3 | By Songtao Liu, Di Huang, Yunhong Wang 4 | 5 | ### Introduction 6 | In this work, we propose a novel and data driven strategy for pyramidal feature fusion, referred to as adaptively spatial feature fusion (ASFF). It learns the way to spatially filter conflictive information to suppress the inconsistency, thus improving the scale-invariance of features, and introduces nearly free inference overhead. For more details, please refer to our [arXiv paper](https://arxiv.org/abs/1911.09516). 7 | 8 | 9 | 10 | ### Updates: 11 | - Add MobileNet V2! 12 | * The previous models actually are all trained with the wrong anchor setting, we fix the error on mobileNet model. 13 | * We currently not support rfb, dropblock and Feature Adaption for mobileNet V2. 14 | * FP16 training for mobileNet is not working now. I didn't figure it out. 15 | * FP16 testing for mobileNet drops about 0.2 mAP. 16 | 17 | - Add a demo.py file 18 | 19 | - Faster NMS (adopt official implementation) 20 | 21 | ### COCO 22 | 23 | | System | *test-dev mAP* | **Time** (V100) | **Time** (2080ti)| 24 | |:-------|:-----:|:-------:|:-------:| 25 | | [YOLOv3 608](http://pjreddie.com/darknet/yolo/) | 33.0 | 20ms| 26ms| 26 | | YOLOv3 608+ [BoFs](https://arxiv.org/abs/1902.04103) | 37.0 | 20ms | 26ms| 27 | | YOLOv3 608 (our baseline) | **38.8** | 20ms | 26ms| 28 | | YOLOv3 608+ ASFF | **40.6** | 22ms | 30ms| 29 | | YOLOv3 608+ ASFF\* | **42.4** | 22ms | 30ms| 30 | | YOLOv3 800+ ASFF\* | **43.9** | 34ms | 38ms| 31 | | YOLOv3 MobileNetV1 416 + [BoFs](https://arxiv.org/abs/1902.04103)| 28.6 | - | 22 ms| 32 | | YOLOv3 MobileNetV2 416 (our baseline) | 29.0 | - | 22 ms| 33 | | YOLOv3 MobileNetV2 416 +ASFF | **30.6** | - | 24 ms| 34 | 35 | 36 | ### Citing 37 | Please cite our paper in your publications if it helps your research: 38 | 39 | @article{liu2019asff, 40 | title = {Learning Spatial Fusion for Single-Shot Object Detection}, 41 | author = {Songtao Liu, Di Huang and Yunhong Wang}, 42 | booktitle = {arxiv preprint arXiv:1911.09516}, 43 | year = {2019} 44 | } 45 | 46 | ### Contents 47 | 1. [Installation](#installation) 48 | 2. [Datasets](#datasets) 49 | 3. [Training](#training) 50 | 4. [Evaluation](#evaluation) 51 | 5. [Models](#models) 52 | 53 | ## Installation 54 | - Install [PyTorch-1.3.1](http://pytorch.org/) by selecting your environment on the website and running the appropriate command. 55 | - Clone this repository. 56 | * Note: We currently only support PyTorch-1.0.0+ and Python 3+. 57 | - Compile the DCN layer (ported from [DCNv2 implementation](https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0)): 58 | ```Shell 59 | ./make.sh 60 | ``` 61 | 62 | ### Prerequisites 63 | - We also use [apex](https://github.com/NVIDIA/apex), numpy, opencv, tqdm, pyyaml, matplotlib, scikit-image... 64 | * Note: We use apex for distributed training and synchronized batch normalization. For FP16 training, since the current apex version have some [issues](https://github.com/NVIDIA/apex/issues/318), we use the old version of FP16_Optimizer, and split the code in ./utils/fp_utils. 65 | 66 | - We also support tensorboard if you have installed it. 67 | 68 | ### Demo 69 | 70 | ```Shell 71 | python demo.py -i /path/to/your/image \ 72 | --cfg config/yolov3_baseline.cfg -d COCO \ 73 | --checkpoint /path/to/you/weights --half --asff --rfb -s 608 74 | ``` 75 | - Note: 76 | * -i, --img: image path. 77 | * --cfg: config files. 78 | * -d: choose datasets, COCO or VOC. 79 | * -c, --checkpoint: pretrained weights. 80 | * --half: FP16 testing. 81 | * -s: evaluation image size, from 320 to 608 as in YOLOv3. 82 | 83 | 84 | ## Datasets 85 | Note: We currently only support [COCO](http://mscoco.org/) and [VOC](http://host.robots.ox.ac.uk/pascal/VOC/). 86 | To make things easy, we provide simple COCO and VOC dataset loader that inherits `torch.utils.data.Dataset` making it fully compatible with the `torchvision.datasets` [API](http://pytorch.org/docs/torchvision/datasets.html). 87 | 88 | Moreover, we also implement the Mix-up strategy in [BoFs](https://arxiv.org/abs/1902.04103) and distributed random resizing in YOLov3. 89 | ### COCO Dataset 90 | Install the MS COCO dataset at /path/to/coco from [official website](http://mscoco.org/), default is ./data/COCO, and a soft-link is recommended. 91 | ``` 92 | ln -s /path/to/coco ./data/COCO 93 | ``` 94 | 95 | It should have this basic structure 96 | ```Shell 97 | $COCO/ 98 | $COCO/annotations/ 99 | $COCO/images/ 100 | $COCO/images/test2017/ 101 | $COCO/images/train2017/ 102 | $COCO/images/val2017/ 103 | ``` 104 | The current COCO dataset has released new *train2017* and *val2017* sets, and we defaultly train our model on *train2017* and evaluate on *val2017*. 105 | 106 | ### VOC Dataset 107 | Install the VOC dataset as ./data/VOC. We also recommend a soft-link: 108 | ``` 109 | ln -s /path/to/VOCdevkit ./data/VOC 110 | ``` 111 | 112 | ## Training 113 | 114 | - First download the mix-up pretrained [Darknet-53](https://arxiv.org/abs/1902.04103) PyTorch base network weights at: https://drive.google.com/open?id=1phqyYhV1K9KZLQZH1kENTAPprLBmymfP 115 | or from our [BaiduYun Driver](https://pan.baidu.com/s/19PaXl6p9vXHG2ZuGqtfLOg) 116 | 117 | - For MobileNetV2, we use the pytorch official [weights](https://drive.google.com/open?id=1LwMd9lK6YqGM8Yjf_ClBT2MG1-PHgUGa) (change the key name to fit our code), or from our [BaiduYun Driver](https://pan.baidu.com/s/12eScI6YNBvkVX0286cMEZA) 118 | 119 | - By default, we assume you have downloaded the file in the `ASFF/weights` dir: 120 | 121 | - Since random resizing consumes much more GPU memory, we implement FP16 training with an old version of apex. 122 | 123 | - We currently **ONLY** test the code with distributed training on multiple GPUs (10 2080ti or 4 Tesla V100). 124 | 125 | - To train YOLOv3 baseline (ours) using the train script simply specify the parameters listed in `main.py` as a flag or manually change them on config/yolov3_baseline.cfg: 126 | ```Shell 127 | python -m torch.distributed.launch --nproc_per_node=10 --master_port=${RANDOM+10000} main.py \ 128 | --cfg config/yolov3_baseline.cfg -d COCO --tfboard --distributed --ngpu 10 \ 129 | --checkpoint weights/darknet53_feature_mx.pth --start_epoch 0 --half --log_dir log/COCO -s 608 130 | ``` 131 | - Note: 132 | * --cfg: config files. 133 | * --tfboard: use tensorboard. 134 | * --distributed: distributed training (we only test the code with distributed training) 135 | * -d: choose datasets, COCO or VOC. 136 | * --ngpu: number of GPUs. 137 | * -c, --checkpoint: pretrained weights or resume weights. You can pick-up training from a checkpoint by specifying the path as one of the training parameters (again, see `main.py` for options) 138 | 139 | * --start_epoch: used for resume training. 140 | * --half: FP16 training. 141 | * --log_dir: log dir for tensorboard. 142 | * -s: evaluation image size, from 320 to 608 as in YOLOv3. 143 | 144 | - To train YOLOv3 with ASFF or ASFF\*, you only need add some addional flags: 145 | ```Shell 146 | python -m torch.distributed.launch --nproc_per_node=10 --master_port=${RANDOM+10000} main.py \ 147 | --cfg config/yolov3_baseline.cfg -d COCO --tfboard --distributed --ngpu 10 \ 148 | --checkpoint weights/darknet53_feature_mx.pth --start_epoch 0 --half --asff --rfb --dropblock \ 149 | --log_dir log/COCO_ASFF -s 608 150 | ``` 151 | - Note: 152 | * --asff: add ASFF module on YOLOv3. 153 | * --rfb: use [RFB](https://github.com/ruinmessi/RFBNet) moduel on ASFF. 154 | * --dropblock: use [DropBlock](https://arxiv.org/abs/1810.12890). 155 | 156 | ## Evaluation 157 | To evaluate a trained network, you can use the following command: 158 | 159 | ```Shell 160 | python -m torch.distributed.launch --nproc_per_node=10 --master_port=${RANDOM+10000} eval.py \ 161 | --cfg config/yolov3_baseline.cfg -d COCO --distributed --ngpu 10 \ 162 | --checkpoint /path/to/you/weights --half --asff --rfb -s 608 163 | ``` 164 | - Note: 165 | * --vis: Visualization of ASFF. 166 | * --testset: evaluate on COCO *test-dev*. 167 | * -s: evaluation image size. 168 | 169 | By default, it will directly output the mAP results on COCO *val2017* or VOC *test 2007*. 170 | 171 | ## Models 172 | * yolov3 mobilenetv2 (ours)[weights](https://drive.google.com/open?id=1XGXJPXHIroimEuW8oujbInNapuEDALOB) [baiduYun](https://pan.baidu.com/s/100TivomBLDTRZSA1pkGiNA) [training tfboard log](https://pan.baidu.com/s/1P_00LAUvV-VOzxqoIxC_Yw) 173 | 174 | * yolov3 mobilenetv2 +asff [weights](https://drive.google.com/open?id=1cC-xGoaw3Wu5hYd3iXEq6xrAn4U_dW-w) [baiduYun](https://pan.baidu.com/s/1JxX8mYkljk1ap2s4zpLrSg) [training tfboard log](https://pan.baidu.com/s/1R2YL9uZ9baQWR6aht0qVlQ) 175 | 176 | * yolov3_baseline (ours) [weights](https://drive.google.com/open?id=1RbjUQbNxl4cEbk-6jFkFnOHRukJY5EQk) [baiduYun](https://pan.baidu.com/s/131JhlaOBbeL9l4tqiJO9yA) [training tfboard log](https://pan.baidu.com/s/1GcpVnq7mhIsrk8zrJ9FF2g) 177 | 178 | * yolov3_asff [weights](https://drive.google.com/open?id=1Dyf8ZEga_VT2O3_c5nrFJA5uON1aSJK-) [baiduYun](https://pan.baidu.com/s/1a-eQZ0kDpsnUooD4RtRdxg) [training tfboard log](https://pan.baidu.com/s/1MeMkAWwv1SFsVbvsTpj_xQ) 179 | 180 | * yolov3_asff\* (320-608) [weights](https://drive.google.com/open?id=1N668Za8OBbJbUStYde0ml9SZdM7tabXy) [baiduYun](https://pan.baidu.com/s/1d9hOQBj20HCy51qWbonxMQ) 181 | 182 | * yolov3_asff\* (480-800) [weights](https://drive.google.com/open?id=18N4_nNVqYbjawerEHQnwJGPcRvcLOe06) [baiduYun](https://pan.baidu.com/s/1HERhiP4vmUekxxm5KQrX8g) 183 | 184 | 185 | -------------------------------------------------------------------------------- /dataset/dataloading.py: -------------------------------------------------------------------------------- 1 | import random 2 | import logging 3 | from functools import wraps 4 | import torch 5 | from torch.utils.data.dataset import Dataset as torchDataset 6 | from torch.utils.data.sampler import BatchSampler as torchBatchSampler 7 | from torch.utils.data.dataloader import DataLoader as torchDataLoader 8 | from torch.utils.data.dataloader import default_collate 9 | 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | 14 | class Dataset(torchDataset): 15 | """ This class is a subclass of the base :class:`torch.utils.data.Dataset`, 16 | that enables on the fly resizing of the ``input_dim`` with a :class:`lightnet.data.DataLoader`. 17 | 18 | Args: 19 | input_dimension (tuple): (width,height) tuple with default dimensions of the network 20 | """ 21 | def __init__(self, input_dimension): 22 | super().__init__() 23 | self.__input_dim = input_dimension[:2] 24 | 25 | @property 26 | def input_dim(self): 27 | """ Dimension that can be used by transforms to set the correct image size, etc. 28 | This allows transforms to have a single source of truth for the input dimension of the network. 29 | 30 | Return: 31 | list: Tuple containing the current width,height 32 | """ 33 | if hasattr(self, '_input_dim'): 34 | return self._input_dim 35 | return self.__input_dim 36 | 37 | @staticmethod 38 | def resize_getitem(getitem_fn): 39 | """ Decorator method that needs to be used around the ``__getitem__`` method. |br| 40 | This decorator enables the on the fly resizing of the ``input_dim`` with our :class:`~lightnet.data.DataLoader` class. 41 | 42 | Example: 43 | >>> class CustomSet(ln.data.Dataset): 44 | ... def __len__(self): 45 | ... return 10 46 | ... @ln.data.Dataset.resize_getitem 47 | ... def __getitem__(self, index): 48 | ... # Should return (image, anno) but here we return input_dim 49 | ... return self.input_dim 50 | >>> data = CustomSet((200,200)) 51 | >>> data[0] 52 | (200, 200) 53 | >>> data[(480,320), 0] 54 | (480, 320) 55 | """ 56 | @wraps(getitem_fn) 57 | def wrapper(self, index): 58 | if not isinstance(index, int): 59 | has_dim = True 60 | self._input_dim = index[0] 61 | index = index[1] 62 | else: 63 | has_dim = False 64 | 65 | ret_val = getitem_fn(self, index) 66 | 67 | if has_dim: 68 | del self._input_dim 69 | 70 | return ret_val 71 | 72 | return wrapper 73 | 74 | 75 | class DataLoader(torchDataLoader): 76 | """ Lightnet dataloader that enables on the fly resizing of the images. 77 | See :class:`torch.utils.data.DataLoader` for more information on the arguments. 78 | 79 | Note: 80 | This dataloader only works with :class:`lightnet.data.Dataset` based datasets. 81 | 82 | Example: 83 | >>> class CustomSet(ln.data.Dataset): 84 | ... def __len__(self): 85 | ... return 4 86 | ... @ln.data.Dataset.resize_getitem 87 | ... def __getitem__(self, index): 88 | ... # Should return (image, anno) but here we return (input_dim,) 89 | ... return (self.input_dim,) 90 | >>> dl = ln.data.DataLoader( 91 | ... CustomSet((200,200)), 92 | ... batch_size = 2, 93 | ... collate_fn = ln.data.list_collate # We want the data to be grouped as a list 94 | ... ) 95 | >>> dl.dataset.input_dim # Default input_dim 96 | (200, 200) 97 | >>> for d in dl: 98 | ... d 99 | [[(200, 200), (200, 200)]] 100 | [[(200, 200), (200, 200)]] 101 | >>> dl.change_input_dim(320, random_range=None) 102 | (320, 320) 103 | >>> for d in dl: 104 | ... d 105 | [[(320, 320), (320, 320)]] 106 | [[(320, 320), (320, 320)]] 107 | >>> dl.change_input_dim((480, 320), random_range=None) 108 | (480, 320) 109 | >>> for d in dl: 110 | ... d 111 | [[(480, 320), (480, 320)]] 112 | [[(480, 320), (480, 320)]] 113 | """ 114 | def __init__(self, *args, **kwargs): 115 | super().__init__(*args, **kwargs) 116 | self.__initialized = False 117 | shuffle = False 118 | batch_sampler = None 119 | if len(args) > 5: 120 | shuffle = args[2] 121 | sampler = args[3] 122 | batch_sampler = args[4] 123 | elif len(args) > 4: 124 | shuffle = args[2] 125 | sampler = args[3] 126 | if 'batch_sampler' in kwargs: 127 | batch_sampler = kwargs['batch_sampler'] 128 | elif len(args) > 3: 129 | shuffle = args[2] 130 | if 'sampler' in kwargs: 131 | sampler = kwargs['sampler'] 132 | if 'batch_sampler' in kwargs: 133 | batch_sampler = kwargs['batch_sampler'] 134 | else: 135 | if 'shuffle' in kwargs: 136 | shuffle = kwargs['shuffle'] 137 | if 'sampler' in kwargs: 138 | sampler = kwargs['sampler'] 139 | if 'batch_sampler' in kwargs: 140 | batch_sampler = kwargs['batch_sampler'] 141 | 142 | # Use custom BatchSampler 143 | if batch_sampler is None: 144 | if sampler is None: 145 | if shuffle: 146 | sampler = torch.utils.data.sampler.RandomSampler(self.dataset) 147 | #sampler = torch.utils.data.DistributedSampler(self.dataset) 148 | else: 149 | sampler = torch.utils.data.sampler.SequentialSampler(self.dataset) 150 | batch_sampler = YoloBatchSampler(sampler, self.batch_size, self.drop_last, input_dimension=self.dataset.input_dim) 151 | #batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iterations = 152 | 153 | self.batch_sampler = batch_sampler 154 | 155 | self.__initialized = True 156 | 157 | def change_input_dim(self, multiple=32, random_range=(10, 19)): 158 | """ This function will compute a new size and update it on the next mini_batch. 159 | 160 | Args: 161 | multiple (int or tuple, optional): value (or values) to multiply the randomly generated range by; Default **32** 162 | random_range (tuple, optional): This (min, max) tuple sets the range for the randomisation; Default **(10, 19)** 163 | 164 | Return: 165 | tuple: width, height tuple with new dimension 166 | 167 | Note: 168 | The new size is generated as follows: |br| 169 | First we compute a random integer inside ``[random_range]``. 170 | We then multiply that number with the ``multiple`` argument, which gives our final new input size. |br| 171 | If ``multiple`` is an integer we generate a square size. If you give a tuple of **(width, height)**, 172 | the size is computed as :math:`rng * multiple[0], rng * multiple[1]`. 173 | 174 | Note: 175 | You can set the ``random_range`` argument to **None** to set an exact size of multiply. |br| 176 | See the example above for how this works. 177 | """ 178 | if random_range is None: 179 | size = 1 180 | else: 181 | size = random.randint(*random_range) 182 | 183 | if isinstance(multiple, int): 184 | size = (size * multiple, size * multiple) 185 | else: 186 | size = (size * multiple[0], size * multiple[1]) 187 | 188 | self.batch_sampler.new_input_dim = size 189 | 190 | return size 191 | 192 | 193 | class YoloBatchSampler(torchBatchSampler): 194 | """ This batch sampler will generate mini-batches of (dim, index) tuples from another sampler. 195 | It works just like the :class:`torch.utils.data.sampler.BatchSampler`, but it will prepend a dimension, 196 | whilst ensuring it stays the same across one mini-batch. 197 | """ 198 | def __init__(self, *args, input_dimension=None, **kwargs): 199 | super().__init__(*args, **kwargs) 200 | self.input_dim = input_dimension 201 | self.new_input_dim = None 202 | 203 | def __iter__(self): 204 | self.__set_input_dim() 205 | for batch in super().__iter__(): 206 | yield [(self.input_dim, idx) for idx in batch] 207 | self.__set_input_dim() 208 | 209 | def __set_input_dim(self): 210 | """ This function randomly changes the the input dimension of the dataset. """ 211 | if self.new_input_dim is not None: 212 | log.info(f'Resizing network {self.new_input_dim[:2]}') 213 | self.input_dim = (self.new_input_dim[0], self.new_input_dim[1]) 214 | self.new_input_dim = None 215 | 216 | class IterationBasedBatchSampler(torchBatchSampler): 217 | """ 218 | Wraps a BatchSampler, resampling from it until 219 | a specified number of iterations have been sampled 220 | """ 221 | 222 | def __init__(self, batch_sampler, num_iterations, start_iter=0): 223 | self.batch_sampler = batch_sampler 224 | self.num_iterations = num_iterations 225 | self.start_iter = start_iter 226 | 227 | def __iter__(self): 228 | iteration = self.start_iter 229 | while iteration <= self.num_iterations: 230 | # if the underlying sampler has a set_epoch method, like 231 | # DistributedSampler, used for making each process see 232 | # a different split of the dataset, then set it 233 | if hasattr(self.batch_sampler.sampler, "set_epoch"): 234 | self.batch_sampler.sampler.set_epoch(iteration) 235 | for batch in self.batch_sampler: 236 | iteration += 1 237 | if iteration > self.num_iterations: 238 | break 239 | yield batch 240 | 241 | def __len__(self): 242 | return self.num_iterations 243 | 244 | def list_collate(batch): 245 | """ Function that collates lists or tuples together into one list (of lists/tuples). 246 | Use this as the collate function in a Dataloader, if you want to have a list of items as an output, as opposed to tensors (eg. Brambox.boxes). 247 | """ 248 | items = list(zip(*batch)) 249 | 250 | for i in range(len(items)): 251 | if isinstance(items[i][0], (list, tuple)): 252 | items[i] = list(items[i]) 253 | else: 254 | items[i] = default_collate(items[i]) 255 | 256 | return items 257 | 258 | -------------------------------------------------------------------------------- /dataset/vocdataset.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | 9 | import os 10 | import pickle 11 | import os.path 12 | import sys 13 | import torch 14 | import torch.utils.data as data 15 | import torchvision.transforms as transforms 16 | import cv2 17 | import numpy as np 18 | from .voc_eval import voc_eval 19 | from .dataloading import Dataset 20 | if sys.version_info[0] == 2: 21 | import xml.etree.cElementTree as ET 22 | else: 23 | import xml.etree.ElementTree as ET 24 | 25 | 26 | #VOC_CLASSES = ( '__background__', # always index 0 27 | VOC_CLASSES = ( 28 | 'aeroplane', 'bicycle', 'bird', 'boat', 29 | 'bottle', 'bus', 'car', 'cat', 'chair', 30 | 'cow', 'diningtable', 'dog', 'horse', 31 | 'motorbike', 'person', 'pottedplant', 32 | 'sheep', 'sofa', 'train', 'tvmonitor') 33 | 34 | # for making bounding boxes pretty 35 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128), 36 | (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128)) 37 | 38 | 39 | 40 | class AnnotationTransform(object): 41 | 42 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 43 | Initilized with a dictionary lookup of classnames to indexes 44 | 45 | Arguments: 46 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 47 | (default: alphabetic indexing of VOC's 20 classes) 48 | keep_difficult (bool, optional): keep difficult instances or not 49 | (default: False) 50 | height (int): height 51 | width (int): width 52 | """ 53 | 54 | def __init__(self, class_to_ind=None, keep_difficult=True): 55 | self.class_to_ind = class_to_ind or dict( 56 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 57 | self.keep_difficult = keep_difficult 58 | 59 | def __call__(self, target): 60 | """ 61 | Arguments: 62 | target (annotation) : the target annotation to be made usable 63 | will be an ET.Element 64 | Returns: 65 | a list containing lists of bounding boxes [bbox coords, class name] 66 | """ 67 | res = np.empty((0,5)) 68 | for obj in target.iter('object'): 69 | difficult = int(obj.find('difficult').text) == 1 70 | if not self.keep_difficult and difficult: 71 | continue 72 | name = obj.find('name').text.lower().strip() 73 | bbox = obj.find('bndbox') 74 | 75 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 76 | bndbox = [] 77 | for i, pt in enumerate(pts): 78 | cur_pt = int(bbox.find(pt).text) - 1 79 | # scale height or width 80 | #cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 81 | bndbox.append(cur_pt) 82 | label_idx = self.class_to_ind[name] 83 | bndbox.append(label_idx) 84 | res = np.vstack((res,bndbox)) # [xmin, ymin, xmax, ymax, label_ind] 85 | # img_id = target.find('filename').text[:-4] 86 | 87 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 88 | 89 | 90 | class VOCDetection(Dataset): 91 | 92 | """VOC Detection Dataset Object 93 | 94 | input is image, target is annotation 95 | 96 | Arguments: 97 | root (string): filepath to VOCdevkit folder. 98 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 99 | transform (callable, optional): transformation to perform on the 100 | input image 101 | target_transform (callable, optional): transformation to perform on the 102 | target `annotation` 103 | (eg: take in caption string, return tensor of word indices) 104 | dataset_name (string, optional): which dataset to load 105 | (default: 'VOC2007') 106 | """ 107 | 108 | def __init__(self, root, image_sets, preproc=None, target_transform=AnnotationTransform(), input_dim=(416,416), 109 | dataset_name='VOC0712'): 110 | super().__init__(input_dim) 111 | self.root = root 112 | self.image_set = image_sets 113 | self.preproc = preproc 114 | self.target_transform = target_transform 115 | self.name = dataset_name 116 | self._annopath = os.path.join('%s', 'Annotations', '%s.xml') 117 | self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg') 118 | self._classes=VOC_CLASSES 119 | self.ids = list() 120 | for (year, name) in image_sets: 121 | self._year = year 122 | rootpath = os.path.join(self.root, 'VOC' + year) 123 | for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 124 | self.ids.append((rootpath, line.strip())) 125 | 126 | @Dataset.resize_getitem 127 | def __getitem__(self, index): 128 | img_id = self.ids[index] 129 | target = ET.parse(self._annopath % img_id).getroot() 130 | img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 131 | #img = Image.open(self._imgpath % img_id).convert('RGB') 132 | 133 | height, width, _ = img.shape 134 | 135 | if self.target_transform is not None: 136 | target = self.target_transform(target) 137 | 138 | 139 | if self.preproc is not None: 140 | img, target = self.preproc(img, target, self.input_dim) 141 | #print(img.size()) 142 | 143 | img_info = (width, height) 144 | 145 | return img, target, img_info, img_id 146 | 147 | def __len__(self): 148 | return len(self.ids) 149 | 150 | def pull_image(self, index): 151 | '''Returns the original image object at index in PIL form 152 | 153 | Note: not using self.__getitem__(), as any transformations passed in 154 | could mess up this functionality. 155 | 156 | Argument: 157 | index (int): index of img to show 158 | Return: 159 | PIL img 160 | ''' 161 | img_id = self.ids[index] 162 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 163 | 164 | def pull_anno(self, index): 165 | '''Returns the original annotation of image at index 166 | 167 | Note: not using self.__getitem__(), as any transformations passed in 168 | could mess up this functionality. 169 | 170 | Argument: 171 | index (int): index of img to get annotation of 172 | Return: 173 | list: [img_id, [(label, bbox coords),...]] 174 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 175 | ''' 176 | img_id = self.ids[index] 177 | anno = ET.parse(self._annopath % img_id).getroot() 178 | gt = self.target_transform(anno, 1, 1) 179 | return img_id[1], gt 180 | 181 | def pull_item(self, index): 182 | '''Returns the original image and target at an index for mixup 183 | 184 | Note: not using self.__getitem__(), as any transformations passed in 185 | could mess up this functionality. 186 | 187 | Argument: 188 | index (int): index of img to show 189 | Return: 190 | img, target 191 | ''' 192 | img_id = self.ids[index] 193 | target = ET.parse(self._annopath % img_id).getroot() 194 | img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 195 | 196 | height, width, _ = img.shape 197 | 198 | img_info = (width, height) 199 | if self.target_transform is not None: 200 | target = self.target_transform(target) 201 | 202 | return img, target, img_info, img_id 203 | 204 | def evaluate_detections(self, all_boxes, output_dir=None): 205 | """ 206 | all_boxes is a list of length number-of-classes. 207 | Each list element is a list of length number-of-images. 208 | Each of those list elements is either an empty list [] 209 | or a numpy array of detection. 210 | 211 | all_boxes[class][image] = [] or np.array of shape #dets x 5 212 | """ 213 | self._write_voc_results_file(all_boxes) 214 | IouTh = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True) 215 | mAPs = [] 216 | for iou in IouTh: 217 | mAP = self._do_python_eval(output_dir,iou) 218 | mAPs.append(mAP) 219 | 220 | print('--------------------------------------------------------------') 221 | print('map_5095:', np.mean(mAPs)) 222 | print('map_50:', mAPs[0]) 223 | print('--------------------------------------------------------------') 224 | return np.mean(mAPs), mAPs[0] 225 | 226 | def _get_voc_results_file_template(self): 227 | filename = 'comp4_det_test' + '_{:s}.txt' 228 | filedir = os.path.join( 229 | self.root, 'results', 'VOC' + self._year, 'Main') 230 | if not os.path.exists(filedir): 231 | os.makedirs(filedir) 232 | path = os.path.join(filedir, filename) 233 | return path 234 | 235 | def _write_voc_results_file(self, all_boxes): 236 | for cls_ind, cls in enumerate(VOC_CLASSES): 237 | cls_ind = cls_ind 238 | if cls == '__background__': 239 | continue 240 | print('Writing {} VOC results file'.format(cls)) 241 | filename = self._get_voc_results_file_template().format(cls) 242 | with open(filename, 'wt') as f: 243 | for im_ind, index in enumerate(self.ids): 244 | index = index[1] 245 | dets = all_boxes[cls_ind][im_ind] 246 | if dets == []: 247 | continue 248 | for k in range(dets.shape[0]): 249 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 250 | format(index, dets[k, -1], 251 | dets[k, 0] + 1, dets[k, 1] + 1, 252 | dets[k, 2] + 1, dets[k, 3] + 1)) 253 | 254 | def _do_python_eval(self, output_dir='output', iou = 0.5): 255 | rootpath = os.path.join(self.root, 'VOC' + self._year) 256 | name = self.image_set[0][1] 257 | annopath = os.path.join( 258 | rootpath, 259 | 'Annotations', 260 | '{:s}.xml') 261 | imagesetfile = os.path.join( 262 | rootpath, 263 | 'ImageSets', 264 | 'Main', 265 | name+'.txt') 266 | cachedir = os.path.join(self.root, 'annotations_cache', 'VOC'+self._year, name) 267 | if not os.path.exists(cachedir): 268 | os.makedirs(cachedir) 269 | aps = [] 270 | # The PASCAL VOC metric changed in 2010 271 | use_07_metric = True if int(self._year) < 2010 else False 272 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 273 | if output_dir is not None and not os.path.isdir(output_dir): 274 | os.mkdir(output_dir) 275 | for i, cls in enumerate(VOC_CLASSES): 276 | 277 | if cls == '__background__': 278 | continue 279 | 280 | filename = self._get_voc_results_file_template().format(cls) 281 | rec, prec, ap = voc_eval( 282 | filename, annopath, imagesetfile, cls, cachedir, ovthresh=iou, 283 | use_07_metric=use_07_metric) 284 | aps += [ap] 285 | if iou == 0.5: 286 | print('AP for {} = {:.4f}'.format(cls, ap)) 287 | if output_dir is not None: 288 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 289 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 290 | if iou ==0.5: 291 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 292 | print('~~~~~~~~') 293 | print('Results:') 294 | for ap in aps: 295 | print('{:.3f}'.format(ap)) 296 | print('{:.3f}'.format(np.mean(aps))) 297 | print('~~~~~~~~') 298 | print('') 299 | print('--------------------------------------------------------------') 300 | print('Results computed with the **unofficial** Python eval code.') 301 | print('Results should be very close to the official MATLAB eval code.') 302 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 303 | print('-- Thanks, The Management') 304 | print('--------------------------------------------------------------') 305 | 306 | return np.mean(aps) 307 | -------------------------------------------------------------------------------- /models/yolov3_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from utils.utils import bboxes_iou 6 | import numpy as np 7 | from .utils_loss import * 8 | from .network_blocks import * 9 | 10 | class YOLOv3Head(nn.Module): 11 | def __init__(self, anch_mask, n_classes, stride, in_ch=1024, ignore_thre=0.7, label_smooth = False, rfb=False, sep=False): 12 | super(YOLOv3Head, self).__init__() 13 | self.anchors = [ 14 | (10, 13), (16, 30), (33, 23), 15 | (30, 61), (62, 45), (42, 119), 16 | (116, 90), (156, 198), (121, 240) ] 17 | if sep: 18 | self.anchors = [ 19 | (10, 13), (16, 30), (33, 23), 20 | (30, 61), (62, 45), (42, 119), 21 | (116, 90), (156, 198), (373, 326)] 22 | 23 | self.anch_mask = anch_mask 24 | self.n_anchors = 4 25 | self.n_classes = n_classes 26 | self.guide_wh = nn.Conv2d(in_channels=in_ch, 27 | out_channels=2*self.n_anchors, kernel_size=1, stride=1, padding=0) 28 | self.Feature_adaption=FeatureAdaption(in_ch, in_ch, self.n_anchors, rfb, sep) 29 | 30 | self.conv = nn.Conv2d(in_channels=in_ch, 31 | out_channels=self.n_anchors*(self.n_classes+5), kernel_size=1, stride=1, padding=0) 32 | self.ignore_thre = ignore_thre 33 | self.l1_loss = nn.L1Loss(reduction='none') 34 | #self.smooth_l1_loss = nn.SmoothL1Loss(reduction='none') 35 | self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction='none') 36 | self.bce_loss = nn.BCELoss(reduction='none') 37 | self.iou_loss = IOUloss(reduction='none') 38 | self.iou_wh_loss = IOUWH_loss(reduction='none') 39 | self.stride = stride 40 | self._label_smooth = label_smooth 41 | 42 | self.all_anchors_grid = self.anchors 43 | self.masked_anchors = [self.all_anchors_grid[i] 44 | for i in self.anch_mask] 45 | self.ref_anchors = np.zeros((len(self.all_anchors_grid), 4)) 46 | self.ref_anchors[:, 2:] = np.array(self.all_anchors_grid) 47 | self.ref_anchors = torch.FloatTensor(self.ref_anchors) 48 | 49 | def forward(self, xin, labels=None): 50 | """ 51 | In this 52 | Args: 53 | xin (torch.Tensor): input feature map whose size is :math:`(N, C, H, W)`, \ 54 | where N, C, H, W denote batchsize, channel width, height, width respectively. 55 | labels (torch.Tensor): label data whose size is :math:`(N, K, 5)`. \ 56 | N and K denote batchsize and number of labels. 57 | Each label consists of [class, xc, yc, w, h]: 58 | class (float): class index. 59 | xc, yc (float) : center of bbox whose values range from 0 to 1. 60 | w, h (float) : size of bbox whose values range from 0 to 1. 61 | Returns: 62 | loss (torch.Tensor): total loss - the target of backprop. 63 | loss_xy (torch.Tensor): x, y loss - calculated by binary cross entropy (BCE) \ 64 | with boxsize-dependent weights. 65 | loss_wh (torch.Tensor): w, h loss - calculated by l2 without size averaging and \ 66 | with boxsize-dependent weights. 67 | loss_obj (torch.Tensor): objectness loss - calculated by BCE. 68 | loss_cls (torch.Tensor): classification loss - calculated by BCE for each class. 69 | loss_l2 (torch.Tensor): total l2 loss - only for logging. 70 | """ 71 | 72 | wh_pred = self.guide_wh(xin) #Anchor guiding 73 | 74 | if xin.type() == 'torch.cuda.HalfTensor': #As DCN only support FP32 now, change the feature to float. 75 | wh_pred = wh_pred.float() 76 | if labels is not None: 77 | labels = labels.float() 78 | self.Feature_adaption = self.Feature_adaption.float() 79 | self.conv = self.conv.float() 80 | xin = xin.float() 81 | 82 | feature_adapted = self.Feature_adaption(xin, wh_pred) 83 | 84 | output = self.conv(feature_adapted) 85 | wh_pred = torch.exp(wh_pred) 86 | 87 | batchsize = output.shape[0] 88 | fsize = output.shape[2] 89 | image_size = fsize * self.stride 90 | n_ch = 5 + self.n_classes 91 | dtype = torch.cuda.FloatTensor if xin.is_cuda else torch.FloatTensor 92 | 93 | wh_pred = wh_pred.view(batchsize, self.n_anchors, 2 , fsize, fsize) 94 | wh_pred = wh_pred.permute(0, 1, 3, 4, 2).contiguous() 95 | 96 | output = output.view(batchsize, self.n_anchors, n_ch, fsize, fsize) 97 | output = output.permute(0,1,3,4,2).contiguous() 98 | 99 | x_shift = dtype(np.broadcast_to( 100 | np.arange(fsize, dtype=np.float32), output.shape[:4])) 101 | y_shift = dtype(np.broadcast_to( 102 | np.arange(fsize, dtype=np.float32).reshape(fsize, 1), output.shape[:4])) 103 | 104 | masked_anchors = np.array(self.masked_anchors) 105 | 106 | w_anchors = dtype(np.broadcast_to(np.reshape( 107 | masked_anchors[:, 0], (1, self.n_anchors-1, 1, 1)), [batchsize, self.n_anchors-1, fsize, fsize])) 108 | h_anchors = dtype(np.broadcast_to(np.reshape( 109 | masked_anchors[:, 1], (1, self.n_anchors-1, 1, 1)), [batchsize, self.n_anchors-1, fsize, fsize])) 110 | 111 | default_center = torch.zeros(batchsize, self.n_anchors, fsize, fsize, 2).type(dtype) 112 | 113 | pred_anchors = torch.cat((default_center, wh_pred), dim=-1).contiguous() 114 | 115 | anchors_based = pred_anchors[:, :self.n_anchors-1, :, :, :] #anchor branch 116 | anchors_free = pred_anchors[:, self.n_anchors-1, :, :, :] #anchor free branch 117 | anchors_based[...,2] *= w_anchors 118 | anchors_based[...,3] *= h_anchors 119 | anchors_free[...,2] *= self.stride*4 120 | anchors_free[...,3] *= self.stride*4 121 | pred_anchors[...,:2] = pred_anchors[...,:2].detach() 122 | 123 | if not self.training: 124 | 125 | pred = output.clone() 126 | pred[..., np.r_[:2, 4:n_ch]] = torch.sigmoid( 127 | pred[...,np.r_[:2, 4:n_ch]]) 128 | pred[...,0] += x_shift 129 | pred[...,1] += y_shift 130 | pred[...,:2] *= self.stride 131 | pred[...,2] = torch.exp(pred[...,2])*(pred_anchors[...,2]) 132 | pred[...,3] = torch.exp(pred[...,3])*(pred_anchors[...,3]) 133 | refined_pred = pred.view(batchsize, -1, n_ch) 134 | return refined_pred.data 135 | 136 | #training for anchor prediction 137 | if self.training: 138 | 139 | target = torch.zeros(batchsize, self.n_anchors, 140 | fsize, fsize, n_ch).type(dtype) 141 | l1_target = torch.zeros(batchsize, self.n_anchors, 142 | fsize, fsize, 4).type(dtype) 143 | tgt_scale = torch.zeros(batchsize, self.n_anchors, 144 | fsize, fsize, 4).type(dtype) 145 | obj_mask = torch.ones(batchsize, self.n_anchors, fsize, fsize).type(dtype) 146 | 147 | cls_mask = torch.zeros(batchsize, self.n_anchors, fsize, fsize, self.n_classes).type(dtype) 148 | coord_mask = torch.zeros(batchsize, self.n_anchors, fsize, fsize).type(dtype) 149 | anchor_mask = torch.zeros(batchsize, self.n_anchors, fsize, fsize).type(dtype) 150 | 151 | labels = labels.data 152 | mixup = labels.shape[2]>5 153 | if mixup: 154 | label_cut = labels[...,:5] 155 | else: 156 | label_cut = labels 157 | nlabel = (label_cut.sum(dim=2) > 0).sum(dim=1) # number of objects 158 | 159 | truth_x_all = labels[:, :, 1] * 1. 160 | truth_y_all = labels[:, :, 2] * 1. 161 | truth_w_all = labels[:, :, 3] * 1. 162 | truth_h_all = labels[:, :, 4] * 1. 163 | truth_i_all = (truth_x_all/image_size*fsize).to(torch.int16).cpu().numpy() 164 | truth_j_all = (truth_y_all/image_size*fsize).to(torch.int16).cpu().numpy() 165 | 166 | pred = output.clone() 167 | pred[..., np.r_[:2, 4:n_ch]] = torch.sigmoid( 168 | pred[...,np.r_[:2, 4:n_ch]]) 169 | pred[...,0] += x_shift 170 | pred[...,1] += y_shift 171 | pred[...,2] = torch.exp(pred[...,2])*(pred_anchors[...,2]) 172 | pred[...,3] = torch.exp(pred[...,3])*(pred_anchors[...,3]) 173 | pred[...,:2] *= self.stride 174 | 175 | pred_boxes = pred[...,:4].data 176 | for b in range(batchsize): 177 | n = int(nlabel[b]) 178 | if n == 0: 179 | continue 180 | 181 | truth_box = dtype(np.zeros((n, 4))) 182 | truth_box[:n, 2] = truth_w_all[b, :n] 183 | truth_box[:n, 3] = truth_h_all[b, :n] 184 | truth_i = truth_i_all[b, :n] 185 | truth_j = truth_j_all[b, :n] 186 | 187 | # calculate iou between truth and reference anchors 188 | anchor_ious_all = bboxes_iou(truth_box.cpu(), self.ref_anchors, xyxy=False) 189 | best_n_all = np.argmax(anchor_ious_all, axis=1) 190 | best_anchor_iou = anchor_ious_all[np.arange(anchor_ious_all.shape[0]),best_n_all] 191 | best_n = best_n_all % 3 192 | best_n_mask = ((best_n_all == self.anch_mask[0]) | ( 193 | best_n_all == self.anch_mask[1]) | (best_n_all == self.anch_mask[2])) 194 | 195 | truth_box[:n, 0] = truth_x_all[b, :n] 196 | truth_box[:n, 1] = truth_y_all[b, :n] 197 | pred_box = pred_boxes[b] 198 | pred_ious = bboxes_iou(pred_box.view(-1,4), 199 | truth_box, xyxy=False) 200 | pred_best_iou, _= pred_ious.max(dim=1) 201 | pred_best_iou = (pred_best_iou > self.ignore_thre) 202 | pred_best_iou = pred_best_iou.view(pred_box.shape[:3]) 203 | obj_mask[b]= ~pred_best_iou 204 | truth_box[:n, 0] = 0 205 | truth_box[:n, 1] = 0 206 | 207 | if sum(best_n_mask) == 0: 208 | continue 209 | for ti in range(best_n.shape[0]): 210 | if best_n_mask[ti] == 1: 211 | i, j = truth_i[ti], truth_j[ti] 212 | a = best_n[ti] 213 | free_iou = bboxes_iou(truth_box[ti].cpu().view(-1,4), 214 | pred_anchors[b, self.n_anchors-1, j, i, :4].data.cpu().view(-1,4),xyxy=False) #iou of pred anchor 215 | 216 | #choose the best anchor 217 | if free_iou > best_anchor_iou[ti]: 218 | aa = self.n_anchors-1 219 | else: 220 | aa = a 221 | 222 | cls_mask[b, aa, j, i, :] = 1 223 | coord_mask[b, aa, j, i] = 1 224 | 225 | anchor_mask[b, self.n_anchors-1, j, i] = 1 226 | anchor_mask[b, a, j, i] = 1 227 | 228 | obj_mask[b, aa, j, i]= 1 if not mixup else labels[b, ti, 5] 229 | 230 | target[b, a, j, i, 0] = truth_x_all[b, ti] 231 | target[b, a, j, i, 1] = truth_y_all[b, ti] 232 | target[b, a, j, i, 2] = truth_w_all[b, ti] 233 | target[b, a, j, i, 3] = truth_h_all[b, ti] 234 | 235 | target[b, self.n_anchors-1, j, i, 0] = truth_x_all[b, ti] 236 | target[b, self.n_anchors-1, j, i, 1] = truth_y_all[b, ti] 237 | target[b, self.n_anchors-1, j, i, 2] = truth_w_all[b, ti] 238 | target[b, self.n_anchors-1, j, i, 3] = truth_h_all[b, ti] 239 | 240 | l1_target[b, aa, j, i, 0] = truth_x_all[b, ti]/image_size *fsize - i*1.0 241 | l1_target[b, aa, j, i, 1] = truth_y_all[b, ti]/image_size *fsize - j*1.0 242 | l1_target[b, aa, j, i, 2] = torch.log(truth_w_all[b, ti]/\ 243 | (pred_anchors[b, aa, j, i, 2])+ 1e-12) 244 | l1_target[b, aa, j, i, 3] = torch.log(truth_h_all[b, ti]/\ 245 | (pred_anchors[b, aa, j, i, 3]) + 1e-12) 246 | target[b, aa, j, i, 4] = 1 247 | if self._label_smooth: 248 | smooth_delta = 1 249 | smooth_weight = 1. / self.n_classes 250 | target[b, aa, j, i, 5:]= smooth_weight* smooth_delta 251 | 252 | target[b, aa, j, i, 5 + labels[b, ti, 253 | 0].to(torch.int16)] = 1 - smooth_delta*smooth_weight 254 | else: 255 | target[b,aa, j, i, 5 + labels[b, ti, 256 | 0].to(torch.int16)] = 1 257 | 258 | tgt_scale[b, aa,j, i, :] = 2.0 - truth_w_all[b, ti]*truth_h_all[b, ti] / image_size/image_size 259 | 260 | 261 | # Anchor loss 262 | anchorcoord_mask = anchor_mask>0 263 | loss_anchor = self.iou_wh_loss(pred_anchors[...,:4][anchorcoord_mask], target[...,:4][anchorcoord_mask]).sum()/batchsize 264 | 265 | #Prediction loss 266 | coord_mask = coord_mask>0 267 | loss_iou = (tgt_scale[coord_mask][...,0]*\ 268 | self.iou_loss(pred[..., :4][coord_mask], target[..., :4][coord_mask])).sum() / batchsize 269 | tgt_scale = tgt_scale[...,:2] 270 | loss_xy = (tgt_scale*self.bcewithlog_loss(output[...,:2], l1_target[...,:2])).sum() / batchsize 271 | loss_wh = (tgt_scale*self.l1_loss(output[...,2:4], l1_target[...,2:4])).sum() / batchsize 272 | loss_l1 = loss_xy + loss_wh 273 | loss_obj = (obj_mask*(self.bcewithlog_loss(output[..., 4], target[..., 4]))).sum() / batchsize 274 | loss_cls = (cls_mask*(self.bcewithlog_loss(output[..., 5:], target[..., 5:]))).sum()/ batchsize 275 | 276 | loss = loss_anchor + loss_iou + loss_l1+ loss_obj + loss_cls 277 | 278 | return loss, loss_anchor, loss_iou, loss_l1, loss_obj, loss_cls 279 | 280 | -------------------------------------------------------------------------------- /dataset/data_augment.py: -------------------------------------------------------------------------------- 1 | """Data augmentation functionality. Passed as callable transformations to 2 | Dataset classes. 3 | 4 | The data augmentation procedures were interpreted from @weiliu89's SSD paper 5 | http://arxiv.org/abs/1512.02325 6 | """ 7 | 8 | import torch 9 | from torchvision import transforms 10 | import cv2 11 | import numpy as np 12 | import random 13 | import math 14 | from utils.utils import matrix_iou, visual 15 | 16 | #DEBUG = True 17 | DEBUG = False 18 | 19 | def _crop(image, boxes, labels, ratios = None): 20 | height, width, _ = image.shape 21 | 22 | if len(boxes)== 0: 23 | return image, boxes, labels, ratios 24 | 25 | while True: 26 | mode = random.choice(( 27 | None, 28 | (0.1, None), 29 | (0.3, None), 30 | (0.5, None), 31 | (0.7, None), 32 | (0.9, None), 33 | (None, None), 34 | )) 35 | 36 | if mode is None: 37 | return image, boxes, labels, ratios 38 | 39 | min_iou, max_iou = mode 40 | if min_iou is None: 41 | min_iou = float('-inf') 42 | if max_iou is None: 43 | max_iou = float('inf') 44 | 45 | for _ in range(50): 46 | scale = random.uniform(0.3,1.) 47 | min_ratio = max(0.5, scale*scale) 48 | max_ratio = min(2, 1. / scale / scale) 49 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 50 | w = int(scale * ratio * width) 51 | h = int((scale / ratio) * height) 52 | 53 | 54 | l = random.randrange(width - w) 55 | t = random.randrange(height - h) 56 | roi = np.array((l, t, l + w, t + h)) 57 | 58 | iou = matrix_iou(boxes, roi[np.newaxis]) 59 | 60 | if not (min_iou <= iou.min() and iou.max() <= max_iou): 61 | continue 62 | 63 | image_t = image[roi[1]:roi[3], roi[0]:roi[2]] 64 | 65 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2 66 | mask = np.logical_and(roi[:2] < centers, centers < roi[2:]) \ 67 | .all(axis=1) 68 | boxes_t = boxes[mask].copy() 69 | labels_t = labels[mask].copy() 70 | if ratios is not None: 71 | ratios_t = ratios[mask].copy() 72 | else: 73 | ratios_t=None 74 | 75 | if len(boxes_t) == 0: 76 | continue 77 | 78 | boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2]) 79 | boxes_t[:, :2] -= roi[:2] 80 | boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:]) 81 | boxes_t[:, 2:] -= roi[:2] 82 | 83 | return image_t, boxes_t,labels_t, ratios_t 84 | 85 | 86 | def _distort(image): 87 | def _convert(image, alpha=1, beta=0): 88 | tmp = image.astype(float) * alpha + beta 89 | tmp[tmp < 0] = 0 90 | tmp[tmp > 255] = 255 91 | image[:] = tmp 92 | 93 | image = image.copy() 94 | 95 | if random.randrange(2): 96 | _convert(image, beta=random.uniform(-32, 32)) 97 | 98 | if random.randrange(2): 99 | _convert(image, alpha=random.uniform(0.5, 1.5)) 100 | 101 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 102 | 103 | if random.randrange(2): 104 | tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) 105 | tmp %= 180 106 | image[:, :, 0] = tmp 107 | 108 | if random.randrange(2): 109 | _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) 110 | 111 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 112 | 113 | return image 114 | 115 | 116 | def _expand(image, boxes,fill, p): 117 | if random.random() > p: 118 | return image, boxes 119 | 120 | height, width, depth = image.shape 121 | for _ in range(50): 122 | scale = random.uniform(1,4) 123 | 124 | min_ratio = max(0.5, 1./scale/scale) 125 | max_ratio = min(2, scale*scale) 126 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 127 | ws = scale*ratio 128 | hs = scale/ratio 129 | if ws < 1 or hs < 1: 130 | continue 131 | w = int(ws * width) 132 | h = int(hs * height) 133 | 134 | left = random.randint(0, w - width) 135 | top = random.randint(0, h - height) 136 | 137 | boxes_t = boxes.copy() 138 | boxes_t[:, :2] += (left, top) 139 | boxes_t[:, 2:] += (left, top) 140 | 141 | 142 | expand_image = np.empty( 143 | (h, w, depth), 144 | dtype=image.dtype) 145 | expand_image[:, :] = fill 146 | expand_image[top:top + height, left:left + width] = image 147 | image = expand_image 148 | 149 | return image, boxes_t 150 | 151 | 152 | def _mirror(image, boxes): 153 | _, width, _ = image.shape 154 | if random.randrange(2): 155 | image = image[:, ::-1] 156 | boxes = boxes.copy() 157 | boxes[:, 0::2] = width - boxes[:, 2::-2] 158 | return image, boxes 159 | 160 | 161 | def _random_affine(img, targets=None, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2), 162 | borderValue=(127.5, 127.5, 127.5)): 163 | # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10)) 164 | # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4 165 | 166 | border = 0 # width of added border (optional) 167 | #height = max(img.shape[0], img.shape[1]) + border * 2 168 | height, width, _ = img.shape 169 | 170 | # Rotation and Scale 171 | R = np.eye(3) 172 | a = random.random() * (degrees[1] - degrees[0]) + degrees[0] 173 | # a += random.choice([-180, -90, 0, 90]) # 90deg rotations added to small rotations 174 | s = random.random() * (scale[1] - scale[0]) + scale[0] 175 | R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s) 176 | 177 | # Translation 178 | T = np.eye(3) 179 | T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[0] + border # x translation (pixels) 180 | T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[1] + border # y translation (pixels) 181 | 182 | # Shear 183 | S = np.eye(3) 184 | S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # x shear (deg) 185 | S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # y shear (deg) 186 | 187 | M = S @ T @ R # Combined rotation matrix. ORDER IS IMPORTANT HERE!! 188 | imw = cv2.warpPerspective(img, M, dsize=(width, height), flags=cv2.INTER_LINEAR, 189 | borderValue=borderValue) # BGR order borderValue 190 | 191 | # Return warped points also 192 | if targets is not None: 193 | if len(targets) > 0: 194 | n = targets.shape[0] 195 | points = targets[:, 0:4].copy() 196 | area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1]) 197 | 198 | # warp points 199 | xy = np.ones((n * 4, 3)) 200 | xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 201 | xy = (xy @ M.T)[:, :2].reshape(n, 8) 202 | 203 | # create new boxes 204 | x = xy[:, [0, 2, 4, 6]] 205 | y = xy[:, [1, 3, 5, 7]] 206 | xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T 207 | 208 | # apply angle-based reduction 209 | radians = a * math.pi / 180 210 | reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 211 | x = (xy[:, 2] + xy[:, 0]) / 2 212 | y = (xy[:, 3] + xy[:, 1]) / 2 213 | w = (xy[:, 2] - xy[:, 0]) * reduction 214 | h = (xy[:, 3] - xy[:, 1]) * reduction 215 | xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T 216 | 217 | # reject warped points outside of image 218 | x1 = np.clip(xy[:,0], 0, width) 219 | y1 = np.clip(xy[:,1], 0, height) 220 | x2 = np.clip(xy[:,2], 0, width) 221 | y2 = np.clip(xy[:,3], 0, height) 222 | boxes = np.concatenate((x1, y1, x2, y2)).reshape(4, n).T 223 | 224 | return imw, boxes, M 225 | else: 226 | return imw 227 | 228 | def preproc_for_test(image, input_size, mean, std): 229 | interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4] 230 | interp_method = interp_methods[random.randrange(5)] 231 | image = cv2.resize(image, input_size,interpolation=interp_method) 232 | image = image.astype(np.float32) 233 | image = image[:,:,::-1] 234 | image /= 255. 235 | if mean is not None: 236 | image -= mean 237 | if std is not None: 238 | image /= std 239 | return image.transpose(2, 0, 1) 240 | 241 | 242 | class TrainTransform(object): 243 | 244 | def __init__(self, p=0.5, rgb_means=None, std = None,max_labels=50): 245 | self.means = rgb_means 246 | self.std = std 247 | self.p = p 248 | self.max_labels=max_labels 249 | 250 | def __call__(self, image, targets, input_dim): 251 | boxes = targets[:,:4].copy() 252 | labels = targets[:,4].copy() 253 | if targets.shape[1] > 5: 254 | mixup=True 255 | ratios = targets[:,-1].copy() 256 | ratios_o = targets[:,-1].copy() 257 | else: 258 | mixup=False 259 | ratios = None 260 | ratios_o = None 261 | lshape = 6 if mixup else 5 262 | if len(boxes) == 0: 263 | targets = np.zeros((self.max_labels,lshape),dtype=np.float32) 264 | image = preproc_for_test(image, input_dim, self.means, self.std) 265 | image = np.ascontiguousarray(image, dtype=np.float32) 266 | return torch.from_numpy(image), torch.from_numpy(targets) 267 | 268 | image_o = image.copy() 269 | targets_o = targets.copy() 270 | height_o, width_o, _ = image_o.shape 271 | boxes_o = targets_o[:,:4] 272 | labels_o = targets_o[:,4] 273 | b_x_o = (boxes_o[:, 2] + boxes_o[:, 0])*.5 274 | b_y_o = (boxes_o[:, 3] + boxes_o[:, 1])*.5 275 | b_w_o = (boxes_o[:, 2] - boxes_o[:, 0])*1. 276 | b_h_o = (boxes_o[:, 3] - boxes_o[:, 1])*1. 277 | boxes_o[:,0] = b_x_o 278 | boxes_o[:,1] = b_y_o 279 | boxes_o[:,2] = b_w_o 280 | boxes_o[:,3] = b_h_o 281 | boxes_o[:, 0::2] /= width_o 282 | boxes_o[:, 1::2] /= height_o 283 | boxes_o[:, 0::2] *= input_dim[0] 284 | boxes_o[:, 1::2] *= input_dim[1] 285 | #labels_o = np.expand_dims(labels_o,1) 286 | #targets_o = np.hstack((boxes_o,labels_o)) 287 | #targets_o = np.hstack((labels_o,boxes_o)) 288 | 289 | image_t = _distort(image) 290 | if self.means is not None: 291 | fill = [m * 255 for m in self.means] 292 | fill = fill[::-1] 293 | else: 294 | fill = (127.5,127.5,127.5) 295 | image_t, boxes = _expand(image_t, boxes, fill, self.p) 296 | image_t, boxes, labels, ratios = _crop(image_t, boxes, labels, ratios) 297 | image_t, boxes = _mirror(image_t, boxes) 298 | 299 | if random.randrange(2): 300 | image_t, boxes, _ = _random_affine(image_t, boxes, borderValue=fill) 301 | 302 | height, width, _ = image_t.shape 303 | 304 | if DEBUG: 305 | image_t = np.ascontiguousarray(image_t, dtype=np.uint8) 306 | img = visual(image_t, boxes,labels) 307 | cv2.imshow('DEBUG', img) 308 | cv2.waitKey(0) 309 | 310 | image_t = preproc_for_test(image_t, input_dim, self.means, self.std) 311 | boxes = boxes.copy() 312 | b_x = (boxes[:, 2] + boxes[:, 0])*.5 313 | b_y = (boxes[:, 3] + boxes[:, 1])*.5 314 | b_w = (boxes[:, 2] - boxes[:, 0])*1. 315 | b_h = (boxes[:, 3] - boxes[:, 1])*1. 316 | boxes[:,0] = b_x 317 | boxes[:,1] = b_y 318 | boxes[:,2] = b_w 319 | boxes[:,3] = b_h 320 | boxes[:, 0::2] /= width 321 | boxes[:, 1::2] /= height 322 | boxes[:, 0::2] *= input_dim[0] 323 | boxes[:, 1::2] *= input_dim[1] 324 | mask_b= np.minimum(boxes[:,2], boxes[:,3]) > 6 325 | #mask_b= (boxes[:,2]*boxes[:,3]) > 32**2 326 | #mask_b= (boxes[:,2]*boxes[:,3]) > 48**2 327 | boxes_t = boxes[mask_b] 328 | labels_t = labels[mask_b].copy() 329 | if mixup: 330 | ratios_t = ratios[mask_b].copy() 331 | 332 | ''' 333 | if len(boxes_t)==0: 334 | targets = np.zeros((self.max_labels,lshape),dtype=np.float32) 335 | image = preproc_for_test(image_o, input_dim, self.means, self.std) 336 | image = np.ascontiguousarray(image, dtype=np.float32) 337 | return torch.from_numpy(image), torch.from_numpy(targets) 338 | ''' 339 | #if len(boxes_t)==0 or random.random() > 0.97: 340 | if len(boxes_t)==0: 341 | image_t = preproc_for_test(image_o, input_dim, self.means, self.std) 342 | boxes_t = boxes_o 343 | labels_t = labels_o 344 | ratios_t = ratios_o 345 | 346 | labels_t = np.expand_dims(labels_t,1) 347 | if mixup: 348 | ratios_t = np.expand_dims(ratios_t,1) 349 | targets_t = np.hstack((labels_t,boxes_t,ratios_t)) 350 | else: 351 | targets_t = np.hstack((labels_t,boxes_t)) 352 | padded_labels = np.zeros((self.max_labels,lshape)) 353 | padded_labels[range(len(targets_t))[:self.max_labels]] = targets_t[:self.max_labels] 354 | padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32) 355 | image_t = np.ascontiguousarray(image_t, dtype=np.float32) 356 | 357 | return torch.from_numpy(image_t), torch.from_numpy(padded_labels) 358 | 359 | 360 | 361 | class ValTransform(object): 362 | """Defines the transformations that should be applied to test PIL image 363 | for input into the network 364 | 365 | dimension -> tensorize -> color adj 366 | 367 | Arguments: 368 | resize (int): input dimension to SSD 369 | rgb_means ((int,int,int)): average RGB of the dataset 370 | (104,117,123) 371 | swap ((int,int,int)): final order of channels 372 | Returns: 373 | transform (transform) : callable transform to be applied to test/val 374 | data 375 | """ 376 | def __init__(self, rgb_means=None, std=None, swap=(2, 0, 1)): 377 | self.means = rgb_means 378 | self.swap = swap 379 | self.std=std 380 | 381 | # assume input is cv2 img for now 382 | def __call__(self, img, res, input_size): 383 | 384 | interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4] 385 | interp_method = interp_methods[0] 386 | img = cv2.resize(np.array(img), input_size, 387 | interpolation = interp_method).astype(np.float32) 388 | img = img[:,:,::-1] 389 | img /= 255. 390 | if self.means is not None: 391 | img -= self.means 392 | if self.std is not None: 393 | img /= self.std 394 | img = img.transpose(self.swap) 395 | img = np.ascontiguousarray(img, dtype=np.float32) 396 | return torch.from_numpy(img), torch.zeros(1,5) 397 | --------------------------------------------------------------------------------