├── utils
    ├── DCN
    │   ├── make.sh
    │   ├── functions
    │   │   ├── __init__.py
    │   │   ├── deform_conv2d_func.py
    │   │   └── modulated_deform_conv2d_func.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── modulated_deform_conv2d.py
    │   │   └── deform_conv2d.py
    │   ├── src
    │   │   ├── vision.cpp
    │   │   ├── cpu
    │   │   │   ├── deform_conv2d_cpu.h
    │   │   │   ├── deform_conv2d_cpu.cpp
    │   │   │   ├── modulated_deform_conv2d_cpu.h
    │   │   │   └── modulated_deform_conv2d_cpu.cpp
    │   │   ├── cuda
    │   │   │   ├── deform_conv2d_cuda.h
    │   │   │   └── modulated_deform_conv2d_cuda.h
    │   │   ├── deform_conv2d.h
    │   │   └── modulated_deform_conv2d.h
    │   ├── setup.py
    │   └── deform_conv2d_naive.py
    ├── __init__.py
    ├── fp16_utils
    │   ├── __init__.py
    │   ├── README.md
    │   ├── fp16util.py
    │   └── loss_scaler.py
    ├── vis_utils.py
    ├── utils.py
    ├── distributed_util.py
    ├── voc_evaluator.py
    └── cocoapi_evaluator.py
├── dataset
    ├── __init__.py
    ├── mixupdetection.py
    ├── cocodataset.py
    ├── voc_eval.py
    ├── dataloading.py
    ├── vocdataset.py
    └── data_augment.py
├── make.sh
├── doc
    └── asff.png
├── example
    └── test.jpg
├── .gitignore
├── config
    ├── yolov3_mobile.cfg
    └── yolov3_baseline.cfg
├── models
    ├── utils_loss.py
    ├── yolov3_baseline.py
    ├── yolov3_asff.py
    ├── yolov3_mobilev2.py
    └── yolov3_head.py
├── demo.py
├── eval.py
└── README.md


/utils/DCN/make.sh:
--------------------------------------------------------------------------------
1 | python setup.py build install 
2 | 


--------------------------------------------------------------------------------
/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/make.sh:
--------------------------------------------------------------------------------
1 | cd utils/DCN
2 | 
3 | python setup.py install
4 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/doc/asff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataXujing/ASFF/master/doc/asff.png


--------------------------------------------------------------------------------
/example/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataXujing/ASFF/master/example/test.jpg


--------------------------------------------------------------------------------
/utils/DCN/functions/__init__.py:
--------------------------------------------------------------------------------
1 | from .deform_conv2d_func import DeformConv2dFunction
2 | from .modulated_deform_conv2d_func import ModulatedDeformConv2dFunction
3 | 


--------------------------------------------------------------------------------
/utils/DCN/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .deform_conv2d import DeformConv2d, _DeformConv2d, DeformConv2dPack, DeformConv2dPackMore
2 | from .modulated_deform_conv2d import ModulatedDeformConv2d, _ModulatedDeformConv2d, ModulatedDeformConv2dPack
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.pyc
 4 | 
 5 | # C extensions
 6 | *.so
 7 | *.o
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | build/
12 | 
13 | *.swp
14 | 
15 | weights/
16 | log/
17 | save/
18 | trained_model/
19 | dist/
20 | *.egg-info/
21 | 
22 | 


--------------------------------------------------------------------------------
/config/yolov3_mobile.cfg:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: YOLOv3
 3 |   BACKBONE: mobile
 4 | TRAIN:
 5 |   LR: 0.001
 6 |   MOMENTUM: 0.9
 7 |   DECAY: 0.0005
 8 |   BURN_IN: 5
 9 |   MAXEPOCH: 300
10 |   COS: True
11 |   SYBN: True
12 |   MIX: True
13 |   NO_MIXUP_EPOCHS: 30
14 |   LABAL_SMOOTH: True
15 |   BATCHSIZE: 8
16 |   IMGSIZE: 416
17 |   IGNORETHRE: 0.7
18 |   RANDRESIZE: True
19 | TEST:
20 |   CONFTHRE: 0.001
21 |   NMSTHRE: 0.65
22 | 


--------------------------------------------------------------------------------
/config/yolov3_baseline.cfg:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: YOLOv3
 3 |   BACKBONE: darknet53
 4 | TRAIN:
 5 |   LR: 0.001
 6 |   MOMENTUM: 0.9
 7 |   DECAY: 0.0005
 8 |   BURN_IN: 5
 9 |   MAXEPOCH: 300
10 |   COS: True
11 |   SYBN: True
12 |   MIX: True
13 |   NO_MIXUP_EPOCHS: 30
14 |   LABAL_SMOOTH: True
15 |   BATCHSIZE: 5
16 |   IMGSIZE: 608
17 |   IGNORETHRE: 0.7
18 |   RANDRESIZE: True
19 | TEST:
20 |   CONFTHRE: 0.01
21 |   NMSTHRE: 0.65
22 |   IMGSIZE: 608
23 | 


--------------------------------------------------------------------------------
/utils/fp16_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .fp16util import (
 2 |     BN_convert_float,
 3 |     network_to_half,
 4 |     prep_param_lists,
 5 |     model_grads_to_master_grads,
 6 |     master_params_to_model_params,
 7 |     tofp16,
 8 |     to_python_float,
 9 |     clip_grad_norm,
10 |     convert_module,
11 |     convert_network,
12 |     FP16Model,
13 | )
14 | 
15 | from .fp16_optimizer import FP16_Optimizer
16 | from .loss_scaler import LossScaler, DynamicLossScaler
17 | 


--------------------------------------------------------------------------------
/utils/DCN/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "deform_conv2d.h"
 3 | #include "modulated_deform_conv2d.h"
 4 | 
 5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 6 |   m.def("deform_conv2d_forward", &deform_conv2d_forward, "deform_conv2d_forward");
 7 |   m.def("deform_conv2d_backward", &deform_conv2d_backward, "deform_conv2d_backward");
 8 |   m.def("modulated_deform_conv2d_forward", &modulated_deform_conv2d_forward, "modulated_deform_conv2d_forward");
 9 |   m.def("modulated_deform_conv2d_backward", &modulated_deform_conv2d_backward, "modulated_deform_conv2d_backward");
10 | }
11 | 


--------------------------------------------------------------------------------
/utils/fp16_utils/README.md:
--------------------------------------------------------------------------------
 1 | fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user.  To use `FP16_Optimizer`, only two lines of one's Python model need to change.
 2 | 
 3 | #### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
 4 | 
 5 | #### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple)
 6 | 
 7 | #### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
 8 | 
 9 | #### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model)
10 | 
11 | 
12 | fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses.  
13 | 
14 | #### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management)
15 | 
16 | The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling.  These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically.
17 | 


--------------------------------------------------------------------------------
/utils/DCN/src/cpu/deform_conv2d_cpu.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | at::Tensor
 5 | deform_conv2d_cpu_forward(const at::Tensor &input,
 6 |                           const at::Tensor &weight,
 7 |                           const at::Tensor &bias,
 8 |                           const at::Tensor &offset,
 9 |                           const int kernel_h,
10 |                           const int kernel_w,
11 |                           const int stride_h,
12 |                           const int stride_w,
13 |                           const int pad_h,
14 |                           const int pad_w,
15 |                           const int dilation_h,
16 |                           const int dilation_w,
17 |                           const int group,
18 |                           const int deformable_group,
19 |                           const int im2col_step);
20 | 
21 | std::vector<at::Tensor>
22 | deform_conv2d_cpu_backward(const at::Tensor &input,
23 |                            const at::Tensor &weight,
24 |                            const at::Tensor &bias,
25 |                            const at::Tensor &offset,
26 |                            const at::Tensor &grad_output,
27 |                            const int kernel_h,
28 |                            const int kernel_w,
29 |                            const int stride_h,
30 |                            const int stride_w,
31 |                            const int pad_h,
32 |                            const int pad_w,
33 |                            const int dilation_h,
34 |                            const int dilation_w,
35 |                            const int group,
36 |                            const int deformable_group,
37 |                            const int im2col_step);
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/utils/DCN/src/cuda/deform_conv2d_cuda.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | at::Tensor
 5 | deform_conv2d_cuda_forward(const at::Tensor &input,
 6 |                            const at::Tensor &weight,
 7 |                            const at::Tensor &bias,
 8 |                            const at::Tensor &offset,
 9 |                            const int kernel_h,
10 |                            const int kernel_w,
11 |                            const int stride_h,
12 |                            const int stride_w,
13 |                            const int pad_h,
14 |                            const int pad_w,
15 |                            const int dilation_h,
16 |                            const int dilation_w,
17 |                            const int group,
18 |                            const int deformable_group,
19 |                            const int im2col_step);
20 | 
21 | std::vector<at::Tensor>
22 | deform_conv2d_cuda_backward(const at::Tensor &input,
23 |                             const at::Tensor &weight,
24 |                             const at::Tensor &bias,
25 |                             const at::Tensor &offset,
26 |                             const at::Tensor &grad_output,
27 |                             const int kernel_h,
28 |                             const int kernel_w,
29 |                             const int stride_h,
30 |                             const int stride_w,
31 |                             const int pad_h,
32 |                             const int pad_w,
33 |                             const int dilation_h,
34 |                             const int dilation_w,
35 |                             const int group,
36 |                             const int deformable_group,
37 |                             const int im2col_step);
38 | 
39 | 


--------------------------------------------------------------------------------
/utils/DCN/src/cpu/deform_conv2d_cpu.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | 
 3 | #include <ATen/ATen.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | 
 6 | 
 7 | at::Tensor
 8 | deform_conv2d_cpu_forward(const at::Tensor &input,
 9 |                           const at::Tensor &weight,
10 |                           const at::Tensor &bias,
11 |                           const at::Tensor &offset,
12 |                           const int kernel_h,
13 |                           const int kernel_w,
14 |                           const int stride_h,
15 |                           const int stride_w,
16 |                           const int pad_h,
17 |                           const int pad_w,
18 |                           const int dilation_h,
19 |                           const int dilation_w,
20 |                           const int group,
21 |                           const int deformable_group,
22 |                           const int im2col_step)
23 | {
24 |     AT_ERROR("Not implement on cpu");
25 | }
26 | 
27 | std::vector<at::Tensor>
28 | deform_conv2d_cpu_backward(const at::Tensor &input,
29 |                            const at::Tensor &weight,
30 |                            const at::Tensor &bias,
31 |                            const at::Tensor &offset,
32 |                            const at::Tensor &grad_output,
33 |                            const int kernel_h,
34 |                            const int kernel_w,
35 |                            const int stride_h,
36 |                            const int stride_w,
37 |                            const int pad_h,
38 |                            const int pad_w,
39 |                            const int dilation_h,
40 |                            const int dilation_w,
41 |                            const int group,
42 |                            const int deformable_group,
43 |                            const int im2col_step)
44 | {
45 |     AT_ERROR("Not implement on cpu");
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/utils/DCN/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import glob
 5 | 
 6 | import torch
 7 | 
 8 | from torch.utils.cpp_extension import CUDA_HOME
 9 | from torch.utils.cpp_extension import CppExtension
10 | from torch.utils.cpp_extension import CUDAExtension
11 | 
12 | from setuptools import find_packages
13 | from setuptools import setup
14 | 
15 | requirements = ["torch", "torchvision"]
16 | 
17 | def get_extensions():
18 |     this_dir = os.path.dirname(os.path.abspath(__file__))
19 |     extensions_dir = os.path.join(this_dir, "src")
20 | 
21 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
22 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
23 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
24 | 
25 |     sources = main_file + source_cpu
26 |     extension = CppExtension
27 |     extra_compile_args = {"cxx": []}
28 |     define_macros = []
29 | 
30 |     if torch.cuda.is_available() and CUDA_HOME is not None:
31 |         extension = CUDAExtension
32 |         sources += source_cuda
33 |         define_macros += [("WITH_CUDA", None)]
34 |         extra_compile_args["nvcc"] = [
35 |             "-DCUDA_HAS_FP16=1",
36 |             "-D__CUDA_NO_HALF_OPERATORS__",
37 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
38 |             "-D__CUDA_NO_HALF2_OPERATORS__",
39 |         ]
40 |     else:
41 |         raise NotImplementedError('Cuda is not availabel')
42 | 
43 |     sources = [os.path.join(extensions_dir, s) for s in sources]
44 |     include_dirs = [extensions_dir]
45 |     ext_modules = [
46 |         extension(
47 |             "DCN",
48 |             sources,
49 |             include_dirs=include_dirs,
50 |             define_macros=define_macros,
51 |             extra_compile_args=extra_compile_args,
52 |         )
53 |     ]
54 |     return ext_modules
55 | 
56 | setup(
57 |     name="DCN",
58 |     version="1.0",
59 |     description="deformable convolutional networks",
60 |     packages=find_packages(exclude=("configs", "tests",)),
61 |     ext_modules=get_extensions(),
62 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
63 | )
64 | 


--------------------------------------------------------------------------------
/utils/DCN/src/cpu/modulated_deform_conv2d_cpu.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | at::Tensor
 5 | modulated_deform_conv2d_cpu_forward(const at::Tensor &input,
 6 |                                     const at::Tensor &weight,
 7 |                                     const at::Tensor &bias,
 8 |                                     const at::Tensor &offset,
 9 |                                     const at::Tensor &mask,
10 |                                     const int kernel_h,
11 |                                     const int kernel_w,
12 |                                     const int stride_h,
13 |                                     const int stride_w,
14 |                                     const int pad_h,
15 |                                     const int pad_w,
16 |                                     const int dilation_h,
17 |                                     const int dilation_w,
18 |                                     const int group,
19 |                                     const int deformable_group,
20 |                                     const int im2col_step);
21 | 
22 | std::vector<at::Tensor>
23 | modulated_deform_conv2d_cpu_backward(const at::Tensor &input,
24 |                                      const at::Tensor &weight,
25 |                                      const at::Tensor &bias,
26 |                                      const at::Tensor &offset,
27 |                                      const at::Tensor &mask,
28 |                                      const at::Tensor &grad_output,
29 |                                      const int kernel_h,
30 |                                      const int kernel_w,
31 |                                      const int stride_h,
32 |                                      const int stride_w,
33 |                                      const int pad_h,
34 |                                      const int pad_w,
35 |                                      const int dilation_h,
36 |                                      const int dilation_w,
37 |                                      const int group,
38 |                                      const int deformable_group,
39 |                                      const int im2col_step);
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/utils/DCN/src/cuda/modulated_deform_conv2d_cuda.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | at::Tensor
 5 | modulated_deform_conv2d_cuda_forward(const at::Tensor &input,
 6 |                                      const at::Tensor &weight,
 7 |                                      const at::Tensor &bias,
 8 |                                      const at::Tensor &offset,
 9 |                                      const at::Tensor &mask,
10 |                                      const int kernel_h,
11 |                                      const int kernel_w,
12 |                                      const int stride_h,
13 |                                      const int stride_w,
14 |                                      const int pad_h,
15 |                                      const int pad_w,
16 |                                      const int dilation_h,
17 |                                      const int dilation_w,
18 |                                      const int group,
19 |                                      const int deformable_group,
20 |                                      const int im2col_step);
21 | 
22 | std::vector<at::Tensor>
23 | modulated_deform_conv2d_cuda_backward(const at::Tensor &input,
24 |                                       const at::Tensor &weight,
25 |                                       const at::Tensor &bias,
26 |                                       const at::Tensor &offset,
27 |                                       const at::Tensor &mask,
28 |                                       const at::Tensor &grad_output,
29 |                                       const int kernel_h,
30 |                                       const int kernel_w,
31 |                                       const int stride_h,
32 |                                       const int stride_w,
33 |                                       const int pad_h,
34 |                                       const int pad_w,
35 |                                       const int dilation_h,
36 |                                       const int dilation_w,
37 |                                       const int group,
38 |                                       const int deformable_group,
39 |                                       const int im2col_step);
40 | 
41 | 


--------------------------------------------------------------------------------
/models/utils_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.autograd import Variable
 5 | 
 6 | 
 7 | class IOUWH_loss(nn.Module): #used for anchor guiding
 8 |     def __init__(self, reduction='none'):
 9 |         super(IOUWH_loss, self).__init__()
10 |         self.reduction = reduction
11 | 
12 |     def forward(self, pred, target):
13 |         orig_shape = pred.shape
14 |         pred = pred.view(-1,4)
15 |         target = target.view(-1,4)
16 |         target[:,:2] = 0
17 |         tl = torch.max((target[:, :2]-pred[:,2:]/2),
18 |                       (target[:, :2] - target[:, 2:]/2))
19 | 
20 |         br = torch.min((target[:, :2]+pred[:,2:]/2),
21 |                       (target[:, :2] + target[:, 2:]/2))
22 | 
23 |         area_p = torch.prod(pred[:,2:], 1)
24 |         area_g = torch.prod(target[:,2:], 1)
25 | 
26 |         en = (tl< br).type(tl.type()).prod(dim=1)
27 |         area_i = torch.prod(br-tl, 1) * en
28 |         U = area_p+area_g-area_i+ 1e-16
29 |         iou= area_i / U
30 | 
31 |         loss = 1-iou**2
32 |         if self.reduction =='mean':
33 |             loss = loss.mean()
34 |         elif self.reduction == 'sum':
35 |             loss = loss.sum()
36 | 
37 |         return loss
38 | 
39 | class IOUloss(nn.Module):
40 |     def __init__(self, reduction='none'):
41 |         super(IOUloss, self).__init__()
42 |         self.reduction = reduction
43 | 
44 |     def forward(self, pred, target):
45 |         orig_shape = pred.shape
46 |         pred = pred.view(-1,4)
47 |         target = target.view(-1,4)
48 |         tl = torch.max((pred[:, :2]-pred[:,2:]/2),
49 |                       (target[:, :2] - target[:, 2:]/2))
50 |         br = torch.min((pred[:, :2]+pred[:,2:]/2),
51 |                       (target[:, :2] + target[:, 2:]/2))
52 | 
53 |         area_p = torch.prod(pred[:,2:], 1)
54 |         area_g = torch.prod(target[:,2:], 1)
55 | 
56 |         en = (tl< br).type(tl.type()).prod(dim=1)
57 |         area_i = torch.prod(br-tl, 1) * en
58 |         iou= (area_i) / (area_p+area_g-area_i+ 1e-16)
59 | 
60 |         loss = 1-iou**2
61 |         if self.reduction =='mean':
62 |             loss = loss.mean()
63 |         elif self.reduction == 'sum':
64 |             loss = loss.sum()
65 | 
66 |         return loss
67 | 


--------------------------------------------------------------------------------
/utils/DCN/src/cpu/modulated_deform_conv2d_cpu.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | 
 3 | #include <ATen/ATen.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | 
 6 | 
 7 | at::Tensor
 8 | modulated_deform_conv2d_cpu_forward(const at::Tensor &input,
 9 |                                     const at::Tensor &weight,
10 |                                     const at::Tensor &bias,
11 |                                     const at::Tensor &offset,
12 |                                     const at::Tensor &mask,
13 |                                     const int kernel_h,
14 |                                     const int kernel_w,
15 |                                     const int stride_h,
16 |                                     const int stride_w,
17 |                                     const int pad_h,
18 |                                     const int pad_w,
19 |                                     const int dilation_h,
20 |                                     const int dilation_w,
21 |                                     const int group,
22 |                                     const int deformable_group,
23 |                                     const int im2col_step)
24 | {
25 |     AT_ERROR("Not implement on cpu");
26 | }
27 | 
28 | std::vector<at::Tensor>
29 | modulated_deform_conv2d_cpu_backward(const at::Tensor &input,
30 |                                      const at::Tensor &weight,
31 |                                      const at::Tensor &bias,
32 |                                      const at::Tensor &offset,
33 |                                      const at::Tensor &mask,
34 |                                      const at::Tensor &grad_output,
35 |                                      const int kernel_h,
36 |                                      const int kernel_w,
37 |                                      const int stride_h,
38 |                                      const int stride_w,
39 |                                      const int pad_h,
40 |                                      const int pad_w,
41 |                                      const int dilation_h,
42 |                                      const int dilation_w,
43 |                                      const int group,
44 |                                      const int deformable_group,
45 |                                      const int im2col_step)
46 | {
47 |     AT_ERROR("Not implement on cpu");
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/utils/DCN/functions/deform_conv2d_func.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import absolute_import
 3 | from __future__ import print_function
 4 | from __future__ import division
 5 | 
 6 | import math
 7 | import torch
 8 | from torch import nn
 9 | from torch.autograd import Function
10 | from torch.nn.modules.utils import _pair
11 | from torch.autograd.function import once_differentiable
12 | from apex import amp
13 | import DCN
14 | 
15 | class DeformConv2dFunction(Function):
16 |     @staticmethod
17 |     @amp.float_function
18 |     def forward(ctx, input, offset, weight, bias,
19 |                 stride, padding, dilation, group, deformable_groups, im2col_step):
20 |         ctx.stride = _pair(stride)
21 |         ctx.padding = _pair(padding)
22 |         ctx.dilation = _pair(dilation)
23 |         ctx.kernel_size = _pair(weight.shape[2:4])
24 |         ctx.group = group
25 |         ctx.deformable_groups = deformable_groups
26 |         ctx.im2col_step = im2col_step
27 |         output = DCN.deform_conv2d_forward(input, weight, bias,
28 |                                          offset,
29 |                                          ctx.kernel_size[0], ctx.kernel_size[1],
30 |                                          ctx.stride[0], ctx.stride[1],
31 |                                          ctx.padding[0], ctx.padding[1],
32 |                                          ctx.dilation[0], ctx.dilation[1],
33 |                                          ctx.group,
34 |                                          ctx.deformable_groups,
35 |                                          ctx.im2col_step)
36 |         ctx.save_for_backward(input, offset, weight, bias)
37 |         return output
38 | 
39 |     @staticmethod
40 |     @once_differentiable
41 |     @amp.float_function
42 |     def backward(ctx, grad_output):
43 |         input, offset, weight, bias = ctx.saved_tensors
44 |         grad_input, grad_offset, grad_weight, grad_bias = \
45 |             DCN.deform_conv2d_backward(input, weight,
46 |                                      bias,
47 |                                      offset,
48 |                                      grad_output,
49 |                                      ctx.kernel_size[0], ctx.kernel_size[1],
50 |                                      ctx.stride[0], ctx.stride[1],
51 |                                      ctx.padding[0], ctx.padding[1],
52 |                                      ctx.dilation[0], ctx.dilation[1],
53 |                                      ctx.group,
54 |                                      ctx.deformable_groups,
55 |                                      ctx.im2col_step)
56 | 
57 |         return grad_input, grad_offset, grad_weight, grad_bias,\
58 |             None, None, None, None, None, None
59 | 


--------------------------------------------------------------------------------
/utils/DCN/functions/modulated_deform_conv2d_func.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import absolute_import
 3 | from __future__ import print_function
 4 | from __future__ import division
 5 | 
 6 | import math
 7 | import torch
 8 | from torch import nn
 9 | from torch.autograd import Function
10 | from torch.nn.modules.utils import _pair
11 | from torch.autograd.function import once_differentiable
12 | 
13 | import DCN
14 | 
15 | class ModulatedDeformConv2dFunction(Function):
16 |     @staticmethod
17 |     def forward(ctx, input, offset, mask, weight, bias,
18 |                 stride, padding, dilation, groups, deformable_groups, im2col_step):
19 |         ctx.stride = _pair(stride)
20 |         ctx.padding = _pair(padding)
21 |         ctx.dilation = _pair(dilation)
22 |         ctx.kernel_size = _pair(weight.shape[2:4])
23 |         ctx.groups = groups
24 |         ctx.deformable_groups = deformable_groups
25 |         ctx.im2col_step = im2col_step
26 |         output = DCN.modulated_deform_conv2d_forward(input, weight, bias,
27 |                                          offset, mask,
28 |                                          ctx.kernel_size[0], ctx.kernel_size[1],
29 |                                          ctx.stride[0], ctx.stride[1],
30 |                                          ctx.padding[0], ctx.padding[1],
31 |                                          ctx.dilation[0], ctx.dilation[1],
32 |                                          ctx.groups,
33 |                                          ctx.deformable_groups,
34 |                                          ctx.im2col_step)
35 |         ctx.save_for_backward(input, offset, mask, weight, bias)
36 |         return output
37 | 
38 |     @staticmethod
39 |     @once_differentiable
40 |     def backward(ctx, grad_output):
41 |         input, offset, mask, weight, bias = ctx.saved_tensors
42 |         grad_input, grad_offset, grad_mask, grad_weight, grad_bias = \
43 |             DCN.modulated_deform_conv2d_backward(input, weight,
44 |                                      bias,
45 |                                      offset, mask,
46 |                                      grad_output,
47 |                                      ctx.kernel_size[0], ctx.kernel_size[1],
48 |                                      ctx.stride[0], ctx.stride[1],
49 |                                      ctx.padding[0], ctx.padding[1],
50 |                                      ctx.dilation[0], ctx.dilation[1],
51 |                                      ctx.groups,
52 |                                      ctx.deformable_groups,
53 |                                      ctx.im2col_step)
54 | 
55 |         return grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\
56 |             None, None, None, None, None, None
57 | 


--------------------------------------------------------------------------------
/utils/DCN/src/deform_conv2d.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cpu/deform_conv2d_cpu.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/deform_conv2d_cuda.h"
 7 | #endif
 8 | 
 9 | 
10 | at::Tensor
11 | deform_conv2d_forward(const at::Tensor &input,
12 |                       const at::Tensor &weight,
13 |                       const at::Tensor &bias,
14 |                       const at::Tensor &offset,
15 |                       const int kernel_h,
16 |                       const int kernel_w,
17 |                       const int stride_h,
18 |                       const int stride_w,
19 |                       const int pad_h,
20 |                       const int pad_w,
21 |                       const int dilation_h,
22 |                       const int dilation_w,
23 |                       const int group,
24 |                       const int deformable_group,
25 |                       const int im2col_step)
26 | {
27 |     if (input.type().is_cuda())
28 |     {
29 | #ifdef WITH_CUDA
30 |         return deform_conv2d_cuda_forward(input, weight, bias, offset,
31 |                                    kernel_h, kernel_w,
32 |                                    stride_h, stride_w,
33 |                                    pad_h, pad_w,
34 |                                    dilation_h, dilation_w,
35 |                                    group,
36 |                                    deformable_group, 
37 |                                    im2col_step);
38 | #else
39 |         AT_ERROR("Not compiled with GPU support");
40 | #endif
41 |     }
42 |     AT_ERROR("Not implemented on the CPU");
43 | }
44 | 
45 | std::vector<at::Tensor>
46 | deform_conv2d_backward(const at::Tensor &input,
47 |                        const at::Tensor &weight,
48 |                        const at::Tensor &bias,
49 |                        const at::Tensor &offset,
50 |                        const at::Tensor &grad_output,
51 |                        const int kernel_h,
52 |                        const int kernel_w,
53 |                        const int stride_h,
54 |                        const int stride_w,
55 |                        const int pad_h,
56 |                        const int pad_w,
57 |                        const int dilation_h,
58 |                        const int dilation_w,
59 |                        const int group,
60 |                        const int deformable_group,
61 |                        const int im2col_step)
62 | {
63 |     if (input.type().is_cuda())
64 |     {
65 | #ifdef WITH_CUDA
66 |         return deform_conv2d_cuda_backward(input,
67 |                                     weight,
68 |                                     bias,
69 |                                     offset,
70 |                                     grad_output,
71 |                                     kernel_h, kernel_w,
72 |                                     stride_h, stride_w,
73 |                                     pad_h, pad_w,
74 |                                     dilation_h, dilation_w,
75 |                                     group,
76 |                                     deformable_group,
77 |                                     im2col_step);
78 | #else
79 |         AT_ERROR("Not compiled with GPU support");
80 | #endif
81 |     }
82 |     AT_ERROR("Not implemented on the CPU");
83 | }
84 | 
85 | 


--------------------------------------------------------------------------------
/utils/DCN/src/modulated_deform_conv2d.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cpu/modulated_deform_conv2d_cpu.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/modulated_deform_conv2d_cuda.h"
 7 | #endif
 8 | 
 9 | 
10 | at::Tensor
11 | modulated_deform_conv2d_forward(const at::Tensor &input,
12 |                                 const at::Tensor &weight,
13 |                                 const at::Tensor &bias,
14 |                                 const at::Tensor &offset,
15 |                                 const at::Tensor &mask,
16 |                                 const int kernel_h,
17 |                                 const int kernel_w,
18 |                                 const int stride_h,
19 |                                 const int stride_w,
20 |                                 const int pad_h,
21 |                                 const int pad_w,
22 |                                 const int dilation_h,
23 |                                 const int dilation_w,
24 |                                 const int group,
25 |                                 const int deformable_group,
26 |                                 const int im2col_step)
27 | {
28 |     if (input.type().is_cuda())
29 |     {
30 | #ifdef WITH_CUDA
31 |         return modulated_deform_conv2d_cuda_forward(input, weight, bias, offset, mask,
32 |                                    kernel_h, kernel_w,
33 |                                    stride_h, stride_w,
34 |                                    pad_h, pad_w,
35 |                                    dilation_h, dilation_w,
36 |                                    group,
37 |                                    deformable_group,
38 |                                    im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | modulated_deform_conv2d_backward(const at::Tensor &input,
48 |                                  const at::Tensor &weight,
49 |                                  const at::Tensor &bias,
50 |                                  const at::Tensor &offset,
51 |                                  const at::Tensor &mask,
52 |                                  const at::Tensor &grad_output,
53 |                                  const int kernel_h,
54 |                                  const int kernel_w,
55 |                                  const int stride_h,
56 |                                  const int stride_w,
57 |                                  const int pad_h,
58 |                                  const int pad_w,
59 |                                  const int dilation_h,
60 |                                  const int dilation_w,
61 |                                  const int group,
62 |                                  const int deformable_group,
63 |                                  const int im2col_step)
64 | {
65 |     if (input.type().is_cuda())
66 |     {
67 | #ifdef WITH_CUDA
68 |         return modulated_deform_conv2d_cuda_backward(input,
69 |                                     weight,
70 |                                     bias,
71 |                                     offset,
72 |                                     mask,
73 |                                     grad_output,
74 |                                     kernel_h, kernel_w,
75 |                                     stride_h, stride_w,
76 |                                     pad_h, pad_w,
77 |                                     dilation_h, dilation_w,
78 |                                     group,
79 |                                     deformable_group,
80 |                                     im2col_step);
81 | #else
82 |         AT_ERROR("Not compiled with GPU support");
83 | #endif
84 |     }
85 |     AT_ERROR("Not implemented on the CPU");
86 | }
87 | 
88 | 


--------------------------------------------------------------------------------
/utils/vis_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import os 
  5 | import matplotlib
  6 | 
  7 | matplotlib.use('AGG')
  8 | 
  9 | import matplotlib.pyplot as plt
 10 | import torch
 11 | import cv2
 12 | import math
 13 | from skimage import transform
 14 | 
 15 | def make_vis(dataset, index, img, fuse_weights, fused_fs):
 16 |     save_dir = 'vis_output/{}/{}'.format(dataset,index)
 17 |     os.makedirs(save_dir, exist_ok=True)
 18 | 
 19 |     for i in range(len(fuse_weights)):
 20 |         weights = fuse_weights[i].float().cpu().squeeze().numpy()
 21 |         max_v = weights.max()
 22 |         min_v = weights.min()
 23 |         for j in range(3):
 24 |             v = weights[j,:,:]
 25 |             save_name = os.path.join(save_dir, 'level_{}_weight_{}.png'.format(i+1,j+1))
 26 |             add_heat(img, v, max_v, min_v, save=save_name)
 27 | 
 28 |         fused_f = fused_fs[i].float().cpu().squeeze().numpy()
 29 |         max_f = fused_f.max()
 30 |         min_f = fused_f.min()
 31 |         save_f_name = os.path.join(save_dir, 'fused_feature_level_{}.png'.format(i+1))
 32 |         add_heat(img, fused_f, max_f, min_f, save=save_f_name)
 33 | 
 34 | def make_pred_vis(dataset,index, img, class_names, bboxes, cls, scores):
 35 |     save_preddir = 'vis_output/{}/pred/'.format(dataset)
 36 |     os.makedirs(save_preddir, exist_ok=True)
 37 | 
 38 |     save_pred_name = os.path.join(save_preddir,'{}.png'.format(index))
 39 | 
 40 |     bboxes = bboxes.numpy()
 41 |     scores = scores.numpy()
 42 |     cls_ids = cls.numpy()
 43 | 
 44 |     im = vis(img, bboxes, scores, cls_ids, class_names)
 45 |     
 46 |     cv2.imwrite(save_pred_name, im)
 47 | 
 48 | def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None, color=None):
 49 | 
 50 |     colors = torch.FloatTensor([[1,0,1],[0,0,1],[0,1,1],[0,1,0],[1,1,0],[1,0,0]]);
 51 |     def get_color(c, x, max_val):
 52 |         ratio = float(x)/max_val * 5
 53 |         i = int(math.floor(ratio))
 54 |         j = int(math.ceil(ratio))
 55 |         ratio = ratio - i
 56 |         r = (1-ratio) * colors[i][c] + ratio*colors[j][c]
 57 |         return int(r*255)
 58 | 
 59 |     width = img.shape[1]
 60 |     height = img.shape[0]
 61 |     for i in range(len(boxes)):
 62 |         box = boxes[i]
 63 |         cls_conf = scores[i]
 64 |         if cls_conf < conf:
 65 |             continue
 66 |         x1 = int(box[0])
 67 |         y1 = int(box[1])
 68 |         x2 = int(box[0]+box[2])
 69 |         y2 = int(box[1]+box[3])
 70 | 
 71 | 
 72 |         if color:
 73 |             rgb = color
 74 |         else:
 75 |             rgb = (255, 0, 0)
 76 |         if class_names is not None:
 77 |             cls_conf = scores[i]
 78 |             cls_id = int(cls_ids[i])
 79 |             class_name = class_names[cls_id]
 80 |             classes = len(class_names)
 81 |             offset = cls_id * 123456 % classes
 82 |             red   = get_color(2, offset, classes)
 83 |             green = get_color(1, offset, classes)
 84 |             blue  = get_color(0, offset, classes)
 85 |             if color is None:
 86 |                 rgb = (red, green, blue)
 87 |             img = cv2.putText(img, '%s: %.2f'%(class_name,cls_conf), (x1,y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.3, rgb, 1)
 88 |         img = cv2.rectangle(img, (x1,y1), (x2,y2), rgb, 1)
 89 |     return img
 90 | 
 91 | def add_heat(image, heat_map, max_v, min_v, alpha=0.4, save=None, cmap='jet', axis='off'):
 92 |     height = image.shape[0]
 93 |     width = image.shape[1]
 94 | 
 95 |     # resize heat map
 96 |     heat_map_resized = transform.resize(heat_map, (height, width))
 97 | 
 98 |     # normalize heat map
 99 |     max_value = max_v
100 |     min_value = min_v
101 |     normalized_heat_map = (heat_map_resized - min_value) / (max_value - min_value)
102 | 
103 |     # display
104 |     plt.imshow(image)
105 |     plt.imshow(255 * normalized_heat_map, alpha=alpha, cmap=cmap)
106 |     plt.axis(axis)
107 | 
108 |     if save is not None:
109 |         plt.savefig(save, bbox_inches='tight', pad_inches=0)
110 | 
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/utils/DCN/deform_conv2d_naive.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import init
 4 | import math
 5 | import numpy as np
 6 | from torch.nn.modules.module import Module
 7 | import torch.nn.functional as F
 8 | from torch.nn.modules.utils import _pair
 9 | 
10 | class deform_conv2d_naive(Module):
11 |     def __init__(self, in_channels, out_channels,
12 |                  kernel_size, stride, padding, dilation=1, groups=1, deformable_groups=1, bias=True):
13 |         super(deform_conv2d_naive, self).__init__()
14 |         self.in_channels = in_channels
15 |         self.out_channels = out_channels
16 |         self.kernel_size = _pair(kernel_size)
17 |         self.stride = _pair(stride)
18 |         self.padding = _pair(padding)
19 |         self.dilation = _pair(dilation)
20 |         self.groups = groups
21 |         self.deformable_groups = deformable_groups
22 |         self.use_bias = bias
23 |         
24 |         self.weight = nn.Parameter(torch.Tensor(
25 |             out_channels, in_channels//groups, *self.kernel_size))
26 |         self.bias = nn.Parameter(torch.Tensor(out_channels))
27 |         self.reset_parameters()
28 |         if not self.use_bias:
29 |             self.bias.requires_grad = False
30 |             self.bias.data.zero_()
31 | 
32 |     def reset_parameters(self):
33 |         n = self.in_channels
34 |         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
35 |         if self.bias is not None:
36 |             fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
37 |             bound = 1 / math.sqrt(fan_in)
38 |             init.uniform_(self.bias, -bound, bound)
39 | 
40 |     def forward(self, input, offset):
41 |         N = input.size(0)
42 |         in_channels = self.in_channels
43 |         out_channels = self.out_channels
44 |         in_h = input.size(2)
45 |         in_w = input.size(3)
46 |         out_h = offset.size(2)
47 |         out_w = offset.size(3)
48 |         kernel_h = self.kernel_size[0]
49 |         kernel_w = self.kernel_size[1]
50 |         # [1, kernel_h * kernel_w, out_h, out_w, 2]
51 |         mesh = self.compute_mesh_grid(in_h, in_w).cuda(input.get_device())
52 |         offset = offset.view(N, self.deformable_groups, kernel_h, kernel_w, 2, out_h, out_w)
53 |         # [N * dg * kernel_h * kernel_w, out_h, out_w, 2]
54 |         offset = offset.permute(0, 1, 2, 3, 5, 6, 4).contiguous().view(N * self.deformable_groups * kernel_h * kernel_w, out_h, out_w, 2)
55 |         offset_x_normalize = (offset[:, :, :, 1]) / ((in_w - 1) * 1.0 / 2)
56 |         offset_y_normalize = (offset[:, :, :, 0]) / ((in_h - 1) * 1.0 / 2)
57 |         # [N * dg * kernel_h * kernel_w, out_h, out_w, 2]
58 |         offset = torch.cat([offset_x_normalize[..., None], offset_y_normalize[..., None]], dim=3)
59 |         # [N * dg * kernel_h * kernel_w, out_h, out_w, 2]
60 |         grid = mesh.expand(N * self.deformable_groups, -1, -1, -1, -1).contiguous().view(-1, out_h, out_w, 2) + offset
61 |         # [N * kernel_h * kernel_w * dg, in_channels/dg, in_h, in_w]
62 |         input = input[:, None, ...].expand(-1, kernel_h * kernel_w, -1, -1, -1).contiguous().view(
63 |             N * kernel_h * kernel_w * self.deformable_groups, in_channels // self.deformable_groups,  in_h, in_w)
64 |         sampled_feat = F.grid_sample(input, grid).view(N, kernel_h * kernel_w, in_channels, out_h, out_w).permute(2, 1, 0, 3, 4).contiguous().view(in_channels * kernel_h * kernel_w, -1)
65 |         output_feat = torch.matmul(self.weight.view(self.weight.size(0), -1), sampled_feat).view(out_channels, N, out_h, out_w).permute(1,0,2,3)
66 |         return output_feat
67 |         
68 |     def compute_mesh_grid(self, in_h, in_w):
69 |         kernel_h, kernel_w = self.kernel_size
70 |         stride_h, stride_w = self.stride
71 |         dilation_h, dilation_w = self.dilation
72 |         padding_h, padding_w = self.padding
73 |         out_h = (in_h + 2 * padding_h - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1
74 |         out_w = (in_w + 2 * padding_w - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1
75 |         # [out_h, out_w]
76 |         mesh_y, mesh_x = torch.meshgrid(torch.arange(out_h), torch.arange(out_w))
77 |         mesh_y = mesh_y * stride_h - padding_h
78 |         mesh_x = mesh_x * stride_w - padding_w
79 |         # [1, out_h, out_w]
80 |         mesh_y = mesh_y.unsqueeze(0).float()
81 |         mesh_x = mesh_x.unsqueeze(0).float()
82 |         # [kernel_h, kernel_w]
83 |         kernel_offset_y, kernel_offset_x = torch.meshgrid(torch.arange(kernel_h), torch.arange(kernel_w))
84 |         # [kernel_h * kernel_w, 1, 1]
85 |         kernel_offset_y = kernel_offset_y.float().view(kernel_h * kernel_w, 1, 1) * dilation_h
86 |         kernel_offset_x = kernel_offset_x.float().view(kernel_h * kernel_w, 1, 1) * dilation_w
87 |         # [kernel_h * kernel_w, out_h, out_w]
88 |         mesh_y = mesh_y + kernel_offset_y
89 |         mesh_x = mesh_x + kernel_offset_x
90 |         mesh_y = (mesh_y - (in_h - 1) / 2.) / ((in_h - 1) / 2.)
91 |         mesh_x = (mesh_x - (in_w - 1) / 2.) / ((in_w - 1) / 2.)
92 |         mesh = torch.cat([mesh_x[None, ..., None], mesh_y[None, ..., None]], dim=4)
93 |         return mesh
94 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | from utils.utils import *
  2 | from dataset.vocdataset import VOC_CLASSES
  3 | from dataset.cocodataset import COCO_CLASSES
  4 | from dataset.data_augment import ValTransform
  5 | from utils.vis_utils import vis
  6 | 
  7 | import os
  8 | import sys
  9 | import argparse
 10 | import yaml
 11 | import cv2
 12 | cv2.setNumThreads(0)
 13 | 
 14 | import torch
 15 | from torch.autograd import Variable
 16 | import time
 17 | 
 18 | ######## unlimit the resource in some dockers or cloud machines ####### 
 19 | #import resource
 20 | #rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
 21 | #resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
 22 | 
 23 | 
 24 | def parse_args():
 25 |     parser = argparse.ArgumentParser()
 26 |     parser.add_argument('--cfg', type=str, default='config/yolov3_baseline.cfg',
 27 |                         help='config file. see readme')
 28 |     parser.add_argument('-d', '--dataset', type=str, default='COCO')
 29 |     parser.add_argument('-i', '--img', type=str, default='example/test.jpg',)
 30 |     parser.add_argument('-c', '--checkpoint', type=str,
 31 |                         help='pytorch checkpoint file path')
 32 |     parser.add_argument('-s', '--test_size', type=int, default=416)
 33 |     parser.add_argument('--half', dest='half', action='store_true', default=False,
 34 |                         help='FP16 training')
 35 |     parser.add_argument('--rfb', dest='rfb', action='store_true', default=False,
 36 |                         help='Use rfb block')
 37 |     parser.add_argument('--asff', dest='asff', action='store_true', default=False,
 38 |                         help='Use ASFF module for yolov3')
 39 |     parser.add_argument('--use_cuda', type=bool, default=True)
 40 |     return parser.parse_args()
 41 | 
 42 | def demo():
 43 |     """
 44 |     YOLOv3 demo. See README for details.
 45 |     """
 46 |     args = parse_args()
 47 |     print("Setting Arguments.. : ", args)
 48 | 
 49 |     cuda = torch.cuda.is_available() and args.use_cuda
 50 | 
 51 |     # Parse config settings
 52 |     with open(args.cfg, 'r') as f:
 53 |         cfg = yaml.safe_load(f)
 54 | 
 55 |     print("successfully loaded config file: ", cfg)
 56 | 
 57 |     backbone=cfg['MODEL']['BACKBONE']
 58 |     test_size = (args.test_size,args.test_size)
 59 | 
 60 |     if args.dataset == 'COCO':
 61 |         class_names = COCO_CLASSES
 62 |         num_class=80
 63 |     elif args.dataset == 'VOC':
 64 |         class_names = VOC_CLASSES
 65 |         num_class=20
 66 |     else:
 67 |         raise Exception("Only support COCO or VOC model now!")
 68 | 
 69 |     # Initiate model
 70 |     if args.asff:
 71 |         if backbone == 'mobile':
 72 |             from models.yolov3_mobilev2 import YOLOv3
 73 |             print("For mobilenet, we currently don't support dropblock, rfb and FeatureAdaption")
 74 |         else:
 75 |             from models.yolov3_asff import YOLOv3
 76 |         print('Training YOLOv3 with ASFF!')
 77 |         model = YOLOv3(num_classes = num_class, rfb=args.rfb, asff=args.asff)
 78 |     else:
 79 |         if backbone == 'mobile':
 80 |             from models.yolov3_mobilev2 import YOLOv3
 81 |         else:
 82 |             from models.yolov3_baseline import YOLOv3
 83 |         print('Training YOLOv3 strong baseline!')
 84 |         model = YOLOv3(num_classes = num_class, rfb=args.rfb)
 85 | 
 86 | 
 87 |     if args.checkpoint:
 88 |         print("loading pytorch ckpt...", args.checkpoint)
 89 |         cpu_device = torch.device("cpu")
 90 |         ckpt = torch.load(args.checkpoint, map_location=cpu_device)
 91 |         #model.load_state_dict(ckpt,strict=False)
 92 |         model.load_state_dict(ckpt)
 93 |     if cuda:
 94 |         print("using cuda")
 95 |         torch.backends.cudnn.benchmark = True
 96 |         device = torch.device("cuda")
 97 |         model = model.to(device)
 98 | 
 99 |     if args.half:
100 |         model = model.half()
101 | 
102 |     model = model.eval()
103 |     dtype = torch.float16 if args.half else torch.float32
104 | 
105 |     #load img
106 |     transform = ValTransform(rgb_means=(0.485, 0.456, 0.406), std=(0.229,0.224,0.225))
107 |     im = cv2.imread(args.img)
108 |     height, width, _ = im.shape
109 |     ori_im = im.copy()
110 |     im_input, _ = transform(im, None, test_size)
111 |     if cuda:
112 |         im_input = im_input.to(device)
113 | 
114 |     im_input = Variable(im_input.type(dtype).unsqueeze(0))
115 |     outputs= model(im_input)
116 |     outputs = postprocess(outputs, num_class, 0.01, 0.65)
117 | 
118 |     outputs = outputs[0].cpu().data
119 |     bboxes = outputs[:, 0:4]
120 |     bboxes[:, 0::2] *= width / test_size[0]
121 |     bboxes[:, 1::2] *= height / test_size[1]
122 |     bboxes[:, 2] = bboxes[:,2] - bboxes[:,0]
123 |     bboxes[:, 3] = bboxes[:,3] - bboxes[:,1]
124 |     cls = outputs[:, 6]
125 |     scores = outputs[:, 4]* outputs[:,5]
126 | 
127 |     pred_im=vis(ori_im, bboxes.numpy(), scores.numpy(), cls.numpy(), conf=0.6, class_names=class_names)
128 |     cv2.imshow('Detection', pred_im)
129 |     cv2.waitKey(0)
130 |     cv2.destroyAllWindows()
131 | 
132 |     sys.exit(0)
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     demo()
137 | 


--------------------------------------------------------------------------------
/utils/DCN/modules/modulated_deform_conv2d.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import absolute_import
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | 
  6 | import torch
  7 | import math
  8 | from torch import nn
  9 | from torch.nn import init
 10 | from torch.nn.modules.utils import _pair
 11 | 
 12 | from ..functions.modulated_deform_conv2d_func import ModulatedDeformConv2dFunction
 13 | 
 14 | class ModulatedDeformConv2d(nn.Module):
 15 | 
 16 |     def __init__(self, in_channels, out_channels,
 17 |                  kernel_size, stride, padding, dilation=1, groups=1, deformable_groups=1, im2col_step=64, bias=True):
 18 |         super(ModulatedDeformConv2d, self).__init__()
 19 | 
 20 |         if in_channels % groups != 0:
 21 |             raise ValueError('in_channels {} must be divisible by groups {}'.format(in_channels, groups))
 22 |         if out_channels % groups != 0:
 23 |             raise ValueError('out_channels {} must be divisible by groups {}'.format(out_channels, groups))
 24 | 
 25 |         self.in_channels = in_channels
 26 |         self.out_channels = out_channels
 27 |         self.kernel_size = _pair(kernel_size)
 28 |         self.stride = _pair(stride)
 29 |         self.padding = _pair(padding)
 30 |         self.dilation = _pair(dilation)
 31 |         self.groups = groups
 32 |         self.deformable_groups = deformable_groups
 33 |         self.im2col_step = im2col_step
 34 |         self.use_bias = bias
 35 | 
 36 |         self.weight = nn.Parameter(torch.Tensor(
 37 |             out_channels, in_channels//groups, *self.kernel_size))
 38 |         self.bias = nn.Parameter(torch.Tensor(out_channels))
 39 |         self.reset_parameters()
 40 |         if not self.use_bias:
 41 |             self.bias.requires_grad = False
 42 | 
 43 |     def reset_parameters(self):
 44 |         n = self.in_channels
 45 |         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
 46 |         if self.bias is not None:
 47 |             fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
 48 |             bound = 1 / math.sqrt(fan_in)
 49 |             init.uniform_(self.bias, -bound, bound)
 50 | 
 51 |     def forward(self, input, offset, mask):
 52 |         assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
 53 |             offset.shape[1]
 54 |         assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
 55 |             mask.shape[1]
 56 |         return ModulatedDeformConv2dFunction.apply(input, offset, mask,
 57 |                                                    self.weight,
 58 |                                                    self.bias,
 59 |                                                    self.stride,
 60 |                                                    self.padding,
 61 |                                                    self.dilation,
 62 |                                                    self.groups,
 63 |                                                    self.deformable_groups,
 64 |                                                    self.im2col_step)
 65 | 
 66 | _ModulatedDeformConv2d = ModulatedDeformConv2dFunction.apply
 67 | 
 68 | class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
 69 | 
 70 |     def __init__(self, in_channels, out_channels,
 71 |                  kernel_size, stride, padding,
 72 |                  dilation=1, groups=1, deformable_groups=1, im2col_step=64, bias=True, lr_mult=0.1):
 73 |         super(ModulatedDeformConv2dPack, self).__init__(in_channels, out_channels,
 74 |                                   kernel_size, stride, padding, dilation, groups, deformable_groups, im2col_step, bias)
 75 | 
 76 |         out_channels = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1]
 77 |         self.conv_offset_mask = nn.Conv2d(self.in_channels,
 78 |                                           out_channels,
 79 |                                           kernel_size=self.kernel_size,
 80 |                                           stride=self.stride,
 81 |                                           padding=self.padding,
 82 |                                           bias=True)
 83 |         self.conv_offset_mask.lr_mult = lr_mult
 84 |         self.conv_offset_mask.inited = True
 85 |         self.init_offset()
 86 | 
 87 |     def init_offset(self):
 88 |         self.conv_offset_mask.weight.data.zero_()
 89 |         self.conv_offset_mask.bias.data.zero_()
 90 | 
 91 |     def forward(self, input):
 92 |         out = self.conv_offset_mask(input)
 93 |         o1, o2, mask = torch.chunk(out, 3, dim=1)
 94 |         offset = torch.cat((o1, o2), dim=1)
 95 |         mask = torch.sigmoid(mask)
 96 |         return ModulatedDeformConv2dFunction.apply(input, offset, mask, 
 97 |                                                 self.weight, 
 98 |                                                 self.bias, 
 99 |                                                 self.stride, 
100 |                                                 self.padding, 
101 |                                                 self.dilation, 
102 |                                                 self.groups,
103 |                                                 self.deformable_groups,
104 |                                                 self.im2col_step)
105 | 
106 | 


--------------------------------------------------------------------------------
/dataset/mixupdetection.py:
--------------------------------------------------------------------------------
  1 | """Mixup detection dataset wrapper."""
  2 | from __future__ import absolute_import
  3 | import numpy as np
  4 | import torch
  5 | #from mxnet.gluon.data import Dataset
  6 | from .dataloading import Dataset
  7 | 
  8 | 
  9 | class MixupDetection(Dataset):
 10 |     """Detection dataset wrapper that performs mixup for normal dataset.
 11 |     Parameters
 12 |     ----------
 13 |     dataset : mx.gluon.data.Dataset
 14 |         Gluon dataset object.
 15 |     mixup : callable random generator, e.g. np.random.uniform
 16 |         A random mixup ratio sampler, preferably a random generator from numpy.random
 17 |         A random float will be sampled each time with mixup(*args).
 18 |         Use None to disable.
 19 |     *args : list
 20 |         Additional arguments for mixup random sampler.
 21 |     """
 22 |     def __init__(self, dataset, mixup=None, preproc=None, *args):
 23 |         super().__init__(dataset.input_dim)
 24 |         self._dataset = dataset
 25 |         self.preproc = preproc
 26 |         self._mixup = mixup
 27 |         self._mixup_args = args
 28 | 
 29 |     def set_mixup(self, mixup=None, *args):
 30 |         """Set mixup random sampler, use None to disable.
 31 |         Parameters
 32 |         ----------
 33 |         mixup : callable random generator, e.g. np.random.uniform
 34 |             A random mixup ratio sampler, preferably a random generator from numpy.random
 35 |             A random float will be sampled each time with mixup(*args)
 36 |         *args : list
 37 |             Additional arguments for mixup random sampler.
 38 |         """
 39 |         self._mixup = mixup
 40 |         self._mixup_args = args
 41 | 
 42 |     def __len__(self):
 43 |         return len(self._dataset)
 44 | 
 45 |     @Dataset.resize_getitem
 46 |     def __getitem__(self, idx):
 47 |         self._dataset._input_dim = self.input_dim
 48 |         # first image
 49 |         img1, label1, _, _= self._dataset.pull_item(idx)
 50 |         lambd = 1
 51 | 
 52 |         # draw a random lambda ratio from distribution
 53 |         if self._mixup is not None:
 54 |             lambd = max(0, min(1, self._mixup(*self._mixup_args)))
 55 | 
 56 |         if lambd >= 1:
 57 |             weights1 = np.ones((label1.shape[0], 1))
 58 |             label1 = np.hstack((label1, weights1))
 59 |             height, width, _ = img1.shape
 60 |             img_info = (width, height)
 61 |             if self.preproc is not None:
 62 |                 img_o, target_o = self.preproc(img1, label1, self.input_dim)
 63 |             return img_o, target_o, img_info, idx
 64 | 
 65 |         # second image
 66 |         idx2 = int(np.random.choice(np.delete(np.arange(len(self)), idx)))
 67 |         img2, label2, _, _ = self._dataset.pull_item(idx2)
 68 | 
 69 |         # mixup two images
 70 |         height = max(img1.shape[0], img2.shape[0])
 71 |         width = max(img1.shape[1], img2.shape[1])
 72 |         mix_img = np.zeros((height, width, 3),dtype=np.float32)
 73 |         mix_img[:img1.shape[0], :img1.shape[1], :] = img1.astype(np.float32) * lambd
 74 |         mix_img[:img2.shape[0], :img2.shape[1], :] += img2.astype(np.float32) * (1. - lambd)
 75 |         mix_img = mix_img.astype(np.uint8)
 76 | 
 77 |         y1 = np.hstack((label1, np.full((label1.shape[0], 1), lambd)))
 78 |         y2 = np.hstack((label2, np.full((label2.shape[0], 1), 1. - lambd)))
 79 |         mix_label = np.vstack((y1, y2))
 80 |         if self.preproc is not None:
 81 |             mix_img, padded_labels = self.preproc(mix_img, mix_label, self.input_dim)
 82 | 
 83 |         img_info = (width, height)
 84 | 
 85 |         return mix_img, padded_labels, img_info , idx
 86 | 
 87 |     def pull_item(self, idx):
 88 |         self._dataset._input_dim = self.input_dim
 89 |         # first image
 90 |         img1, label1, _, _= self._dataset.pull_item(idx)
 91 |         lambd = 1
 92 | 
 93 |         # draw a random lambda ratio from distribution
 94 |         if self._mixup is not None:
 95 |             lambd = max(0, min(1, self._mixup(*self._mixup_args)))
 96 | 
 97 |         if lambd >= 1:
 98 |             weights1 = np.ones((label1.shape[0], 1))
 99 |             label1 = np.hstack((label1, weights1))
100 |             height, width, _ = img1.shape
101 |             img_info = (width, height)
102 |             if self.preproc is not None:
103 |                 img_o, target_o = self.preproc(img1, label1, self.input_dim)
104 |             return img_o, target_o, img_info, idx
105 | 
106 |         # second image
107 |         idx2 = int(np.random.choice(np.delete(np.arange(len(self)), idx)))
108 |         img2, label2 = self._dataset.pull_item(idx2)
109 | 
110 |         # mixup two images
111 |         height = max(img1.shape[0], img2.shape[0])
112 |         width = max(img1.shape[1], img2.shape[1])
113 |         mix_img = np.zeros((height, width, 3),dtype=np.float32)
114 |         mix_img[:img1.shape[0], :img1.shape[1], :] = img1.astype(np.float32) * lambd
115 |         mix_img[:img2.shape[0], :img2.shape[1], :] += img2.astype(np.float32) * (1. - lambd)
116 |         mix_img = mix_img.astype(np.uint8)
117 | 
118 |         y1 = np.hstack((label1, np.full((label1.shape[0], 1), lambd)))
119 |         y2 = np.hstack((label2, np.full((label2.shape[0], 1), 1. - lambd)))
120 |         mix_label = np.vstack((y1, y2))
121 |         if self.preproc is not None:
122 |             mix_img, padded_labels = self.preproc(mix_img, mix_label, self.input_dim)
123 | 
124 |         img_info = (width, height)
125 |         return mix_img, padded_labels, img_info , idx
126 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch
  3 | import torchvision
  4 | import numpy as np
  5 | import cv2
  6 | 
  7 | def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45):
  8 |     """
  9 |     Postprocess for the output of YOLO model
 10 |     perform box transformation, specify the class for each detection,
 11 |     and perform class-wise non-maximum suppression.
 12 |     Args:
 13 |         prediction (torch tensor): The shape is :math:`(N, B, 4)`.
 14 |             :math:`N` is the number of predictions,
 15 |             :math:`B` the number of boxes. The last axis consists of
 16 |             :math:`xc, yc, w, h` where `xc` and `yc` represent a center
 17 |             of a bounding box.
 18 |         num_classes (int):
 19 |             number of dataset classes.
 20 |         conf_thre (float):
 21 |             confidence threshold ranging from 0 to 1,
 22 |             which is defined in the config file.
 23 |         nms_thre (float):
 24 |             IoU threshold of non-max suppression ranging from 0 to 1.
 25 | 
 26 |     Returns:
 27 |         output (list of torch tensor):
 28 | 
 29 |     """
 30 |     box_corner = prediction.new(prediction.shape)
 31 |     box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
 32 |     box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
 33 |     box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
 34 |     box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
 35 |     prediction[:, :, :4] = box_corner[:, :, :4]
 36 | 
 37 |     output = [None for _ in range(len(prediction))]
 38 |     for i, image_pred in enumerate(prediction):
 39 | 
 40 |         # If none are remaining => process next image
 41 |         if not image_pred.size(0):
 42 |             continue
 43 |         # Get score and class with highest confidence
 44 |         class_conf, class_pred = torch.max(
 45 |             image_pred[:, 5:5 + num_classes], 1,  keepdim=True)
 46 | 
 47 |         conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
 48 |         # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
 49 |         detections = torch.cat(
 50 |             (image_pred[:, :5], class_conf, class_pred.float()), 1)
 51 |         detections = detections[conf_mask]
 52 |         if not detections.size(0):
 53 |             continue
 54 | 
 55 |         # Iterate through all predicted classes
 56 |         unique_labels = detections[:, -1].unique()
 57 | 
 58 |         for c in unique_labels:
 59 |             # Get the detections with the particular class
 60 |             detections_class = detections[detections[:, -1] == c]
 61 |             nms_out_index = torchvision.ops.nms(
 62 |                 detections_class[:, :4], detections_class[:, 4]*detections_class[:, 5], nms_thre)
 63 |             detections_class = detections_class[nms_out_index]
 64 |             if output[i] is None:
 65 |                 output[i] = detections_class
 66 |             else:
 67 |                 output[i] = torch.cat((output[i], detections_class))
 68 | 
 69 |     return output
 70 | 
 71 | 
 72 | def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
 73 |     """Calculate the Intersection of Unions (IoUs) between bounding boxes.
 74 |     IoU is calculated as a ratio of area of the intersection
 75 |     and area of the union.
 76 | 
 77 |     Args:
 78 |         bbox_a (array): An array whose shape is :math:`(N, 4)`.
 79 |             :math:`N` is the number of bounding boxes.
 80 |             The dtype should be :obj:`numpy.float32`.
 81 |         bbox_b (array): An array similar to :obj:`bbox_a`,
 82 |             whose shape is :math:`(K, 4)`.
 83 |             The dtype should be :obj:`numpy.float32`.
 84 |     Returns:
 85 |         array:
 86 |         An array whose shape is :math:`(N, K)`. \
 87 |         An element at index :math:`(n, k)` contains IoUs between \
 88 |         :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \
 89 |         box in :obj:`bbox_b`.
 90 | 
 91 |     from: https://github.com/chainer/chainercv
 92 |     """
 93 |     if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
 94 |         raise IndexError
 95 | 
 96 |     if xyxy:
 97 |         tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
 98 |         br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
 99 |         area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
100 |         area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
101 |     else:
102 |         tl = torch.max((bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
103 |                         (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2))
104 |         br = torch.min((bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
105 |                         (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2))
106 | 
107 |         area_a = torch.prod(bboxes_a[:, 2:], 1)
108 |         area_b = torch.prod(bboxes_b[:, 2:], 1)
109 |     en = (tl < br).type(tl.type()).prod(dim=2)
110 |     area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
111 |     return area_i / (area_a[:, None] + area_b - area_i)
112 | 
113 | 
114 | def matrix_iou(a,b):
115 |     """
116 |     return iou of a and b, numpy version for data augenmentation
117 |     """
118 |     lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
119 |     rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
120 | 
121 |     area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
122 |     area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
123 |     area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
124 |     return area_i / (area_a[:, np.newaxis] + area_b - area_i+1e-12)
125 | 
126 | def visual(img, boxes, scores):
127 | 
128 |     COLORS = [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
129 |     FONT = cv2.FONT_HERSHEY_SIMPLEX
130 |     for i in range(boxes.shape[0]):
131 | 
132 |         cv2.rectangle(img, (int(boxes[i][0]),int(boxes[i][1])),(int(boxes[i][2]),int(boxes[i][3])),COLORS[i%3],2)
133 |         cv2.putText(img, 'Object: %.2f'%scores[i],(int(boxes[i][0])-3,int(boxes[i][1])-5), FONT,
134 |                      0.4, (0,0,0),2)
135 | 
136 |     return img
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/utils/distributed_util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import tempfile
  4 | import time
  5 | 
  6 | import torch
  7 | 
  8 | 
  9 | def get_world_size():
 10 |     if not torch.distributed.is_initialized():
 11 |         return 1
 12 |     return torch.distributed.get_world_size()
 13 | 
 14 | 
 15 | def get_rank():
 16 |     if not torch.distributed.is_initialized():
 17 |         return 0
 18 |     return torch.distributed.get_rank()
 19 | 
 20 | 
 21 | def is_main_process():
 22 |     if not torch.distributed.is_initialized():
 23 |         return True
 24 |     return torch.distributed.get_rank() == 0
 25 | 
 26 | 
 27 | def synchronize():
 28 |     """
 29 |     Helper function to synchronize between multiple processes when
 30 |     using distributed training
 31 |     """
 32 |     if not torch.distributed.is_initialized():
 33 |         return
 34 |     world_size = torch.distributed.get_world_size()
 35 |     rank = torch.distributed.get_rank()
 36 |     if world_size == 1:
 37 |         return
 38 | 
 39 |     def _send_and_wait(r):
 40 |         if rank == r:
 41 |             tensor = torch.tensor(0, device="cuda")
 42 |         else:
 43 |             tensor = torch.tensor(1, device="cuda")
 44 |         torch.distributed.broadcast(tensor, r)
 45 |         while tensor.item() == 1:
 46 |             time.sleep(1)
 47 | 
 48 |     _send_and_wait(0)
 49 |     # now sync on the main process
 50 |     _send_and_wait(1)
 51 | 
 52 | 
 53 | def _encode(encoded_data, data):
 54 |     # gets a byte representation for the data
 55 |     encoded_bytes = pickle.dumps(data)
 56 |     # convert this byte string into a byte tensor
 57 |     storage = torch.ByteStorage.from_buffer(encoded_bytes)
 58 |     tensor = torch.ByteTensor(storage).to("cuda")
 59 |     # encoding: first byte is the size and then rest is the data
 60 |     s = tensor.numel()
 61 |     assert s <= 255, "Can't encode data greater than 255 bytes"
 62 |     # put the encoded data in encoded_data
 63 |     encoded_data[0] = s
 64 |     encoded_data[1: (s + 1)] = tensor
 65 | 
 66 | 
 67 | def _decode(encoded_data):
 68 |     size = encoded_data[0]
 69 |     encoded_tensor = encoded_data[1: (size + 1)].to("cpu")
 70 |     return pickle.loads(bytearray(encoded_tensor.tolist()))
 71 | 
 72 | 
 73 | # TODO try to use tensor in shared-memory instead of serializing to disk
 74 | # this involves getting the all_gather to work
 75 | def scatter_gather(data):
 76 |     """
 77 |     This function gathers data from multiple processes, and returns them
 78 |     in a list, as they were obtained from each process.
 79 |     This function is useful for retrieving data from multiple processes,
 80 |     when launching the code with torch.distributed.launch
 81 |     Note: this function is slow and should not be used in tight loops, i.e.,
 82 |     do not use it in the training loop.
 83 |     Arguments:
 84 |         data: the object to be gathered from multiple processes.
 85 |             It must be serializable
 86 |     Returns:
 87 |         result (list): a list with as many elements as there are processes,
 88 |             where each element i in the list corresponds to the data that was
 89 |             gathered from the process of rank i.
 90 |     """
 91 |     # strategy: the main process creates a temporary directory, and communicates
 92 |     # the location of the temporary directory to all other processes.
 93 |     # each process will then serialize the data to the folder defined by
 94 |     # the main process, and then the main process reads all of the serialized
 95 |     # files and returns them in a list
 96 |     if not torch.distributed.is_initialized():
 97 |         return [data]
 98 |     synchronize()
 99 |     # get rank of the current process
100 |     rank = torch.distributed.get_rank()
101 | 
102 |     # the data to communicate should be small
103 |     data_to_communicate = torch.empty(256, dtype=torch.uint8, device="cuda")
104 |     if rank == 0:
105 |         # manually creates a temporary directory, that needs to be cleaned
106 |         # afterwards
107 |         tmp_dir = tempfile.mkdtemp()
108 |         _encode(data_to_communicate, tmp_dir)
109 | 
110 |     synchronize()
111 |     # the main process (rank=0) communicates the data to all processes
112 |     torch.distributed.broadcast(data_to_communicate, 0)
113 | 
114 |     # get the data that was communicated
115 |     tmp_dir = _decode(data_to_communicate)
116 | 
117 |     # each process serializes to a different file
118 |     file_template = "file{}.pth"
119 |     tmp_file = os.path.join(tmp_dir, file_template.format(rank))
120 |     torch.save(data, tmp_file)
121 | 
122 |     # synchronize before loading the data
123 |     synchronize()
124 | 
125 |     # only the master process returns the data
126 |     if rank == 0:
127 |         data_list = []
128 |         world_size = torch.distributed.get_world_size()
129 |         for r in range(world_size):
130 |             file_path = os.path.join(tmp_dir, file_template.format(r))
131 |             d = torch.load(file_path)
132 |             data_list.append(d)
133 |             # cleanup
134 |             os.remove(file_path)
135 |         # cleanup
136 |         os.rmdir(tmp_dir)
137 |         return data_list
138 | 
139 | 
140 | def reduce_loss_dict(loss_dict):
141 |     """
142 |     Reduce the loss dictionary from all processes so that process with rank
143 |     0 has the averaged results. Returns a dict with the same fields as
144 |     loss_dict, after reduction.
145 |     """
146 |     world_size = get_world_size()
147 |     if world_size < 2:
148 |         return loss_dict
149 |     with torch.no_grad():
150 |         loss_names = []
151 |         all_losses = []
152 |         for k in sorted(loss_dict.keys()):
153 |             loss_names.append(k)
154 |             all_losses.append(loss_dict[k])
155 |         all_losses = torch.stack(all_losses, dim=0)
156 |         torch.distributed.reduce(all_losses, dst=0)
157 |         if torch.distributed.get_rank() == 0:
158 |             # only main process gets accumulated, so only divide by
159 |             # world_size in this case
160 |             all_losses /= world_size
161 |         reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
162 |     return reduced_losses
163 | 


--------------------------------------------------------------------------------
/dataset/cocodataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | 
  4 | import torch
  5 | from .dataloading import Dataset
  6 | import cv2
  7 | from pycocotools.coco import COCO
  8 | 
  9 | from utils.utils import *
 10 | 
 11 | COCO_CLASSES=(
 12 | 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
 13 | 'boat', 'traffic light', 'fire hydrant', 'street sign', 'stop sign',
 14 | 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
 15 | 'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella',
 16 | 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
 17 | 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
 18 | 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass',
 19 | 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
 20 | 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
 21 | 'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk',
 22 | 'toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
 23 | 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book',
 24 | 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
 25 | 
 26 | 
 27 | class COCODataset(Dataset):
 28 |     """
 29 |     COCO dataset class.
 30 |     """
 31 |     def __init__(self, data_dir='data/COCO', json_file='instances_train2017.json',
 32 |                  name='train2017', img_size=(416,416), preproc=None, debug=False, voc=False):
 33 |         """
 34 |         COCO dataset initialization. Annotation data are read into memory by COCO API.
 35 |         Args:
 36 |             data_dir (str): dataset root directory
 37 |             json_file (str): COCO json file name
 38 |             name (str): COCO data name (e.g. 'train2017' or 'val2017')
 39 |             img_size (int): target image size after pre-processing
 40 |             preproc: data augmentation strategy
 41 |             debug (bool): if True, only one data id is selected from the dataset
 42 |         """
 43 |         super().__init__(img_size)
 44 |         self.data_dir = data_dir
 45 |         self.json_file = json_file
 46 |         self.voc = voc
 47 |         if voc:
 48 |             self.coco = COCO(self.data_dir+'VOC2007/Annotations/'+self.json_file)
 49 |         else:
 50 |             self.coco = COCO(self.data_dir+'annotations/'+self.json_file)
 51 |         self.ids = self.coco.getImgIds()
 52 |         if debug:
 53 |             self.ids = self.ids[1:2]
 54 |             print("debug mode...", self.ids)
 55 |         self.class_ids = sorted(self.coco.getCatIds())
 56 |         cats = self.coco.loadCats(self.coco.getCatIds())
 57 |         self._classes = tuple([c['name'] for c in cats])
 58 |         self.name = name
 59 |         self.max_labels = 50
 60 |         self.img_size = img_size
 61 |         self.preproc = preproc
 62 | 
 63 |     def __len__(self):
 64 |         return len(self.ids)
 65 | 
 66 |     def pull_item(self, index):
 67 | 
 68 |         id_ = self.ids[index]
 69 | 
 70 |         im_ann = self.coco.loadImgs(id_)[0]
 71 |         width = im_ann['width']
 72 |         height = im_ann['height']
 73 |         anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None)
 74 |         annotations = self.coco.loadAnns(anno_ids)
 75 | 
 76 |         # load image and preprocess
 77 |         img_file = os.path.join(self.data_dir, 'images', self.name,
 78 |                                 #'COCO_'+self.name+'_'+'{:012}'.format(id_) + '.jpg')
 79 |                                 '{:012}'.format(id_) + '.jpg')
 80 | 
 81 |         if self.voc:
 82 |             file_name = im_ann['file_name']
 83 |             img_file = os.path.join(self.data_dir, 'VOC2007', 'JPEGImages',
 84 |                                 file_name)
 85 | 
 86 |         img = cv2.imread(img_file)
 87 | 
 88 |         if self.json_file == 'instances_val5k.json' and img is None:
 89 |             img_file = os.path.join(self.data_dir, 'images', 'train2017',
 90 |                                     '{:012}'.format(id_) + '.jpg')
 91 |             img = cv2.imread(img_file)
 92 |         assert img is not None
 93 | 
 94 |         #img, info_img = preprocess(img, self.input_dim[0])
 95 | 
 96 |         # load labels
 97 |         valid_objs = []
 98 |         for obj in annotations:
 99 |             x1 = np.max((0, obj['bbox'][0]))
100 |             y1 = np.max((0, obj['bbox'][1]))
101 |             x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1))))
102 |             y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1))))
103 |             if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
104 |                 obj['clean_bbox'] = [x1, y1, x2, y2]
105 |                 valid_objs.append(obj)
106 |         objs = valid_objs
107 |         num_objs = len(objs)
108 | 
109 |         res = np.zeros((num_objs, 5))
110 | 
111 |         for ix, obj in enumerate(objs):
112 |             cls = self.class_ids.index(obj['category_id'])
113 |             res[ix, 0:4] = obj['clean_bbox']
114 |             res[ix, 4] = cls
115 | 
116 |         img_info = (width, height)
117 | 
118 |         return img, res, img_info, id_
119 | 
120 |     @Dataset.resize_getitem
121 |     def __getitem__(self, index):
122 |         """
123 |         One image / label pair for the given index is picked up \
124 |         and pre-processed.
125 |         Args:
126 |             index (int): data index
127 |         Returns:
128 |             img (numpy.ndarray): pre-processed image
129 |             padded_labels (torch.Tensor): pre-processed label data. \
130 |                 The shape is :math:`[self.max_labels, 5]`. \
131 |                 each label consists of [class, xc, yc, w, h]:
132 |                     class (float): class index.
133 |                     xc, yc (float) : center of bbox whose values range from 0 to 1.
134 |                     w, h (float) : size of bbox whose values range from 0 to 1.
135 |             info_img : tuple of h, w, nh, nw, dx, dy.
136 |                 h, w (int): original shape of the image
137 |                 nh, nw (int): shape of the resized image without padding
138 |                 dx, dy (int): pad size
139 |             id_ (int): same as the input index. Used for evaluation.
140 |         """
141 |         img, res, img_info, id_ = self.pull_item(index)
142 | 
143 |         if self.preproc is not None:
144 |             img, target = self.preproc(img, res, self.input_dim)
145 | 
146 | 
147 |         return img, target, img_info, id_
148 | 


--------------------------------------------------------------------------------
/utils/DCN/modules/deform_conv2d.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import absolute_import
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | 
  6 | import torch
  7 | import math
  8 | from torch import nn
  9 | from torch.nn import init
 10 | from torch.nn.modules.utils import _pair
 11 | 
 12 | from ..functions.deform_conv2d_func import DeformConv2dFunction
 13 | 
 14 | class DeformConv2d(nn.Module):
 15 | 
 16 |     def __init__(self, in_channels, out_channels,
 17 |                  kernel_size, stride, padding, dilation=1, groups=1, deformable_groups=1, im2col_step=64, bias=True):
 18 |         super(DeformConv2d, self).__init__()
 19 | 
 20 |         if in_channels % groups != 0:
 21 |             raise ValueError('in_channels {} must be divisible by groups {}'.format(in_channels, groups))
 22 |         if out_channels % groups != 0:
 23 |             raise ValueError('out_channels {} must be divisible by groups {}'.format(out_channels, groups))
 24 | 
 25 |         self.in_channels = in_channels
 26 |         self.out_channels = out_channels
 27 |         self.kernel_size = _pair(kernel_size)
 28 |         self.stride = _pair(stride)
 29 |         self.padding = _pair(padding)
 30 |         self.dilation = _pair(dilation)
 31 |         self.groups = groups
 32 |         self.deformable_groups = deformable_groups
 33 |         self.im2col_step = im2col_step
 34 |         self.use_bias = bias
 35 |         
 36 |         self.weight = nn.Parameter(torch.Tensor(
 37 |             out_channels, in_channels//groups, *self.kernel_size))
 38 |         self.bias = nn.Parameter(torch.Tensor(out_channels))
 39 |         self.reset_parameters()
 40 |         if not self.use_bias:
 41 |             self.bias.requires_grad = False
 42 |             self.bias.data.zero_()
 43 | 
 44 |     def reset_parameters(self):
 45 |         n = self.in_channels
 46 |         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
 47 |         if self.bias is not None:
 48 |             fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
 49 |             bound = 1 / math.sqrt(fan_in)
 50 |             init.uniform_(self.bias, -bound, bound)
 51 | 
 52 |     def forward(self, input, offset):
 53 |         assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
 54 |             offset.shape[1]
 55 |         return DeformConv2dFunction.apply(input, offset,
 56 |                                                    self.weight,
 57 |                                                    self.bias,
 58 |                                                    self.stride,
 59 |                                                    self.padding,
 60 |                                                    self.dilation,
 61 |                                                    self.groups,
 62 |                                                    self.deformable_groups,
 63 |                                                    self.im2col_step)
 64 | 
 65 | _DeformConv2d = DeformConv2dFunction.apply
 66 | 
 67 | class DeformConv2dPack(DeformConv2d):
 68 | 
 69 |     def __init__(self, in_channels, out_channels,
 70 |                  kernel_size, stride, padding,
 71 |                  dilation=1, groups=1, deformable_groups=1, im2col_step=64, bias=True, lr_mult=0.1):
 72 |         super(DeformConv2dPack, self).__init__(in_channels, out_channels,
 73 |                                   kernel_size, stride, padding, dilation, groups, deformable_groups, im2col_step, bias)
 74 | 
 75 |         out_channels = self.deformable_groups * 2 * self.kernel_size[0] * self.kernel_size[1]
 76 |         self.conv_offset = nn.Conv2d(self.in_channels,
 77 |                                           out_channels,
 78 |                                           kernel_size=self.kernel_size,
 79 |                                           stride=self.stride,
 80 |                                           padding=self.padding,
 81 |                                           bias=True)
 82 |         self.conv_offset.lr_mult = lr_mult
 83 |         self.conv_offset.inited = True
 84 |         self.init_offset()
 85 | 
 86 |     def init_offset(self):
 87 |         self.conv_offset.weight.data.zero_()
 88 |         self.conv_offset.bias.data.zero_()
 89 | 
 90 |     def forward(self, input):
 91 |         offset = self.conv_offset(input)
 92 |         return DeformConv2dFunction.apply(input, offset, 
 93 |                                           self.weight, 
 94 |                                           self.bias, 
 95 |                                           self.stride, 
 96 |                                           self.padding, 
 97 |                                           self.dilation, 
 98 |                                           self.groups,
 99 |                                           self.deformable_groups,
100 |                                           self.im2col_step)
101 | 
102 | 
103 | class DeformConv2dPackMore(DeformConv2d):
104 | 
105 |     def __init__(self, in_channels, out_channels,
106 |                  kernel_size, stride, padding,
107 |                  dilation=1, groups=1, deformable_groups=1, im2col_step=64, bias=True, lr_mult=0.1):
108 |         super(DeformConv2dPackMore, self).__init__(in_channels, out_channels,
109 |                                                    kernel_size, stride, padding, dilation, groups, deformable_groups, im2col_step, bias)
110 | 
111 |         out_channels = self.deformable_groups * 2 * self.kernel_size[0] * self.kernel_size[1]
112 |         self.conv_offset = nn.Sequential(
113 |             nn.Conv2d(self.in_channels, self.in_channels//4, kernel_size=1, bias=False),
114 |             nn.BatchNorm2d(self.in_channels//4),
115 |             nn.ReLU(inplace=True),
116 |             nn.Conv2d(self.in_channels//4, out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, bias=True)
117 |         )
118 |         self.conv_offset[-1].lr_mult = lr_mult
119 |         self.conv_offset[-1].inited = True
120 |         self.init_offset()
121 | 
122 |     def init_offset(self):
123 |         self.conv_offset[-1].weight.data.zero_()
124 |         self.conv_offset[-1].bias.data.zero_()
125 | 
126 |     def forward(self, input):
127 |         offset = self.conv_offset(input)
128 |         return DeformConv2dFunction.apply(input, offset,
129 |                                           self.weight,
130 |                                           self.bias,
131 |                                           self.stride,
132 |                                           self.padding,
133 |                                           self.dilation,
134 |                                           self.groups,
135 |                                           self.deformable_groups,
136 |                                           self.im2col_step)
137 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | from utils.utils import *
  2 | from utils.cocoapi_evaluator import COCOAPIEvaluator
  3 | from utils.voc_evaluator import VOCEvaluator
  4 | from utils import distributed_util
  5 | from utils.distributed_util import reduce_loss_dict
  6 | from dataset.cocodataset import *
  7 | from dataset.vocdataset import *
  8 | from dataset.data_augment import TrainTransform
  9 | from dataset.dataloading import *
 10 | 
 11 | import os
 12 | import sys
 13 | import argparse
 14 | import yaml
 15 | import random
 16 | import math
 17 | import cv2
 18 | cv2.setNumThreads(0)
 19 | 
 20 | import torch
 21 | import torch.nn.init as init
 22 | from torch.autograd import Variable
 23 | import torch.distributed as dist
 24 | import time
 25 | 
 26 | import apex
 27 | 
 28 | ######## unlimit the resource in some dockers or cloud machines ####### 
 29 | #import resource
 30 | #rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
 31 | #resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
 32 | 
 33 | 
 34 | def parse_args():
 35 |     parser = argparse.ArgumentParser()
 36 |     parser.add_argument('--cfg', type=str, default='config/yolov3_baseline.cfg',
 37 |                         help='config file. see readme')
 38 |     parser.add_argument('-d', '--dataset', type=str,
 39 |                         default='COCO', help='COCO or VOC dataset')
 40 |     parser.add_argument('--n_cpu', type=int, default=4,
 41 |                         help='number of workers')
 42 |     parser.add_argument('--distributed', dest='distributed', action='store_true', default=False,
 43 |                         help='distributed training')
 44 |     parser.add_argument('--local_rank', type=int,
 45 |                             default=0, help='local_rank')
 46 |     parser.add_argument('--ngpu', type=int, default=10,
 47 |                         help='number of gpu')
 48 |     parser.add_argument('-c', '--checkpoint', type=str,
 49 |                         help='pytorch checkpoint file path')
 50 |     parser.add_argument('-s', '--test_size', type=int, default=416)
 51 |     parser.add_argument('--testset', dest='testset', action='store_true', default=False,
 52 |                         help='test set evaluation')
 53 |     parser.add_argument('--half', dest='half', action='store_true', default=False,
 54 |                         help='FP16 training')
 55 |     parser.add_argument('--rfb', dest='rfb', action='store_true', default=False,
 56 |                         help='Use rfb block')
 57 |     parser.add_argument('--asff', dest='asff', action='store_true', default=False,
 58 |                         help='Use ASFF module for yolov3')
 59 |     parser.add_argument('--vis', dest='vis', action='store_true', default=False,
 60 |                         help='visualize fusion weight and detection results')
 61 |     parser.add_argument('--use_cuda', type=bool, default=True)
 62 |     parser.add_argument('--debug', action='store_true', default=False,
 63 |                         help='debug mode where only one image is trained')
 64 |     return parser.parse_args()
 65 | 
 66 | def eval():
 67 |     """
 68 |     YOLOv3 evaler. See README for details.
 69 |     """
 70 |     args = parse_args()
 71 |     print("Setting Arguments.. : ", args)
 72 | 
 73 |     cuda = torch.cuda.is_available() and args.use_cuda
 74 | 
 75 |     if args.distributed:
 76 |         torch.cuda.set_device(args.local_rank)
 77 |         torch.distributed.init_process_group(backend="nccl", init_method="env://")
 78 | 
 79 | 
 80 |     # Parse config settings
 81 |     with open(args.cfg, 'r') as f:
 82 |         cfg = yaml.safe_load(f)
 83 | 
 84 |     print("successfully loaded config file: ", cfg)
 85 | 
 86 |     backbone=cfg['MODEL']['BACKBONE']
 87 |     test_size = (args.test_size,args.test_size)
 88 | 
 89 |     if args.dataset == 'COCO':
 90 |         evaluator = COCOAPIEvaluator(
 91 |                     data_dir='data/COCO/',
 92 |                     img_size=test_size,
 93 |                     confthre=0.001,
 94 |                     nmsthre=0.65,
 95 |                     testset=args.testset,
 96 |                     vis=args.vis)
 97 | 
 98 |         num_class=80
 99 | 
100 |     elif args.dataset == 'VOC':
101 |         '''
102 |         # COCO style evaluation, you have to convert xml annotation files into a json file.
103 |         evaluator = COCOAPIEvaluator(
104 |                     data_dir='data/VOC/',
105 |                     img_size=test_size,
106 |                     confthre=cfg['TEST']['CONFTHRE'],
107 |                     nmsthre=cfg['TEST']['NMSTHRE'],
108 |                     testset=args.testset,
109 |                     voc = True)
110 |         '''
111 |         evaluator = VOCEvaluator(
112 |                     data_dir='data/VOC/',
113 |                     img_size=test_size,
114 |                     confthre=0.001,
115 |                     nmsthre=0.65,
116 |                     vis=args.vis)
117 |         num_class=20
118 |     # Initiate model
119 |     if args.asff:
120 |         if backbone == 'mobile':
121 |             from models.yolov3_mobilev2 import YOLOv3
122 |             print("For mobilenet, we currently don't support dropblock, rfb and FeatureAdaption")
123 |         else:
124 |             from models.yolov3_asff import YOLOv3
125 |         print('Training YOLOv3 with ASFF!')
126 |         model = YOLOv3(num_classes = num_class, rfb=args.rfb, vis=args.vis, asff=args.asff)
127 |     else:
128 |         if backbone == 'mobile':
129 |             from models.yolov3_mobilev2 import YOLOv3
130 |         else:
131 |             from models.yolov3_baseline import YOLOv3
132 |         print('Training YOLOv3 strong baseline!')
133 |         if args.vis:
134 |             print('Visualization is not supported for YOLOv3 baseline model')
135 |             args.vis = False
136 |         model = YOLOv3(num_classes = num_class, rfb=args.rfb)
137 | 
138 |     save_to_disk = (not args.distributed) or distributed_util.get_rank() == 0
139 | 
140 |     if args.checkpoint:
141 |         print("loading pytorch ckpt...", args.checkpoint)
142 |         cpu_device = torch.device("cpu")
143 |         ckpt = torch.load(args.checkpoint, map_location=cpu_device)
144 |         #model.load_state_dict(ckpt,strict=False)
145 |         model.load_state_dict(ckpt)
146 |     if cuda:
147 |         print("using cuda")
148 |         torch.backends.cudnn.benchmark = True
149 |         device = torch.device("cuda")
150 |         model = model.to(device)
151 | 
152 |     if args.half:
153 |         model = model.half()
154 | 
155 |     if args.ngpu > 1:
156 |         if args.distributed:
157 |             model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True)
158 |             #model = apex.parallel.DistributedDataParallel(model)
159 |         else:
160 |             model = nn.DataParallel(model) 
161 | 
162 |     dtype = torch.float16 if args.half else torch.float32
163 | 
164 |     if args.distributed:
165 |         distributed_util.synchronize()
166 | 
167 |     ap50_95, ap50 = evaluator.evaluate(model, args.half, args.distributed)
168 | 
169 |     if args.distributed:
170 |         distributed_util.synchronize()
171 |     sys.exit(0) 
172 | 
173 | 
174 | if __name__ == '__main__':
175 |     eval()
176 | 


--------------------------------------------------------------------------------
/models/yolov3_baseline.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from collections import defaultdict
  5 | from .network_blocks import *
  6 | from .yolov3_head import YOLOv3Head
  7 | 
  8 | def create_yolov3_modules(num_classes, ignore_thre, label_smooth, rfb):
  9 |     """
 10 |     Build yolov3 layer modules.
 11 |     Args:
 12 |         ignore_thre (float): used in YOLOLayer.
 13 |     Returns:
 14 |         mlist (ModuleList): YOLOv3 module list.
 15 |     """
 16 |     # DarkNet53
 17 |     mlist = nn.ModuleList()
 18 |     mlist.append(add_conv(in_ch=3, out_ch=32, ksize=3, stride=1))           #0
 19 |     mlist.append(add_conv(in_ch=32, out_ch=64, ksize=3, stride=2))          #1
 20 |     mlist.append(resblock(ch=64))                                           #2
 21 |     mlist.append(add_conv(in_ch=64, out_ch=128, ksize=3, stride=2))         #3
 22 |     mlist.append(resblock(ch=128, nblocks=2))                               #4
 23 |     mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=2))        #5
 24 |     mlist.append(resblock(ch=256, nblocks=8))    # shortcut 1 from here     #6
 25 |     mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=2))        #7
 26 |     mlist.append(resblock(ch=512, nblocks=8))    # shortcut 2 from here     #8
 27 |     mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=2))       #9
 28 |     mlist.append(resblock(ch=1024, nblocks=4))                              #10
 29 | 
 30 |     # YOLOv3
 31 |     mlist.append(resblock(ch=1024, nblocks=1, shortcut=False))              #11
 32 |     mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1))       #12
 33 |     #SPP Layer
 34 |     mlist.append(SPPLayer())                                                #13
 35 | 
 36 |     mlist.append(add_conv(in_ch=2048, out_ch=512, ksize=1, stride=1))       #14
 37 |     mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=1))       #15
 38 |     mlist.append(DropBlock(block_size=1, keep_prob=1.0))                    #16
 39 |     mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1))       #17
 40 |     # 1st yolo branch
 41 |     mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=1))       #18
 42 |     mlist.append(
 43 |         YOLOv3Head(anch_mask=[6, 7, 8], n_classes=num_classes, stride=32, in_ch=1024,
 44 |             ignore_thre=ignore_thre,label_smooth = label_smooth, rfb=rfb))           #19
 45 | 
 46 |     mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1))        #20
 47 |     mlist.append(upsample(scale_factor=2, mode='nearest'))                  #21
 48 |     mlist.append(add_conv(in_ch=768, out_ch=256, ksize=1, stride=1))        #22
 49 |     mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=1))        #23
 50 |     mlist.append(DropBlock(block_size=1, keep_prob=1.0))                    #24
 51 |     mlist.append(resblock(ch=512, nblocks=1, shortcut=False))               #25
 52 |     mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1))        #26
 53 |     # 2nd yolo branch
 54 |     mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=1))        #27
 55 |     mlist.append(
 56 |         YOLOv3Head(anch_mask=[3, 4, 5], n_classes=num_classes, stride=16, in_ch=512,
 57 |              ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb))         #28
 58 | 
 59 |     mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1))        #29
 60 |     mlist.append(upsample(scale_factor=2, mode='nearest'))                  #30
 61 |     mlist.append(add_conv(in_ch=384, out_ch=128, ksize=1, stride=1))        #31
 62 |     mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=1))        #32
 63 |     mlist.append(DropBlock(block_size=1, keep_prob=1.0))                    #33
 64 |     mlist.append(resblock(ch=256, nblocks=1, shortcut=False))               #34
 65 |     mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1))        #35
 66 |     mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=1))        #36
 67 |     mlist.append(
 68 |         YOLOv3Head(anch_mask=[0, 1, 2], n_classes=num_classes, stride=8, in_ch=256,
 69 |              ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb))         #37
 70 | 
 71 |     return mlist
 72 | 
 73 | 
 74 | class YOLOv3(nn.Module):
 75 |     """
 76 |     YOLOv3 model module. The module list is defined by create_yolov3_modules function. \
 77 |     The network returns loss values from three YOLO layers during training \
 78 |     and detection results during test.
 79 |     """
 80 |     def __init__(self, num_classes = 80, ignore_thre=0.7, label_smooth = False, rfb=False):
 81 | 
 82 |         super(YOLOv3, self).__init__()
 83 |         self.module_list = create_yolov3_modules(num_classes, ignore_thre, label_smooth, rfb)
 84 | 
 85 |     def forward(self, x, targets=None, epoch=0):
 86 | 
 87 |         train = targets is not None
 88 |         output = []
 89 |         anchor_losses= []
 90 |         iou_losses = []
 91 |         l1_losses = []
 92 |         conf_losses = []
 93 |         cls_losses = []
 94 |         route_layers = []
 95 |         for i, module in enumerate(self.module_list):
 96 | 
 97 |             # yolo layers
 98 |             if i in [19, 28, 37]:
 99 |                 if train:
100 |                     x, anchor_loss, iou_loss, l1_loss, conf_loss, cls_loss = module(x, targets)
101 |                     anchor_losses.append(anchor_loss)
102 |                     iou_losses.append(iou_loss)
103 |                     l1_losses.append(l1_loss)
104 |                     conf_losses.append(conf_loss)
105 |                     cls_losses.append(cls_loss)
106 |                 else:
107 |                     x = module(x)
108 | 
109 |                 output.append(x)
110 |             else:
111 |                 x = module(x)
112 | 
113 |             # route layers
114 |             if i in [6, 8, 17, 26]:
115 |                 route_layers.append(x)
116 |             if i == 19:
117 |                 x = route_layers[2]
118 |             if i == 28:  # yolo 2nd
119 |                 x = route_layers[3]
120 |             if i == 21:
121 |                 x = torch.cat((x, route_layers[1]), 1)
122 |             if i == 30:
123 |                 x = torch.cat((x, route_layers[0]), 1)
124 | 
125 |         if train:
126 |             losses = torch.stack(output, 0).unsqueeze(0).sum(1,keepdim=True)
127 |             anchor_losses = torch.stack(anchor_losses, 0).unsqueeze(0).sum(1,keepdim=True)
128 |             iou_losses = torch.stack(iou_losses, 0).unsqueeze(0).sum(1,keepdim=True)
129 |             l1_losses = torch.stack(l1_losses, 0).unsqueeze(0).sum(1,keepdim=True)
130 |             conf_losses = torch.stack(conf_losses, 0).unsqueeze(0).sum(1,keepdim=True)
131 |             cls_losses = torch.stack(cls_losses, 0).unsqueeze(0).sum(1,keepdim=True)
132 |             loss_dict = dict(
133 |                     losses = losses,
134 |                     anchor_losses = anchor_losses,
135 |                     iou_losses = iou_losses,
136 |                     l1_losses = l1_losses,
137 |                     conf_losses = conf_losses,
138 |                     cls_losses = cls_losses,
139 |             )
140 |             return loss_dict
141 |         else:
142 |             return torch.cat(output, 1)
143 | 
144 | 


--------------------------------------------------------------------------------
/utils/fp16_utils/fp16util.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
  5 | 
  6 | 
  7 | class tofp16(nn.Module):
  8 |     """
  9 |     Utility module that implements::
 10 | 
 11 |         def forward(self, input):
 12 |             return input.half()
 13 |     """
 14 | 
 15 |     def __init__(self):
 16 |         super(tofp16, self).__init__()
 17 | 
 18 |     def forward(self, input):
 19 |         return input.half()
 20 | 
 21 | 
 22 | def BN_convert_float(module):
 23 |     """
 24 |     Utility function for network_to_half().
 25 | 
 26 |     Retained for legacy purposes.
 27 |     """
 28 |     if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
 29 |         module.float()
 30 |     for child in module.children():
 31 |         BN_convert_float(child)
 32 |     return module
 33 | 
 34 | 
 35 | def network_to_half(network):
 36 |     """
 37 |     Convert model to half precision in a batchnorm-safe way.
 38 | 
 39 |     Retained for legacy purposes. It is recommended to use FP16Model.
 40 |     """
 41 |     return nn.Sequential(tofp16(), BN_convert_float(network.half()))
 42 | 
 43 | 
 44 | def convert_module(module, dtype):
 45 |     """
 46 |     Converts a module's immediate parameters and buffers to dtype.
 47 |     """
 48 |     for param in module.parameters(recurse=False):
 49 |         if param is not None:
 50 |             if param.data.dtype.is_floating_point:
 51 |                 param.data = param.data.to(dtype=dtype)
 52 |             if param._grad is not None and param._grad.data.dtype.is_floating_point:
 53 |                 param._grad.data = param._grad.data.to(dtype=dtype)
 54 | 
 55 |     for buf in module.buffers(recurse=False):
 56 |         if buf is not None and buf.data.dtype.is_floating_point:
 57 |             buf.data = buf.data.to(dtype=dtype)
 58 | 
 59 | 
 60 | def convert_network(network, dtype):
 61 |     """
 62 |     Converts a network's parameters and buffers to dtype.
 63 |     """
 64 |     for module in network.modules():
 65 |         if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
 66 |             continue
 67 |         convert_module(module, dtype)
 68 |     return network
 69 | 
 70 | 
 71 | class FP16Model(nn.Module):
 72 |     """
 73 |     Convert model to half precision in a batchnorm-safe way.
 74 |     """
 75 | 
 76 |     def __init__(self, network):
 77 |         super(FP16Model, self).__init__()
 78 |         self.network = convert_network(network, dtype=torch.half)
 79 | 
 80 |     def forward(self, *inputs):
 81 |         inputs = tuple(t.half() for t in inputs)
 82 |         return self.network(*inputs)
 83 | 
 84 | 
 85 | def backwards_debug_hook(grad):
 86 |     raise RuntimeError("master_params recieved a gradient in the backward pass!")
 87 | 
 88 | def prep_param_lists(model, flat_master=False):
 89 |     """
 90 |     Creates a list of FP32 master parameters for a given model, as in
 91 |     `Training Neural Networks with Mixed Precision:  Real Examples`_.
 92 | 
 93 |     Args:
 94 |         model (torch.nn.Module): Existing Pytorch model
 95 |         flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
 96 |     Returns:
 97 |         A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
 98 | 
 99 |     Example::
100 | 
101 |         model_params, master_params = prep_param_lists(model)
102 | 
103 |     .. warning::
104 |         Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
105 | 
106 |     .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
107 |         http://on-demand.gputechconf.com/gtc/2018/video/S81012/
108 |     """
109 |     model_params = [param for param in model.parameters() if param.requires_grad]
110 | 
111 |     if flat_master:
112 |         # Give the user some more useful error messages
113 |         try:
114 |             # flatten_dense_tensors returns a contiguous flat array.
115 |             # http://pytorch.org/docs/master/_modules/torch/_utils.html
116 |             master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
117 |         except:
118 |             print("Error in prep_param_lists:  model may contain a mixture of parameters "
119 |                       "of different types.  Use flat_master=False, or use F16_Optimizer.")
120 |             raise
121 |         master_params = torch.nn.Parameter(master_params)
122 |         master_params.requires_grad = True
123 |         # master_params.register_hook(backwards_debug_hook)
124 |         if master_params.grad is None:
125 |             master_params.grad = master_params.new(*master_params.size())
126 |         return model_params, [master_params]
127 |     else:
128 |         master_params = [param.clone().float().detach() for param in model_params]
129 |         for param in master_params:
130 |             param.requires_grad = True
131 |         return model_params, master_params
132 | 
133 | 
134 | def model_grads_to_master_grads(model_params, master_params, flat_master=False):
135 |     """
136 |     Copy model gradients to master gradients.  
137 | 
138 |     Args:
139 |         model_params:  List of model parameters created by :func:`prep_param_lists`.
140 |         master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
141 |     """
142 |     if flat_master:
143 |         # The flattening may incur one more deep copy than is necessary.
144 |         master_params[0].grad.data.copy_(
145 |             _flatten_dense_tensors([p.grad.data for p in model_params]))
146 |     else:
147 |         for model, master in zip(model_params, master_params):
148 |             if model.grad is not None:
149 |                 if master.grad is None:
150 |                     master.grad = Variable(master.data.new(*master.data.size()))
151 |                 master.grad.data.copy_(model.grad.data)
152 |             else:
153 |                 master.grad = None
154 | 
155 | 
156 | def master_params_to_model_params(model_params, master_params, flat_master=False):
157 |     """
158 |     Copy master parameters to model parameters.
159 | 
160 |     Args:
161 |         model_params:  List of model parameters created by :func:`prep_param_lists`.
162 |         master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
163 |     """
164 |     if flat_master:
165 |         for model, master in zip(model_params, 
166 |                                  _unflatten_dense_tensors(master_params[0].data, model_params)):
167 |             model.data.copy_(master)
168 |     else:
169 |         for model, master in zip(model_params, master_params):
170 |             model.data.copy_(master.data)
171 | 
172 | # Backward compatibility fixes
173 | 
174 | def to_python_float(t):
175 |     if hasattr(t, 'item'):
176 |         return t.item()
177 |     else:
178 |         return t[0]
179 | 
180 | TORCH_MAJOR = int(torch.__version__.split('.')[0])
181 | TORCH_MINOR = int(torch.__version__.split('.')[1])
182 | if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
183 |     clip_grad_norm = torch.nn.utils.clip_grad_norm
184 | else:
185 |     clip_grad_norm = torch.nn.utils.clip_grad_norm_
186 | 


--------------------------------------------------------------------------------
/dataset/voc_eval.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast/er R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Bharath Hariharan
  5 | # --------------------------------------------------------
  6 | 
  7 | import xml.etree.ElementTree as ET
  8 | import os
  9 | import pickle
 10 | import numpy as np
 11 | import pdb
 12 | 
 13 | 
 14 | def parse_rec(filename):
 15 |     """ Parse a PASCAL VOC xml file """
 16 |     tree = ET.parse(filename)
 17 |     objects = []
 18 |     for obj in tree.findall('object'):
 19 |         obj_struct = {}
 20 |         obj_struct['name'] = obj.find('name').text
 21 |         obj_struct['pose'] = obj.find('pose').text
 22 |         obj_struct['truncated'] = int(obj.find('truncated').text)
 23 |         obj_struct['difficult'] = int(obj.find('difficult').text)
 24 |         bbox = obj.find('bndbox')
 25 |         obj_struct['bbox'] = [int(bbox.find('xmin').text),
 26 |                               int(bbox.find('ymin').text),
 27 |                               int(bbox.find('xmax').text),
 28 |                               int(bbox.find('ymax').text)]
 29 |         objects.append(obj_struct)
 30 | 
 31 |     return objects
 32 | 
 33 | 
 34 | 
 35 | def voc_ap(rec, prec, use_07_metric=False):
 36 |     """ ap = voc_ap(rec, prec, [use_07_metric])
 37 |     Compute VOC AP given precision and recall.
 38 |     If use_07_metric is true, uses the
 39 |     VOC 07 11 point method (default:False).
 40 |     """
 41 |     if use_07_metric:
 42 |         # 11 point metric
 43 |         ap = 0.
 44 |         for t in np.arange(0., 1.1, 0.1):
 45 |             if np.sum(rec >= t) == 0:
 46 |                 p = 0
 47 |             else:
 48 |                 p = np.max(prec[rec >= t])
 49 |             ap = ap + p / 11.
 50 |     else:
 51 |         # correct AP calculation
 52 |         # first append sentinel values at the end
 53 |         mrec = np.concatenate(([0.], rec, [1.]))
 54 |         mpre = np.concatenate(([0.], prec, [0.]))
 55 | 
 56 |         # compute the precision envelope
 57 |         for i in range(mpre.size - 1, 0, -1):
 58 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 59 | 
 60 |         # to calculate area under PR curve, look for points
 61 |         # where X axis (recall) changes value
 62 |         i = np.where(mrec[1:] != mrec[:-1])[0]
 63 | 
 64 |         # and sum (\Delta recall) * prec
 65 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 66 |     return ap
 67 | 
 68 | def voc_eval(detpath,
 69 |              annopath,
 70 |              imagesetfile,
 71 |              classname,
 72 |              cachedir,
 73 |              ovthresh=0.5,
 74 |              use_07_metric=False):
 75 |     """rec, prec, ap = voc_eval(detpath,
 76 |                                 annopath,
 77 |                                 imagesetfile,
 78 |                                 classname,
 79 |                                 [ovthresh],
 80 |                                 [use_07_metric])
 81 | 
 82 |     Top level function that does the PASCAL VOC evaluation.
 83 | 
 84 |     detpath: Path to detections
 85 |         detpath.format(classname) should produce the detection results file.
 86 |     annopath: Path to annotations
 87 |         annopath.format(imagename) should be the xml annotations file.
 88 |     imagesetfile: Text file containing the list of images, one image per line.
 89 |     classname: Category name (duh)
 90 |     cachedir: Directory for caching the annotations
 91 |     [ovthresh]: Overlap threshold (default = 0.5)
 92 |     [use_07_metric]: Whether to use VOC07's 11 point AP computation
 93 |         (default False)
 94 |     """
 95 |     # assumes detections are in detpath.format(classname)
 96 |     # assumes annotations are in annopath.format(imagename)
 97 |     # assumes imagesetfile is a text file with each line an image name
 98 |     # cachedir caches the annotations in a pickle file
 99 | 
100 |     # first load gt
101 |     if not os.path.isdir(cachedir):
102 |         os.mkdir(cachedir)
103 |     cachefile = os.path.join(cachedir, 'annots.pkl')
104 |     # read list of images
105 |     with open(imagesetfile, 'r') as f:
106 |         lines = f.readlines()
107 |     imagenames = [x.strip() for x in lines]
108 | 
109 |     if not os.path.isfile(cachefile):
110 |         # load annots
111 |         recs = {}
112 |         for i, imagename in enumerate(imagenames):
113 |             recs[imagename] = parse_rec(annopath.format(imagename))
114 |             if i % 100 == 0:
115 |                 print('Reading annotation for {:d}/{:d}'.format(
116 |                     i + 1, len(imagenames)))
117 |         # save
118 |         print('Saving cached annotations to {:s}'.format(cachefile))
119 |         with open(cachefile, 'wb') as f:
120 |             pickle.dump(recs, f)
121 |     else:
122 |         # load
123 |         with open(cachefile, 'rb') as f:
124 |             recs = pickle.load(f)
125 | 
126 |     # extract gt objects for this class
127 |     class_recs = {}
128 |     npos = 0
129 |     for imagename in imagenames:
130 |         R = [obj for obj in recs[imagename] if obj['name'] == classname]
131 |         bbox = np.array([x['bbox'] for x in R])
132 |         difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
133 |         det = [False] * len(R)
134 |         npos = npos + sum(~difficult)
135 |         class_recs[imagename] = {'bbox': bbox,
136 |                                  'difficult': difficult,
137 |                                  'det': det}
138 | 
139 |     # read dets
140 |     detfile = detpath.format(classname)
141 |     with open(detfile, 'r') as f:
142 |         lines = f.readlines()
143 | 
144 |     if len(lines) == 0:
145 |         return 0, 0, 0
146 | 
147 |     splitlines = [x.strip().split(' ') for x in lines]
148 |     image_ids = [x[0] for x in splitlines]
149 |     confidence = np.array([float(x[1]) for x in splitlines])
150 |     BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
151 | 
152 |         # sort by confidence
153 |     sorted_ind = np.argsort(-confidence)
154 |     sorted_scores = np.sort(-confidence)
155 |     BB = BB[sorted_ind, :]
156 |     image_ids = [image_ids[x] for x in sorted_ind]
157 | 
158 |         # go down dets and mark TPs and FPs
159 |     nd = len(image_ids)
160 |     tp = np.zeros(nd)
161 |     fp = np.zeros(nd)
162 |     for d in range(nd):
163 |         R = class_recs[image_ids[d]]
164 |         bb = BB[d, :].astype(float)
165 |         ovmax = -np.inf
166 |         BBGT = R['bbox'].astype(float)
167 | 
168 |         if BBGT.size > 0:
169 |             # compute overlaps
170 |             # intersection
171 |             ixmin = np.maximum(BBGT[:, 0], bb[0])
172 |             iymin = np.maximum(BBGT[:, 1], bb[1])
173 |             ixmax = np.minimum(BBGT[:, 2], bb[2])
174 |             iymax = np.minimum(BBGT[:, 3], bb[3])
175 |             iw = np.maximum(ixmax - ixmin + 1., 0.)
176 |             ih = np.maximum(iymax - iymin + 1., 0.)
177 |             inters = iw * ih
178 | 
179 |                 # union
180 |             uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
181 |                    (BBGT[:, 2] - BBGT[:, 0] + 1.) *
182 |                    (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
183 | 
184 |             overlaps = inters / uni
185 |             ovmax = np.max(overlaps)
186 |             jmax = np.argmax(overlaps)
187 | 
188 |         if ovmax > ovthresh:
189 |             if not R['difficult'][jmax]:
190 |                 if not R['det'][jmax]:
191 |                     tp[d] = 1.
192 |                     R['det'][jmax] = 1
193 |                 else:
194 |                     fp[d] = 1.
195 |         else:
196 |             fp[d] = 1.
197 | 
198 |         # compute precision recall
199 |     fp = np.cumsum(fp)
200 |     tp = np.cumsum(tp)
201 |     rec = tp / float(npos)
202 |         # avoid divide by zero in case the first detection matches a difficult
203 |         # ground truth
204 |     prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
205 |     ap = voc_ap(rec, prec, use_07_metric)
206 | 
207 |     return rec, prec, ap
208 | 


--------------------------------------------------------------------------------
/utils/fp16_utils/loss_scaler.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | # item() is a recent addition, so this helps with backward compatibility.
  4 | def to_python_float(t):
  5 |     if hasattr(t, 'item'):
  6 |         return t.item()
  7 |     else:
  8 |         return t[0]
  9 | 
 10 | class LossScaler:
 11 |     """
 12 |     Class that manages a static loss scale.  This class is intended to interact with
 13 |     :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
 14 | 
 15 |     Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to 
 16 |     :class:`FP16_Optimizer`'s constructor.
 17 | 
 18 |     Args:
 19 |         scale (float, optional, default=1.0):  The loss scale.
 20 |     """
 21 | 
 22 |     def __init__(self, scale=1):
 23 |         self.cur_scale = scale
 24 | 
 25 |     # `params` is a list / generator of torch.Variable
 26 |     def has_overflow(self, params):
 27 |         return False
 28 | 
 29 |     # `x` is a torch.Tensor
 30 |     def _has_inf_or_nan(x):
 31 |         return False
 32 | 
 33 |     def update_scale(self, overflow):
 34 |         pass
 35 | 
 36 |     @property
 37 |     def loss_scale(self):
 38 |         return self.cur_scale
 39 | 
 40 |     def scale_gradient(self, module, grad_in, grad_out):
 41 |         return tuple(self.loss_scale * g for g in grad_in)
 42 | 
 43 |     def backward(self, loss, retain_graph=False):
 44 |         scaled_loss = loss*self.loss_scale
 45 |         scaled_loss.backward(retain_graph=retain_graph)
 46 | 
 47 | class DynamicLossScaler:
 48 |     """
 49 |     Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
 50 |     indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of 
 51 |     :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
 52 |     operates, because the default options can be changed using the
 53 |     the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
 54 | 
 55 |     Loss scaling is designed to combat the problem of underflowing gradients encountered at long
 56 |     times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
 57 |     scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
 58 |     encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has 
 59 |     occurred.
 60 |     :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
 61 |     and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.  
 62 |     If a certain number of iterations occur without overflowing gradients detected,
 63 |     :class:`DynamicLossScaler` increases the loss scale once more.
 64 |     In this way :class:`DynamicLossScaler` attempts to "ride the edge" of 
 65 |     always using the highest loss scale possible without incurring overflow.
 66 | 
 67 |     Args:
 68 |         init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
 69 |         scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. 
 70 |         scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
 71 |     """
 72 | 
 73 |     def __init__(self,
 74 |                  init_scale=2**32,
 75 |                  scale_factor=2.,
 76 |                  scale_window=1000):
 77 |         self.cur_scale = init_scale
 78 |         self.cur_iter = 0
 79 |         self.last_overflow_iter = -1
 80 |         self.scale_factor = scale_factor
 81 |         self.scale_window = scale_window
 82 | 
 83 |     # `params` is a list / generator of torch.Variable
 84 |     def has_overflow(self, params):
 85 |         for p in params:
 86 |             if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
 87 |                 return True
 88 | 
 89 |         return False
 90 | 
 91 |     # `x` is a torch.Tensor
 92 |     def _has_inf_or_nan(x):
 93 |         try:
 94 |             # if x is half, the .float() incurs an additional deep copy, but it's necessary if 
 95 |             # Pytorch's .sum() creates a one-element tensor of the same type as x 
 96 |             # (which is true for some recent version of pytorch).
 97 |             cpu_sum = float(x.float().sum())
 98 |             # More efficient version that can be used if .sum() returns a Python scalar
 99 |             # cpu_sum = float(x.sum())
100 |         except RuntimeError as instance:
101 |             # We want to check if inst is actually an overflow exception.
102 |             # RuntimeError could come from a different error.
103 |             # If so, we still want the exception to propagate.
104 |             if "value cannot be converted" not in instance.args[0]:
105 |                 raise
106 |             return True
107 |         else:
108 |             if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
109 |                 return True
110 |             return False
111 | 
112 |     # `overflow` is boolean indicating whether the gradient overflowed
113 |     def update_scale(self, overflow):
114 |         if overflow:
115 |             # self.cur_scale /= self.scale_factor
116 |             self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
117 |             self.last_overflow_iter = self.cur_iter
118 |         else:
119 |             if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
120 |                 self.cur_scale *= self.scale_factor
121 |         self.cur_iter += 1
122 | 
123 |     @property
124 |     def loss_scale(self):
125 |         return self.cur_scale
126 | 
127 |     def scale_gradient(self, module, grad_in, grad_out):
128 |         return tuple(self.loss_scale * g for g in grad_in)
129 | 
130 |     def backward(self, loss, retain_graph=False):
131 |         scaled_loss = loss*self.loss_scale
132 |         scaled_loss.backward(retain_graph=retain_graph)
133 |         
134 | ##############################################################        
135 | # Example usage below here -- assuming it's in a separate file
136 | ##############################################################
137 | """
138 | TO-DO separate out into an example.
139 | if __name__ == "__main__":
140 |     import torch
141 |     from torch.autograd import Variable
142 |     from dynamic_loss_scaler import DynamicLossScaler
143 | 
144 |     # N is batch size; D_in is input dimension;
145 |     # H is hidden dimension; D_out is output dimension.
146 |     N, D_in, H, D_out = 64, 1000, 100, 10
147 | 
148 |     # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
149 |     x = Variable(torch.randn(N, D_in), requires_grad=False)
150 |     y = Variable(torch.randn(N, D_out), requires_grad=False)
151 | 
152 |     w1 = Variable(torch.randn(D_in, H), requires_grad=True)
153 |     w2 = Variable(torch.randn(H, D_out), requires_grad=True)
154 |     parameters = [w1, w2]
155 | 
156 |     learning_rate = 1e-6
157 |     optimizer = torch.optim.SGD(parameters, lr=learning_rate)
158 |     loss_scaler = DynamicLossScaler()
159 | 
160 |     for t in range(500):
161 |         y_pred = x.mm(w1).clamp(min=0).mm(w2)
162 |         loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
163 |         print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
164 |         print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
165 |         print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
166 | 
167 |         # Run backprop
168 |         optimizer.zero_grad()
169 |         loss.backward()
170 |         
171 |         # Check for overflow
172 |         has_overflow = DynamicLossScaler.has_overflow(parameters)
173 |         
174 |         # If no overflow, unscale grad and update as usual
175 |         if not has_overflow:
176 |             for param in parameters:
177 |                 param.grad.data.mul_(1. / loss_scaler.loss_scale)
178 |             optimizer.step()
179 |         # Otherwise, don't do anything -- ie, skip iteration
180 |         else:
181 |             print('OVERFLOW!')
182 | 
183 |         # Update loss scale for next iteration
184 |         loss_scaler.update_scale(has_overflow)
185 | 
186 | """
187 | 


--------------------------------------------------------------------------------
/models/yolov3_asff.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from .network_blocks import *
  5 | from .yolov3_head import YOLOv3Head
  6 | 
  7 | from collections import defaultdict
  8 | 
  9 | def build_yolov3_modules(num_classes, ignore_thre, label_smooth, rfb):
 10 |     """
 11 |     Build yolov3 layer modules.
 12 |     Args:
 13 |         ignore_thre (float): used in YOLOLayer.
 14 |     Returns:
 15 |         mlist (ModuleList): YOLOv3 module list.
 16 |     """
 17 |     # DarkNet53
 18 |     mlist = nn.ModuleList()
 19 |     mlist.append(add_conv(in_ch=3, out_ch=32, ksize=3, stride=1))           #0
 20 |     mlist.append(add_conv(in_ch=32, out_ch=64, ksize=3, stride=2))          #1
 21 |     mlist.append(resblock(ch=64))                                           #2
 22 |     mlist.append(add_conv(in_ch=64, out_ch=128, ksize=3, stride=2))         #3
 23 |     mlist.append(resblock(ch=128, nblocks=2))                               #4
 24 |     mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=2))        #5
 25 |     mlist.append(resblock(ch=256, nblocks=8))    # shortcut 1 from here     #6
 26 |     mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=2))        #7
 27 |     mlist.append(resblock(ch=512, nblocks=8))    # shortcut 2 from here     #8
 28 |     mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=2))       #9
 29 |     mlist.append(resblock(ch=1024, nblocks=4))                              #10
 30 | 
 31 |     # YOLOv3
 32 |     mlist.append(resblock(ch=1024, nblocks=1, shortcut=False))              #11
 33 |     mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1))       #12
 34 |     #SPP Layer
 35 |     mlist.append(SPPLayer())                                                #13
 36 | 
 37 |     mlist.append(add_conv(in_ch=2048, out_ch=512, ksize=1, stride=1))       #14
 38 |     mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=1))       #15
 39 |     mlist.append(DropBlock(block_size=1, keep_prob=1))                    #16
 40 |     mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1))       #17
 41 | 
 42 |     # 1st yolo branch
 43 |     mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1))        #18
 44 |     mlist.append(upsample(scale_factor=2, mode='nearest'))                  #19
 45 |     mlist.append(add_conv(in_ch=768, out_ch=256, ksize=1, stride=1))        #20
 46 |     mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=1))        #21
 47 |     mlist.append(DropBlock(block_size=1, keep_prob=1))                    #22
 48 |     mlist.append(resblock(ch=512, nblocks=1, shortcut=False))               #23
 49 |     mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1))        #24
 50 |     # 2nd yolo branch
 51 | 
 52 |     mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1))        #25
 53 |     mlist.append(upsample(scale_factor=2, mode='nearest'))                  #26
 54 |     mlist.append(add_conv(in_ch=384, out_ch=128, ksize=1, stride=1))        #27
 55 |     mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=1))        #28
 56 |     mlist.append(DropBlock(block_size=1, keep_prob=1))                    #29
 57 |     mlist.append(resblock(ch=256, nblocks=1, shortcut=False))               #30
 58 |     mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1))        #31
 59 |     mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=1))        #32
 60 | 
 61 |     return mlist
 62 | 
 63 | 
 64 | class YOLOv3(nn.Module):
 65 |     """
 66 |     YOLOv3 model module. The module list is defined by create_yolov3_modules function. \
 67 |     The network returns loss values from three YOLO layers during training \
 68 |     and detection results during test.
 69 |     """
 70 |     def __init__(self, num_classes = 80, ignore_thre=0.7, label_smooth = False, rfb=False, vis=False, asff=False):
 71 |         """
 72 |         Initialization of YOLOv3 class.
 73 |         Args:
 74 |             ignore_thre (float): used in YOLOLayer.
 75 |         """
 76 |         super(YOLOv3, self).__init__()
 77 |         self.module_list = build_yolov3_modules(num_classes, ignore_thre, label_smooth, rfb)
 78 | 
 79 | 
 80 |         self.level_0_fusion = ASFF(level=0,rfb=rfb,vis=vis)
 81 | 
 82 |         self.level_0_header = YOLOv3Head(anch_mask=[6, 7, 8], n_classes=num_classes, stride=32, in_ch=1024,
 83 |                               ignore_thre=ignore_thre,label_smooth = label_smooth, rfb=rfb)
 84 | 
 85 |         self.level_1_fusion = ASFF(level=1,rfb=rfb,vis=vis)
 86 | 
 87 |         self.level_1_header = YOLOv3Head(anch_mask=[3, 4, 5], n_classes=num_classes, stride=16, in_ch=512,
 88 |                               ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb)
 89 | 
 90 |         self.level_2_fusion = ASFF(level=2,rfb=rfb,vis=vis)
 91 | 
 92 |         self.level_2_header = YOLOv3Head(anch_mask=[0, 1, 2], n_classes=num_classes, stride=8, in_ch=256,
 93 |                               ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb)
 94 |         self.vis=vis
 95 | 
 96 |     def forward(self, x, targets=None, epoch=0):
 97 |         """
 98 |         Forward path of YOLOv3.
 99 |         Args:
100 |             x (torch.Tensor) : input data whose shape is :math:`(N, C, H, W)`, \
101 |                 where N, C are batchsize and num. of channels.
102 |             targets (torch.Tensor) : label array whose shape is :math:`(N, 50, 5)`
103 | 
104 |         Returns:
105 |             training:
106 |                 output (torch.Tensor): loss tensor for backpropagation.
107 |             test:
108 |                 output (torch.Tensor): concatenated detection results.
109 |         """
110 | 
111 |         train = targets is not None
112 |         output = []
113 |         anchor_losses= []
114 |         iou_losses = []
115 |         l1_losses = []
116 |         conf_losses = []
117 |         cls_losses = []
118 |         route_layers = []
119 |         if self.vis:
120 |             fuse_wegihts = []
121 |             fuse_fs = []
122 | 
123 |         for i, module in enumerate(self.module_list):
124 | 
125 |             # yolo layers
126 |             x = module(x)
127 | 
128 |             # route layers
129 |             if i in [6, 8, 17, 24, 32]:
130 |                 route_layers.append(x)
131 |             if i == 19:
132 |                 x = torch.cat((x, route_layers[1]), 1)
133 |             if i == 26:
134 |                 x = torch.cat((x, route_layers[0]), 1)
135 |         
136 | 
137 |         for l in range(3):
138 |             fusion = getattr(self, 'level_{}_fusion'.format(l))
139 |             header = getattr(self, 'level_{}_header'.format(l))
140 | 
141 |             if self.vis:
142 |                 fused, weight, fuse_f = fusion(route_layers[2],route_layers[3],route_layers[4])
143 |                 fuse_wegihts.append(weight)
144 |                 fuse_fs.append(fuse_f)
145 |             else:
146 |                 fused = fusion(route_layers[2],route_layers[3],route_layers[4])
147 | 
148 |             if train:
149 |                 x, anchor_loss, iou_loss, l1_loss, conf_loss, cls_loss = header(fused, targets)
150 |                 anchor_losses.append(anchor_loss)
151 |                 iou_losses.append(iou_loss)
152 |                 l1_losses.append(l1_loss)
153 |                 conf_losses.append(conf_loss)
154 |                 cls_losses.append(cls_loss)
155 |             else:
156 |                 x = header(fused)
157 | 
158 |             output.append(x)
159 | 
160 |         if train:
161 |             losses = torch.stack(output, 0).unsqueeze(0).sum(1,keepdim=True)
162 |             anchor_losses = torch.stack(anchor_losses, 0).unsqueeze(0).sum(1,keepdim=True)
163 |             iou_losses = torch.stack(iou_losses, 0).unsqueeze(0).sum(1,keepdim=True)
164 |             l1_losses = torch.stack(l1_losses, 0).unsqueeze(0).sum(1,keepdim=True)
165 |             conf_losses = torch.stack(conf_losses, 0).unsqueeze(0).sum(1,keepdim=True)
166 |             cls_losses = torch.stack(cls_losses, 0).unsqueeze(0).sum(1,keepdim=True)
167 |             loss_dict = dict(
168 |                     losses = losses,
169 |                     anchor_losses = anchor_losses,
170 |                     iou_losses = iou_losses,
171 |                     l1_losses = l1_losses,
172 |                     conf_losses = conf_losses,
173 |                     cls_losses = cls_losses,
174 |             )
175 |             return loss_dict
176 |         else:
177 |             if self.vis:
178 |                 return torch.cat(output, 1), fuse_wegihts, fuse_fs
179 |             else:
180 |                 return torch.cat(output, 1)
181 | 
182 | 


--------------------------------------------------------------------------------
/utils/voc_evaluator.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import tempfile
  3 | import sys
  4 | from tqdm import tqdm
  5 | 
  6 | from pycocotools.cocoeval import COCOeval
  7 | from torch.autograd import Variable
  8 | 
  9 | from dataset.vocdataset import *
 10 | from dataset.data_augment import ValTransform
 11 | from utils.utils import *
 12 | from utils import distributed_util
 13 | from utils.vis_utils import make_vis, make_pred_vis
 14 | 
 15 | import time
 16 | 
 17 | #DEBUG = True
 18 | DEBUG = False
 19 | 
 20 | def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu):
 21 |     all_predictions = distributed_util.scatter_gather(predictions_per_gpu)
 22 |     if not distributed_util.is_main_process():
 23 |         return
 24 |     # merge the list of dicts
 25 |     predictions = {}
 26 |     for p in all_predictions:
 27 |         predictions.update(p)
 28 |     # convert a dict where the key is the index in a list
 29 |     image_ids = list(sorted(predictions.keys()))
 30 |     if len(image_ids) != image_ids[-1] + 1:
 31 |         print('num_imgs: ',len(image_ids))
 32 |         print('last img_id: ',image_ids[-1])
 33 |         print(
 34 |             "Number of images that were gathered from multiple processes is not "
 35 |             "a contiguous set. Some images might be missing from the evaluation"
 36 |         )
 37 | 
 38 |     # convert to a list
 39 |     predictions = [predictions[i] for i in image_ids]
 40 |     return predictions
 41 | 
 42 | 
 43 | class VOCEvaluator():
 44 |     """
 45 |     COCO AP Evaluation class.
 46 |     All the data in the val2017 dataset are processed \
 47 |     and evaluated by COCO API.
 48 |     """
 49 |     def __init__(self, data_dir, img_size, confthre, nmsthre,vis=False):
 50 |         """
 51 |         Args:
 52 |             data_dir (str): dataset root directory
 53 |             img_size (int): image size after preprocess. images are resized \
 54 |                 to squares whose shape is (img_size, img_size).
 55 |             confthre (float):
 56 |                 confidence threshold ranging from 0 to 1, \
 57 |                 which is defined in the config file.
 58 |             nmsthre (float):
 59 |                 IoU threshold of non-max supression ranging from 0 to 1.
 60 |         """
 61 |         test_sets = [('2007', 'test'),]
 62 |         self.dataset = VOCDetection(
 63 |                                    root=data_dir,
 64 |                                    image_sets = test_sets,
 65 |                                    input_dim=img_size,
 66 |                                    preproc = ValTransform(rgb_means=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225)),)
 67 |         self.num_images = len(self.dataset)
 68 |         self.dataloader = torch.utils.data.DataLoader(
 69 |             self.dataset, batch_size=1, shuffle=False, num_workers=0)
 70 |         self.img_size = img_size
 71 |         self.confthre = confthre
 72 |         self.nmsthre = nmsthre
 73 |         self.vis=vis
 74 | 
 75 |     def evaluate(self, model, half=False):
 76 |         """
 77 |         COCO average precision (AP) Evaluation. Iterate inference on the test dataset
 78 |         and the results are evaluated by COCO API.
 79 |         Args:
 80 |             model : model object
 81 |         Returns:
 82 |             ap50_95 (float) : calculated COCO AP for IoU=50:95
 83 |             ap50 (float) : calculated COCO AP for IoU=50
 84 |         """
 85 |         if isinstance(model, torch.nn.parallel.DistributedDataParallel):
 86 |             model = model.module
 87 |         model.eval()
 88 |         cuda = torch.cuda.is_available()
 89 |         if half:
 90 |             Tensor = torch.cuda.HalfTensor if cuda else torch.HalfTensor
 91 |         else:
 92 |             Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
 93 |         
 94 |         ids = []
 95 |         data_dict = []
 96 |         dataiterator = iter(self.dataloader)
 97 |         img_num = 0
 98 |         indices = list(range(self.num_images))
 99 |         dis_indices = indices[distributed_util.get_rank()::distributed_util.get_world_size()]
100 |         progress_bar = tqdm if distributed_util.is_main_process() else iter
101 |         num_classes = 20
102 |         predictions = {}
103 | 
104 |         if distributed_util.is_main_process():
105 |             inference_time=0
106 |             nms_time=0
107 |             n_samples=len(dis_indices)
108 | 
109 |         for i in progress_bar(dis_indices):
110 |             img, _, info_img, id_ = self.dataset[i]  # load a batch
111 |             info_img = [float(info) for info in info_img]
112 |             ids.append(id_)
113 |             with torch.no_grad():
114 |                 img = Variable(img.type(Tensor).unsqueeze(0))
115 | 
116 |                 if distributed_util.is_main_process() and i > 9:
117 |                     start=time.time()
118 | 
119 |                 if self.vis:
120 |                     outputs,fuse_weights,fused_f = model(img)
121 |                 else:
122 |                     outputs = model(img)
123 | 
124 |                 if distributed_util.is_main_process() and i > 9:
125 |                     infer_end=time.time()
126 |                     inference_time += (infer_end-start)
127 | 
128 |                 outputs = postprocess(
129 |                     outputs, 20, self.confthre, self.nmsthre)
130 | 
131 | 
132 |                 if distributed_util.is_main_process() and i > 9:
133 |                     nms_end=time.time()
134 |                     nms_time +=(nms_end-infer_end)
135 | 
136 |                 if outputs[0] is None:
137 |                     predictions[i] = (None, None, None)
138 |                     continue
139 |                 outputs = outputs[0].cpu().data
140 | 
141 |             bboxes = outputs[:, 0:4]
142 |             bboxes[:, 0::2] *= info_img[0] / self.img_size[0]
143 |             bboxes[:, 1::2] *= info_img[1] / self.img_size[1]
144 |             cls = outputs[:, 6]
145 |             scores = outputs[:, 4]* outputs[:,5]
146 |             predictions[i] = (bboxes, cls, scores)
147 | 
148 |             if self.vis:
149 |                 o_img,_,_,_  = self.dataset.pull_item(i)
150 |                 make_vis('VOC', i, o_img, fuse_weights, fused_f)
151 |                 class_names = self.dataset._classes
152 | 
153 |                 bbox = bboxes.clone()
154 |                 bbox[:, 2] = bbox[:,2] - bbox[:,0]
155 |                 bbox[:, 3] = bbox[:,3] - bbox[:,1]
156 | 
157 |                 make_pred_vis('VOC', i, o_img, class_names, bbox, cls, scores)
158 | 
159 |             if DEBUG and distributed_util.is_main_process():
160 |                 o_img,_,_,_  = self.dataset.pull_item(i)
161 |                 class_names = self.dataset._classes
162 |                 bbox = bboxes.clone()
163 |                 bbox[:, 2] = bbox[:,2] - bbox[:,0]
164 |                 bbox[:, 3] = bbox[:,3] - bbox[:,1]
165 |                 make_pred_vis('VOC', i, o_img, class_names, bbox, cls, scores)
166 | 
167 |         distributed_util.synchronize()
168 |         predictions = _accumulate_predictions_from_multiple_gpus(predictions)
169 |         if not distributed_util.is_main_process():
170 |             return 0, 0
171 | 
172 | 
173 |         print('Main process Evaluating...')
174 | 
175 |         a_infer_time = 1000*inference_time / (n_samples-10)
176 |         a_nms_time= 1000*nms_time / (n_samples-10)
177 | 
178 |         print('Average forward time: %.2f ms, Average NMS time: %.2f ms, Average inference time: %.2f ms' %(a_infer_time, \
179 |                 a_nms_time, (a_infer_time+a_nms_time)))
180 | 
181 |         all_boxes = [[[] for _ in range(self.num_images)]
182 |                      for _ in range(num_classes)]
183 |         for img_num in range(self.num_images):
184 |             bboxes, cls, scores = predictions[img_num]
185 |             if bboxes is None:
186 |                 for j in range(num_classes):
187 |                     all_boxes[j][img_num] = np.empty([0,5],dtype=np.float32)
188 |                 continue
189 |             for j in range(num_classes):
190 |                 mask_c = (cls == j)
191 |                 if sum(mask_c) ==0:
192 |                     all_boxes[j][img_num] = np.empty([0,5],dtype=np.float32)
193 |                     continue
194 | 
195 |                 c_dets = torch.cat((bboxes, scores.unsqueeze(1)),dim=1)
196 |                 all_boxes[j][img_num] = c_dets[mask_c].numpy()
197 | 
198 |             sys.stdout.write('im_eval: {:d}/{:d} \r'.format(img_num+1, self.num_images))
199 |             sys.stdout.flush()
200 | 
201 |         with tempfile.TemporaryDirectory() as tempdir:
202 |             mAP50, mAP70 = self.dataset.evaluate_detections(all_boxes, tempdir)
203 |             return mAP50,mAP70
204 | 
205 | 


--------------------------------------------------------------------------------
/utils/cocoapi_evaluator.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import tempfile
  3 | import sys
  4 | from tqdm import tqdm
  5 | 
  6 | from pycocotools.cocoeval import COCOeval
  7 | from torch.autograd import Variable
  8 | 
  9 | from dataset.cocodataset import *
 10 | from dataset.data_augment import ValTransform
 11 | from utils.utils import *
 12 | from utils import distributed_util
 13 | from utils.vis_utils import make_vis, make_pred_vis
 14 | import time
 15 | import apex
 16 | 
 17 | DEBUG =False
 18 | 
 19 | def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu):
 20 |     all_predictions = distributed_util.scatter_gather(predictions_per_gpu)
 21 |     if not distributed_util.is_main_process():
 22 |         return
 23 |     # merge the list of dicts
 24 |     predictions = []
 25 |     for p in all_predictions:
 26 |         for a in p:
 27 |             predictions.append(a)
 28 | 
 29 |     return predictions
 30 | 
 31 | class COCOAPIEvaluator():
 32 |     """
 33 |     COCO AP Evaluation class.
 34 |     All the data in the val2017 dataset are processed \
 35 |     and evaluated by COCO API.
 36 |     """
 37 |     def __init__(self, data_dir, img_size, confthre, nmsthre, testset=False, voc=False, vis=False):
 38 |         """
 39 |         Args:
 40 |             data_dir (str): dataset root directory
 41 |             img_size (int): image size after preprocess. images are resized \
 42 |                 to squares whose shape is (img_size, img_size).
 43 |             confthre (float):
 44 |                 confidence threshold ranging from 0 to 1, \
 45 |                 which is defined in the config file.
 46 |             nmsthre (float):
 47 |                 IoU threshold of non-max supression ranging from 0 to 1.
 48 |         """
 49 |         json_f = 'instances_val2017.json'
 50 |         name='val2017'
 51 |         if testset:
 52 |             json_f = 'image_info_test-dev2017.json'
 53 |             name='test2017'
 54 |         if voc:
 55 |             json_f = 'pascal_test2007.json'
 56 | 
 57 |         self.testset= testset
 58 |         self.dataset = COCODataset(data_dir=data_dir,
 59 |                                    img_size=img_size,
 60 |                                    json_file=json_f,
 61 |                                    preproc = ValTransform(rgb_means=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225)),
 62 |                                    name=name,
 63 |                                    voc = voc)
 64 | 
 65 |         self.num_images = len(self.dataset)
 66 |         self.dataloader = torch.utils.data.DataLoader(
 67 |             self.dataset, batch_size=1, shuffle=False, num_workers=0)
 68 |         self.img_size = img_size
 69 |         self.confthre = confthre
 70 |         self.nmsthre = nmsthre
 71 |         self.voc = voc
 72 |         self.vis = vis
 73 | 
 74 |     def evaluate(self, model, half=False, distributed=False):
 75 |         """
 76 |         COCO average precision (AP) Evaluation. Iterate inference on the test dataset
 77 |         and the results are evaluated by COCO API.
 78 |         Args:
 79 |             model : model object
 80 |         Returns:
 81 |             ap50_95 (float) : calculated COCO AP for IoU=50:95
 82 |             ap50 (float) : calculated COCO AP for IoU=50
 83 |         """
 84 |         if isinstance(model, apex.parallel.DistributedDataParallel):
 85 |             model = model.module
 86 |             distributed=True
 87 | 
 88 |         model=model.eval()
 89 |         cuda = torch.cuda.is_available()
 90 |         if half:
 91 |             Tensor = torch.cuda.HalfTensor if cuda else torch.HalfTensor
 92 |         else:
 93 |             Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
 94 |         ids = []
 95 |         data_dict = []
 96 |         img_num = 0
 97 | 
 98 |         indices = list(range(self.num_images))
 99 |         if distributed:
100 |             dis_indices = indices[distributed_util.get_rank()::distributed_util.get_world_size()]
101 |         else:
102 |             dis_indices = indices
103 |         progress_bar = tqdm if distributed_util.is_main_process() else iter
104 |         num_classes = 80 if not self.voc else 20
105 | 
106 |         inference_time=0
107 |         nms_time=0
108 |         n_samples=len(dis_indices)-10
109 | 
110 |         for k, i in enumerate(progress_bar(dis_indices)):
111 |             img, _, info_img, id_ = self.dataset[i]  # load a batch
112 |             info_img = [float(info) for info in info_img]
113 |             id_ = int(id_)
114 |             ids.append(id_)
115 |             with torch.no_grad():
116 |                 img = Variable(img.type(Tensor).unsqueeze(0))
117 |                 if k > 9:
118 |                     start=time.time()
119 | 
120 |                 if self.vis:
121 |                     outputs,fuse_weights,fused_f = model(img)
122 |                 else:
123 |                     outputs = model(img)
124 | 
125 |                 if k > 9:
126 |                     infer_end=time.time()
127 |                     inference_time += (infer_end-start)
128 | 
129 |                 outputs = postprocess(
130 |                     outputs, num_classes, self.confthre, self.nmsthre)
131 | 
132 |                 if k > 9:
133 |                     nms_end=time.time()
134 |                     nms_time +=(nms_end-infer_end)
135 | 
136 |                 if outputs[0] is None:
137 |                     continue
138 |                 outputs = outputs[0].cpu().data
139 | 
140 |             bboxes = outputs[:, 0:4]
141 |             bboxes[:, 0::2] *= info_img[0] / self.img_size[0]
142 |             bboxes[:, 1::2] *= info_img[1] / self.img_size[1]
143 |             bboxes[:, 2] = bboxes[:,2] - bboxes[:,0]
144 |             bboxes[:, 3] = bboxes[:,3] - bboxes[:,1]
145 |             cls = outputs[:, 6]
146 |             scores = outputs[:, 4]* outputs[:,5]
147 |             for ind in range(bboxes.shape[0]):
148 |                 label = self.dataset.class_ids[int(cls[ind])]
149 |                 A = {"image_id": id_, "category_id": label, "bbox": bboxes[ind].numpy().tolist(),
150 |                  "score": scores[ind].numpy().item(), "segmentation": []} # COCO json format
151 |                 data_dict.append(A)
152 |             
153 |             if self.vis:
154 |                 o_img,_,_,_  = self.dataset.pull_item(i)
155 |                 make_vis('COCO', i, o_img, fuse_weights, fused_f)
156 |                 class_names = self.dataset._classes
157 |                 make_pred_vis('COCO', i, o_img, class_names, bboxes, cls, scores)
158 | 
159 |             if DEBUG and distributed_util.is_main_process():
160 |                 o_img,_  = self.dataset.pull_item(i)
161 |                 class_names = self.dataset._classes
162 |                 make_pred_vis('COCO', i, o_img, class_names, bboxes, cls, scores)
163 | 
164 |         if distributed:
165 |             distributed_util.synchronize()
166 |             data_dict = _accumulate_predictions_from_multiple_gpus(data_dict)
167 |             inference_time = torch.FloatTensor(1).type(Tensor).fill_(inference_time)
168 |             nms_time = torch.FloatTensor(1).type(Tensor).fill_(nms_time)
169 |             n_samples = torch.LongTensor(1).type(Tensor).fill_(n_samples)
170 |             distributed_util.synchronize()
171 |             torch.distributed.reduce(inference_time, dst=0)
172 |             torch.distributed.reduce(nms_time, dst=0)
173 |             torch.distributed.reduce(n_samples, dst=0)
174 |             inference_time = inference_time.item()
175 |             nms_time = nms_time.item()
176 |             n_samples = n_samples.item()
177 | 
178 |         if not distributed_util.is_main_process():
179 |             return 0, 0
180 | 
181 | 
182 |         print('Main process Evaluating...')
183 | 
184 |         annType = ['segm', 'bbox', 'keypoints']
185 |         a_infer_time = 1000*inference_time / (n_samples)
186 |         a_nms_time= 1000*nms_time / (n_samples)
187 | 
188 |         print('Average forward time: %.2f ms, Average NMS time: %.2f ms, Average inference time: %.2f ms' %(a_infer_time, \
189 |                 a_nms_time, (a_infer_time+a_nms_time)))
190 | 
191 |         # Evaluate the Dt (detection) json comparing with the ground truth
192 |         if len(data_dict) > 0:
193 |             cocoGt = self.dataset.coco
194 |             # workaround: temporarily write data to json file because pycocotools can't process dict in py36.
195 |             if self.testset:
196 |                 json.dump(data_dict, open('yolov3_2017.json', 'w'))
197 |                 cocoDt = cocoGt.loadRes('yolov3_2017.json')
198 |             else:
199 |                 _, tmp = tempfile.mkstemp()
200 |                 json.dump(data_dict, open(tmp, 'w'))
201 |                 cocoDt = cocoGt.loadRes(tmp)
202 |             cocoEval = COCOeval(self.dataset.coco, cocoDt, annType[1])
203 |             cocoEval.evaluate()
204 |             cocoEval.accumulate()
205 |             cocoEval.summarize()
206 |             return cocoEval.stats[0], cocoEval.stats[1]
207 |         else:
208 |             return 0, 0
209 | 
210 | 


--------------------------------------------------------------------------------
/models/yolov3_mobilev2.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | from .network_blocks import *
  3 | from .yolov3_head import YOLOv3Head
  4 | 
  5 | 
  6 | def create_yolov3_mobilenet_v2(num_classes, width_mult=1.0, inverted_residual_setting=None, round_nearest=8):
  7 |     """
  8 |     MobileNet V2 main class
  9 | 
 10 |     Args:
 11 |         num_classes (int): Number of classes
 12 |         width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
 13 |         inverted_residual_setting: Network structure
 14 |         round_nearest (int): Round the number of channels in each layer to be a multiple of this number
 15 |         Set to 1 to turn off rounding
 16 |     """
 17 |     block = InvertedResidual
 18 |     input_channel = 32
 19 |     last_channel = 1280
 20 | 
 21 |     if inverted_residual_setting is None:
 22 |         inverted_residual_setting = [
 23 |             # t, c, n, s
 24 |             [1, 16, 1, 1],
 25 |             [6, 24, 2, 2],
 26 |             [6, 32, 3, 2],
 27 |             [6, 64, 4, 2],
 28 |             [6, 96, 3, 1],
 29 |             [6, 160, 3, 2],
 30 |             [6, 320, 1, 1],
 31 |         ]
 32 | 
 33 |     # only check the first element, assuming user knows t,c,n,s are required
 34 |     if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
 35 |         raise ValueError("inverted_residual_setting should be non-empty "
 36 |                          "or a 4-element list, got {}".format(inverted_residual_setting))
 37 | 
 38 |     # building first layer
 39 |     input_channel = make_divisible(input_channel * width_mult, round_nearest)
 40 |     last_channel = make_divisible(last_channel * max(1.0, width_mult), round_nearest)
 41 |     mlist = nn.ModuleList()
 42 |     mlist.append(ConvBNReLU(3, input_channel, stride=2))
 43 |     # building inverted residual blocks
 44 |     for t, c, n, s in inverted_residual_setting:
 45 |         output_channel =make_divisible(c * width_mult, round_nearest)
 46 |         for i in range(n):
 47 |             stride = s if i == 0 else 1
 48 |             mlist.append(block(input_channel, output_channel, stride, expand_ratio=t))
 49 |             input_channel = output_channel
 50 |     # building last several layers
 51 |     mlist.append(ConvBNReLU(input_channel, last_channel, kernel_size=1))   #18
 52 | 
 53 |     # YOLOv3
 54 |     mlist.append(ressepblock(last_channel, 1024, in_ch=512, shortcut=False))               #19
 55 |     mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1,leaky=False))      #20
 56 |     # SPP Layer
 57 |     mlist.append(SPPLayer())                                               #21
 58 | 
 59 |     mlist.append(add_conv(in_ch=2048, out_ch=512, ksize=1, stride=1, leaky=False))      #22
 60 |     mlist.append(add_conv(in_ch=512, out_ch=1024, ksize=3, stride=1,leaky=False))   #23
 61 |     mlist.append(DropBlock(block_size=1, keep_prob=1))                     #24
 62 |     mlist.append(add_conv(in_ch=1024, out_ch=512, ksize=1, stride=1, leaky=False))      #25 (17)
 63 | 
 64 |     # 1st yolo branch
 65 |     mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1, leaky=False))        #26
 66 |     mlist.append(upsample(scale_factor=2, mode='nearest'))                  #27
 67 |     mlist.append(add_conv(in_ch=352, out_ch=256, ksize=1, stride=1,leaky=False))        #28
 68 |     mlist.append(add_conv(in_ch=256, out_ch=512, ksize=3, stride=1,leaky=False))     #29
 69 |     mlist.append(DropBlock(block_size=1, keep_prob=1))                      #30
 70 |     mlist.append(ressepblock(512, 512, in_ch=256,shortcut=False))        #31
 71 |     mlist.append(add_conv(in_ch=512, out_ch=256, ksize=1, stride=1,leaky=False))        #32
 72 |     # 2nd yolo branch
 73 | 
 74 |     mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1,leaky=False))        #33
 75 |     mlist.append(upsample(scale_factor=2, mode='nearest'))                  #34
 76 |     mlist.append(add_conv(in_ch=160, out_ch=128, ksize=1, stride=1,leaky=False))        #35
 77 |     mlist.append(add_conv(in_ch=128, out_ch=256, ksize=3, stride=1,leaky=False))     #36
 78 |     mlist.append(DropBlock(block_size=1, keep_prob=1))                      #37
 79 |     mlist.append(ressepblock(256, 256, in_ch=128,shortcut=False))        #38
 80 |     mlist.append(add_conv(in_ch=256, out_ch=128, ksize=1, stride=1,leaky=False))        #39
 81 | 
 82 |     return mlist
 83 | 
 84 | 
 85 | class YOLOv3(nn.Module):
 86 |     """
 87 |     YOLOv3 model module. The module list is defined by create_yolov3_modules function. \
 88 |     The network returns loss values from three YOLO layers during training \
 89 |     and detection results during test.
 90 |     """
 91 |     def __init__(self, num_classes = 80, ignore_thre=0.7, label_smooth = False, rfb=False, vis=False, asff=False):
 92 |         """
 93 |         Initialization of YOLOv3 class.
 94 |         Args:
 95 |             ignore_thre (float): used in YOLOLayer.
 96 |         """
 97 |         super(YOLOv3, self).__init__()
 98 |         self.module_list = create_yolov3_mobilenet_v2(num_classes)
 99 | 
100 |         if asff:
101 |             self.level_0_conv =ASFFmobile(level=0,rfb=rfb,vis=vis)
102 |         else:
103 |             self.level_0_conv =add_conv(in_ch=512, out_ch=1024, ksize=3, stride=1,leaky=False)  
104 | 
105 |         self.level_0_header = YOLOv3Head(anch_mask=[6, 7, 8], n_classes=num_classes, stride=32, in_ch=1024,
106 |                               ignore_thre=ignore_thre,label_smooth = label_smooth, rfb=rfb, sep=True)
107 | 
108 |         if asff:
109 |             self.level_1_conv =ASFFmobile(level=1,rfb=rfb,vis=vis)
110 |         else:
111 |             self.level_1_conv =add_conv(in_ch=256, out_ch=512, ksize=3, stride=1,leaky=False)  
112 | 
113 |         self.level_1_header = YOLOv3Head(anch_mask=[3, 4, 5], n_classes=num_classes, stride=16, in_ch=512,
114 |                               ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb, sep=True)
115 | 
116 |         if asff:
117 |             self.level_2_conv =ASFFmobile(level=2,rfb=rfb,vis=vis)
118 |         else:
119 |             self.level_2_conv =add_conv(in_ch=128, out_ch=256, ksize=3, stride=1,leaky=False)  
120 | 
121 |         self.level_2_header = YOLOv3Head(anch_mask=[0, 1, 2], n_classes=num_classes, stride=8, in_ch=256,
122 |                               ignore_thre=ignore_thre, label_smooth = label_smooth, rfb=rfb, sep=True)
123 |         self.asff = asff
124 | 
125 |     def forward(self, x, targets=None, epoch=0):
126 |         """
127 |         Forward path of YOLOv3.
128 |         Args:
129 |             x (torch.Tensor) : input data whose shape is :math:`(N, C, H, W)`, \
130 |                 where N, C are batchsize and num. of channels.
131 |             targets (torch.Tensor) : label array whose shape is :math:`(N, 50, 5)`
132 | 
133 |         Returns:
134 |             training:
135 |                 output (torch.Tensor): loss tensor for backpropagation.
136 |             test:
137 |                 output (torch.Tensor): concatenated detection results.
138 |         """
139 | 
140 |         train = targets is not None
141 |         output = []
142 |         anchor_losses= []
143 |         iou_losses = []
144 |         l1_losses = []
145 |         conf_losses = []
146 |         cls_losses = []
147 |         route_layers = []
148 | 
149 |         for i, module in enumerate(self.module_list):
150 | 
151 |             # yolo layers
152 |             x = module(x)
153 | 
154 |             # route layers
155 |             if i in [6, 13, 25, 32, 39]:
156 |                 route_layers.append(x)
157 |             if i == 27:
158 |                 x = torch.cat((x, route_layers[1]), 1)
159 |             if i == 34:
160 |                 x = torch.cat((x, route_layers[0]), 1)
161 |         
162 | 
163 |         for l in range(3):
164 |             conver = getattr(self, 'level_{}_conv'.format(l))
165 |             header = getattr(self, 'level_{}_header'.format(l))
166 |             if self.asff:
167 |                 f_conv= conver(route_layers[2],route_layers[3],route_layers[4])
168 |             else:
169 |                 f_conv = conver(route_layers[l+2])
170 |             if train:
171 |                 x, anchor_loss, iou_loss, l1_loss, conf_loss, cls_loss = header(f_conv, targets)
172 |                 anchor_losses.append(anchor_loss)
173 |                 iou_losses.append(iou_loss)
174 |                 l1_losses.append(l1_loss)
175 |                 conf_losses.append(conf_loss)
176 |                 cls_losses.append(cls_loss)
177 |             else:
178 |                 x = header(f_conv)
179 | 
180 |             output.append(x)
181 | 
182 |         if train:
183 |             losses = torch.stack(output, 0).unsqueeze(0).sum(1,keepdim=True)
184 |             anchor_losses = torch.stack(anchor_losses, 0).unsqueeze(0).sum(1,keepdim=True)
185 |             iou_losses = torch.stack(iou_losses, 0).unsqueeze(0).sum(1,keepdim=True)
186 |             l1_losses = torch.stack(l1_losses, 0).unsqueeze(0).sum(1,keepdim=True)
187 |             conf_losses = torch.stack(conf_losses, 0).unsqueeze(0).sum(1,keepdim=True)
188 |             cls_losses = torch.stack(cls_losses, 0).unsqueeze(0).sum(1,keepdim=True)
189 |             loss_dict = dict(
190 |                     losses = losses,
191 |                     anchor_losses = anchor_losses,
192 |                     iou_losses = iou_losses,
193 |                     l1_losses = l1_losses,
194 |                     conf_losses = conf_losses,
195 |                     cls_losses = cls_losses,
196 |             )
197 |             return loss_dict
198 |         else:
199 |             return torch.cat(output, 1)
200 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Learning Spatial Fusion for Single-Shot Object Detection
  2 | 
  3 | By Songtao Liu, Di Huang, Yunhong Wang
  4 | 
  5 | ### Introduction
  6 | In this work, we propose a novel and data driven strategy for pyramidal feature fusion, referred to as adaptively spatial feature fusion (ASFF). It learns the way to spatially filter conflictive information to suppress the inconsistency, thus improving the scale-invariance of features, and introduces nearly free inference overhead. For more details, please refer to our [arXiv paper](https://arxiv.org/abs/1911.09516).
  7 | 
  8 | <img align="center" src="https://github.com/ruinmessi/ASFF/blob/master/doc/asff.png">
  9 | 
 10 | ### Updates:
 11 | - Add MobileNet V2!
 12 |     * The previous models actually are all trained with the wrong anchor setting, we fix the error on mobileNet model.
 13 |     * We currently not support rfb, dropblock and Feature Adaption for mobileNet V2.
 14 |     * FP16 training for mobileNet is not working now. I didn't figure it out. 
 15 |     * FP16 testing for mobileNet drops about 0.2 mAP. 
 16 | 
 17 | - Add a demo.py file
 18 | 
 19 | - Faster NMS (adopt official implementation)
 20 | 
 21 | ### COCO 
 22 | 
 23 | | System |  *test-dev mAP* | **Time** (V100) | **Time** (2080ti)|
 24 | |:-------|:-----:|:-------:|:-------:|
 25 | | [YOLOv3 608](http://pjreddie.com/darknet/yolo/) | 33.0 | 20ms| 26ms|
 26 | | YOLOv3 608+ [BoFs](https://arxiv.org/abs/1902.04103) | 37.0 | 20ms | 26ms|
 27 | | YOLOv3 608 (our baseline) | **38.8** | 20ms | 26ms|
 28 | | YOLOv3 608+ ASFF | **40.6** | 22ms | 30ms| 
 29 | | YOLOv3 608+ ASFF\* | **42.4** | 22ms | 30ms| 
 30 | | YOLOv3 800+ ASFF\* | **43.9** | 34ms | 38ms| 
 31 | | YOLOv3 MobileNetV1 416 + [BoFs](https://arxiv.org/abs/1902.04103)| 28.6 | - | 22 ms| 
 32 | | YOLOv3 MobileNetV2 416 (our baseline) | 29.0 | - | 22 ms| 
 33 | | YOLOv3 MobileNetV2 416 +ASFF | **30.6** | - | 24 ms| 
 34 | 
 35 | 
 36 | ### Citing 
 37 | Please cite our paper in your publications if it helps your research:
 38 | 
 39 |     @article{liu2019asff,
 40 |         title = {Learning Spatial Fusion for Single-Shot Object Detection},
 41 |         author = {Songtao Liu, Di Huang and Yunhong Wang},
 42 |         booktitle = {arxiv preprint arXiv:1911.09516},
 43 |         year = {2019}
 44 |     }
 45 | 
 46 | ### Contents
 47 | 1. [Installation](#installation)
 48 | 2. [Datasets](#datasets)
 49 | 3. [Training](#training)
 50 | 4. [Evaluation](#evaluation)
 51 | 5. [Models](#models)
 52 | 
 53 | ## Installation
 54 | - Install [PyTorch-1.3.1](http://pytorch.org/) by selecting your environment on the website and running the appropriate command.
 55 | - Clone this repository. 
 56 |     * Note: We currently only support PyTorch-1.0.0+ and Python 3+.
 57 | - Compile the DCN layer (ported from [DCNv2 implementation](https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0)):
 58 | ```Shell
 59 | ./make.sh
 60 | ```
 61 | 
 62 | ### Prerequisites
 63 | - We also use [apex](https://github.com/NVIDIA/apex), numpy, opencv, tqdm, pyyaml, matplotlib, scikit-image...
 64 |     * Note: We use apex for distributed training and synchronized batch normalization. For FP16 training, since the current apex version have some [issues](https://github.com/NVIDIA/apex/issues/318), we use the old version of FP16_Optimizer, and split the code in ./utils/fp_utils.
 65 | 
 66 | - We also support tensorboard if you have installed it.   
 67 | 
 68 | ### Demo
 69 | 
 70 | ```Shell
 71 | python demo.py -i /path/to/your/image \
 72 | --cfg config/yolov3_baseline.cfg -d COCO \
 73 | --checkpoint /path/to/you/weights --half --asff --rfb -s 608
 74 | ```
 75 | - Note:
 76 |   * -i, --img: image path.
 77 |   * --cfg: config files.
 78 |   * -d: choose datasets, COCO or VOC.
 79 |   * -c, --checkpoint: pretrained weights.
 80 |   * --half: FP16 testing.
 81 |   * -s: evaluation image size, from 320 to 608 as in YOLOv3.
 82 | 
 83 | 
 84 | ## Datasets
 85 | Note: We currently only support [COCO](http://mscoco.org/) and [VOC](http://host.robots.ox.ac.uk/pascal/VOC/).  
 86 | To make things easy, we provide simple COCO and VOC dataset loader that inherits `torch.utils.data.Dataset` making it fully compatible with the `torchvision.datasets` [API](http://pytorch.org/docs/torchvision/datasets.html).
 87 | 
 88 | Moreover, we also implement the Mix-up strategy in [BoFs](https://arxiv.org/abs/1902.04103) and distributed random resizing in YOLov3.
 89 | ### COCO Dataset
 90 | Install the MS COCO dataset at /path/to/coco from [official website](http://mscoco.org/), default is ./data/COCO, and a soft-link is recommended. 
 91 | ```
 92 | ln -s /path/to/coco ./data/COCO
 93 | ```
 94 | 
 95 | It should have this basic structure
 96 | ```Shell
 97 | $COCO/
 98 | $COCO/annotations/
 99 | $COCO/images/
100 | $COCO/images/test2017/
101 | $COCO/images/train2017/
102 | $COCO/images/val2017/
103 | ```
104 | The current COCO dataset has released new *train2017* and *val2017* sets, and we defaultly train our model on *train2017* and evaluate on *val2017*. 
105 | 
106 | ### VOC Dataset
107 | Install the VOC dataset as ./data/VOC. We also recommend a soft-link:
108 | ```
109 | ln -s /path/to/VOCdevkit ./data/VOC
110 | ```
111 | 
112 | ## Training
113 | 
114 | - First download the mix-up pretrained [Darknet-53](https://arxiv.org/abs/1902.04103) PyTorch base network weights at: https://drive.google.com/open?id=1phqyYhV1K9KZLQZH1kENTAPprLBmymfP  
115 |   or from our [BaiduYun Driver](https://pan.baidu.com/s/19PaXl6p9vXHG2ZuGqtfLOg) 
116 | 
117 | - For MobileNetV2, we use the pytorch official [weights](https://drive.google.com/open?id=1LwMd9lK6YqGM8Yjf_ClBT2MG1-PHgUGa) (change the key name to fit our code), or from our [BaiduYun Driver](https://pan.baidu.com/s/12eScI6YNBvkVX0286cMEZA)
118 | 
119 | - By default, we assume you have downloaded the file in the `ASFF/weights` dir:
120 | 
121 | - Since random resizing consumes much more GPU memory, we implement FP16 training with an old version of apex. 
122 | 
123 | - We currently **ONLY** test the code with distributed training on multiple GPUs (10 2080ti or 4 Tesla V100).
124 | 
125 | - To train YOLOv3 baseline (ours) using the train script simply specify the parameters listed in `main.py` as a flag or manually change them on config/yolov3_baseline.cfg:
126 | ```Shell
127 | python -m torch.distributed.launch --nproc_per_node=10 --master_port=${RANDOM+10000} main.py \
128 | --cfg config/yolov3_baseline.cfg -d COCO --tfboard --distributed --ngpu 10 \
129 | --checkpoint weights/darknet53_feature_mx.pth --start_epoch 0 --half --log_dir log/COCO -s 608 
130 | ```
131 | - Note:
132 |   * --cfg: config files.
133 |   * --tfboard: use tensorboard.
134 |   * --distributed: distributed training (we only test the code with distributed training)
135 |   * -d: choose datasets, COCO or VOC.
136 |   * --ngpu: number of GPUs.
137 |   * -c, --checkpoint: pretrained weights or resume weights. You can pick-up training from a checkpoint by specifying the path as one of the training parameters (again, see `main.py` for options)
138 |  
139 |   * --start_epoch: used for resume training.
140 |   * --half: FP16 training.
141 |   * --log_dir: log dir for tensorboard.
142 |   * -s: evaluation image size, from 320 to 608 as in YOLOv3.
143 | 
144 | - To train YOLOv3 with ASFF or ASFF\*, you only need add some addional flags:
145 | ```Shell
146 | python -m torch.distributed.launch --nproc_per_node=10 --master_port=${RANDOM+10000} main.py \
147 | --cfg config/yolov3_baseline.cfg -d COCO --tfboard --distributed --ngpu 10 \
148 | --checkpoint weights/darknet53_feature_mx.pth --start_epoch 0 --half --asff --rfb --dropblock \
149 | --log_dir log/COCO_ASFF -s 608 
150 | ```
151 | - Note:
152 |   * --asff: add ASFF module on YOLOv3.
153 |   * --rfb: use [RFB](https://github.com/ruinmessi/RFBNet) moduel on ASFF.
154 |   * --dropblock: use [DropBlock](https://arxiv.org/abs/1810.12890).
155 |   
156 | ## Evaluation
157 | To evaluate a trained network, you can use the following command:
158 | 
159 | ```Shell
160 | python -m torch.distributed.launch --nproc_per_node=10 --master_port=${RANDOM+10000} eval.py \
161 | --cfg config/yolov3_baseline.cfg -d COCO --distributed --ngpu 10 \
162 | --checkpoint /path/to/you/weights --half --asff --rfb -s 608
163 | ```
164 | - Note:
165 |   * --vis: Visualization of ASFF.
166 |   * --testset: evaluate on COCO *test-dev*.
167 |   * -s: evaluation image size.
168 | 
169 | By default, it will directly output the mAP results on COCO *val2017* or VOC *test 2007*. 
170 | 
171 | ## Models
172 | * yolov3 mobilenetv2 (ours)[weights](https://drive.google.com/open?id=1XGXJPXHIroimEuW8oujbInNapuEDALOB) [baiduYun](https://pan.baidu.com/s/100TivomBLDTRZSA1pkGiNA) [training tfboard log](https://pan.baidu.com/s/1P_00LAUvV-VOzxqoIxC_Yw)
173 | 
174 | * yolov3 mobilenetv2 +asff [weights](https://drive.google.com/open?id=1cC-xGoaw3Wu5hYd3iXEq6xrAn4U_dW-w) [baiduYun](https://pan.baidu.com/s/1JxX8mYkljk1ap2s4zpLrSg) [training tfboard log](https://pan.baidu.com/s/1R2YL9uZ9baQWR6aht0qVlQ)
175 | 
176 | * yolov3_baseline (ours) [weights](https://drive.google.com/open?id=1RbjUQbNxl4cEbk-6jFkFnOHRukJY5EQk) [baiduYun](https://pan.baidu.com/s/131JhlaOBbeL9l4tqiJO9yA) [training tfboard log](https://pan.baidu.com/s/1GcpVnq7mhIsrk8zrJ9FF2g)
177 | 
178 | * yolov3_asff [weights](https://drive.google.com/open?id=1Dyf8ZEga_VT2O3_c5nrFJA5uON1aSJK-) [baiduYun](https://pan.baidu.com/s/1a-eQZ0kDpsnUooD4RtRdxg) [training tfboard log](https://pan.baidu.com/s/1MeMkAWwv1SFsVbvsTpj_xQ)
179 | 
180 | * yolov3_asff\* (320-608) [weights](https://drive.google.com/open?id=1N668Za8OBbJbUStYde0ml9SZdM7tabXy) [baiduYun](https://pan.baidu.com/s/1d9hOQBj20HCy51qWbonxMQ)
181 | 
182 | * yolov3_asff\* (480-800) [weights](https://drive.google.com/open?id=18N4_nNVqYbjawerEHQnwJGPcRvcLOe06) [baiduYun](https://pan.baidu.com/s/1HERhiP4vmUekxxm5KQrX8g)
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/dataset/dataloading.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import logging
  3 | from functools import wraps
  4 | import torch
  5 | from torch.utils.data.dataset import Dataset as torchDataset
  6 | from torch.utils.data.sampler import BatchSampler as torchBatchSampler
  7 | from torch.utils.data.dataloader import DataLoader as torchDataLoader
  8 | from torch.utils.data.dataloader import default_collate
  9 | 
 10 | 
 11 | log = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class Dataset(torchDataset):
 15 |     """ This class is a subclass of the base :class:`torch.utils.data.Dataset`,
 16 |     that enables on the fly resizing of the ``input_dim`` with a :class:`lightnet.data.DataLoader`.
 17 | 
 18 |     Args:
 19 |         input_dimension (tuple): (width,height) tuple with default dimensions of the network
 20 |     """
 21 |     def __init__(self, input_dimension):
 22 |         super().__init__()
 23 |         self.__input_dim = input_dimension[:2]
 24 | 
 25 |     @property
 26 |     def input_dim(self):
 27 |         """ Dimension that can be used by transforms to set the correct image size, etc.
 28 |         This allows transforms to have a single source of truth for the input dimension of the network.
 29 | 
 30 |         Return:
 31 |             list: Tuple containing the current width,height
 32 |         """
 33 |         if hasattr(self, '_input_dim'):
 34 |             return self._input_dim
 35 |         return self.__input_dim
 36 | 
 37 |     @staticmethod
 38 |     def resize_getitem(getitem_fn):
 39 |         """ Decorator method that needs to be used around the ``__getitem__`` method. |br|
 40 |         This decorator enables the on the fly resizing  of the ``input_dim`` with our :class:`~lightnet.data.DataLoader` class.
 41 | 
 42 |         Example:
 43 |             >>> class CustomSet(ln.data.Dataset):
 44 |             ...     def __len__(self):
 45 |             ...         return 10
 46 |             ...     @ln.data.Dataset.resize_getitem
 47 |             ...     def __getitem__(self, index):
 48 |             ...         # Should return (image, anno) but here we return input_dim
 49 |             ...         return self.input_dim
 50 |             >>> data = CustomSet((200,200))
 51 |             >>> data[0]
 52 |             (200, 200)
 53 |             >>> data[(480,320), 0]
 54 |             (480, 320)
 55 |         """
 56 |         @wraps(getitem_fn)
 57 |         def wrapper(self, index):
 58 |             if not isinstance(index, int):
 59 |                 has_dim = True
 60 |                 self._input_dim = index[0]
 61 |                 index = index[1]
 62 |             else:
 63 |                 has_dim = False
 64 | 
 65 |             ret_val = getitem_fn(self, index)
 66 | 
 67 |             if has_dim:
 68 |                 del self._input_dim
 69 | 
 70 |             return ret_val
 71 | 
 72 |         return wrapper
 73 | 
 74 | 
 75 | class DataLoader(torchDataLoader):
 76 |     """ Lightnet dataloader that enables on the fly resizing of the images.
 77 |     See :class:`torch.utils.data.DataLoader` for more information on the arguments.
 78 | 
 79 |     Note:
 80 |         This dataloader only works with :class:`lightnet.data.Dataset` based datasets.
 81 | 
 82 |     Example:
 83 |         >>> class CustomSet(ln.data.Dataset):
 84 |         ...     def __len__(self):
 85 |         ...         return 4
 86 |         ...     @ln.data.Dataset.resize_getitem
 87 |         ...     def __getitem__(self, index):
 88 |         ...         # Should return (image, anno) but here we return (input_dim,)
 89 |         ...         return (self.input_dim,)
 90 |         >>> dl = ln.data.DataLoader(
 91 |         ...     CustomSet((200,200)),
 92 |         ...     batch_size = 2,
 93 |         ...     collate_fn = ln.data.list_collate   # We want the data to be grouped as a list
 94 |         ... )
 95 |         >>> dl.dataset.input_dim    # Default input_dim
 96 |         (200, 200)
 97 |         >>> for d in dl:
 98 |         ...     d
 99 |         [[(200, 200), (200, 200)]]
100 |         [[(200, 200), (200, 200)]]
101 |         >>> dl.change_input_dim(320, random_range=None)
102 |         (320, 320)
103 |         >>> for d in dl:
104 |         ...     d
105 |         [[(320, 320), (320, 320)]]
106 |         [[(320, 320), (320, 320)]]
107 |         >>> dl.change_input_dim((480, 320), random_range=None)
108 |         (480, 320)
109 |         >>> for d in dl:
110 |         ...     d
111 |         [[(480, 320), (480, 320)]]
112 |         [[(480, 320), (480, 320)]]
113 |     """
114 |     def __init__(self, *args, **kwargs):
115 |         super().__init__(*args, **kwargs)
116 |         self.__initialized = False
117 |         shuffle = False
118 |         batch_sampler = None
119 |         if len(args) > 5:
120 |             shuffle = args[2]
121 |             sampler = args[3]
122 |             batch_sampler = args[4]
123 |         elif len(args) > 4:
124 |             shuffle = args[2]
125 |             sampler = args[3]
126 |             if 'batch_sampler' in kwargs:
127 |                 batch_sampler = kwargs['batch_sampler']
128 |         elif len(args) > 3:
129 |             shuffle = args[2]
130 |             if 'sampler' in kwargs:
131 |                 sampler = kwargs['sampler']
132 |             if 'batch_sampler' in kwargs:
133 |                 batch_sampler = kwargs['batch_sampler']
134 |         else:
135 |             if 'shuffle' in kwargs:
136 |                 shuffle = kwargs['shuffle']
137 |             if 'sampler' in kwargs:
138 |                 sampler = kwargs['sampler']
139 |             if 'batch_sampler' in kwargs:
140 |                 batch_sampler = kwargs['batch_sampler']
141 | 
142 |         # Use custom BatchSampler
143 |         if batch_sampler is None:
144 |             if sampler is None:
145 |                 if shuffle:
146 |                     sampler = torch.utils.data.sampler.RandomSampler(self.dataset)
147 |                     #sampler = torch.utils.data.DistributedSampler(self.dataset)
148 |                 else:
149 |                     sampler = torch.utils.data.sampler.SequentialSampler(self.dataset)
150 |             batch_sampler = YoloBatchSampler(sampler, self.batch_size, self.drop_last, input_dimension=self.dataset.input_dim)
151 |             #batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iterations = 
152 | 
153 |         self.batch_sampler = batch_sampler
154 | 
155 |         self.__initialized = True
156 | 
157 |     def change_input_dim(self, multiple=32, random_range=(10, 19)):
158 |         """ This function will compute a new size and update it on the next mini_batch.
159 | 
160 |         Args:
161 |             multiple (int or tuple, optional): value (or values) to multiply the randomly generated range by; Default **32**
162 |             random_range (tuple, optional): This (min, max) tuple sets the range for the randomisation; Default **(10, 19)**
163 | 
164 |         Return:
165 |             tuple: width, height tuple with new dimension
166 | 
167 |         Note:
168 |             The new size is generated as follows: |br|
169 |             First we compute a random integer inside ``[random_range]``.
170 |             We then multiply that number with the ``multiple`` argument, which gives our final new input size. |br|
171 |             If ``multiple`` is an integer we generate a square size. If you give a tuple of **(width, height)**,
172 |             the size is computed as :math:`rng * multiple[0], rng * multiple[1]`.
173 | 
174 |         Note:
175 |             You can set the ``random_range`` argument to **None** to set an exact size of multiply. |br|
176 |             See the example above for how this works.
177 |         """
178 |         if random_range is None:
179 |             size = 1
180 |         else:
181 |             size = random.randint(*random_range)
182 | 
183 |         if isinstance(multiple, int):
184 |             size = (size * multiple, size * multiple)
185 |         else:
186 |             size = (size * multiple[0], size * multiple[1])
187 | 
188 |         self.batch_sampler.new_input_dim = size
189 | 
190 |         return size
191 | 
192 | 
193 | class YoloBatchSampler(torchBatchSampler):
194 |     """ This batch sampler will generate mini-batches of (dim, index) tuples from another sampler.
195 |     It works just like the :class:`torch.utils.data.sampler.BatchSampler`, but it will prepend a dimension,
196 |     whilst ensuring it stays the same across one mini-batch.
197 |     """
198 |     def __init__(self, *args, input_dimension=None, **kwargs):
199 |         super().__init__(*args, **kwargs)
200 |         self.input_dim = input_dimension
201 |         self.new_input_dim = None
202 | 
203 |     def __iter__(self):
204 |         self.__set_input_dim()
205 |         for batch in super().__iter__():
206 |             yield [(self.input_dim, idx) for idx in batch]
207 |             self.__set_input_dim()
208 | 
209 |     def __set_input_dim(self):
210 |         """ This function randomly changes the the input dimension of the dataset. """
211 |         if self.new_input_dim is not None:
212 |             log.info(f'Resizing network {self.new_input_dim[:2]}')
213 |             self.input_dim = (self.new_input_dim[0], self.new_input_dim[1])
214 |             self.new_input_dim = None
215 | 
216 | class IterationBasedBatchSampler(torchBatchSampler):
217 |     """
218 |     Wraps a BatchSampler, resampling from it until
219 |     a specified number of iterations have been sampled
220 |     """
221 | 
222 |     def __init__(self, batch_sampler, num_iterations, start_iter=0):
223 |         self.batch_sampler = batch_sampler
224 |         self.num_iterations = num_iterations
225 |         self.start_iter = start_iter
226 | 
227 |     def __iter__(self):
228 |         iteration = self.start_iter
229 |         while iteration <= self.num_iterations:
230 |             # if the underlying sampler has a set_epoch method, like
231 |             # DistributedSampler, used for making each process see
232 |             # a different split of the dataset, then set it
233 |             if hasattr(self.batch_sampler.sampler, "set_epoch"):
234 |                 self.batch_sampler.sampler.set_epoch(iteration)
235 |             for batch in self.batch_sampler:
236 |                 iteration += 1
237 |                 if iteration > self.num_iterations:
238 |                     break
239 |                 yield batch
240 | 
241 |     def __len__(self):
242 |         return self.num_iterations
243 | 
244 | def list_collate(batch):
245 |     """ Function that collates lists or tuples together into one list (of lists/tuples).
246 |     Use this as the collate function in a Dataloader, if you want to have a list of items as an output, as opposed to tensors (eg. Brambox.boxes).
247 |     """
248 |     items = list(zip(*batch))
249 | 
250 |     for i in range(len(items)):
251 |         if isinstance(items[i][0], (list, tuple)):
252 |             items[i] = list(items[i])
253 |         else:
254 |             items[i] = default_collate(items[i])
255 | 
256 |     return items
257 | 
258 | 


--------------------------------------------------------------------------------
/dataset/vocdataset.py:
--------------------------------------------------------------------------------
  1 | """VOC Dataset Classes
  2 | 
  3 | Original author: Francisco Massa
  4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
  5 | 
  6 | Updated by: Ellis Brown, Max deGroot
  7 | """
  8 | 
  9 | import os
 10 | import pickle
 11 | import os.path
 12 | import sys
 13 | import torch
 14 | import torch.utils.data as data
 15 | import torchvision.transforms as transforms
 16 | import cv2
 17 | import numpy as np
 18 | from .voc_eval import voc_eval
 19 | from .dataloading import Dataset
 20 | if sys.version_info[0] == 2:
 21 |     import xml.etree.cElementTree as ET
 22 | else:
 23 |     import xml.etree.ElementTree as ET
 24 | 
 25 | 
 26 | #VOC_CLASSES = ( '__background__', # always index 0
 27 | VOC_CLASSES = ( 
 28 |     'aeroplane', 'bicycle', 'bird', 'boat',
 29 |     'bottle', 'bus', 'car', 'cat', 'chair',
 30 |     'cow', 'diningtable', 'dog', 'horse',
 31 |     'motorbike', 'person', 'pottedplant',
 32 |     'sheep', 'sofa', 'train', 'tvmonitor')
 33 | 
 34 | # for making bounding boxes pretty
 35 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128),
 36 |           (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128))
 37 | 
 38 | 
 39 | 
 40 | class AnnotationTransform(object):
 41 | 
 42 |     """Transforms a VOC annotation into a Tensor of bbox coords and label index
 43 |     Initilized with a dictionary lookup of classnames to indexes
 44 | 
 45 |     Arguments:
 46 |         class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
 47 |             (default: alphabetic indexing of VOC's 20 classes)
 48 |         keep_difficult (bool, optional): keep difficult instances or not
 49 |             (default: False)
 50 |         height (int): height
 51 |         width (int): width
 52 |     """
 53 | 
 54 |     def __init__(self, class_to_ind=None, keep_difficult=True):
 55 |         self.class_to_ind = class_to_ind or dict(
 56 |             zip(VOC_CLASSES, range(len(VOC_CLASSES))))
 57 |         self.keep_difficult = keep_difficult
 58 | 
 59 |     def __call__(self, target):
 60 |         """
 61 |         Arguments:
 62 |             target (annotation) : the target annotation to be made usable
 63 |                 will be an ET.Element
 64 |         Returns:
 65 |             a list containing lists of bounding boxes  [bbox coords, class name]
 66 |         """
 67 |         res = np.empty((0,5)) 
 68 |         for obj in target.iter('object'):
 69 |             difficult = int(obj.find('difficult').text) == 1
 70 |             if not self.keep_difficult and difficult:
 71 |                 continue
 72 |             name = obj.find('name').text.lower().strip()
 73 |             bbox = obj.find('bndbox')
 74 | 
 75 |             pts = ['xmin', 'ymin', 'xmax', 'ymax']
 76 |             bndbox = []
 77 |             for i, pt in enumerate(pts):
 78 |                 cur_pt = int(bbox.find(pt).text) - 1
 79 |                 # scale height or width
 80 |                 #cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
 81 |                 bndbox.append(cur_pt)
 82 |             label_idx = self.class_to_ind[name]
 83 |             bndbox.append(label_idx)
 84 |             res = np.vstack((res,bndbox))  # [xmin, ymin, xmax, ymax, label_ind]
 85 |             # img_id = target.find('filename').text[:-4]
 86 | 
 87 |         return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
 88 | 
 89 | 
 90 | class VOCDetection(Dataset):
 91 | 
 92 |     """VOC Detection Dataset Object
 93 | 
 94 |     input is image, target is annotation
 95 | 
 96 |     Arguments:
 97 |         root (string): filepath to VOCdevkit folder.
 98 |         image_set (string): imageset to use (eg. 'train', 'val', 'test')
 99 |         transform (callable, optional): transformation to perform on the
100 |             input image
101 |         target_transform (callable, optional): transformation to perform on the
102 |             target `annotation`
103 |             (eg: take in caption string, return tensor of word indices)
104 |         dataset_name (string, optional): which dataset to load
105 |             (default: 'VOC2007')
106 |     """
107 | 
108 |     def __init__(self, root, image_sets, preproc=None, target_transform=AnnotationTransform(), input_dim=(416,416),
109 |                  dataset_name='VOC0712'):
110 |         super().__init__(input_dim)
111 |         self.root = root
112 |         self.image_set = image_sets
113 |         self.preproc = preproc
114 |         self.target_transform = target_transform
115 |         self.name = dataset_name
116 |         self._annopath = os.path.join('%s', 'Annotations', '%s.xml')
117 |         self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg')
118 |         self._classes=VOC_CLASSES
119 |         self.ids = list()
120 |         for (year, name) in image_sets:
121 |             self._year = year
122 |             rootpath = os.path.join(self.root, 'VOC' + year)
123 |             for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
124 |                 self.ids.append((rootpath, line.strip()))
125 | 
126 |     @Dataset.resize_getitem
127 |     def __getitem__(self, index):
128 |         img_id = self.ids[index]
129 |         target = ET.parse(self._annopath % img_id).getroot()
130 |         img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
131 |         #img = Image.open(self._imgpath % img_id).convert('RGB')
132 | 
133 |         height, width, _ = img.shape
134 | 
135 |         if self.target_transform is not None:
136 |             target = self.target_transform(target)
137 | 
138 | 
139 |         if self.preproc is not None:
140 |             img, target = self.preproc(img, target, self.input_dim)
141 |             #print(img.size())
142 | 
143 |         img_info = (width, height)
144 | 
145 |         return img, target, img_info, img_id
146 | 
147 |     def __len__(self):
148 |         return len(self.ids)
149 | 
150 |     def pull_image(self, index):
151 |         '''Returns the original image object at index in PIL form
152 | 
153 |         Note: not using self.__getitem__(), as any transformations passed in
154 |         could mess up this functionality.
155 | 
156 |         Argument:
157 |             index (int): index of img to show
158 |         Return:
159 |             PIL img
160 |         '''
161 |         img_id = self.ids[index]
162 |         return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
163 | 
164 |     def pull_anno(self, index):
165 |         '''Returns the original annotation of image at index
166 | 
167 |         Note: not using self.__getitem__(), as any transformations passed in
168 |         could mess up this functionality.
169 | 
170 |         Argument:
171 |             index (int): index of img to get annotation of
172 |         Return:
173 |             list:  [img_id, [(label, bbox coords),...]]
174 |                 eg: ('001718', [('dog', (96, 13, 438, 332))])
175 |         '''
176 |         img_id = self.ids[index]
177 |         anno = ET.parse(self._annopath % img_id).getroot()
178 |         gt = self.target_transform(anno, 1, 1)
179 |         return img_id[1], gt
180 | 
181 |     def pull_item(self, index):
182 |         '''Returns the original image and target at an index for mixup
183 | 
184 |         Note: not using self.__getitem__(), as any transformations passed in
185 |         could mess up this functionality.
186 | 
187 |         Argument:
188 |             index (int): index of img to show
189 |         Return:
190 |             img, target
191 |         '''
192 |         img_id = self.ids[index]
193 |         target = ET.parse(self._annopath % img_id).getroot()
194 |         img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
195 | 
196 |         height, width, _ = img.shape
197 | 
198 |         img_info = (width, height)
199 |         if self.target_transform is not None:
200 |             target = self.target_transform(target)
201 | 
202 |         return img, target, img_info, img_id
203 | 
204 |     def evaluate_detections(self, all_boxes, output_dir=None):
205 |         """
206 |         all_boxes is a list of length number-of-classes.
207 |         Each list element is a list of length number-of-images.
208 |         Each of those list elements is either an empty list []
209 |         or a numpy array of detection.
210 | 
211 |         all_boxes[class][image] = [] or np.array of shape #dets x 5
212 |         """
213 |         self._write_voc_results_file(all_boxes)
214 |         IouTh = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
215 |         mAPs = []
216 |         for iou in IouTh:
217 |             mAP = self._do_python_eval(output_dir,iou)
218 |             mAPs.append(mAP)
219 | 
220 |         print('--------------------------------------------------------------')
221 |         print('map_5095:', np.mean(mAPs))
222 |         print('map_50:', mAPs[0])
223 |         print('--------------------------------------------------------------')
224 |         return np.mean(mAPs), mAPs[0]
225 | 
226 |     def _get_voc_results_file_template(self):
227 |         filename = 'comp4_det_test' + '_{:s}.txt'
228 |         filedir = os.path.join(
229 |             self.root, 'results', 'VOC' + self._year, 'Main')
230 |         if not os.path.exists(filedir):
231 |             os.makedirs(filedir)
232 |         path = os.path.join(filedir, filename)
233 |         return path
234 | 
235 |     def _write_voc_results_file(self, all_boxes):
236 |         for cls_ind, cls in enumerate(VOC_CLASSES):
237 |             cls_ind = cls_ind 
238 |             if cls == '__background__':
239 |                 continue
240 |             print('Writing {} VOC results file'.format(cls))
241 |             filename = self._get_voc_results_file_template().format(cls)
242 |             with open(filename, 'wt') as f:
243 |                 for im_ind, index in enumerate(self.ids):
244 |                     index = index[1]
245 |                     dets = all_boxes[cls_ind][im_ind]
246 |                     if dets == []:
247 |                         continue
248 |                     for k in range(dets.shape[0]):
249 |                         f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
250 |                                 format(index, dets[k, -1],
251 |                                 dets[k, 0] + 1, dets[k, 1] + 1,
252 |                                 dets[k, 2] + 1, dets[k, 3] + 1))
253 | 
254 |     def _do_python_eval(self, output_dir='output', iou = 0.5):
255 |         rootpath = os.path.join(self.root, 'VOC' + self._year)
256 |         name = self.image_set[0][1]
257 |         annopath = os.path.join(
258 |                                 rootpath,
259 |                                 'Annotations',
260 |                                 '{:s}.xml')
261 |         imagesetfile = os.path.join(
262 |                                 rootpath,
263 |                                 'ImageSets',
264 |                                 'Main',
265 |                                 name+'.txt')
266 |         cachedir = os.path.join(self.root, 'annotations_cache', 'VOC'+self._year, name)
267 |         if not os.path.exists(cachedir):
268 |             os.makedirs(cachedir)
269 |         aps = []
270 |         # The PASCAL VOC metric changed in 2010
271 |         use_07_metric = True if int(self._year) < 2010 else False
272 |         print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
273 |         if output_dir is not None and not os.path.isdir(output_dir):
274 |             os.mkdir(output_dir)
275 |         for i, cls in enumerate(VOC_CLASSES):
276 | 
277 |             if cls == '__background__':
278 |                 continue
279 | 
280 |             filename = self._get_voc_results_file_template().format(cls)
281 |             rec, prec, ap = voc_eval(
282 |                                     filename, annopath, imagesetfile, cls, cachedir, ovthresh=iou,
283 |                                     use_07_metric=use_07_metric)
284 |             aps += [ap]
285 |             if iou == 0.5:
286 |                 print('AP for {} = {:.4f}'.format(cls, ap))
287 |             if output_dir is not None:
288 |                 with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
289 |                     pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
290 |         if iou ==0.5:
291 |             print('Mean AP = {:.4f}'.format(np.mean(aps)))
292 |             print('~~~~~~~~')
293 |             print('Results:')
294 |             for ap in aps:
295 |                 print('{:.3f}'.format(ap))
296 |             print('{:.3f}'.format(np.mean(aps)))
297 |             print('~~~~~~~~')
298 |             print('')
299 |             print('--------------------------------------------------------------')
300 |             print('Results computed with the **unofficial** Python eval code.')
301 |             print('Results should be very close to the official MATLAB eval code.')
302 |             print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
303 |             print('-- Thanks, The Management')
304 |             print('--------------------------------------------------------------')
305 |     
306 |         return np.mean(aps)
307 | 


--------------------------------------------------------------------------------
/models/yolov3_head.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | from utils.utils import bboxes_iou
  6 | import numpy as np
  7 | from .utils_loss import *
  8 | from .network_blocks import *
  9 | 
 10 | class YOLOv3Head(nn.Module):
 11 |     def __init__(self, anch_mask, n_classes, stride, in_ch=1024, ignore_thre=0.7, label_smooth = False, rfb=False, sep=False):
 12 |         super(YOLOv3Head, self).__init__()
 13 |         self.anchors = [
 14 |             (10, 13), (16, 30), (33, 23), 
 15 |             (30, 61), (62, 45), (42, 119),
 16 |             (116, 90), (156, 198), (121, 240) ]
 17 |         if sep:
 18 |             self.anchors = [
 19 |                 (10, 13), (16, 30), (33, 23), 
 20 |                 (30, 61), (62, 45), (42, 119),
 21 |                 (116, 90), (156, 198), (373, 326)]
 22 | 
 23 |         self.anch_mask = anch_mask
 24 |         self.n_anchors = 4
 25 |         self.n_classes = n_classes
 26 |         self.guide_wh = nn.Conv2d(in_channels=in_ch,
 27 |                               out_channels=2*self.n_anchors, kernel_size=1, stride=1, padding=0)
 28 |         self.Feature_adaption=FeatureAdaption(in_ch, in_ch, self.n_anchors, rfb, sep)
 29 | 
 30 |         self.conv = nn.Conv2d(in_channels=in_ch,
 31 |                               out_channels=self.n_anchors*(self.n_classes+5), kernel_size=1, stride=1, padding=0)
 32 |         self.ignore_thre = ignore_thre
 33 |         self.l1_loss = nn.L1Loss(reduction='none')
 34 |         #self.smooth_l1_loss = nn.SmoothL1Loss(reduction='none')
 35 |         self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction='none')
 36 |         self.bce_loss = nn.BCELoss(reduction='none')
 37 |         self.iou_loss = IOUloss(reduction='none')
 38 |         self.iou_wh_loss = IOUWH_loss(reduction='none')
 39 |         self.stride = stride
 40 |         self._label_smooth = label_smooth
 41 | 
 42 |         self.all_anchors_grid = self.anchors
 43 |         self.masked_anchors = [self.all_anchors_grid[i]
 44 |                                for i in self.anch_mask]
 45 |         self.ref_anchors = np.zeros((len(self.all_anchors_grid), 4))
 46 |         self.ref_anchors[:, 2:] = np.array(self.all_anchors_grid)
 47 |         self.ref_anchors = torch.FloatTensor(self.ref_anchors)
 48 | 
 49 |     def forward(self, xin, labels=None):
 50 |         """
 51 |         In this
 52 |         Args:
 53 |             xin (torch.Tensor): input feature map whose size is :math:`(N, C, H, W)`, \
 54 |                 where N, C, H, W denote batchsize, channel width, height, width respectively.
 55 |             labels (torch.Tensor): label data whose size is :math:`(N, K, 5)`. \
 56 |                 N and K denote batchsize and number of labels.
 57 |                 Each label consists of [class, xc, yc, w, h]:
 58 |                     class (float): class index.
 59 |                     xc, yc (float) : center of bbox whose values range from 0 to 1.
 60 |                     w, h (float) : size of bbox whose values range from 0 to 1.
 61 |         Returns:
 62 |             loss (torch.Tensor): total loss - the target of backprop.
 63 |             loss_xy (torch.Tensor): x, y loss - calculated by binary cross entropy (BCE) \
 64 |                 with boxsize-dependent weights.
 65 |             loss_wh (torch.Tensor): w, h loss - calculated by l2 without size averaging and \
 66 |                 with boxsize-dependent weights.
 67 |             loss_obj (torch.Tensor): objectness loss - calculated by BCE.
 68 |             loss_cls (torch.Tensor): classification loss - calculated by BCE for each class.
 69 |             loss_l2 (torch.Tensor): total l2 loss - only for logging.
 70 |         """
 71 | 
 72 |         wh_pred = self.guide_wh(xin) #Anchor guiding
 73 | 
 74 |         if xin.type() == 'torch.cuda.HalfTensor': #As DCN only support FP32 now, change the feature to float.
 75 |             wh_pred = wh_pred.float()
 76 |             if labels is not None:
 77 |                 labels = labels.float()
 78 |             self.Feature_adaption = self.Feature_adaption.float()
 79 |             self.conv = self.conv.float()
 80 |             xin = xin.float()
 81 | 
 82 |         feature_adapted = self.Feature_adaption(xin, wh_pred)
 83 | 
 84 |         output = self.conv(feature_adapted)
 85 |         wh_pred = torch.exp(wh_pred)
 86 | 
 87 |         batchsize = output.shape[0]
 88 |         fsize = output.shape[2]
 89 |         image_size = fsize * self.stride
 90 |         n_ch = 5 + self.n_classes
 91 |         dtype = torch.cuda.FloatTensor if xin.is_cuda else torch.FloatTensor
 92 | 
 93 |         wh_pred = wh_pred.view(batchsize, self.n_anchors, 2 , fsize, fsize)
 94 |         wh_pred = wh_pred.permute(0, 1, 3, 4, 2).contiguous()
 95 | 
 96 |         output = output.view(batchsize, self.n_anchors, n_ch, fsize, fsize)
 97 |         output = output.permute(0,1,3,4,2).contiguous()
 98 | 
 99 |         x_shift = dtype(np.broadcast_to(
100 |             np.arange(fsize, dtype=np.float32), output.shape[:4]))
101 |         y_shift = dtype(np.broadcast_to(
102 |             np.arange(fsize, dtype=np.float32).reshape(fsize, 1), output.shape[:4]))
103 | 
104 |         masked_anchors = np.array(self.masked_anchors)
105 | 
106 |         w_anchors = dtype(np.broadcast_to(np.reshape(
107 |             masked_anchors[:, 0], (1, self.n_anchors-1, 1, 1)), [batchsize, self.n_anchors-1, fsize, fsize]))
108 |         h_anchors = dtype(np.broadcast_to(np.reshape(
109 |             masked_anchors[:, 1], (1, self.n_anchors-1, 1, 1)), [batchsize, self.n_anchors-1, fsize, fsize]))
110 | 
111 |         default_center = torch.zeros(batchsize, self.n_anchors, fsize, fsize, 2).type(dtype)
112 | 
113 |         pred_anchors = torch.cat((default_center, wh_pred), dim=-1).contiguous()
114 | 
115 |         anchors_based = pred_anchors[:, :self.n_anchors-1, :, :, :]   #anchor branch
116 |         anchors_free = pred_anchors[:, self.n_anchors-1, :, :, :]     #anchor free branch
117 |         anchors_based[...,2] *= w_anchors
118 |         anchors_based[...,3] *= h_anchors
119 |         anchors_free[...,2] *= self.stride*4
120 |         anchors_free[...,3] *= self.stride*4
121 |         pred_anchors[...,:2] = pred_anchors[...,:2].detach()
122 | 
123 |         if not self.training:
124 | 
125 |             pred = output.clone()
126 |             pred[..., np.r_[:2, 4:n_ch]] = torch.sigmoid(
127 |                     pred[...,np.r_[:2, 4:n_ch]])
128 |             pred[...,0] += x_shift
129 |             pred[...,1] += y_shift
130 |             pred[...,:2] *= self.stride
131 |             pred[...,2] = torch.exp(pred[...,2])*(pred_anchors[...,2])
132 |             pred[...,3] = torch.exp(pred[...,3])*(pred_anchors[...,3])
133 |             refined_pred = pred.view(batchsize, -1, n_ch)
134 |             return refined_pred.data
135 | 
136 |         #training for anchor prediction
137 |         if self.training:
138 | 
139 |             target = torch.zeros(batchsize, self.n_anchors,
140 |                                 fsize, fsize, n_ch).type(dtype)
141 |             l1_target = torch.zeros(batchsize, self.n_anchors,
142 |                                 fsize, fsize, 4).type(dtype)
143 |             tgt_scale = torch.zeros(batchsize, self.n_anchors,
144 |                                 fsize, fsize, 4).type(dtype)
145 |             obj_mask = torch.ones(batchsize, self.n_anchors, fsize, fsize).type(dtype)
146 | 
147 |             cls_mask = torch.zeros(batchsize, self.n_anchors, fsize, fsize, self.n_classes).type(dtype)
148 |             coord_mask = torch.zeros(batchsize, self.n_anchors, fsize, fsize).type(dtype)
149 |             anchor_mask = torch.zeros(batchsize, self.n_anchors, fsize, fsize).type(dtype)
150 | 
151 |             labels = labels.data
152 |             mixup = labels.shape[2]>5
153 |             if mixup:
154 |                 label_cut = labels[...,:5]
155 |             else:
156 |                 label_cut = labels
157 |             nlabel = (label_cut.sum(dim=2) > 0).sum(dim=1)  # number of objects
158 | 
159 |             truth_x_all = labels[:, :, 1] * 1.
160 |             truth_y_all = labels[:, :, 2] * 1.
161 |             truth_w_all = labels[:, :, 3] * 1.
162 |             truth_h_all = labels[:, :, 4] * 1.
163 |             truth_i_all = (truth_x_all/image_size*fsize).to(torch.int16).cpu().numpy()
164 |             truth_j_all = (truth_y_all/image_size*fsize).to(torch.int16).cpu().numpy()
165 | 
166 |             pred = output.clone()
167 |             pred[..., np.r_[:2, 4:n_ch]] = torch.sigmoid(
168 |                     pred[...,np.r_[:2, 4:n_ch]])
169 |             pred[...,0] += x_shift
170 |             pred[...,1] += y_shift
171 |             pred[...,2] = torch.exp(pred[...,2])*(pred_anchors[...,2])
172 |             pred[...,3] = torch.exp(pred[...,3])*(pred_anchors[...,3])
173 |             pred[...,:2] *= self.stride
174 | 
175 |             pred_boxes = pred[...,:4].data
176 |             for b in range(batchsize):
177 |                 n = int(nlabel[b])
178 |                 if n == 0:
179 |                     continue
180 | 
181 |                 truth_box = dtype(np.zeros((n, 4)))
182 |                 truth_box[:n, 2] = truth_w_all[b, :n]
183 |                 truth_box[:n, 3] = truth_h_all[b, :n]
184 |                 truth_i = truth_i_all[b, :n]
185 |                 truth_j = truth_j_all[b, :n]
186 | 
187 |                 # calculate iou between truth and reference anchors
188 |                 anchor_ious_all = bboxes_iou(truth_box.cpu(), self.ref_anchors, xyxy=False)
189 |                 best_n_all = np.argmax(anchor_ious_all, axis=1)
190 |                 best_anchor_iou = anchor_ious_all[np.arange(anchor_ious_all.shape[0]),best_n_all]
191 |                 best_n = best_n_all % 3
192 |                 best_n_mask = ((best_n_all == self.anch_mask[0]) | (
193 |                     best_n_all == self.anch_mask[1]) | (best_n_all == self.anch_mask[2]))
194 | 
195 |                 truth_box[:n, 0] = truth_x_all[b, :n]
196 |                 truth_box[:n, 1] = truth_y_all[b, :n]
197 |                 pred_box = pred_boxes[b]
198 |                 pred_ious = bboxes_iou(pred_box.view(-1,4),
199 |                         truth_box, xyxy=False)
200 |                 pred_best_iou, _= pred_ious.max(dim=1)
201 |                 pred_best_iou = (pred_best_iou > self.ignore_thre)
202 |                 pred_best_iou = pred_best_iou.view(pred_box.shape[:3])
203 |                 obj_mask[b]= ~pred_best_iou
204 |                 truth_box[:n, 0] = 0
205 |                 truth_box[:n, 1] = 0
206 | 
207 |                 if sum(best_n_mask) == 0:
208 |                     continue
209 |                 for ti in range(best_n.shape[0]):
210 |                     if best_n_mask[ti] == 1:
211 |                         i, j = truth_i[ti], truth_j[ti]
212 |                         a = best_n[ti]
213 |                         free_iou = bboxes_iou(truth_box[ti].cpu().view(-1,4),
214 |                                 pred_anchors[b, self.n_anchors-1, j, i, :4].data.cpu().view(-1,4),xyxy=False)  #iou of pred anchor 
215 | 
216 |                         #choose the best anchor
217 |                         if free_iou > best_anchor_iou[ti]:
218 |                             aa = self.n_anchors-1
219 |                         else:
220 |                             aa = a
221 | 
222 |                         cls_mask[b, aa, j, i, :] = 1
223 |                         coord_mask[b, aa, j, i] = 1
224 | 
225 |                         anchor_mask[b, self.n_anchors-1, j, i] = 1
226 |                         anchor_mask[b, a, j, i] = 1
227 | 
228 |                         obj_mask[b, aa, j, i]= 1 if not mixup else labels[b, ti, 5]
229 | 
230 |                         target[b, a, j, i, 0] = truth_x_all[b, ti]
231 |                         target[b, a, j, i, 1] = truth_y_all[b, ti]
232 |                         target[b, a, j, i, 2] = truth_w_all[b, ti]
233 |                         target[b, a, j, i, 3] = truth_h_all[b, ti]
234 | 
235 |                         target[b, self.n_anchors-1, j, i, 0] = truth_x_all[b, ti]
236 |                         target[b, self.n_anchors-1, j, i, 1] = truth_y_all[b, ti]
237 |                         target[b, self.n_anchors-1, j, i, 2] = truth_w_all[b, ti]
238 |                         target[b, self.n_anchors-1, j, i, 3] = truth_h_all[b, ti]
239 | 
240 |                         l1_target[b, aa, j, i, 0] = truth_x_all[b, ti]/image_size *fsize - i*1.0
241 |                         l1_target[b, aa, j, i, 1] = truth_y_all[b, ti]/image_size *fsize - j*1.0
242 |                         l1_target[b, aa, j, i, 2] = torch.log(truth_w_all[b, ti]/\
243 |                             (pred_anchors[b, aa, j, i, 2])+ 1e-12)
244 |                         l1_target[b, aa, j, i, 3] = torch.log(truth_h_all[b, ti]/\
245 |                             (pred_anchors[b, aa, j, i, 3]) + 1e-12)
246 |                         target[b, aa, j, i, 4] = 1
247 |                         if self._label_smooth:
248 |                             smooth_delta = 1
249 |                             smooth_weight = 1. / self.n_classes
250 |                             target[b, aa, j, i, 5:]= smooth_weight* smooth_delta
251 | 
252 |                             target[b, aa, j, i, 5 + labels[b, ti,
253 |                                 0].to(torch.int16)] = 1 - smooth_delta*smooth_weight
254 |                         else:
255 |                             target[b,aa, j, i, 5 + labels[b, ti,
256 |                                 0].to(torch.int16)] = 1
257 | 
258 |                         tgt_scale[b, aa,j, i, :] = 2.0 - truth_w_all[b, ti]*truth_h_all[b, ti] / image_size/image_size
259 | 
260 | 
261 |             # Anchor loss
262 |             anchorcoord_mask = anchor_mask>0
263 |             loss_anchor = self.iou_wh_loss(pred_anchors[...,:4][anchorcoord_mask], target[...,:4][anchorcoord_mask]).sum()/batchsize
264 | 
265 |             #Prediction loss
266 |             coord_mask = coord_mask>0
267 |             loss_iou = (tgt_scale[coord_mask][...,0]*\
268 |                     self.iou_loss(pred[..., :4][coord_mask], target[..., :4][coord_mask])).sum() / batchsize
269 |             tgt_scale = tgt_scale[...,:2]
270 |             loss_xy = (tgt_scale*self.bcewithlog_loss(output[...,:2], l1_target[...,:2])).sum() / batchsize
271 |             loss_wh = (tgt_scale*self.l1_loss(output[...,2:4], l1_target[...,2:4])).sum() / batchsize
272 |             loss_l1 = loss_xy + loss_wh
273 |             loss_obj = (obj_mask*(self.bcewithlog_loss(output[..., 4], target[..., 4]))).sum() / batchsize
274 |             loss_cls = (cls_mask*(self.bcewithlog_loss(output[..., 5:], target[..., 5:]))).sum()/ batchsize
275 | 
276 |             loss = loss_anchor + loss_iou + loss_l1+ loss_obj + loss_cls
277 | 
278 |             return loss, loss_anchor, loss_iou, loss_l1, loss_obj, loss_cls
279 | 
280 | 


--------------------------------------------------------------------------------
/dataset/data_augment.py:
--------------------------------------------------------------------------------
  1 | """Data augmentation functionality. Passed as callable transformations to
  2 | Dataset classes.
  3 | 
  4 | The data augmentation procedures were interpreted from @weiliu89's SSD paper
  5 | http://arxiv.org/abs/1512.02325
  6 | """
  7 | 
  8 | import torch
  9 | from torchvision import transforms
 10 | import cv2
 11 | import numpy as np
 12 | import random
 13 | import math
 14 | from utils.utils import matrix_iou, visual
 15 | 
 16 | #DEBUG = True
 17 | DEBUG = False
 18 | 
 19 | def _crop(image, boxes, labels, ratios = None):
 20 |     height, width, _ = image.shape
 21 | 
 22 |     if len(boxes)== 0:
 23 |         return image, boxes, labels, ratios
 24 | 
 25 |     while True:
 26 |         mode = random.choice((
 27 |             None,
 28 |             (0.1, None),
 29 |             (0.3, None),
 30 |             (0.5, None),
 31 |             (0.7, None),
 32 |             (0.9, None),
 33 |             (None, None),
 34 |         ))
 35 | 
 36 |         if mode is None:
 37 |             return image, boxes, labels, ratios
 38 | 
 39 |         min_iou, max_iou = mode
 40 |         if min_iou is None:
 41 |             min_iou = float('-inf')
 42 |         if max_iou is None:
 43 |             max_iou = float('inf')
 44 | 
 45 |         for _ in range(50):
 46 |             scale = random.uniform(0.3,1.)
 47 |             min_ratio = max(0.5, scale*scale)
 48 |             max_ratio = min(2, 1. / scale / scale)
 49 |             ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
 50 |             w = int(scale * ratio * width)
 51 |             h = int((scale / ratio) * height)
 52 | 
 53 | 
 54 |             l = random.randrange(width - w)
 55 |             t = random.randrange(height - h)
 56 |             roi = np.array((l, t, l + w, t + h))
 57 | 
 58 |             iou = matrix_iou(boxes, roi[np.newaxis])
 59 | 
 60 |             if not (min_iou <= iou.min() and iou.max() <= max_iou):
 61 |                 continue
 62 | 
 63 |             image_t = image[roi[1]:roi[3], roi[0]:roi[2]]
 64 | 
 65 |             centers = (boxes[:, :2] + boxes[:, 2:]) / 2
 66 |             mask = np.logical_and(roi[:2] < centers, centers < roi[2:]) \
 67 |                      .all(axis=1)
 68 |             boxes_t = boxes[mask].copy()
 69 |             labels_t = labels[mask].copy()
 70 |             if ratios is not None:
 71 |                 ratios_t = ratios[mask].copy()
 72 |             else:
 73 |                 ratios_t=None
 74 | 
 75 |             if len(boxes_t) == 0:
 76 |                 continue
 77 | 
 78 |             boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2])
 79 |             boxes_t[:, :2] -= roi[:2]
 80 |             boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:])
 81 |             boxes_t[:, 2:] -= roi[:2]
 82 | 
 83 |             return image_t, boxes_t,labels_t, ratios_t
 84 | 
 85 | 
 86 | def _distort(image):
 87 |     def _convert(image, alpha=1, beta=0):
 88 |         tmp = image.astype(float) * alpha + beta
 89 |         tmp[tmp < 0] = 0
 90 |         tmp[tmp > 255] = 255
 91 |         image[:] = tmp
 92 | 
 93 |     image = image.copy()
 94 | 
 95 |     if random.randrange(2):
 96 |         _convert(image, beta=random.uniform(-32, 32))
 97 | 
 98 |     if random.randrange(2):
 99 |         _convert(image, alpha=random.uniform(0.5, 1.5))
100 | 
101 |     image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
102 | 
103 |     if random.randrange(2):
104 |         tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
105 |         tmp %= 180
106 |         image[:, :, 0] = tmp
107 | 
108 |     if random.randrange(2):
109 |         _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))
110 | 
111 |     image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
112 | 
113 |     return image
114 | 
115 | 
116 | def _expand(image, boxes,fill, p):
117 |     if random.random() > p:
118 |         return image, boxes
119 | 
120 |     height, width, depth = image.shape
121 |     for _ in range(50):
122 |         scale = random.uniform(1,4)
123 | 
124 |         min_ratio = max(0.5, 1./scale/scale)
125 |         max_ratio = min(2, scale*scale)
126 |         ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
127 |         ws = scale*ratio
128 |         hs = scale/ratio
129 |         if ws < 1 or hs < 1:
130 |             continue
131 |         w = int(ws * width)
132 |         h = int(hs * height)
133 | 
134 |         left = random.randint(0, w - width)
135 |         top = random.randint(0, h - height)
136 | 
137 |         boxes_t = boxes.copy()
138 |         boxes_t[:, :2] += (left, top)
139 |         boxes_t[:, 2:] += (left, top)
140 | 
141 | 
142 |         expand_image = np.empty(
143 |             (h, w, depth),
144 |             dtype=image.dtype)
145 |         expand_image[:, :] = fill
146 |         expand_image[top:top + height, left:left + width] = image
147 |         image = expand_image
148 | 
149 |         return image, boxes_t
150 | 
151 | 
152 | def _mirror(image, boxes):
153 |     _, width, _ = image.shape
154 |     if random.randrange(2):
155 |         image = image[:, ::-1]
156 |         boxes = boxes.copy()
157 |         boxes[:, 0::2] = width - boxes[:, 2::-2]
158 |     return image, boxes
159 | 
160 | 
161 | def _random_affine(img, targets=None, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2),
162 |                   borderValue=(127.5, 127.5, 127.5)):
163 |     # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
164 |     # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4
165 | 
166 |     border = 0  # width of added border (optional)
167 |     #height = max(img.shape[0], img.shape[1]) + border * 2
168 |     height, width, _ = img.shape 
169 | 
170 |     # Rotation and Scale
171 |     R = np.eye(3)
172 |     a = random.random() * (degrees[1] - degrees[0]) + degrees[0]
173 |     # a += random.choice([-180, -90, 0, 90])  # 90deg rotations added to small rotations
174 |     s = random.random() * (scale[1] - scale[0]) + scale[0]
175 |     R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s)
176 | 
177 |     # Translation
178 |     T = np.eye(3)
179 |     T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[0] + border  # x translation (pixels)
180 |     T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[1] + border  # y translation (pixels)
181 | 
182 |     # Shear
183 |     S = np.eye(3)
184 |     S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)  # x shear (deg)
185 |     S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)  # y shear (deg)
186 | 
187 |     M = S @ T @ R  # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
188 |     imw = cv2.warpPerspective(img, M, dsize=(width, height), flags=cv2.INTER_LINEAR,
189 |                               borderValue=borderValue)  # BGR order borderValue
190 | 
191 |     # Return warped points also
192 |     if targets is not None:
193 |         if len(targets) > 0:
194 |             n = targets.shape[0]
195 |             points = targets[:, 0:4].copy()
196 |             area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1])
197 | 
198 |             # warp points
199 |             xy = np.ones((n * 4, 3))
200 |             xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
201 |             xy = (xy @ M.T)[:, :2].reshape(n, 8)
202 | 
203 |             # create new boxes
204 |             x = xy[:, [0, 2, 4, 6]]
205 |             y = xy[:, [1, 3, 5, 7]]
206 |             xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
207 | 
208 |             # apply angle-based reduction
209 |             radians = a * math.pi / 180
210 |             reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
211 |             x = (xy[:, 2] + xy[:, 0]) / 2
212 |             y = (xy[:, 3] + xy[:, 1]) / 2
213 |             w = (xy[:, 2] - xy[:, 0]) * reduction
214 |             h = (xy[:, 3] - xy[:, 1]) * reduction
215 |             xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
216 | 
217 |             # reject warped points outside of image
218 |             x1 = np.clip(xy[:,0], 0, width)
219 |             y1 = np.clip(xy[:,1], 0, height)
220 |             x2 = np.clip(xy[:,2], 0, width)
221 |             y2 = np.clip(xy[:,3], 0, height)
222 |             boxes = np.concatenate((x1, y1, x2, y2)).reshape(4, n).T
223 | 
224 |         return imw, boxes, M
225 |     else:
226 |         return imw
227 | 
228 | def preproc_for_test(image, input_size, mean, std):
229 |     interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4]
230 |     interp_method = interp_methods[random.randrange(5)]
231 |     image = cv2.resize(image, input_size,interpolation=interp_method)
232 |     image = image.astype(np.float32)
233 |     image = image[:,:,::-1]
234 |     image /= 255.
235 |     if mean is not None:
236 |         image -= mean
237 |     if std is not None:
238 |         image /= std
239 |     return image.transpose(2, 0, 1)
240 | 
241 | 
242 | class TrainTransform(object):
243 | 
244 |     def __init__(self, p=0.5, rgb_means=None, std = None,max_labels=50):
245 |         self.means = rgb_means
246 |         self.std = std
247 |         self.p = p
248 |         self.max_labels=max_labels
249 | 
250 |     def __call__(self, image, targets, input_dim):
251 |         boxes = targets[:,:4].copy()
252 |         labels = targets[:,4].copy()
253 |         if targets.shape[1] > 5:
254 |             mixup=True
255 |             ratios = targets[:,-1].copy()
256 |             ratios_o = targets[:,-1].copy()
257 |         else:
258 |             mixup=False
259 |             ratios = None
260 |             ratios_o = None
261 |         lshape = 6 if mixup else 5
262 |         if len(boxes) == 0:
263 |             targets = np.zeros((self.max_labels,lshape),dtype=np.float32)
264 |             image = preproc_for_test(image, input_dim, self.means, self.std)
265 |             image = np.ascontiguousarray(image, dtype=np.float32)
266 |             return torch.from_numpy(image), torch.from_numpy(targets)
267 | 
268 |         image_o = image.copy()
269 |         targets_o = targets.copy()
270 |         height_o, width_o, _ = image_o.shape
271 |         boxes_o = targets_o[:,:4]
272 |         labels_o = targets_o[:,4]
273 |         b_x_o = (boxes_o[:, 2] + boxes_o[:, 0])*.5
274 |         b_y_o = (boxes_o[:, 3] + boxes_o[:, 1])*.5
275 |         b_w_o = (boxes_o[:, 2] - boxes_o[:, 0])*1.
276 |         b_h_o = (boxes_o[:, 3] - boxes_o[:, 1])*1.
277 |         boxes_o[:,0] = b_x_o
278 |         boxes_o[:,1] = b_y_o
279 |         boxes_o[:,2] = b_w_o
280 |         boxes_o[:,3] = b_h_o
281 |         boxes_o[:, 0::2] /= width_o
282 |         boxes_o[:, 1::2] /= height_o
283 |         boxes_o[:, 0::2] *= input_dim[0]
284 |         boxes_o[:, 1::2] *= input_dim[1]
285 |         #labels_o = np.expand_dims(labels_o,1)
286 |         #targets_o = np.hstack((boxes_o,labels_o))
287 |         #targets_o = np.hstack((labels_o,boxes_o))
288 | 
289 |         image_t = _distort(image)
290 |         if self.means is not None:
291 |             fill = [m * 255 for m in self.means]
292 |             fill = fill[::-1]
293 |         else:
294 |             fill = (127.5,127.5,127.5)
295 |         image_t, boxes = _expand(image_t, boxes, fill, self.p)
296 |         image_t, boxes, labels, ratios = _crop(image_t, boxes, labels, ratios)
297 |         image_t, boxes = _mirror(image_t, boxes)
298 | 
299 |         if random.randrange(2):
300 |             image_t, boxes, _ = _random_affine(image_t, boxes, borderValue=fill)
301 | 
302 |         height, width, _ = image_t.shape
303 | 
304 |         if DEBUG:
305 |             image_t = np.ascontiguousarray(image_t, dtype=np.uint8)
306 |             img = visual(image_t, boxes,labels) 
307 |             cv2.imshow('DEBUG', img)
308 |             cv2.waitKey(0)
309 | 
310 |         image_t = preproc_for_test(image_t, input_dim, self.means, self.std)
311 |         boxes  = boxes.copy()
312 |         b_x = (boxes[:, 2] + boxes[:, 0])*.5
313 |         b_y = (boxes[:, 3] + boxes[:, 1])*.5
314 |         b_w = (boxes[:, 2] - boxes[:, 0])*1.
315 |         b_h = (boxes[:, 3] - boxes[:, 1])*1.
316 |         boxes[:,0] = b_x
317 |         boxes[:,1] = b_y
318 |         boxes[:,2] = b_w
319 |         boxes[:,3] = b_h
320 |         boxes[:, 0::2] /= width
321 |         boxes[:, 1::2] /= height
322 |         boxes[:, 0::2] *= input_dim[0]
323 |         boxes[:, 1::2] *= input_dim[1]
324 |         mask_b= np.minimum(boxes[:,2], boxes[:,3]) > 6
325 |         #mask_b= (boxes[:,2]*boxes[:,3]) > 32**2
326 |         #mask_b= (boxes[:,2]*boxes[:,3]) > 48**2
327 |         boxes_t = boxes[mask_b]
328 |         labels_t = labels[mask_b].copy()
329 |         if mixup:
330 |             ratios_t = ratios[mask_b].copy()
331 | 
332 |         '''
333 |         if len(boxes_t)==0:
334 |             targets = np.zeros((self.max_labels,lshape),dtype=np.float32)
335 |             image = preproc_for_test(image_o, input_dim, self.means, self.std)
336 |             image = np.ascontiguousarray(image, dtype=np.float32)
337 |             return torch.from_numpy(image), torch.from_numpy(targets)
338 |         '''
339 |         #if len(boxes_t)==0 or random.random() > 0.97:
340 |         if len(boxes_t)==0:
341 |             image_t = preproc_for_test(image_o, input_dim, self.means, self.std)
342 |             boxes_t = boxes_o
343 |             labels_t = labels_o
344 |             ratios_t = ratios_o
345 | 
346 |         labels_t = np.expand_dims(labels_t,1)
347 |         if mixup:
348 |             ratios_t = np.expand_dims(ratios_t,1)
349 |             targets_t = np.hstack((labels_t,boxes_t,ratios_t))
350 |         else:
351 |             targets_t = np.hstack((labels_t,boxes_t))
352 |         padded_labels = np.zeros((self.max_labels,lshape))
353 |         padded_labels[range(len(targets_t))[:self.max_labels]] = targets_t[:self.max_labels]
354 |         padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32)
355 |         image_t = np.ascontiguousarray(image_t, dtype=np.float32)
356 | 
357 |         return torch.from_numpy(image_t), torch.from_numpy(padded_labels)
358 | 
359 | 
360 | 
361 | class ValTransform(object):
362 |     """Defines the transformations that should be applied to test PIL image
363 |         for input into the network
364 | 
365 |     dimension -> tensorize -> color adj
366 | 
367 |     Arguments:
368 |         resize (int): input dimension to SSD
369 |         rgb_means ((int,int,int)): average RGB of the dataset
370 |             (104,117,123)
371 |         swap ((int,int,int)): final order of channels
372 |     Returns:
373 |         transform (transform) : callable transform to be applied to test/val
374 |         data
375 |     """
376 |     def __init__(self, rgb_means=None, std=None, swap=(2, 0, 1)):
377 |         self.means = rgb_means
378 |         self.swap = swap
379 |         self.std=std
380 | 
381 |     # assume input is cv2 img for now
382 |     def __call__(self, img, res, input_size):
383 | 
384 |         interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4]
385 |         interp_method = interp_methods[0]
386 |         img = cv2.resize(np.array(img), input_size,
387 |                         interpolation = interp_method).astype(np.float32)
388 |         img = img[:,:,::-1]
389 |         img /= 255.
390 |         if self.means is not None:
391 |             img -= self.means
392 |         if self.std is not None:
393 |             img /= self.std
394 |         img = img.transpose(self.swap)
395 |         img = np.ascontiguousarray(img, dtype=np.float32)
396 |         return torch.from_numpy(img), torch.zeros(1,5)
397 | 


--------------------------------------------------------------------------------