├── utils
    ├── __init__.py
    ├── nms
    │   ├── __init__.py
    │   ├── gpu_nms.hpp
    │   ├── py_cpu_nms.py
    │   ├── gpu_nms.pyx
    │   ├── nms_kernel.cu
    │   └── cpu_nms.pyx
    ├── pycocotools
    │   ├── __init__.py
    │   ├── maskApi.h
    │   ├── mask.py
    │   ├── maskApi.c
    │   └── _mask.pyx
    ├── nms_wrapper.py
    ├── timer.py
    ├── build.py
    └── box_utils.py
├── layers
    ├── __init__.py
    ├── functions
    │   ├── __init__.py
    │   ├── prior_box.py
    │   └── detection.py
    └── modules
    │   ├── __init__.py
    │   ├── l2norm.py
    │   ├── multibox_loss.py
    │   └── refine_multibox_loss.py
├── make.sh
├── data
    ├── __init__.py
    ├── scripts
    │   ├── VOC2012.sh
    │   └── VOC2007.sh
    ├── config.py
    ├── voc0712_aug.py
    ├── voc_eval.py
    ├── data_augment.py
    ├── coco.py
    ├── voc0712.py
    └── augmentations.py
├── coco_voc.txt
├── LICENSE
├── .gitignore
├── README.md
├── demo.py
└── models
    ├── base_models.py
    ├── mobilenet.py
    ├── misc.py
    ├── FSSD_vgg_FPN.py
    └── FSSD_Mob_FPN.py


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/nms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/pycocotools/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import *
2 | from .modules import *
3 | 


--------------------------------------------------------------------------------
/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | cd ./utils/
3 | 
4 | CUDA_PATH=/usr/local/cuda/
5 | 
6 | python build.py build_ext --inplace
7 | 
8 | cd ..
9 | 


--------------------------------------------------------------------------------
/layers/functions/__init__.py:
--------------------------------------------------------------------------------
1 | from .detection import Detect
2 | from .prior_box import PriorBox
3 | 
4 | 
5 | __all__ = ['Detect', 'PriorBox']
6 | 


--------------------------------------------------------------------------------
/utils/nms/gpu_nms.hpp:
--------------------------------------------------------------------------------
1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
2 |           int boxes_dim, float nms_overlap_thresh, int device_id);
3 | 


--------------------------------------------------------------------------------
/layers/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .multibox_loss import MultiBoxLoss
2 | from .refine_multibox_loss import RefineMultiBoxLoss
3 | from .l2norm import L2Norm
4 | 
5 | __all__ = ['MultiBoxLoss','L2Norm']
6 | 


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | # from .voc import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES
2 | from .voc0712 import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES
3 | from .coco import COCODetection
4 | from .data_augment import *
5 | from .config import *
6 | 


--------------------------------------------------------------------------------
/coco_voc.txt:
--------------------------------------------------------------------------------
 1 | 0,0,background
 2 | 5,1,aeroplane
 3 | 2,2,bicycle
 4 | 15,3,bird
 5 | 9,4,boat
 6 | 40,5,bottle
 7 | 6,6,bus
 8 | 3,7,car
 9 | 16,8,cat
10 | 57,9,chair
11 | 20,10,cow
12 | 61,11,diningtable
13 | 17,12,dog
14 | 18,13,horse
15 | 4,14,motorbike
16 | 1,15,person
17 | 59,16,pottedplant
18 | 19,17,sheep
19 | 58,18,sofa
20 | 7,19,train
21 | 63,20,tvmonitor


--------------------------------------------------------------------------------
/layers/modules/l2norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Function
 4 | from torch.autograd import Variable
 5 | import torch.nn.init as init
 6 | 
 7 | class L2Norm(nn.Module):
 8 |     def __init__(self,n_channels, scale):
 9 |         super(L2Norm,self).__init__()
10 |         self.n_channels = n_channels
11 |         self.gamma = scale or None
12 |         self.eps = 1e-10
13 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
14 |         self.reset_parameters()
15 | 
16 |     def reset_parameters(self):
17 |         init.constant(self.weight,self.gamma)
18 | 
19 |     def forward(self, x):
20 |         norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps
21 |         x /= norm
22 |         out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
23 |         return out
24 | 


--------------------------------------------------------------------------------
/data/scripts/VOC2012.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | echo "Downloading VOC2012 trainval ..."
24 | # Download the data.
25 | #curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
26 | echo "Done downloading."
27 | 
28 | 
29 | # Extract data
30 | echo "Extracting trainval ..."
31 | tar -xvf VOCtrainval_11-May-2012.tar
32 | echo "removing tar ..."
33 | rm VOCtrainval_11-May-2012.tar
34 | 
35 | end=`date +%s`
36 | runtime=$((end-start))
37 | 
38 | echo "Completed in" $runtime "seconds"
39 | 


--------------------------------------------------------------------------------
/utils/nms_wrapper.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | from .nms.cpu_nms import cpu_nms, cpu_soft_nms
 9 | from .nms.gpu_nms import gpu_nms
10 | 
11 | 
12 | # def nms(dets, thresh, force_cpu=False):
13 | #     """Dispatch to either CPU or GPU NMS implementations."""
14 | #
15 | #     if dets.shape[0] == 0:
16 | #         return []
17 | #     if cfg.USE_GPU_NMS and not force_cpu:
18 | #         return gpu_nms(dets, thresh, device_id=cfg.GPU_ID)
19 | #     else:
20 | #         return cpu_nms(dets, thresh)
21 | 
22 | 
23 | def nms(dets, thresh, force_cpu=False):
24 |     """Dispatch to either CPU or GPU NMS implementations."""
25 | 
26 |     if dets.shape[0] == 0:
27 |         return []
28 |     if force_cpu:
29 |         #return cpu_soft_nms(dets, thresh, method = 0)
30 |         return cpu_nms(dets, thresh)
31 |     return gpu_nms(dets, thresh)
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Fanbinqi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data/scripts/VOC2007.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | #echo "Downloading VOC2007 trainval ..."
24 | # Download the data.
25 | #curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
26 | #echo "Downloading VOC2007 test data ..."
27 | #curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
28 | #echo "Done downloading."
29 | 
30 | # Extract data
31 | echo "Extracting trainval ..."
32 | tar -xvf VOCtrainval_06-Nov-2007.tar
33 | echo "Extracting test ..."
34 | tar -xvf VOCtest_06-Nov-2007.tar
35 | echo "removing tars ..."
36 | rm VOCtrainval_06-Nov-2007.tar
37 | rm VOCtest_06-Nov-2007.tar
38 | 
39 | end=`date +%s`
40 | runtime=$((end-start))
41 | 
42 | echo "Completed in" $runtime "seconds"
43 | 


--------------------------------------------------------------------------------
/utils/nms/py_cpu_nms.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | def py_cpu_nms(dets, thresh):
11 |     """Pure Python NMS baseline."""
12 |     x1 = dets[:, 0]
13 |     y1 = dets[:, 1]
14 |     x2 = dets[:, 2]
15 |     y2 = dets[:, 3]
16 |     scores = dets[:, 4]
17 | 
18 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
19 |     order = scores.argsort()[::-1]
20 | 
21 |     keep = []
22 |     while order.size > 0:
23 |         i = order[0]
24 |         keep.append(i)
25 |         xx1 = np.maximum(x1[i], x1[order[1:]])
26 |         yy1 = np.maximum(y1[i], y1[order[1:]])
27 |         xx2 = np.minimum(x2[i], x2[order[1:]])
28 |         yy2 = np.minimum(y2[i], y2[order[1:]])
29 | 
30 |         w = np.maximum(0.0, xx2 - xx1 + 1)
31 |         h = np.maximum(0.0, yy2 - yy1 + 1)
32 |         inter = w * h
33 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
34 | 
35 |         inds = np.where(ovr <= thresh)[0]
36 |         order = order[inds + 1]
37 | 
38 |     return keep
39 | 


--------------------------------------------------------------------------------
/utils/nms/gpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Faster R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | assert sizeof(int) == sizeof(np.int32_t)
12 | 
13 | cdef extern from "gpu_nms.hpp":
14 |     void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
15 | 
16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
17 |             np.int32_t device_id=0):
18 |     cdef int boxes_num = dets.shape[0]
19 |     cdef int boxes_dim = dets.shape[1]
20 |     cdef int num_out
21 |     cdef np.ndarray[np.int32_t, ndim=1] \
22 |         keep = np.zeros(boxes_num, dtype=np.int32)
23 |     cdef np.ndarray[np.float32_t, ndim=1] \
24 |         scores = dets[:, 4]
25 |     cdef np.ndarray[np.int_t, ndim=1] \
26 |         order = scores.argsort()[::-1]
27 |     cdef np.ndarray[np.float32_t, ndim=2] \
28 |         sorted_dets = dets[order, :]
29 |     _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
30 |     keep = keep[:num_out]
31 |     return list(order[keep])
32 | 


--------------------------------------------------------------------------------
/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import time
 9 | 
10 | 
11 | class Timer(object):
12 |     """A simple timer."""
13 |     def __init__(self):
14 |         self.total_time = 0.
15 |         self.calls = 0
16 |         self.start_time = 0.
17 |         self.diff = 0.
18 |         self.average_time = 0.
19 | 
20 |     def tic(self):
21 |         # using time.time instead of time.clock because time time.clock
22 |         # does not normalize for multithreading
23 |         self.start_time = time.time()
24 | 
25 |     def toc(self, average=True):
26 |         self.diff = time.time() - self.start_time
27 |         self.total_time += self.diff
28 |         self.calls += 1
29 |         self.average_time = self.total_time / self.calls
30 |         if average:
31 |             return self.average_time
32 |         else:
33 |             return self.diff
34 | 
35 |     def clear(self):
36 |         self.total_time = 0.
37 |         self.calls = 0
38 |         self.start_time = 0.
39 |         self.diff = 0.
40 |         self.average_time = 0.
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/layers/functions/prior_box.py:
--------------------------------------------------------------------------------
 1 | from itertools import product as product
 2 | from math import sqrt as sqrt
 3 | 
 4 | import torch
 5 | 
 6 | if torch.cuda.is_available():
 7 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 8 | 
 9 | 
10 | class PriorBox(object):
11 |     """Compute priorbox coordinates in center-offset form for each source
12 |     feature map.
13 |     Note:
14 |     This 'layer' has changed between versions of the original SSD
15 |     paper, so we include both versions, but note v2 is the most tested and most
16 |     recent version of the paper.
17 | 
18 |     """
19 | 
20 |     def __init__(self, cfg):
21 |         super(PriorBox, self).__init__()
22 |         self.image_size = cfg['min_dim']
23 |         # number of priors for feature map location (either 4 or 6)
24 |         self.num_priors = len(cfg['aspect_ratios'])
25 |         self.variance = cfg['variance'] or [0.1]
26 |         self.feature_maps = cfg['feature_maps']
27 |         self.min_sizes = cfg['min_sizes']
28 |         self.max_sizes = cfg['max_sizes']
29 |         self.steps = cfg['steps']
30 |         self.aspect_ratios = cfg['aspect_ratios']
31 |         self.clip = cfg['clip']
32 |         for v in self.variance:
33 |             if v <= 0:
34 |                 raise ValueError('Variances must be greater than 0')
35 | 
36 |     def forward(self):
37 |         mean = []
38 |         for k, f in enumerate(self.feature_maps):
39 |             for i, j in product(range(f), repeat=2):
40 |                 f_k = self.image_size / self.steps[k]
41 |                 cx = (j + 0.5) / f_k
42 |                 cy = (i + 0.5) / f_k
43 | 
44 |                 s_k = self.min_sizes[k] / self.image_size
45 |                 mean += [cx, cy, s_k, s_k]
46 | 
47 |                 # aspect_ratio: 1
48 |                 # rel size: sqrt(s_k * s_(k+1))
49 |                 if self.max_sizes:
50 |                     s_k_prime = sqrt(s_k * (self.max_sizes[k] / self.image_size))
51 |                     mean += [cx, cy, s_k_prime, s_k_prime]
52 | 
53 |                 # rest of aspect ratios
54 |                 for ar in self.aspect_ratios[k]:
55 |                     mean += [cx, cy, s_k * sqrt(ar), s_k / sqrt(ar)]
56 |                     mean += [cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)]
57 | 
58 |         # back to torch land
59 |         output = torch.Tensor(mean).view(-1, 4)
60 |         if self.clip:
61 |             output.clamp_(max=1, min=0)
62 |         return output
63 | 


--------------------------------------------------------------------------------
/utils/pycocotools/maskApi.h:
--------------------------------------------------------------------------------
 1 | /**************************************************************************
 2 | * Microsoft COCO Toolbox.      version 2.0
 3 | * Data, paper, and tutorials available at:  http://mscoco.org/
 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
 5 | * Licensed under the Simplified BSD License [see coco/license.txt]
 6 | **************************************************************************/
 7 | #pragma once
 8 | 
 9 | typedef unsigned int uint;
10 | typedef unsigned long siz;
11 | typedef unsigned char byte;
12 | typedef double* BB;
13 | typedef struct { siz h, w, m; uint *cnts; } RLE;
14 | 
15 | /* Initialize/destroy RLE. */
16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts );
17 | void rleFree( RLE *R );
18 | 
19 | /* Initialize/destroy RLE array. */
20 | void rlesInit( RLE **R, siz n );
21 | void rlesFree( RLE **R, siz n );
22 | 
23 | /* Encode binary masks using RLE. */
24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n );
25 | 
26 | /* Decode binary masks encoded via RLE. */
27 | void rleDecode( const RLE *R, byte *mask, siz n );
28 | 
29 | /* Compute union or intersection of encoded masks. */
30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect );
31 | 
32 | /* Compute area of encoded masks. */
33 | void rleArea( const RLE *R, siz n, uint *a );
34 | 
35 | /* Compute intersection over union between masks. */
36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o );
37 | 
38 | /* Compute non-maximum suppression between bounding masks */
39 | void rleNms( RLE *dt, siz n, uint *keep, double thr );
40 | 
41 | /* Compute intersection over union between bounding boxes. */
42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o );
43 | 
44 | /* Compute non-maximum suppression between bounding boxes */
45 | void bbNms( BB dt, siz n, uint *keep, double thr );
46 | 
47 | /* Get bounding boxes surrounding encoded masks. */
48 | void rleToBbox( const RLE *R, BB bb, siz n );
49 | 
50 | /* Convert bounding boxes to encoded masks. */
51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n );
52 | 
53 | /* Convert polygon to encoded mask. */
54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w );
55 | 
56 | /* Get compressed string representation of encoded mask. */
57 | char* rleToString( const RLE *R );
58 | 
59 | /* Convert from compressed string representation of encoded mask. */
60 | void rleFrString( RLE *R, char *s, siz h, siz w );
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## FFBNet
 2 | FFBNET : LIGHTWEIGHT BACKBONE FOR OBJECT DETECTION BASED FEATURE FUSION BLOCK
 3 | 
 4 | ## Our paper has been accepted by IEEE ICIP2019 for presentention.
 5 | 
 6 | ### VOC2007 Test
 7 | | System                                   |  *mAP*   | **FPS** (1080Ti) |
 8 | | :--------------------------------------- | :------: | :-----------------------: |
 9 | | Mob-SSD |   68   |            190             |
10 | | Tiny-Yolo v3 |   61.3   |           220             |
11 | | Pelee |   70.9   |            -             |
12 | | SSD |   77.2   |            160            |
13 | | STDN | 78.1 |            41             |
14 | | FSSD | 78.8 |            150             |
15 | | RefineDet |  80.0  |     -      |
16 | | FFBNet |   73.54   |       185        |
17 | | VGG-FFB |   80.2   |      142        |
18 | 
19 | ## Installation
20 | - Install [PyTorch 0.3.1](http://pytorch.org/) by selecting your environment on the website and running the appropriate command.
21 | - Clone this repository. This repository is mainly based on[lzx1413/PytorchSSD](https://github.com/lzx1413/PytorchSSD), and a huge thank to him.
22 | 
23 | - Compile the nms and coco tools:
24 | ```Shell
25 | ./make.sh
26 | ```
27 | 
28 | ## Datasets
29 | 
30 | ### VOC Dataset
31 | ##### Download VOC2007 trainval & test
32 | 
33 | ```Shell
34 | # specify a directory for dataset to be downloaded into, else default is ~/data/
35 | sh data/scripts/VOC2007.sh # <directory>
36 | ```
37 | 
38 | ##### Download VOC2012 trainval
39 | 
40 | ```Shell
41 | # specify a directory for dataset to be downloaded into, else default is ~/data/
42 | sh data/scripts/VOC2012.sh # <directory>
43 | ```
44 | 
45 | ## Training
46 | - First download the fc-reduced [VGG-16](https://arxiv.org/abs/1409.1556) PyTorch base network weights at: [BaiduYun Driver](https://pan.baidu.com/s/1nzOgaL8mAPex8_HLU4mb8Q), password is `mu59`.
47 | - MobileNet is reported in the [paper](https://arxiv.org/abs/1704.04861), weight file is available at: [BaiduYun Driver](https://pan.baidu.com/s/1LXq3p6IOoQ6YJMY0xhRkLQ), password is `f7oe`.
48 | 
49 | ```Shell
50 | # Put vgg16_reducedfc.pth, and mobilenet_1.pth in a new folder weights and 
51 | python train_test_mob.py or python train_test_vgg.py
52 | ```
53 | ### Personal advice: when use Mobilenet v1 to train voc datasets, use a higher learning rate at the beginning, the convergence performance may be better.
54 | 
55 | If you are interested in this paper or interested in lightweight detectors, please QQ me (374873360)
56 | 


--------------------------------------------------------------------------------
/data/config.py:
--------------------------------------------------------------------------------
  1 | # config.py
  2 | 
  3 | # gets home dir cross platform
  4 | import cv2
  5 | cv2.setNumThreads(0)  # pytorch issue 1355: possible deadlock in dataloader
  6 | # note: if you used our download scripts, this should be right
  7 | VOCroot = '/home/zdh1901/data/VOCdevkit'  # path to VOCdevkit root dir
  8 | 
  9 | COCOroot = '/home/zdh1901/data/coco'
 10 | 
 11 | # RFB CONFIGS
 12 | VOC_300 = {
 13 |     'feature_maps': [38, 19, 10, 5, 3, 1],
 14 |     #'feature_maps': [1, 3, 5, 10, 19, 38],
 15 | 
 16 |     'min_dim': 300,
 17 | 
 18 |     'steps': [8, 16, 32, 64, 100, 300],
 19 |     #'steps': [300, 100, 64, 32, 16, 8],
 20 | 
 21 |     'min_sizes': [30, 60, 111, 162, 213, 264],
 22 | 
 23 |     'max_sizes': [60, 111, 162, 213, 264, 315],
 24 |     # 'min_sizes': [264, 213, 163, 111, 60, 30],
 25 |     #
 26 |     # 'max_sizes': [315, 264, 213, 163, 111, 60],
 27 | 
 28 |     'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
 29 |     #'aspect_ratios' : [[2], [2], [2, 3], [2, 3], [2, 3], [2]],
 30 | 
 31 |     'variance': [0.1, 0.2],
 32 | 
 33 |     'clip': True,
 34 | }
 35 | 
 36 | VOC_512 = {
 37 |     'feature_maps': [38, 19, 10, 5, 3, 1],
 38 | 
 39 |     'min_dim': 512,
 40 | 
 41 |     'steps': [14, 27, 51, 102, 170, 512],
 42 | 
 43 |     'min_sizes': [35.84, 76.8, 153.6, 230.4, 307.2, 384.0],
 44 | 
 45 |     'max_sizes': [76.8, 153.6, 230.4, 307.2, 384.0, 460.8],
 46 | 
 47 |     'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
 48 | 
 49 |     'variance': [0.1, 0.2],
 50 | 
 51 |     'clip': True,
 52 | }
 53 | 
 54 | COCO_300 = {
 55 |     'feature_maps': [38, 19, 10, 5, 3, 1],
 56 | 
 57 |     'min_dim': 300,
 58 | 
 59 |     'steps': [8, 16, 32, 64, 100, 300],
 60 | 
 61 |     'min_sizes': [21, 45, 99, 153, 207, 261],
 62 | 
 63 |     'max_sizes': [45, 99, 153, 207, 261, 315],
 64 | 
 65 |     'aspect_ratios': [[2, 3], [2, 3], [2, 3], [2, 3], [2], [2]],
 66 | 
 67 |     'variance': [0.1, 0.2],
 68 | 
 69 |     'clip': True,
 70 | }
 71 | 
 72 | COCO_512 = {
 73 |     'feature_maps': [64, 32, 16, 8, 4, 2, 1],
 74 | 
 75 |     'min_dim': 512,
 76 | 
 77 |     'steps': [8, 16, 32, 64, 128, 256, 512],
 78 | 
 79 |     'min_sizes': [20.48, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8],
 80 | 
 81 |     'max_sizes': [51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72],
 82 | 
 83 |     'aspect_ratios': [[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]],
 84 | 
 85 |     'variance': [0.1, 0.2],
 86 | 
 87 |     'clip': True,
 88 | }
 89 | 
 90 | COCO_mobile_300 = {
 91 |     'feature_maps': [19, 10, 5, 3, 2, 1],
 92 | 
 93 |     'min_dim': 300,
 94 | 
 95 |     'steps': [16, 32, 64, 100, 150, 300],
 96 | 
 97 |     'min_sizes': [45, 90, 135, 180, 225, 270],
 98 | 
 99 |     'max_sizes': [90, 135, 180, 225, 270, 315],
100 | 
101 |     'aspect_ratios': [[2, 3], [2, 3], [2, 3], [2, 3], [2], [2]],
102 | 
103 |     'variance': [0.1, 0.2],
104 | 
105 |     'clip': True,
106 | }
107 | 
108 | VOC_320 = {
109 |     'feature_maps': [40, 20, 10, 5],
110 | 
111 |     'min_dim': 320,
112 | 
113 |     'steps': [8, 16, 32, 64],
114 | 
115 |     'min_sizes': [32, 64, 128, 256],
116 | 
117 |     'max_sizes': [],
118 | 
119 |     'aspect_ratios': [[2], [2], [2], [2]],
120 | 
121 |     'variance': [0.1, 0.2],
122 | 
123 |     'clip': True,
124 | }
125 | 


--------------------------------------------------------------------------------
/layers/functions/detection.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Function
 3 | 
 4 | from utils.box_utils import decode, center_size
 5 | 
 6 | 
 7 | class Detect(Function):
 8 |     """At test time, Detect is the final layer of SSD.  Decode location preds,
 9 |     apply non-maximum suppression to location predictions based on conf
10 |     scores and threshold to a top_k number of output predictions for both
11 |     confidence score and locations.
12 |     """
13 | 
14 |     def __init__(self, num_classes, bkg_label, cfg, object_score=0):
15 |         self.num_classes = num_classes
16 |         self.background_label = bkg_label
17 |         self.object_score = object_score
18 |         # self.thresh = thresh
19 | 
20 |         # Parameters used in nms.
21 |         self.variance = cfg['variance']
22 | 
23 |     def forward(self, predictions, prior, arm_data=None):
24 |         """
25 |         Args:
26 |             loc_data: (tensor) Loc preds from loc layers
27 |                 Shape: [batch,num_priors*4]
28 |             conf_data: (tensor) Shape: Conf preds from conf layers
29 |                 Shape: [batch*num_priors,num_classes]
30 |             prior_data: (tensor) Prior boxes and variances from priorbox layers
31 |                 Shape: [1,num_priors,4]
32 |         """
33 | 
34 |         loc, conf = predictions
35 |         loc_data = loc.data
36 |         conf_data = conf.data
37 |         prior_data = prior.data
38 |         num = loc_data.size(0)  # batch size
39 |         if arm_data:
40 |             arm_loc, arm_conf = arm_data
41 |             arm_loc_data = arm_loc.data
42 |             arm_conf_data = arm_conf.data
43 |             arm_object_conf = arm_conf_data[:, 1:]
44 |             no_object_index = arm_object_conf <= self.object_score
45 |             conf_data[no_object_index.expand_as(conf_data)] = 0
46 | 
47 |         self.num_priors = prior_data.size(0)
48 |         self.boxes = torch.zeros(num, self.num_priors, 4)
49 |         self.scores = torch.zeros(num, self.num_priors, self.num_classes)
50 | 
51 |         if num == 1:
52 |             # size batch x num_classes x num_priors
53 |             conf_preds = conf_data.unsqueeze(0)
54 | 
55 |         else:
56 |             conf_preds = conf_data.view(num, self.num_priors,
57 |                                         self.num_classes)
58 |             self.boxes.expand(num, self.num_priors, 4)
59 |             self.scores.expand(num, self.num_priors, self.num_classes)
60 |         # Decode predictions into bboxes.
61 |         for i in range(num):
62 |             if arm_data:
63 |                 default = decode(arm_loc_data[i], prior_data, self.variance)
64 |                 default = center_size(default)
65 |             else:
66 |                 default = prior_data
67 |             decoded_boxes = decode(loc_data[i], default, self.variance)
68 |             # For each class, perform nms
69 |             conf_scores = conf_preds[i].clone()
70 |             '''
71 |             c_mask = conf_scores.gt(self.thresh)
72 |             decoded_boxes = decoded_boxes[c_mask]
73 |             conf_scores = conf_scores[c_mask]
74 |             '''
75 | 
76 |             self.boxes[i] = decoded_boxes
77 |             self.scores[i] = conf_scores
78 | 
79 |         return self.boxes, self.scores
80 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys
  3 | import os
  4 | import cv2
  5 | import pickle
  6 | import argparse
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.backends.cudnn as cudnn
 10 | import torchvision.transforms as transforms
 11 | import numpy as np
 12 | from torch.autograd import Variable
 13 | from data import VOCroot,COCOroot
 14 | from data import AnnotationTransform, COCODetection, VOCDetection, BaseTransform, VOC_300,VOC_512,COCO_300,COCO_512, COCO_mobile_300
 15 | from models.FSSD_vgg_FPN import build_net
 16 | import torch.utils.data as data
 17 | from layers.functions import Detect,PriorBox
 18 | from utils.nms_wrapper import nms
 19 | from utils.timer import Timer
 20 | from matplotlib import pyplot as plt
 21 | 
 22 | CLASSES = ('__background__',
 23 |            'aeroplane', 'bicycle', 'bird', 'boat',
 24 |            'bottle', 'bus', 'car', 'cat', 'chair',
 25 |            'cow', 'diningtable', 'dog', 'horse',
 26 |            'motorbike', 'person', 'pottedplant',
 27 |            'sheep', 'sofa', 'train', 'tvmonitor')
 28 | 
 29 | def test_net(net,img,name,detector,transform,priors,top_k=200,thresh=0.01):
 30 | 
 31 |     scale = torch.Tensor([img.shape[1], img.shape[0],
 32 |                           img.shape[1], img.shape[0]])
 33 |     #cv2.imshow('ori.jpg',img)
 34 |     #cv2.waitKey(2)
 35 |     # with torch.no_grad():
 36 |     #     x = transform(img).unsqueeze(0)
 37 |     #     x = x.cuda()
 38 |     #     scale = scale.cuda()
 39 |     x = Variable(transform(img).unsqueeze(0), volatile=True)
 40 |     x = x.cuda()
 41 |     scale = scale.cuda()
 42 | 
 43 |     out = net(x,test=True)
 44 |     boxes, scores = detector.forward(out, priors)
 45 |     boxes = boxes[0]
 46 |     scores = scores[0]
 47 |     a = []
 48 |     boxes *= scale
 49 |     boxes = boxes.cpu().numpy()
 50 |     scores = scores.cpu().numpy()
 51 | 
 52 |     flag = True
 53 |     for j in range(1, 21):
 54 |         inds = np.where(scores[:, j] > thresh)[0]
 55 |         if len(inds) == 0:
 56 |             #print  ("%s class" %str(j))
 57 |             continue
 58 |         c_bboxes = boxes[inds]
 59 |         c_scores = scores[inds, j]
 60 |         c_dets = np.hstack((c_bboxes, c_scores[:, np.newaxis])).astype(
 61 |             np.float32, copy=False)
 62 |         keep = nms(c_dets, 0.45, force_cpu=True)
 63 |         c_dets = c_dets[keep, :]
 64 |         cls = np.ones(c_dets.shape[0])*j
 65 |         c_dets = np.column_stack((c_dets,cls))
 66 |         if flag:
 67 |             result = c_dets
 68 |             flag = False
 69 |         else:
 70 |             result = np.vstack((result,c_dets))
 71 | 
 72 |     a = list(result)
 73 |     #a.append(result)
 74 |     rgb_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 75 |     colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()
 76 |     plt.imshow(rgb_image)
 77 |     currentAxis = plt.gca()
 78 | 
 79 |     for (x1,y1,x2,y2,s,cls) in a:
 80 |         x1 = int(x1)
 81 |         y1 = int(y1)
 82 |         x2 = int(x2)
 83 |         y2 = int(y2)
 84 |         cls = int(cls)
 85 |         title = "%s:%.2f" % (CLASSES[int(cls)], s)
 86 |         coords = (x1,y1), x2-x1+1, y2-y1+1
 87 |         color = colors[cls]
 88 |         currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
 89 |         currentAxis.text(x1, y1, title, bbox={'facecolor': color, 'alpha': 0.5})
 90 |     plt.axis('off')
 91 |     plt.savefig(name.split('.')[0]+'.eps',format='eps',bbox_inches = 'tight')
 92 |     plt.show()
 93 | 
 94 | if __name__ == "__main__":
 95 |     Image = os.listdir('image1/')
 96 | 
 97 |     for img_name in Image:
 98 |         img = cv2.imread("image1/"+img_name)
 99 |         model = './weights/FSSD_VGG.pth'
100 |         net = build_net(300, 21)
101 |         state_dict = torch.load(model)
102 |         from collections import OrderedDict
103 |         new_state_dict = OrderedDict()
104 |         for k, v in state_dict.items():
105 |             head = k[:7]
106 |             if head == 'module.':
107 |                 name = k[7:] # remove `module.`
108 |             else:
109 |                 name = k
110 |             new_state_dict[name] = v
111 |         net.load_state_dict(new_state_dict)
112 |         net.eval()
113 |         net = net.cuda()
114 |         cudnn.benchmark = True
115 |         print("Finished loading model")
116 |         transform = BaseTransform(300, (104, 117, 123))
117 |         detector = Detect(21, 0, VOC_300)
118 |         priorbox = PriorBox(VOC_300)
119 |         # with torch.no_grad():
120 |         #     priors = priorbox.forward()
121 |         #     priors = priors.cuda()
122 |         priors = Variable(priorbox.forward(), volatile=True)
123 |         priors = priors.cuda()
124 |         test_net(net, img, img_name, detector, transform, priors,top_k=200, thresh=0.7)


--------------------------------------------------------------------------------
/models/base_models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | 
  5 | def vgg(cfg, i, batch_norm=False):
  6 |     layers = []
  7 |     in_channels = i
  8 |     for v in cfg:
  9 |         if v == 'M':
 10 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
 11 |         elif v == 'C':
 12 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
 13 |         else:
 14 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
 15 |             if batch_norm:
 16 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
 17 |             else:
 18 |                 layers += [conv2d, nn.ReLU(inplace=True)]
 19 |             in_channels = v
 20 |     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
 21 |     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 22 |     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
 23 |     layers += [pool5, conv6,
 24 |                nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
 25 |     return layers
 26 | 
 27 | 
 28 | vgg_base = {
 29 |     '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 30 |             512, 512, 512],
 31 |     '512': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 32 |             512, 512, 512],
 33 | }
 34 | 
 35 | 
 36 | class BasicConv(nn.Module):
 37 | 
 38 |     def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True,
 39 |                  bn=True, bias=False):
 40 |         super(BasicConv, self).__init__()
 41 |         self.out_channels = out_planes
 42 |         self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding,
 43 |                               dilation=dilation, groups=groups, bias=bias)
 44 |         self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None
 45 |         self.relu = nn.ReLU(inplace=True) if relu else None
 46 | 
 47 |     def forward(self, x):
 48 |         x = self.conv(x)
 49 |         if self.bn is not None:
 50 |             x = self.bn(x)
 51 |         if self.relu is not None:
 52 |             x = self.relu(x)
 53 |         return x
 54 | 
 55 | 
 56 | class BasicRFB_a(nn.Module):
 57 | 
 58 |     def __init__(self, in_planes, out_planes, stride=1, scale=0.1):
 59 |         super(BasicRFB_a, self).__init__()
 60 |         self.scale = scale
 61 |         self.out_channels = out_planes
 62 |         inter_planes = in_planes // 4
 63 | 
 64 |         self.branch0 = nn.Sequential(
 65 |             BasicConv(in_planes, inter_planes, kernel_size=1, stride=1),
 66 |             BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=1, relu=False)
 67 |         )
 68 |         self.branch1 = nn.Sequential(
 69 |             BasicConv(in_planes, inter_planes, kernel_size=1, stride=1),
 70 |             BasicConv(inter_planes, inter_planes, kernel_size=(3, 1), stride=1, padding=(1, 0)),
 71 |             BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=3, dilation=3, relu=False)
 72 |         )
 73 |         self.branch2 = nn.Sequential(
 74 |             BasicConv(in_planes, inter_planes, kernel_size=1, stride=1),
 75 |             BasicConv(inter_planes, inter_planes, kernel_size=(1, 3), stride=stride, padding=(0, 1)),
 76 |             BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=3, dilation=3, relu=False)
 77 |         )
 78 |         '''
 79 |         self.branch3 = nn.Sequential(
 80 |                 BasicConv(in_planes, inter_planes, kernel_size=1, stride=1),
 81 |                 BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=1),
 82 |                 BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=3, dilation=3, relu=False)
 83 |                 )
 84 |         '''
 85 |         self.branch3 = nn.Sequential(
 86 |             BasicConv(in_planes, inter_planes // 2, kernel_size=1, stride=1),
 87 |             BasicConv(inter_planes // 2, (inter_planes // 4) * 3, kernel_size=(1, 3), stride=1, padding=(0, 1)),
 88 |             BasicConv((inter_planes // 4) * 3, inter_planes, kernel_size=(3, 1), stride=stride, padding=(1, 0)),
 89 |             BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=5, dilation=5, relu=False)
 90 |         )
 91 | 
 92 |         self.ConvLinear = BasicConv(4 * inter_planes, out_planes, kernel_size=1, stride=1, relu=False)
 93 |         self.shortcut = BasicConv(in_planes, out_planes, kernel_size=1, stride=stride, relu=False)
 94 |         self.relu = nn.ReLU(inplace=False)
 95 | 
 96 |     def forward(self, x):
 97 |         x0 = self.branch0(x)
 98 |         x1 = self.branch1(x)
 99 |         x2 = self.branch2(x)
100 |         x3 = self.branch3(x)
101 | 
102 |         out = torch.cat((x0, x1, x2, x3), 1)
103 |         out = self.ConvLinear(out)
104 |         short = self.shortcut(x)
105 |         out = out * self.scale + short
106 |         out = self.relu(out)
107 | 
108 |         return out
109 | 


--------------------------------------------------------------------------------
/utils/pycocotools/mask.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'tsungyi'
  2 | 
  3 | #import pycocotools._mask as _mask
  4 | from . import _mask
  5 | 
  6 | # Interface for manipulating masks stored in RLE format.
  7 | #
  8 | # RLE is a simple yet efficient format for storing binary masks. RLE
  9 | # first divides a vector (or vectorized image) into a series of piecewise
 10 | # constant regions and then for each piece simply stores the length of
 11 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would
 12 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1]
 13 | # (note that the odd counts are always the numbers of zeros). Instead of
 14 | # storing the counts directly, additional compression is achieved with a
 15 | # variable bitrate representation based on a common scheme called LEB128.
 16 | #
 17 | # Compression is greatest given large piecewise constant regions.
 18 | # Specifically, the size of the RLE is proportional to the number of
 19 | # *boundaries* in M (or for an image the number of boundaries in the y
 20 | # direction). Assuming fairly simple shapes, the RLE representation is
 21 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage
 22 | # is substantially lower, especially for large simple objects (large n).
 23 | #
 24 | # Many common operations on masks can be computed directly using the RLE
 25 | # (without need for decoding). This includes computations such as area,
 26 | # union, intersection, etc. All of these operations are linear in the
 27 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area
 28 | # of the object. Computing these operations on the original mask is O(n).
 29 | # Thus, using the RLE can result in substantial computational savings.
 30 | #
 31 | # The following API functions are defined:
 32 | #  encode         - Encode binary masks using RLE.
 33 | #  decode         - Decode binary masks encoded via RLE.
 34 | #  merge          - Compute union or intersection of encoded masks.
 35 | #  iou            - Compute intersection over union between masks.
 36 | #  area           - Compute area of encoded masks.
 37 | #  toBbox         - Get bounding boxes surrounding encoded masks.
 38 | #  frPyObjects    - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask.
 39 | #
 40 | # Usage:
 41 | #  Rs     = encode( masks )
 42 | #  masks  = decode( Rs )
 43 | #  R      = merge( Rs, intersect=false )
 44 | #  o      = iou( dt, gt, iscrowd )
 45 | #  a      = area( Rs )
 46 | #  bbs    = toBbox( Rs )
 47 | #  Rs     = frPyObjects( [pyObjects], h, w )
 48 | #
 49 | # In the API the following formats are used:
 50 | #  Rs      - [dict] Run-length encoding of binary masks
 51 | #  R       - dict Run-length encoding of binary mask
 52 | #  masks   - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order)
 53 | #  iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore
 54 | #  bbs     - [nx4] Bounding box(es) stored as [x y w h]
 55 | #  poly    - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list)
 56 | #  dt,gt   - May be either bounding boxes or encoded masks
 57 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel).
 58 | #
 59 | # Finally, a note about the intersection over union (iou) computation.
 60 | # The standard iou of a ground truth (gt) and detected (dt) object is
 61 | #  iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt))
 62 | # For "crowd" regions, we use a modified criteria. If a gt object is
 63 | # marked as "iscrowd", we allow a dt to match any subregion of the gt.
 64 | # Choosing gt' in the crowd gt that best matches the dt can be done using
 65 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing
 66 | #  iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt)
 67 | # For crowd gt regions we use this modified criteria above for the iou.
 68 | #
 69 | # To compile run "python setup.py build_ext --inplace"
 70 | # Please do not contact us for help with compiling.
 71 | #
 72 | # Microsoft COCO Toolbox.      version 2.0
 73 | # Data, paper, and tutorials available at:  http://mscoco.org/
 74 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
 75 | # Licensed under the Simplified BSD License [see coco/license.txt]
 76 | 
 77 | iou         = _mask.iou
 78 | merge       = _mask.merge
 79 | frPyObjects = _mask.frPyObjects
 80 | 
 81 | def encode(bimask):
 82 |     if len(bimask.shape) == 3:
 83 |         return _mask.encode(bimask)
 84 |     elif len(bimask.shape) == 2:
 85 |         h, w = bimask.shape
 86 |         return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
 87 | 
 88 | def decode(rleObjs):
 89 |     if type(rleObjs) == list:
 90 |         return _mask.decode(rleObjs)
 91 |     else:
 92 |         return _mask.decode([rleObjs])[:,:,0]
 93 | 
 94 | def area(rleObjs):
 95 |     if type(rleObjs) == list:
 96 |         return _mask.area(rleObjs)
 97 |     else:
 98 |         return _mask.area([rleObjs])[0]
 99 | 
100 | def toBbox(rleObjs):
101 |     if type(rleObjs) == list:
102 |         return _mask.toBbox(rleObjs)
103 |     else:
104 |         return _mask.toBbox([rleObjs])[0]
105 | 


--------------------------------------------------------------------------------
/layers/modules/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | from utils.box_utils import match, log_sum_exp
  6 | GPU = False
  7 | if torch.cuda.is_available():
  8 |     GPU = True
  9 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 10 | 
 11 | 
 12 | class MultiBoxLoss(nn.Module):
 13 |     """SSD Weighted Loss Function
 14 |     Compute Targets:
 15 |         1) Produce Confidence Target Indices by matching  ground truth boxes
 16 |            with (default) 'priorboxes' that have jaccard index > threshold parameter
 17 |            (default threshold: 0.5).
 18 |         2) Produce localization target by 'encoding' variance into offsets of ground
 19 |            truth boxes and their matched  'priorboxes'.
 20 |         3) Hard negative mining to filter the excessive number of negative examples
 21 |            that comes with using a large number of default bounding boxes.
 22 |            (default negative:positive ratio 3:1)
 23 |     Objective Loss:
 24 |         L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
 25 |         Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
 26 |         weighted by α which is set to 1 by cross val.
 27 |         Args:
 28 |             c: class confidences,
 29 |             l: predicted boxes,
 30 |             g: ground truth boxes
 31 |             N: number of matched default boxes
 32 |         See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 33 |     """
 34 | 
 35 | 
 36 |     def __init__(self, num_classes,overlap_thresh,prior_for_matching,bkg_label,neg_mining,neg_pos,neg_overlap,encode_target):
 37 |         super(MultiBoxLoss, self).__init__()
 38 |         self.num_classes = num_classes
 39 |         self.threshold = overlap_thresh
 40 |         self.background_label = bkg_label
 41 |         self.encode_target = encode_target
 42 |         self.use_prior_for_matching  = prior_for_matching
 43 |         self.do_neg_mining = neg_mining
 44 |         self.negpos_ratio = neg_pos
 45 |         self.neg_overlap = neg_overlap
 46 |         self.variance = [0.1,0.2]
 47 | 
 48 |     def forward(self, predictions, priors, targets):
 49 |         """Multibox Loss
 50 |         Args:
 51 |             predictions (tuple): A tuple containing loc preds, conf preds,
 52 |             and prior boxes from SSD net.
 53 |                 conf shape: torch.size(batch_size,num_priors,num_classes)
 54 |                 loc shape: torch.size(batch_size,num_priors,4)
 55 |                 priors shape: torch.size(num_priors,4)
 56 | 
 57 |             ground_truth (tensor): Ground truth boxes and labels for a batch,
 58 |                 shape: [batch_size,num_objs,5] (last idx is the label).
 59 |         """
 60 | 
 61 |         loc_data, conf_data = predictions
 62 |         priors = priors
 63 |         num = loc_data.size(0)
 64 |         num_priors = (priors.size(0))
 65 |         num_classes = self.num_classes
 66 | 
 67 |         # match priors (default boxes) and ground truth boxes
 68 |         loc_t = torch.Tensor(num, num_priors, 4)
 69 |         conf_t = torch.LongTensor(num, num_priors)
 70 |         for idx in range(num):
 71 |             truths = targets[idx][:,:-1].data
 72 |             labels = targets[idx][:,-1].data
 73 |             defaults = priors.data
 74 |             match(self.threshold,truths,defaults,self.variance,labels,loc_t,conf_t,idx)
 75 |         if GPU:
 76 |             loc_t = loc_t.cuda()
 77 |             conf_t = conf_t.cuda()
 78 |         # wrap targets
 79 |         loc_t = Variable(loc_t, requires_grad=False)
 80 |         conf_t = Variable(conf_t,requires_grad=False)
 81 | 
 82 |         pos = conf_t > 0
 83 | 
 84 |         # Localization Loss (Smooth L1)
 85 |         # Shape: [batch,num_priors,4]
 86 |         pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
 87 |         loc_p = loc_data[pos_idx].view(-1,4)
 88 |         loc_t = loc_t[pos_idx].view(-1,4)
 89 |         loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
 90 | 
 91 |         # Compute max conf across batch for hard negative mining
 92 |         batch_conf = conf_data.view(-1,self.num_classes)
 93 |         loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1,1))
 94 | 
 95 |         # Hard Negative Mining
 96 |         loss_c[pos] = 0 # filter out pos boxes for now
 97 |         loss_c = loss_c.view(num, -1)
 98 |         _,loss_idx = loss_c.sort(1, descending=True)
 99 |         _,idx_rank = loss_idx.sort(1)
100 |         num_pos = pos.long().sum(1,keepdim=True)
101 |         num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
102 |         neg = idx_rank < num_neg.expand_as(idx_rank)
103 | 
104 |         # Confidence Loss Including Positive and Negative Examples
105 |         pos_idx = pos.unsqueeze(2).expand_as(conf_data)
106 |         neg_idx = neg.unsqueeze(2).expand_as(conf_data)
107 |         conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1,self.num_classes)
108 |         targets_weighted = conf_t[(pos+neg).gt(0)]
109 |         loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)
110 | 
111 |         # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
112 | 
113 |         N = num_pos.data.sum()
114 |         loss_l/=N
115 |         loss_c/=N
116 |         return loss_l,loss_c
117 | 


--------------------------------------------------------------------------------
/utils/nms/nms_kernel.cu:
--------------------------------------------------------------------------------
  1 | // ------------------------------------------------------------------
  2 | // Faster R-CNN
  3 | // Copyright (c) 2015 Microsoft
  4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
  5 | // Written by Shaoqing Ren
  6 | // ------------------------------------------------------------------
  7 | 
  8 | #include "gpu_nms.hpp"
  9 | #include <vector>
 10 | #include <iostream>
 11 | 
 12 | #define CUDA_CHECK(condition) \
 13 |   /* Code block avoids redefinition of cudaError_t error */ \
 14 |   do { \
 15 |     cudaError_t error = condition; \
 16 |     if (error != cudaSuccess) { \
 17 |       std::cout << cudaGetErrorString(error) << std::endl; \
 18 |     } \
 19 |   } while (0)
 20 | 
 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
 23 | 
 24 | __device__ inline float devIoU(float const * const a, float const * const b) {
 25 |   float left = max(a[0], b[0]), right = min(a[2], b[2]);
 26 |   float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
 27 |   float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
 28 |   float interS = width * height;
 29 |   float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
 30 |   float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
 31 |   return interS / (Sa + Sb - interS);
 32 | }
 33 | 
 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 35 |                            const float *dev_boxes, unsigned long long *dev_mask) {
 36 |   const int row_start = blockIdx.y;
 37 |   const int col_start = blockIdx.x;
 38 | 
 39 |   // if (row_start > col_start) return;
 40 | 
 41 |   const int row_size =
 42 |         min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
 43 |   const int col_size =
 44 |         min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
 45 | 
 46 |   __shared__ float block_boxes[threadsPerBlock * 5];
 47 |   if (threadIdx.x < col_size) {
 48 |     block_boxes[threadIdx.x * 5 + 0] =
 49 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
 50 |     block_boxes[threadIdx.x * 5 + 1] =
 51 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
 52 |     block_boxes[threadIdx.x * 5 + 2] =
 53 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
 54 |     block_boxes[threadIdx.x * 5 + 3] =
 55 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
 56 |     block_boxes[threadIdx.x * 5 + 4] =
 57 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
 58 |   }
 59 |   __syncthreads();
 60 | 
 61 |   if (threadIdx.x < row_size) {
 62 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
 63 |     const float *cur_box = dev_boxes + cur_box_idx * 5;
 64 |     int i = 0;
 65 |     unsigned long long t = 0;
 66 |     int start = 0;
 67 |     if (row_start == col_start) {
 68 |       start = threadIdx.x + 1;
 69 |     }
 70 |     for (i = start; i < col_size; i++) {
 71 |       if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
 72 |         t |= 1ULL << i;
 73 |       }
 74 |     }
 75 |     const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
 76 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
 77 |   }
 78 | }
 79 | 
 80 | void _set_device(int device_id) {
 81 |   int current_device;
 82 |   CUDA_CHECK(cudaGetDevice(&current_device));
 83 |   if (current_device == device_id) {
 84 |     return;
 85 |   }
 86 |   // The call to cudaSetDevice must come before any calls to Get, which
 87 |   // may perform initialization using the GPU.
 88 |   CUDA_CHECK(cudaSetDevice(device_id));
 89 | }
 90 | 
 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
 92 |           int boxes_dim, float nms_overlap_thresh, int device_id) {
 93 |   _set_device(device_id);
 94 | 
 95 |   float* boxes_dev = NULL;
 96 |   unsigned long long* mask_dev = NULL;
 97 | 
 98 |   const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
 99 | 
100 |   CUDA_CHECK(cudaMalloc(&boxes_dev,
101 |                         boxes_num * boxes_dim * sizeof(float)));
102 |   CUDA_CHECK(cudaMemcpy(boxes_dev,
103 |                         boxes_host,
104 |                         boxes_num * boxes_dim * sizeof(float),
105 |                         cudaMemcpyHostToDevice));
106 | 
107 |   CUDA_CHECK(cudaMalloc(&mask_dev,
108 |                         boxes_num * col_blocks * sizeof(unsigned long long)));
109 | 
110 |   dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
111 |               DIVUP(boxes_num, threadsPerBlock));
112 |   dim3 threads(threadsPerBlock);
113 |   nms_kernel<<<blocks, threads>>>(boxes_num,
114 |                                   nms_overlap_thresh,
115 |                                   boxes_dev,
116 |                                   mask_dev);
117 | 
118 |   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
119 |   CUDA_CHECK(cudaMemcpy(&mask_host[0],
120 |                         mask_dev,
121 |                         sizeof(unsigned long long) * boxes_num * col_blocks,
122 |                         cudaMemcpyDeviceToHost));
123 | 
124 |   std::vector<unsigned long long> remv(col_blocks);
125 |   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
126 | 
127 |   int num_to_keep = 0;
128 |   for (int i = 0; i < boxes_num; i++) {
129 |     int nblock = i / threadsPerBlock;
130 |     int inblock = i % threadsPerBlock;
131 | 
132 |     if (!(remv[nblock] & (1ULL << inblock))) {
133 |       keep_out[num_to_keep++] = i;
134 |       unsigned long long *p = &mask_host[0] + i * col_blocks;
135 |       for (int j = nblock; j < col_blocks; j++) {
136 |         remv[j] |= p[j];
137 |       }
138 |     }
139 |   }
140 |   *num_out = num_to_keep;
141 | 
142 |   CUDA_CHECK(cudaFree(boxes_dev));
143 |   CUDA_CHECK(cudaFree(mask_dev));
144 | }
145 | 


--------------------------------------------------------------------------------
/utils/nms/cpu_nms.pyx:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick
  6 | # --------------------------------------------------------
  7 | 
  8 | import numpy as np
  9 | cimport numpy as np
 10 | 
 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
 12 |     return a if a >= b else b
 13 | 
 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
 15 |     return a if a <= b else b
 16 | 
 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
 18 |     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
 19 |     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
 20 |     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
 21 |     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
 22 |     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
 23 | 
 24 |     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
 25 |     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
 26 | 
 27 |     cdef int ndets = dets.shape[0]
 28 |     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
 29 |             np.zeros((ndets), dtype=np.int)
 30 | 
 31 |     # nominal indices
 32 |     cdef int _i, _j
 33 |     # sorted indices
 34 |     cdef int i, j
 35 |     # temp variables for box i's (the box currently under consideration)
 36 |     cdef np.float32_t ix1, iy1, ix2, iy2, iarea
 37 |     # variables for computing overlap with box j (lower scoring box)
 38 |     cdef np.float32_t xx1, yy1, xx2, yy2
 39 |     cdef np.float32_t w, h
 40 |     cdef np.float32_t inter, ovr
 41 | 
 42 |     keep = []
 43 |     for _i in range(ndets):
 44 |         i = order[_i]
 45 |         if suppressed[i] == 1:
 46 |             continue
 47 |         keep.append(i)
 48 |         ix1 = x1[i]
 49 |         iy1 = y1[i]
 50 |         ix2 = x2[i]
 51 |         iy2 = y2[i]
 52 |         iarea = areas[i]
 53 |         for _j in range(_i + 1, ndets):
 54 |             j = order[_j]
 55 |             if suppressed[j] == 1:
 56 |                 continue
 57 |             xx1 = max(ix1, x1[j])
 58 |             yy1 = max(iy1, y1[j])
 59 |             xx2 = min(ix2, x2[j])
 60 |             yy2 = min(iy2, y2[j])
 61 |             w = max(0.0, xx2 - xx1 + 1)
 62 |             h = max(0.0, yy2 - yy1 + 1)
 63 |             inter = w * h
 64 |             ovr = inter / (iarea + areas[j] - inter)
 65 |             if ovr >= thresh:
 66 |                 suppressed[j] = 1
 67 | 
 68 |     return keep
 69 | 
 70 | def cpu_soft_nms(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0):
 71 |     cdef unsigned int N = boxes.shape[0]
 72 |     cdef float iw, ih, box_area
 73 |     cdef float ua
 74 |     cdef int pos = 0
 75 |     cdef float maxscore = 0
 76 |     cdef int maxpos = 0
 77 |     cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov
 78 | 
 79 |     for i in range(N):
 80 |         maxscore = boxes[i, 4]
 81 |         maxpos = i
 82 | 
 83 |         tx1 = boxes[i,0]
 84 |         ty1 = boxes[i,1]
 85 |         tx2 = boxes[i,2]
 86 |         ty2 = boxes[i,3]
 87 |         ts = boxes[i,4]
 88 | 
 89 |         pos = i + 1
 90 | 	# get max box
 91 |         while pos < N:
 92 |             if maxscore < boxes[pos, 4]:
 93 |                 maxscore = boxes[pos, 4]
 94 |                 maxpos = pos
 95 |             pos = pos + 1
 96 | 
 97 | 	# add max box as a detection 
 98 |         boxes[i,0] = boxes[maxpos,0]
 99 |         boxes[i,1] = boxes[maxpos,1]
100 |         boxes[i,2] = boxes[maxpos,2]
101 |         boxes[i,3] = boxes[maxpos,3]
102 |         boxes[i,4] = boxes[maxpos,4]
103 | 
104 | 	# swap ith box with position of max box
105 |         boxes[maxpos,0] = tx1
106 |         boxes[maxpos,1] = ty1
107 |         boxes[maxpos,2] = tx2
108 |         boxes[maxpos,3] = ty2
109 |         boxes[maxpos,4] = ts
110 | 
111 |         tx1 = boxes[i,0]
112 |         ty1 = boxes[i,1]
113 |         tx2 = boxes[i,2]
114 |         ty2 = boxes[i,3]
115 |         ts = boxes[i,4]
116 | 
117 |         pos = i + 1
118 | 	# NMS iterations, note that N changes if detection boxes fall below threshold
119 |         while pos < N:
120 |             x1 = boxes[pos, 0]
121 |             y1 = boxes[pos, 1]
122 |             x2 = boxes[pos, 2]
123 |             y2 = boxes[pos, 3]
124 |             s = boxes[pos, 4]
125 | 
126 |             area = (x2 - x1 + 1) * (y2 - y1 + 1)
127 |             iw = (min(tx2, x2) - max(tx1, x1) + 1)
128 |             if iw > 0:
129 |                 ih = (min(ty2, y2) - max(ty1, y1) + 1)
130 |                 if ih > 0:
131 |                     ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
132 |                     ov = iw * ih / ua #iou between max box and detection box
133 | 
134 |                     if method == 1: # linear
135 |                         if ov > Nt: 
136 |                             weight = 1 - ov
137 |                         else:
138 |                             weight = 1
139 |                     elif method == 2: # gaussian
140 |                         weight = np.exp(-(ov * ov)/sigma)
141 |                     else: # original NMS
142 |                         if ov > Nt: 
143 |                             weight = 0
144 |                         else:
145 |                             weight = 1
146 | 
147 |                     boxes[pos, 4] = weight*boxes[pos, 4]
148 | 		    
149 | 		    # if box score falls below threshold, discard the box by swapping with last box
150 | 		    # update N
151 |                     if boxes[pos, 4] < threshold:
152 |                         boxes[pos,0] = boxes[N-1, 0]
153 |                         boxes[pos,1] = boxes[N-1, 1]
154 |                         boxes[pos,2] = boxes[N-1, 2]
155 |                         boxes[pos,3] = boxes[N-1, 3]
156 |                         boxes[pos,4] = boxes[N-1, 4]
157 |                         N = N - 1
158 |                         pos = pos - 1
159 | 
160 |             pos = pos + 1
161 | 
162 |     keep = [i for i in range(N)]
163 |     return keep
164 | 


--------------------------------------------------------------------------------
/layers/modules/refine_multibox_loss.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | from utils.box_utils import match,refine_match, log_sum_exp,decode
  7 | GPU = False
  8 | if torch.cuda.is_available():
  9 |     GPU = True
 10 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 11 | 
 12 | 
 13 | class RefineMultiBoxLoss(nn.Module):
 14 |     """SSD Weighted Loss Function
 15 |     Compute Targets:
 16 |         1) Produce Confidence Target Indices by matching  ground truth boxes
 17 |            with (default) 'priorboxes' that have jaccard index > threshold parameter
 18 |            (default threshold: 0.5).
 19 |         2) Produce localization target by 'encoding' variance into offsets of ground
 20 |            truth boxes and their matched  'priorboxes'.
 21 |         3) Hard negative mining to filter the excessive number of negative examples
 22 |            that comes with using a large number of default bounding boxes.
 23 |            (default negative:positive ratio 3:1)
 24 |     Objective Loss:
 25 |         L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
 26 |         Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
 27 |         weighted by α which is set to 1 by cross val.
 28 |         Args:
 29 |             c: class confidences,
 30 |             l: predicted boxes,
 31 |             g: ground truth boxes
 32 |             N: number of matched default boxes
 33 |         See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 34 |     """
 35 | 
 36 | 
 37 |     def __init__(self, num_classes,overlap_thresh,prior_for_matching,bkg_label,neg_mining,neg_pos,neg_overlap,encode_target,object_score = 0):
 38 |         super(RefineMultiBoxLoss, self).__init__()
 39 |         self.num_classes = num_classes
 40 |         self.threshold = overlap_thresh
 41 |         self.background_label = bkg_label
 42 |         self.encode_target = encode_target
 43 |         self.use_prior_for_matching  = prior_for_matching
 44 |         self.do_neg_mining = neg_mining
 45 |         self.negpos_ratio = neg_pos
 46 |         self.neg_overlap = neg_overlap
 47 |         self.object_score = object_score
 48 |         self.variance = [0.1,0.2]
 49 | 
 50 |     def forward(self, odm_data,priors, targets,arm_data = None,filter_object = False):
 51 |         """Multibox Loss
 52 |         Args:
 53 |             predictions (tuple): A tuple containing loc preds, conf preds,
 54 |             and prior boxes from SSD net.
 55 |                 conf shape: torch.size(batch_size,num_priors,num_classes)
 56 |                 loc shape: torch.size(batch_size,num_priors,4)
 57 |                 priors shape: torch.size(num_priors,4)
 58 | 
 59 |             ground_truth (tensor): Ground truth boxes and labels for a batch,
 60 |                 shape: [batch_size,num_objs,5] (last idx is the label).
 61 |             arm_data (tuple): arm branch containg arm_loc and arm_conf
 62 |             filter_object: whether filter out the  prediction according to the arm conf score
 63 |         """
 64 | 
 65 |         loc_data,conf_data = odm_data
 66 |         if arm_data:
 67 |             arm_loc,arm_conf = arm_data
 68 |         priors = priors.data
 69 |         num = loc_data.size(0)
 70 |         num_priors = (priors.size(0))
 71 | 
 72 |         # match priors (default boxes) and ground truth boxes
 73 |         loc_t = torch.Tensor(num, num_priors, 4)
 74 |         conf_t = torch.LongTensor(num, num_priors)
 75 |         for idx in range(num):
 76 |             truths = targets[idx][:,:-1].data
 77 |             labels = targets[idx][:,-1].data
 78 |             #for object detection
 79 |             if self.num_classes == 2:
 80 |                 labels = labels > 0
 81 |             if arm_data:
 82 |                 refine_match(self.threshold,truths,priors,self.variance,labels,loc_t,conf_t,idx,arm_loc[idx].data)
 83 |             else:
 84 |                 match(self.threshold,truths,priors,self.variance,labels,loc_t,conf_t,idx)
 85 |         if GPU:
 86 |             loc_t = loc_t.cuda()
 87 |             conf_t = conf_t.cuda()
 88 |         # wrap targets
 89 |         loc_t = Variable(loc_t, requires_grad=False)
 90 |         conf_t = Variable(conf_t,requires_grad=False)
 91 |         if arm_data and filter_object:
 92 |             arm_conf_data = arm_conf.data[:,:,1]
 93 |             pos = conf_t > 0
 94 |             object_score_index = arm_conf_data <= self.object_score
 95 |             pos[object_score_index] = 0
 96 | 
 97 |         else:
 98 |             pos = conf_t > 0
 99 | 
100 |         # Localization Loss (Smooth L1)
101 |         # Shape: [batch,num_priors,4]
102 |         pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
103 |         loc_p = loc_data[pos_idx].view(-1,4)
104 |         loc_t = loc_t[pos_idx].view(-1,4)
105 |         loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
106 | 
107 |         # Compute max conf across batch for hard negative mining
108 |         batch_conf = conf_data.view(-1,self.num_classes)
109 |         loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1,1))
110 | 
111 |         # Hard Negative Mining
112 |         loss_c[pos] = 0 # filter out pos boxes for now
113 |         loss_c = loss_c.view(num, -1)
114 |         _,loss_idx = loss_c.sort(1, descending=True)
115 |         _,idx_rank = loss_idx.sort(1)
116 |         num_pos = pos.long().sum(1,keepdim=True)
117 |         num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
118 |         neg = idx_rank < num_neg.expand_as(idx_rank)
119 | 
120 |         # Confidence Loss Including Positive and Negative Examples
121 |         pos_idx = pos.unsqueeze(2).expand_as(conf_data)
122 |         neg_idx = neg.unsqueeze(2).expand_as(conf_data)
123 |         conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1,self.num_classes)
124 |         targets_weighted = conf_t[(pos+neg).gt(0)]
125 |         loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)
126 | 
127 |         # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
128 |         N = num_pos.data.sum()
129 |         loss_l/=N
130 |         loss_c/=N
131 |         return loss_l,loss_c
132 | 


--------------------------------------------------------------------------------
/utils/build.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick
  6 | # --------------------------------------------------------
  7 | 
  8 | import os
  9 | from os.path import join as pjoin
 10 | import numpy as np
 11 | from distutils.core import setup
 12 | from distutils.extension import Extension
 13 | from Cython.Distutils import build_ext
 14 | 
 15 | 
 16 | def find_in_path(name, path):
 17 |     "Find a file in a search path"
 18 |     # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
 19 |     for dir in path.split(os.pathsep):
 20 |         binpath = pjoin(dir, name)
 21 |         if os.path.exists(binpath):
 22 |             return os.path.abspath(binpath)
 23 |     return None
 24 | 
 25 | 
 26 | def locate_cuda():
 27 |     """Locate the CUDA environment on the system
 28 | 
 29 |     Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
 30 |     and values giving the absolute path to each directory.
 31 | 
 32 |     Starts by looking for the CUDAHOME env variable. If not found, everything
 33 |     is based on finding 'nvcc' in the PATH.
 34 |     """
 35 | 
 36 |     # first check if the CUDAHOME env variable is in use
 37 |     if 'CUDAHOME' in os.environ:
 38 |         home = os.environ['CUDAHOME']
 39 |         nvcc = pjoin(home, 'bin', 'nvcc')
 40 |     else:
 41 |         # otherwise, search the PATH for NVCC
 42 |         default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
 43 |         nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path)
 44 |         if nvcc is None:
 45 |             raise EnvironmentError('The nvcc binary could not be '
 46 |                                    'located in your $PATH. Either add it to your path, or set $CUDAHOME')
 47 |         home = os.path.dirname(os.path.dirname(nvcc))
 48 | 
 49 |     cudaconfig = {'home': home, 'nvcc': nvcc,
 50 |                   'include': pjoin(home, 'include'),
 51 |                   'lib64': pjoin(home, 'lib64')}
 52 |     for k, v in cudaconfig.items():
 53 |         if not os.path.exists(v):
 54 |             raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
 55 | 
 56 |     return cudaconfig
 57 | 
 58 | 
 59 | CUDA = locate_cuda()
 60 | 
 61 | # Obtain the numpy include directory.  This logic works across numpy versions.
 62 | try:
 63 |     numpy_include = np.get_include()
 64 | except AttributeError:
 65 |     numpy_include = np.get_numpy_include()
 66 | 
 67 | 
 68 | def customize_compiler_for_nvcc(self):
 69 |     """inject deep into distutils to customize how the dispatch
 70 |     to gcc/nvcc works.
 71 | 
 72 |     If you subclass UnixCCompiler, it's not trivial to get your subclass
 73 |     injected in, and still have the right customizations (i.e.
 74 |     distutils.sysconfig.customize_compiler) run on it. So instead of going
 75 |     the OO route, I have this. Note, it's kindof like a wierd functional
 76 |     subclassing going on."""
 77 | 
 78 |     # tell the compiler it can processes .cu
 79 |     self.src_extensions.append('.cu')
 80 | 
 81 |     # save references to the default compiler_so and _comple methods
 82 |     default_compiler_so = self.compiler_so
 83 |     super = self._compile
 84 | 
 85 |     # now redefine the _compile method. This gets executed for each
 86 |     # object but distutils doesn't have the ability to change compilers
 87 |     # based on source extension: we add it.
 88 |     def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
 89 |         print(extra_postargs)
 90 |         if os.path.splitext(src)[1] == '.cu':
 91 |             # use the cuda for .cu files
 92 |             self.set_executable('compiler_so', CUDA['nvcc'])
 93 |             # use only a subset of the extra_postargs, which are 1-1 translated
 94 |             # from the extra_compile_args in the Extension class
 95 |             postargs = extra_postargs['nvcc']
 96 |         else:
 97 |             postargs = extra_postargs['gcc']
 98 | 
 99 |         super(obj, src, ext, cc_args, postargs, pp_opts)
100 |         # reset the default compiler_so, which we might have changed for cuda
101 |         self.compiler_so = default_compiler_so
102 | 
103 |     # inject our redefined _compile method into the class
104 |     self._compile = _compile
105 | 
106 | 
107 | # run the customize_compiler
108 | class custom_build_ext(build_ext):
109 |     def build_extensions(self):
110 |         customize_compiler_for_nvcc(self.compiler)
111 |         build_ext.build_extensions(self)
112 | 
113 | 
114 | ext_modules = [
115 |     Extension(
116 |         "nms.cpu_nms",
117 |         ["nms/cpu_nms.pyx"],
118 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
119 |         include_dirs=[numpy_include]
120 |     ),
121 |     Extension('nms.gpu_nms',
122 |               ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'],
123 |               library_dirs=[CUDA['lib64']],
124 |               libraries=['cudart'],
125 |               language='c++',
126 |               runtime_library_dirs=[CUDA['lib64']],
127 |               # this syntax is specific to this build system
128 |               # we're only going to use certain compiler args with nvcc and not with gcc
129 |               # the implementation of this trick is in customize_compiler() below
130 |               extra_compile_args={'gcc': ["-Wno-unused-function"],
131 |                                   'nvcc': ['-arch=sm_52',
132 |                                            '--ptxas-options=-v',
133 |                                            '-c',
134 |                                            '--compiler-options',
135 |                                            "'-fPIC'"]},
136 |               include_dirs=[numpy_include, CUDA['include']]
137 |               ),
138 |     Extension(
139 |         'pycocotools._mask',
140 |         sources=['pycocotools/maskApi.c', 'pycocotools/_mask.pyx'],
141 |         include_dirs=[numpy_include, 'pycocotools'],
142 |         extra_compile_args={
143 |             'gcc': ['-Wno-cpp', '-Wno-unused-function', '-std=c99']},
144 |     ),
145 | ]
146 | 
147 | setup(
148 |     name='mot_utils',
149 |     ext_modules=ext_modules,
150 |     # inject our custom trigger
151 |     cmdclass={'build_ext': custom_build_ext},
152 | )
153 | 


--------------------------------------------------------------------------------
/models/mobilenet.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | """ 
  4 | Creates a MobileNet Model as defined in:
  5 | Andrew G. Howard Menglong Zhu Bo Chen, et.al. (2017). 
  6 | MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications. 
  7 | (c) Yang Lu
  8 | """
  9 | import math
 10 | import torch.nn as nn
 11 | import torch
 12 | 
 13 | __all__ = ['DepthWiseBlock', 'mobilenet', 'mobilenet_2', 'mobilenet_1', 'mobilenet_075', 'mobilenet_05',
 14 |            'mobilenet_025']
 15 | 
 16 | class SELayer(nn.Module):
 17 |     def __init__(self, channel, reduction=16):
 18 |         super(SELayer, self).__init__()
 19 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 20 |         self.fc = nn.Sequential(
 21 |             nn.Linear(channel, channel // reduction, bias=False),
 22 |             nn.ReLU(inplace=True),
 23 |             nn.Linear(channel // reduction, channel, bias=False),
 24 |             nn.Sigmoid()
 25 |         )
 26 | 
 27 |     def forward(self, x):
 28 |         b, c, _, _ = x.size()
 29 |         y = self.avg_pool(x).view(b,c)
 30 |         y = self.fc(y).view(b, c, 1, 1)
 31 |         return x * y.expand_as(x)
 32 | 
 33 | class DepthWiseBlock(nn.Module):
 34 |     def __init__(self, inplanes, planes, stride=1, padding=1):
 35 |         super(DepthWiseBlock, self).__init__()
 36 |         inplanes, planes = int(inplanes), int(planes)
 37 |         self.conv_dw = nn.Conv2d(inplanes, inplanes, kernel_size=3, padding=padding, stride=stride, groups=inplanes,
 38 |                                  bias=False)
 39 |         self.bn_dw = nn.BatchNorm2d(inplanes)
 40 |         self.conv_sep = nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, padding=0, bias=False)
 41 |         self.bn_sep = nn.BatchNorm2d(planes)
 42 |         self.relu = nn.ReLU(inplace=True)
 43 |         #self.se = SELayer(inplanes)
 44 | 
 45 |     def forward(self, x):
 46 |         out = self.conv_dw(x)
 47 |         out = self.bn_dw(out)
 48 |         out = self.relu(out)
 49 | 
 50 |         out = self.conv_sep(out)
 51 |         out = self.bn_sep(out)
 52 |         out = self.relu(out)
 53 |         #out = self.se(out)
 54 | 
 55 |         return out
 56 | 
 57 | 
 58 | class MobileNet(nn.Module):
 59 |     def __init__(self, widen_factor=1.0, num_classes=1000):
 60 |         """ Constructor
 61 |         Args:
 62 |             widen_factor: config of widen_factor
 63 |             num_classes: number of classes
 64 |         """
 65 |         super(MobileNet, self).__init__()
 66 | 
 67 |         block = DepthWiseBlock
 68 |         self.conv1 = nn.Conv2d(3, int(32 * widen_factor), kernel_size=3, stride=2, padding=1, bias=False)
 69 |         self.bn1 = nn.BatchNorm2d(int(32 * widen_factor))
 70 |         self.relu = nn.ReLU(inplace=True)
 71 | 
 72 |         self.dw2_1 = block(32 * widen_factor, 64 * widen_factor)
 73 |         self.dw2_2 = block(64 * widen_factor, 128 * widen_factor, stride=2)
 74 | 
 75 |         self.dw3_1 = block(128 * widen_factor, 128 * widen_factor)
 76 |         self.dw3_2 = block(128 * widen_factor, 256 * widen_factor, stride=2)
 77 | 
 78 |         self.dw4_1 = block(256 * widen_factor, 256 * widen_factor)
 79 |         self.dw4_2 = block(256 * widen_factor, 512 * widen_factor, stride=2)
 80 | 
 81 |         self.dw5_1 = block(512 * widen_factor, 512 * widen_factor)
 82 |         self.dw5_2 = block(512 * widen_factor, 512 * widen_factor)
 83 |         self.dw5_3 = block(512 * widen_factor, 512 * widen_factor)
 84 |         self.dw5_4 = block(512 * widen_factor, 512 * widen_factor)
 85 |         self.dw5_5 = block(512 * widen_factor, 512 * widen_factor)
 86 |         self.dw5_6 = block(512 * widen_factor, 1024 * widen_factor, stride=2)
 87 | 
 88 |         self.dw6 = block(1024 * widen_factor, 1024 * widen_factor)
 89 | 
 90 |         self.avgpool = nn.AdaptiveAvgPool2d(1)
 91 |         self.fc = nn.Linear(int(1024 * widen_factor), num_classes)
 92 | 
 93 |         for m in self.modules():
 94 |             if isinstance(m, nn.Conv2d):
 95 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 96 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 97 |             elif isinstance(m, nn.BatchNorm2d):
 98 |                 m.weight.data.fill_(1)
 99 |                 m.bias.data.zero_()
100 | 
101 |     def forward(self, x):
102 |         x = self.conv1(x)
103 |         x = self.bn1(x)
104 |         x = self.relu(x)
105 | 
106 |         x = self.dw2_1(x)
107 |         x = self.dw2_2(x)
108 |         x = self.dw3_1(x)
109 |         x = self.dw3_2(x)
110 |         x0 = self.dw4_1(x)
111 |         #print(x0.size(),"layer4_1")
112 |         x = self.dw4_2(x0)
113 |         x = self.dw5_1(x)
114 |         x = self.dw5_2(x)
115 |         x = self.dw5_3(x)
116 |         x = self.dw5_4(x)
117 |         x1 = self.dw5_5(x)
118 |         #print(x1.size(), "layer5_5")
119 |         x = self.dw5_6(x1)
120 |         x2 = self.dw6(x)
121 |         #print(x2.size(), "layer6_1")
122 |         return x0, x1, x2
123 | 
124 | 
125 | def mobilenet(widen_factor=1.0, num_classes=1000):
126 |     """
127 |     Construct MobileNet.
128 |     """
129 |     model = MobileNet(widen_factor=widen_factor, num_classes=num_classes)
130 |     return model
131 | 
132 | 
133 | def mobilenet_2():
134 |     """
135 |     Construct MobileNet.
136 |     """
137 |     model = MobileNet(widen_factor=2.0, num_classes=1000)
138 |     return model
139 | 
140 | 
141 | def mobilenet_1():
142 |     """
143 |     Construct MobileNet.
144 |     """
145 |     model = MobileNet(widen_factor=1.0, num_classes=1000)
146 |     return model
147 | 
148 | 
149 | def mobilenet_075():
150 |     """
151 |     Construct MobileNet.
152 |     """
153 |     model = MobileNet(widen_factor=0.75, num_classes=1000)
154 |     return model
155 | 
156 | 
157 | def mobilenet_05():
158 |     """
159 |     Construct MobileNet.
160 |     """
161 |     model = MobileNet(widen_factor=0.5, num_classes=1000)
162 |     return model
163 | 
164 | 
165 | def mobilenet_025():
166 |     """
167 |     Construct MobileNet.
168 |     """
169 |     model = MobileNet(widen_factor=0.25, num_classes=1000)
170 |     return model
171 | 
172 | 
173 | # if __name__ == '__main__':
174 | #     mobilenet = mobilenet_1()
175 | #     print(mobilenet)
176 | #     print(mobilenet.state_dict().keys())
177 | 
178 | # from torch.autograd import Variable
179 | #
180 | # input = Variable(torch.randn(1, 3, 300, 300))
181 | #
182 | # model = mobilenet()
183 | # print(model)
184 | #
185 | # output = model(input)
186 | # print(output.size())
187 | 


--------------------------------------------------------------------------------
/data/voc0712_aug.py:
--------------------------------------------------------------------------------
  1 | """VOC Dataset Classes
  2 | 
  3 | Original author: Francisco Massa
  4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
  5 | 
  6 | Updated by: Ellis Brown, Max deGroot
  7 | """
  8 | #from .config import HOME
  9 | import os.path as osp
 10 | import sys
 11 | import torch
 12 | import torch.utils.data as data
 13 | import cv2
 14 | import numpy as np
 15 | if sys.version_info[0] == 2:
 16 |     import xml.etree.cElementTree as ET
 17 | else:
 18 |     import xml.etree.ElementTree as ET
 19 | 
 20 | VOC_CLASSES = (  # always index 0
 21 |     'aeroplane', 'bicycle', 'bird', 'boat',
 22 |     'bottle', 'bus', 'car', 'cat', 'chair',
 23 |     'cow', 'diningtable', 'dog', 'horse',
 24 |     'motorbike', 'person', 'pottedplant',
 25 |     'sheep', 'sofa', 'train', 'tvmonitor')
 26 | 
 27 | # note: if you used our download scripts, this should be right
 28 | #VOC_ROOT = osp.join(HOME, "data/VOCdevkit/")
 29 | 
 30 | 
 31 | class VOCAnnotationTransform(object):
 32 |     """Transforms a VOC annotation into a Tensor of bbox coords and label index
 33 |     Initilized with a dictionary lookup of classnames to indexes
 34 | 
 35 |     Arguments:
 36 |         class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
 37 |             (default: alphabetic indexing of VOC's 20 classes)
 38 |         keep_difficult (bool, optional): keep difficult instances or not
 39 |             (default: False)
 40 |         height (int): height
 41 |         width (int): width
 42 |     """
 43 | 
 44 |     def __init__(self, class_to_ind=None, keep_difficult=False):
 45 |         self.class_to_ind = class_to_ind or dict(
 46 |             zip(VOC_CLASSES, range(len(VOC_CLASSES))))
 47 |         self.keep_difficult = keep_difficult
 48 | 
 49 |     def __call__(self, target, width, height):
 50 |         """
 51 |         Arguments:
 52 |             target (annotation) : the target annotation to be made usable
 53 |                 will be an ET.Element
 54 |         Returns:
 55 |             a list containing lists of bounding boxes  [bbox coords, class name]
 56 |         """
 57 |         res = []
 58 |         for obj in target.iter('object'):
 59 |             difficult = int(obj.find('difficult').text) == 1
 60 |             if not self.keep_difficult and difficult:
 61 |                 continue
 62 |             name = obj.find('name').text.lower().strip()
 63 |             bbox = obj.find('bndbox')
 64 | 
 65 |             pts = ['xmin', 'ymin', 'xmax', 'ymax']
 66 |             bndbox = []
 67 |             for i, pt in enumerate(pts):
 68 |                 cur_pt = int(bbox.find(pt).text) - 1
 69 |                 # scale height or width
 70 |                 cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
 71 |                 bndbox.append(cur_pt)
 72 |             label_idx = self.class_to_ind[name]
 73 |             bndbox.append(label_idx)
 74 |             res += [bndbox]  # [xmin, ymin, xmax, ymax, label_ind]
 75 |             # img_id = target.find('filename').text[:-4]
 76 | 
 77 |         return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
 78 | 
 79 | 
 80 | class VOCDetection(data.Dataset):
 81 |     """VOC Detection Dataset Object
 82 | 
 83 |     input is image, target is annotation
 84 | 
 85 |     Arguments:
 86 |         root (string): filepath to VOCdevkit folder.
 87 |         image_set (string): imageset to use (eg. 'train', 'val', 'test')
 88 |         transform (callable, optional): transformation to perform on the
 89 |             input image
 90 |         target_transform (callable, optional): transformation to perform on the
 91 |             target `annotation`
 92 |             (eg: take in caption string, return tensor of word indices)
 93 |         dataset_name (string, optional): which dataset to load
 94 |             (default: 'VOC2007')
 95 |     """
 96 | 
 97 |     def __init__(self, root,
 98 |                  image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
 99 |                  transform=None, target_transform=VOCAnnotationTransform(),
100 |                  dataset_name='VOC0712'):
101 |         self.root = root
102 |         self.image_set = image_sets
103 |         self.transform = transform
104 |         self.target_transform = target_transform
105 |         self.name = dataset_name
106 |         self._annopath = osp.join('%s', 'Annotations', '%s.xml')
107 |         self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
108 |         self.ids = list()
109 |         for (year, name) in image_sets:
110 |             rootpath = osp.join(self.root, 'VOC' + year)
111 |             for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
112 |                 self.ids.append((rootpath, line.strip()))
113 | 
114 |     def __getitem__(self, index):
115 |         im, gt, h, w = self.pull_item(index)
116 | 
117 |         return im, gt
118 | 
119 |     def __len__(self):
120 |         return len(self.ids)
121 | 
122 |     def pull_item(self, index):
123 |         img_id = self.ids[index]
124 | 
125 |         target = ET.parse(self._annopath % img_id).getroot()
126 |         img = cv2.imread(self._imgpath % img_id)
127 |         height, width, channels = img.shape
128 | 
129 |         if self.target_transform is not None:
130 |             target = self.target_transform(target, width, height)
131 | 
132 |         if self.transform is not None:
133 |             target = np.array(target)
134 |             img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])
135 |             # to rgb
136 |             img = img[:, :, (2, 1, 0)]
137 |             # img = img.transpose(2, 0, 1)
138 |             target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
139 |         return torch.from_numpy(img).permute(2, 0, 1), target, height, width
140 |         # return torch.from_numpy(img), target, height, width
141 | 
142 |     def pull_image(self, index):
143 |         '''Returns the original image object at index in PIL form
144 | 
145 |         Note: not using self.__getitem__(), as any transformations passed in
146 |         could mess up this functionality.
147 | 
148 |         Argument:
149 |             index (int): index of img to show
150 |         Return:
151 |             PIL img
152 |         '''
153 |         img_id = self.ids[index]
154 |         return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
155 | 
156 |     def pull_anno(self, index):
157 |         '''Returns the original annotation of image at index
158 | 
159 |         Note: not using self.__getitem__(), as any transformations passed in
160 |         could mess up this functionality.
161 | 
162 |         Argument:
163 |             index (int): index of img to get annotation of
164 |         Return:
165 |             list:  [img_id, [(label, bbox coords),...]]
166 |                 eg: ('001718', [('dog', (96, 13, 438, 332))])
167 |         '''
168 |         img_id = self.ids[index]
169 |         anno = ET.parse(self._annopath % img_id).getroot()
170 |         gt = self.target_transform(anno, 1, 1)
171 |         return img_id[1], gt
172 | 
173 |     def pull_tensor(self, index):
174 |         '''Returns the original image at an index in tensor form
175 | 
176 |         Note: not using self.__getitem__(), as any transformations passed in
177 |         could mess up this functionality.
178 | 
179 |         Argument:
180 |             index (int): index of img to show
181 |         Return:
182 |             tensorized version of img, squeezed
183 |         '''
184 |         return torch.Tensor(self.pull_image(index)).unsqueeze_(0)
185 | 


--------------------------------------------------------------------------------
/data/voc_eval.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast/er R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Bharath Hariharan
  5 | # --------------------------------------------------------
  6 | 
  7 | import pickle
  8 | import xml.etree.ElementTree as ET
  9 | 
 10 | import numpy as np
 11 | import os
 12 | 
 13 | 
 14 | def parse_rec(filename):
 15 |     """ Parse a PASCAL VOC xml file """
 16 |     tree = ET.parse(filename)
 17 |     objects = []
 18 |     for obj in tree.findall('object'):
 19 |         obj_struct = {}
 20 |         obj_struct['name'] = obj.find('name').text
 21 |         obj_struct['pose'] = obj.find('pose').text
 22 |         obj_struct['truncated'] = int(obj.find('truncated').text)
 23 |         obj_struct['difficult'] = int(obj.find('difficult').text)
 24 |         bbox = obj.find('bndbox')
 25 |         obj_struct['bbox'] = [int(bbox.find('xmin').text),
 26 |                               int(bbox.find('ymin').text),
 27 |                               int(bbox.find('xmax').text),
 28 |                               int(bbox.find('ymax').text)]
 29 |         objects.append(obj_struct)
 30 | 
 31 |     return objects
 32 | 
 33 | 
 34 | def voc_ap(rec, prec, use_07_metric=False):
 35 |     """ ap = voc_ap(rec, prec, [use_07_metric])
 36 |     Compute VOC AP given precision and recall.
 37 |     If use_07_metric is true, uses the
 38 |     VOC 07 11 point method (default:False).
 39 |     """
 40 |     if use_07_metric:
 41 |         # 11 point metric
 42 |         ap = 0.
 43 |         for t in np.arange(0., 1.1, 0.1):
 44 |             if np.sum(rec >= t) == 0:
 45 |                 p = 0
 46 |             else:
 47 |                 p = np.max(prec[rec >= t])
 48 |             ap = ap + p / 11.
 49 |     else:
 50 |         # correct AP calculation
 51 |         # first append sentinel values at the end
 52 |         mrec = np.concatenate(([0.], rec, [1.]))
 53 |         mpre = np.concatenate(([0.], prec, [0.]))
 54 | 
 55 |         # compute the precision envelope
 56 |         for i in range(mpre.size - 1, 0, -1):
 57 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 58 | 
 59 |         # to calculate area under PR curve, look for points
 60 |         # where X axis (recall) changes value
 61 |         i = np.where(mrec[1:] != mrec[:-1])[0]
 62 | 
 63 |         # and sum (\Delta recall) * prec
 64 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 65 |     return ap
 66 | 
 67 | 
 68 | def voc_eval(detpath,
 69 |              annopath,
 70 |              imagesetfile,
 71 |              classname,
 72 |              cachedir,
 73 |              ovthresh=0.5,
 74 |              use_07_metric=False):
 75 |     """rec, prec, ap = voc_eval(detpath,
 76 |                                 annopath,
 77 |                                 imagesetfile,
 78 |                                 classname,
 79 |                                 [ovthresh],
 80 |                                 [use_07_metric])
 81 | 
 82 |     Top level function that does the PASCAL VOC evaluation.
 83 | 
 84 |     detpath: Path to detections
 85 |         detpath.format(classname) should produce the detection results file.
 86 |     annopath: Path to annotations
 87 |         annopath.format(imagename) should be the xml annotations file.
 88 |     imagesetfile: Text file containing the list of images, one image per line.
 89 |     classname: Category name (duh)
 90 |     cachedir: Directory for caching the annotations
 91 |     [ovthresh]: Overlap threshold (default = 0.5)
 92 |     [use_07_metric]: Whether to use VOC07's 11 point AP computation
 93 |         (default False)
 94 |     """
 95 |     # assumes detections are in detpath.format(classname)
 96 |     # assumes annotations are in annopath.format(imagename)
 97 |     # assumes imagesetfile is a text file with each line an image name
 98 |     # cachedir caches the annotations in a pickle file
 99 | 
100 |     # first load gt
101 |     if not os.path.isdir(cachedir):
102 |         os.mkdir(cachedir)
103 |     cachefile = os.path.join(cachedir, 'annots.pkl')
104 |     # read list of images
105 |     with open(imagesetfile, 'r') as f:
106 |         lines = f.readlines()
107 |     imagenames = [x.strip() for x in lines]
108 | 
109 |     if not os.path.isfile(cachefile):
110 |         # load annots
111 |         recs = {}
112 |         for i, imagename in enumerate(imagenames):
113 |             recs[imagename] = parse_rec(annopath.format(imagename))
114 |             if i % 100 == 0:
115 |                 print('Reading annotation for {:d}/{:d}'.format(
116 |                     i + 1, len(imagenames)))
117 |         # save
118 |         print('Saving cached annotations to {:s}'.format(cachefile))
119 |         with open(cachefile, 'wb') as f:
120 |             pickle.dump(recs, f)
121 |     else:
122 |         # load
123 |         with open(cachefile, 'rb') as f:
124 |             recs = pickle.load(f)
125 | 
126 |     # extract gt objects for this class
127 |     class_recs = {}
128 |     npos = 0
129 |     for imagename in imagenames:
130 |         R = [obj for obj in recs[imagename] if obj['name'] == classname]
131 |         bbox = np.array([x['bbox'] for x in R])
132 |         difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
133 |         det = [False] * len(R)
134 |         npos = npos + sum(~difficult)
135 |         class_recs[imagename] = {'bbox': bbox,
136 |                                  'difficult': difficult,
137 |                                  'det': det}
138 | 
139 |     # read dets
140 |     detfile = detpath.format(classname)
141 |     with open(detfile, 'r') as f:
142 |         lines = f.readlines()
143 | 
144 |     splitlines = [x.strip().split(' ') for x in lines]
145 |     image_ids = [x[0] for x in splitlines]
146 |     confidence = np.array([float(x[1]) for x in splitlines])
147 |     BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
148 | 
149 |     # sort by confidence
150 |     sorted_ind = np.argsort(-confidence)
151 |     sorted_scores = np.sort(-confidence)
152 |     BB = BB[sorted_ind, :]
153 |     image_ids = [image_ids[x] for x in sorted_ind]
154 | 
155 |     # go down dets and mark TPs and FPs
156 |     nd = len(image_ids)
157 |     tp = np.zeros(nd)
158 |     fp = np.zeros(nd)
159 |     for d in range(nd):
160 |         R = class_recs[image_ids[d]]
161 |         bb = BB[d, :].astype(float)
162 |         ovmax = -np.inf
163 |         BBGT = R['bbox'].astype(float)
164 | 
165 |         if BBGT.size > 0:
166 |             # compute overlaps
167 |             # intersection
168 |             ixmin = np.maximum(BBGT[:, 0], bb[0])
169 |             iymin = np.maximum(BBGT[:, 1], bb[1])
170 |             ixmax = np.minimum(BBGT[:, 2], bb[2])
171 |             iymax = np.minimum(BBGT[:, 3], bb[3])
172 |             iw = np.maximum(ixmax - ixmin + 1., 0.)
173 |             ih = np.maximum(iymax - iymin + 1., 0.)
174 |             inters = iw * ih
175 | 
176 |             # union
177 |             uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
178 |                    (BBGT[:, 2] - BBGT[:, 0] + 1.) *
179 |                    (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
180 | 
181 |             overlaps = inters / uni
182 |             ovmax = np.max(overlaps)
183 |             jmax = np.argmax(overlaps)
184 | 
185 |         if ovmax > ovthresh:
186 |             if not R['difficult'][jmax]:
187 |                 if not R['det'][jmax]:
188 |                     tp[d] = 1.
189 |                     R['det'][jmax] = 1
190 |                 else:
191 |                     fp[d] = 1.
192 |         else:
193 |             fp[d] = 1.
194 | 
195 |         # compute precision recall
196 |     fp = np.cumsum(fp)
197 |     tp = np.cumsum(tp)
198 |     rec = tp / float(npos)
199 |     # avoid divide by zero in case the first detection matches a difficult
200 |     # ground truth
201 |     prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
202 |     ap = voc_ap(rec, prec, use_07_metric)
203 | 
204 |     return rec, prec, ap
205 | 


--------------------------------------------------------------------------------
/models/misc.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import os
  3 | import shutil
  4 | import pickle as pkl
  5 | import time
  6 | import numpy as np
  7 | import hashlib
  8 | 
  9 | from IPython import embed
 10 | 
 11 | class Logger(object):
 12 |     def __init__(self):
 13 |         self._logger = None
 14 | 
 15 |     def init(self, logdir, name='log'):
 16 |         if self._logger is None:
 17 |             import logging
 18 |             if not os.path.exists(logdir):
 19 |                 os.makedirs(logdir)
 20 |             log_file = os.path.join(logdir, name)
 21 |             if os.path.exists(log_file):
 22 |                 os.remove(log_file)
 23 |             self._logger = logging.getLogger()
 24 |             self._logger.setLevel('INFO')
 25 |             fh = logging.FileHandler(log_file)
 26 |             ch = logging.StreamHandler()
 27 |             self._logger.addHandler(fh)
 28 |             self._logger.addHandler(ch)
 29 | 
 30 |     def info(self, str_info):
 31 |         self.init('/tmp', 'tmp.log')
 32 |         self._logger.info(str_info)
 33 | logger = Logger()
 34 | 
 35 | print = logger.info
 36 | def ensure_dir(path, erase=False):
 37 |     if os.path.exists(path) and erase:
 38 |         print("Removing old folder {}".format(path))
 39 |         shutil.rmtree(path)
 40 |     if not os.path.exists(path):
 41 |         print("Creating folder {}".format(path))
 42 |         os.makedirs(path)
 43 | 
 44 | def load_pickle(path):
 45 |     begin_st = time.time()
 46 |     with open(path, 'rb') as f:
 47 |         print("Loading pickle object from {}".format(path))
 48 |         v = pkl.load(f)
 49 |     print("=> Done ({:.4f} s)".format(time.time() - begin_st))
 50 |     return v
 51 | 
 52 | def dump_pickle(obj, path):
 53 |     with open(path, 'wb') as f:
 54 |         print("Dumping pickle object to {}".format(path))
 55 |         pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
 56 | 
 57 | def auto_select_gpu(mem_bound=500, utility_bound=0, gpus=(0, 1, 2, 3, 4, 5, 6, 7), num_gpu=1, selected_gpus=None):
 58 |     import sys
 59 |     import os
 60 |     import subprocess
 61 |     import re
 62 |     import time
 63 |     import numpy as np
 64 |     if 'CUDA_VISIBLE_DEVCIES' in os.environ:
 65 |         sys.exit(0)
 66 |     if selected_gpus is None:
 67 |         mem_trace = []
 68 |         utility_trace = []
 69 |         for i in range(5): # sample 5 times
 70 |             info = subprocess.check_output('nvidia-smi', shell=True).decode('utf-8')
 71 |             mem = [int(s[:-5]) for s in re.compile('\d+MiB\s/').findall(info)]
 72 |             utility = [int(re.compile('\d+').findall(s)[0]) for s in re.compile('\d+%\s+Default').findall(info)]
 73 |             mem_trace.append(mem)
 74 |             utility_trace.append(utility)
 75 |             time.sleep(0.1)
 76 |         mem = np.mean(mem_trace, axis=0)
 77 |         utility = np.mean(utility_trace, axis=0)
 78 |         assert(len(mem) == len(utility))
 79 |         nGPU = len(utility)
 80 |         ideal_gpus = [i for i in range(nGPU) if mem[i] <= mem_bound and utility[i] <= utility_bound and i in gpus]
 81 | 
 82 |         if len(ideal_gpus) < num_gpu:
 83 |             print("No sufficient resource, available: {}, require {} gpu".format(ideal_gpus, num_gpu))
 84 |             sys.exit(0)
 85 |         else:
 86 |             selected_gpus = list(map(str, ideal_gpus[:num_gpu]))
 87 |     else:
 88 |         selected_gpus = selected_gpus.split(',')
 89 | 
 90 |     print("Setting GPU: {}".format(selected_gpus))
 91 |     os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(selected_gpus)
 92 |     return selected_gpus
 93 | 
 94 | def expand_user(path):
 95 |     return os.path.abspath(os.path.expanduser(path))
 96 | 
 97 | def model_snapshot(model, new_file, old_file=None, verbose=False):
 98 |     from collections import OrderedDict
 99 |     import torch
100 |     if isinstance(model, torch.nn.DataParallel):
101 |         model = model.module
102 |     if old_file and os.path.exists(expand_user(old_file)):
103 |         if verbose:
104 |             print("Removing old model {}".format(expand_user(old_file)))
105 |         os.remove(expand_user(old_file))
106 |     if verbose:
107 |         print("Saving model to {}".format(expand_user(new_file)))
108 | 
109 |     state_dict = OrderedDict()
110 |     for k, v in model.state_dict().items():
111 |         if v.is_cuda:
112 |             v = v.cpu()
113 |         state_dict[k] = v
114 |     torch.save(state_dict, expand_user(new_file))
115 | 
116 | 
117 | def load_lmdb(lmdb_file, n_records=None):
118 |     import lmdb
119 |     import numpy as np
120 |     lmdb_file = expand_user(lmdb_file)
121 |     if os.path.exists(lmdb_file):
122 |         data = []
123 |         env = lmdb.open(lmdb_file, readonly=True, max_readers=512)
124 |         with env.begin() as txn:
125 |             cursor = txn.cursor()
126 |             begin_st = time.time()
127 |             print("Loading lmdb file {} into memory".format(lmdb_file))
128 |             for key, value in cursor:
129 |                 _, target, _ = key.decode('ascii').split(':')
130 |                 target = int(target)
131 |                 img = cv2.imdecode(np.fromstring(value, np.uint8), cv2.IMREAD_COLOR)
132 |                 data.append((img, target))
133 |                 if n_records is not None and len(data) >= n_records:
134 |                     break
135 |         env.close()
136 |         print("=> Done ({:.4f} s)".format(time.time() - begin_st))
137 |         return data
138 |     else:
139 |         print("Not found lmdb file".format(lmdb_file))
140 | 
141 | def str2img(str_b):
142 |     return cv2.imdecode(np.fromstring(str_b, np.uint8), cv2.IMREAD_COLOR)
143 | 
144 | def img2str(img):
145 |     return cv2.imencode('.jpg', img)[1].tostring()
146 | 
147 | def md5(s):
148 |     m = hashlib.md5()
149 |     m.update(s)
150 |     return m.hexdigest()
151 | 
152 | def eval_model(model, ds, n_sample=None, ngpu=1, is_imagenet=False):
153 |     import tqdm
154 |     import torch
155 |     from torch import nn
156 |     from torch.autograd import Variable
157 | 
158 |     class ModelWrapper(nn.Module):
159 |         def __init__(self, model):
160 |             super(ModelWrapper, self).__init__()
161 |             self.model = model
162 |             self.mean = [0.485, 0.456, 0.406]
163 |             self.std = [0.229, 0.224, 0.225]
164 | 
165 |         def forward(self, input):
166 |             input.data.div_(255.)
167 |             input.data[:, 0, :, :].sub_(self.mean[0]).div_(self.std[0])
168 |             input.data[:, 1, :, :].sub_(self.mean[1]).div_(self.std[1])
169 |             input.data[:, 2, :, :].sub_(self.mean[2]).div_(self.std[2])
170 |             return self.model(input)
171 | 
172 |     correct1, correct5 = 0, 0
173 |     n_passed = 0
174 |     if is_imagenet:
175 |         model = ModelWrapper(model)
176 |     model = model.eval()
177 |     model = torch.nn.DataParallel(model, device_ids=range(ngpu)).cuda()
178 | 
179 |     n_sample = len(ds) if n_sample is None else n_sample
180 |     for idx, (data, target) in enumerate(tqdm.tqdm(ds, total=n_sample)):
181 |         n_passed += len(data)
182 |         data =  Variable(torch.FloatTensor(data)).cuda()
183 |         indx_target = torch.LongTensor(target)
184 |         output = model(data)
185 |         bs = output.size(0)
186 |         idx_pred = output.data.sort(1, descending=True)[1]
187 | 
188 |         idx_gt1 = indx_target.expand(1, bs).transpose_(0, 1)
189 |         idx_gt5 = idx_gt1.expand(bs, 5)
190 | 
191 |         correct1 += idx_pred[:, :1].cpu().eq(idx_gt1).sum()
192 |         correct5 += idx_pred[:, :5].cpu().eq(idx_gt5).sum()
193 | 
194 |         if idx >= n_sample - 1:
195 |             break
196 | 
197 |     acc1 = correct1 * 1.0 / n_passed
198 |     acc5 = correct5 * 1.0 / n_passed
199 |     return acc1, acc5
200 | 
201 | def load_state_dict(model, model_urls, model_root):
202 |     from torch.utils import model_zoo
203 |     from torch import nn
204 |     import re
205 |     from collections import OrderedDict
206 |     own_state_old = model.state_dict()
207 |     own_state = OrderedDict() # remove all 'group' string
208 |     for k, v in own_state_old.items():
209 |         k = re.sub('group\d+\.', '', k)
210 |         own_state[k] = v
211 | 
212 |     state_dict = model_zoo.load_url(model_urls, model_root)
213 | 
214 |     for name, param in state_dict.items():
215 |         if name not in own_state:
216 |             print(own_state.keys())
217 |             raise KeyError('unexpected key "{}" in state_dict'
218 |                            .format(name))
219 |         if isinstance(param, nn.Parameter):
220 |             # backwards compatibility for serialized parameters
221 |             param = param.data
222 |         own_state[name].copy_(param)
223 | 
224 |     missing = set(own_state.keys()) - set(state_dict.keys())
225 |     if len(missing) > 0:
226 |         raise KeyError('missing keys in state_dict: "{}"'.format(missing))


--------------------------------------------------------------------------------
/data/data_augment.py:
--------------------------------------------------------------------------------
  1 | """Data augmentation functionality. Passed as callable transformations to
  2 | Dataset classes.
  3 | 
  4 | The data augmentation procedures were interpreted from @weiliu89's SSD paper
  5 | http://arxiv.org/abs/1512.02325
  6 | 
  7 | TODO: implement data_augment for training
  8 | 
  9 | Ellis Brown, Max deGroot
 10 | """
 11 | 
 12 | import math
 13 | 
 14 | import cv2
 15 | import numpy as np
 16 | import random
 17 | import torch
 18 | 
 19 | from utils.box_utils import matrix_iou
 20 | 
 21 | 
 22 | # import torch_transforms
 23 | 
 24 | def _crop(image, boxes, labels):
 25 |     height, width, _ = image.shape
 26 | 
 27 |     if len(boxes) == 0:
 28 |         return image, boxes, labels
 29 | 
 30 |     while True:
 31 |         mode = random.choice((
 32 |             None,
 33 |             (0.1, None),
 34 |             (0.3, None),
 35 |             (0.5, None),
 36 |             (0.7, None),
 37 |             (0.9, None),
 38 |             (None, None),
 39 |         ))
 40 | 
 41 |         if mode is None:
 42 |             return image, boxes, labels
 43 | 
 44 |         min_iou, max_iou = mode
 45 |         if min_iou is None:
 46 |             min_iou = float('-inf')
 47 |         if max_iou is None:
 48 |             max_iou = float('inf')
 49 | 
 50 |         for _ in range(50):
 51 |             scale = random.uniform(0.3, 1.)
 52 |             min_ratio = max(0.5, scale * scale)
 53 |             max_ratio = min(2, 1. / scale / scale)
 54 |             ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
 55 |             w = int(scale * ratio * width)
 56 |             h = int((scale / ratio) * height)
 57 | 
 58 |             l = random.randrange(width - w)
 59 |             t = random.randrange(height - h)
 60 |             roi = np.array((l, t, l + w, t + h))
 61 | 
 62 |             iou = matrix_iou(boxes, roi[np.newaxis])
 63 | 
 64 |             if not (min_iou <= iou.min() and iou.max() <= max_iou):
 65 |                 continue
 66 | 
 67 |             image_t = image[roi[1]:roi[3], roi[0]:roi[2]]
 68 | 
 69 |             centers = (boxes[:, :2] + boxes[:, 2:]) / 2
 70 |             mask = np.logical_and(roi[:2] < centers, centers < roi[2:]) \
 71 |                 .all(axis=1)
 72 |             boxes_t = boxes[mask].copy()
 73 |             labels_t = labels[mask].copy()
 74 |             if len(boxes_t) == 0:
 75 |                 continue
 76 | 
 77 |             boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2])
 78 |             boxes_t[:, :2] -= roi[:2]
 79 |             boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:])
 80 |             boxes_t[:, 2:] -= roi[:2]
 81 | 
 82 |             return image_t, boxes_t, labels_t
 83 | 
 84 | 
 85 | def _distort(image):
 86 |     def _convert(image, alpha=1, beta=0):
 87 |         tmp = image.astype(float) * alpha + beta
 88 |         tmp[tmp < 0] = 0
 89 |         tmp[tmp > 255] = 255
 90 |         image[:] = tmp
 91 | 
 92 |     image = image.copy()
 93 | 
 94 |     if random.randrange(2):
 95 |         _convert(image, beta=random.uniform(-32, 32))
 96 | 
 97 |     if random.randrange(2):
 98 |         _convert(image, alpha=random.uniform(0.5, 1.5))
 99 | 
100 |     image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
101 | 
102 |     if random.randrange(2):
103 |         tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
104 |         tmp %= 180
105 |         image[:, :, 0] = tmp
106 | 
107 |     if random.randrange(2):
108 |         _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))
109 | 
110 |     image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
111 | 
112 |     return image
113 | 
114 | 
115 | def _expand(image, boxes, fill, p):
116 |     if random.random() > p:
117 |         return image, boxes
118 | 
119 |     height, width, depth = image.shape
120 |     for _ in range(50):
121 |         scale = random.uniform(1, 4)
122 | 
123 |         min_ratio = max(0.5, 1. / scale / scale)
124 |         max_ratio = min(2, scale * scale)
125 |         ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
126 |         ws = scale * ratio
127 |         hs = scale / ratio
128 |         if ws < 1 or hs < 1:
129 |             continue
130 |         w = int(ws * width)
131 |         h = int(hs * height)
132 | 
133 |         left = random.randint(0, w - width)
134 |         top = random.randint(0, h - height)
135 | 
136 |         boxes_t = boxes.copy()
137 |         boxes_t[:, :2] += (left, top)
138 |         boxes_t[:, 2:] += (left, top)
139 | 
140 |         expand_image = np.empty(
141 |             (h, w, depth),
142 |             dtype=image.dtype)
143 |         expand_image[:, :] = fill
144 |         expand_image[top:top + height, left:left + width] = image
145 |         image = expand_image
146 | 
147 |         return image, boxes_t
148 | 
149 | 
150 | def _mirror(image, boxes):
151 |     _, width, _ = image.shape
152 |     if random.randrange(2):
153 |         image = image[:, ::-1]
154 |         boxes = boxes.copy()
155 |         boxes[:, 0::2] = width - boxes[:, 2::-2]
156 |     return image, boxes
157 | 
158 | 
159 | def preproc_for_test(image, insize, mean, std=(1, 1, 1)):
160 |     interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4]
161 |     interp_method = interp_methods[random.randrange(5)]
162 |     image = cv2.resize(image, (insize, insize), interpolation=interp_method)
163 |     image = image.astype(np.float32)
164 |     image -= mean
165 |     image /= std
166 |     return image.transpose(2, 0, 1)
167 | 
168 | 
169 | class preproc(object):
170 | 
171 |     def __init__(self, resize, rgb_means, rgb_std=(1, 1, 1), p=0.2):
172 |         self.means = rgb_means
173 |         self.std = rgb_std
174 |         self.resize = resize
175 |         self.p = p
176 | 
177 |     def __call__(self, image, targets):
178 |         boxes = targets[:, :-1].copy()
179 |         labels = targets[:, -1].copy()
180 |         if len(boxes) == 0:
181 |             # boxes = np.empty((0, 4))
182 |             targets = np.zeros((1, 5))
183 |             image = preproc_for_test(image, self.resize, self.means, self.std)
184 |             return torch.from_numpy(image), targets
185 | 
186 |         image_o = image.copy()
187 |         targets_o = targets.copy()
188 |         height_o, width_o, _ = image_o.shape
189 |         boxes_o = targets_o[:, :-1]
190 |         labels_o = targets_o[:, -1]
191 |         boxes_o[:, 0::2] /= width_o
192 |         boxes_o[:, 1::2] /= height_o
193 |         labels_o = np.expand_dims(labels_o, 1)
194 |         targets_o = np.hstack((boxes_o, labels_o))
195 | 
196 |         image_t, boxes, labels = _crop(image, boxes, labels)
197 |         image_t = _distort(image_t)
198 |         image_t, boxes = _expand(image_t, boxes, self.means, self.p)
199 |         image_t, boxes = _mirror(image_t, boxes)
200 |         # image_t, boxes = _mirror(image, boxes)
201 | 
202 |         height, width, _ = image_t.shape
203 |         image_t = preproc_for_test(image_t, self.resize, self.means, self.std)
204 |         boxes = boxes.copy()
205 |         boxes[:, 0::2] /= width
206 |         boxes[:, 1::2] /= height
207 |         b_w = (boxes[:, 2] - boxes[:, 0]) * 1.
208 |         b_h = (boxes[:, 3] - boxes[:, 1]) * 1.
209 |         mask_b = np.minimum(b_w, b_h) > 0.01
210 |         boxes_t = boxes[mask_b]
211 |         labels_t = labels[mask_b].copy()
212 | 
213 |         if len(boxes_t) == 0:
214 |             image = preproc_for_test(image_o, self.resize, self.means, self.std)
215 |             return torch.from_numpy(image), targets_o
216 | 
217 |         labels_t = np.expand_dims(labels_t, 1)
218 |         targets_t = np.hstack((boxes_t, labels_t))
219 | 
220 |         return torch.from_numpy(image_t), targets_t
221 | 
222 | 
223 | class BaseTransform(object):
224 |     """Defines the transformations that should be applied to test PIL image
225 |         for input into the network
226 | 
227 |     dimension -> tensorize -> color adj
228 | 
229 |     Arguments:
230 |         resize (int): input dimension to SSD
231 |         rgb_means ((int,int,int)): average RGB of the dataset
232 |             (104,117,123)
233 |         rgb_std: std of the dataset
234 |         swap ((int,int,int)): final order of channels
235 |     Returns:
236 |         transform (transform) : callable transform to be applied to test/val
237 |         data
238 |     """
239 | 
240 |     def __init__(self, resize, rgb_means, rgb_std=(1, 1, 1), swap=(2, 0, 1)):
241 |         self.means = rgb_means
242 |         self.resize = resize
243 |         self.std = rgb_std
244 |         self.swap = swap
245 | 
246 |     # assume input is cv2 img for now
247 |     def __call__(self, img):
248 |         interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4]
249 |         interp_method = interp_methods[0]
250 |         img = cv2.resize(np.array(img), (self.resize,
251 |                                          self.resize), interpolation=interp_method).astype(np.float32)
252 |         img -= self.means
253 |         img /= self.std
254 |         img = img.transpose(self.swap)
255 |         return torch.from_numpy(img)
256 | 


--------------------------------------------------------------------------------
/utils/pycocotools/maskApi.c:
--------------------------------------------------------------------------------
  1 | /**************************************************************************
  2 | * Microsoft COCO Toolbox.      version 2.0
  3 | * Data, paper, and tutorials available at:  http://mscoco.org/
  4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
  5 | * Licensed under the Simplified BSD License [see coco/license.txt]
  6 | **************************************************************************/
  7 | #include "maskApi.h"
  8 | #include <math.h>
  9 | #include <stdlib.h>
 10 | 
 11 | uint umin( uint a, uint b ) { return (a<b) ? a : b; }
 12 | uint umax( uint a, uint b ) { return (a>b) ? a : b; }
 13 | 
 14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) {
 15 |   R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m);
 16 |   siz j; if(cnts) for(j=0; j<m; j++) R->cnts[j]=cnts[j];
 17 | }
 18 | 
 19 | void rleFree( RLE *R ) {
 20 |   free(R->cnts); R->cnts=0;
 21 | }
 22 | 
 23 | void rlesInit( RLE **R, siz n ) {
 24 |   siz i; *R = (RLE*) malloc(sizeof(RLE)*n);
 25 |   for(i=0; i<n; i++) rleInit((*R)+i,0,0,0,0);
 26 | }
 27 | 
 28 | void rlesFree( RLE **R, siz n ) {
 29 |   siz i; for(i=0; i<n; i++) rleFree((*R)+i); free(*R); *R=0;
 30 | }
 31 | 
 32 | void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) {
 33 |   siz i, j, k, a=w*h; uint c, *cnts; byte p;
 34 |   cnts = malloc(sizeof(uint)*(a+1));
 35 |   for(i=0; i<n; i++) {
 36 |     const byte *T=M+a*i; k=0; p=0; c=0;
 37 |     for(j=0; j<a; j++) { if(T[j]!=p) { cnts[k++]=c; c=0; p=T[j]; } c++; }
 38 |     cnts[k++]=c; rleInit(R+i,h,w,k,cnts);
 39 |   }
 40 |   free(cnts);
 41 | }
 42 | 
 43 | void rleDecode( const RLE *R, byte *M, siz n ) {
 44 |   siz i, j, k; for( i=0; i<n; i++ ) {
 45 |     byte v=0; for( j=0; j<R[i].m; j++ ) {
 46 |       for( k=0; k<R[i].cnts[j]; k++ ) *(M++)=v; v=!v; }}
 47 | }
 48 | 
 49 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ) {
 50 |   uint *cnts, c, ca, cb, cc, ct; int v, va, vb, vp;
 51 |   siz i, a, b, h=R[0].h, w=R[0].w, m=R[0].m; RLE A, B;
 52 |   if(n==0) { rleInit(M,0,0,0,0); return; }
 53 |   if(n==1) { rleInit(M,h,w,m,R[0].cnts); return; }
 54 |   cnts = malloc(sizeof(uint)*(h*w+1));
 55 |   for( a=0; a<m; a++ ) cnts[a]=R[0].cnts[a];
 56 |   for( i=1; i<n; i++ ) {
 57 |     B=R[i]; if(B.h!=h||B.w!=w) { h=w=m=0; break; }
 58 |     rleInit(&A,h,w,m,cnts); ca=A.cnts[0]; cb=B.cnts[0];
 59 |     v=va=vb=0; m=0; a=b=1; cc=0; ct=1;
 60 |     while( ct>0 ) {
 61 |       c=umin(ca,cb); cc+=c; ct=0;
 62 |       ca-=c; if(!ca && a<A.m) { ca=A.cnts[a++]; va=!va; } ct+=ca;
 63 |       cb-=c; if(!cb && b<B.m) { cb=B.cnts[b++]; vb=!vb; } ct+=cb;
 64 |       vp=v; if(intersect) v=va&&vb; else v=va||vb;
 65 |       if( v!=vp||ct==0 ) { cnts[m++]=cc; cc=0; }
 66 |     }
 67 |     rleFree(&A);
 68 |   }
 69 |   rleInit(M,h,w,m,cnts); free(cnts);
 70 | }
 71 | 
 72 | void rleArea( const RLE *R, siz n, uint *a ) {
 73 |   siz i, j; for( i=0; i<n; i++ ) {
 74 |     a[i]=0; for( j=1; j<R[i].m; j+=2 ) a[i]+=R[i].cnts[j]; }
 75 | }
 76 | 
 77 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) {
 78 |   siz g, d; BB db, gb; int crowd;
 79 |   db=malloc(sizeof(double)*m*4); rleToBbox(dt,db,m);
 80 |   gb=malloc(sizeof(double)*n*4); rleToBbox(gt,gb,n);
 81 |   bbIou(db,gb,m,n,iscrowd,o); free(db); free(gb);
 82 |   for( g=0; g<n; g++ ) for( d=0; d<m; d++ ) if(o[g*m+d]>0) {
 83 |     crowd=iscrowd!=NULL && iscrowd[g];
 84 |     if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; }
 85 |     siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb;
 86 |     ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0;
 87 |     cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1;
 88 |     while( ct>0 ) {
 89 |       c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0;
 90 |       ca-=c; if(!ca && a<ka) { ca=dt[d].cnts[a++]; va=!va; } ct+=ca;
 91 |       cb-=c; if(!cb && b<kb) { cb=gt[g].cnts[b++]; vb=!vb; } ct+=cb;
 92 |     }
 93 |     if(i==0) u=1; else if(crowd) rleArea(dt+d,1,&u);
 94 |     o[g*m+d] = (double)i/(double)u;
 95 |   }
 96 | }
 97 | 
 98 | void rleNms( RLE *dt, siz n, uint *keep, double thr ) {
 99 |   siz i, j; double u;
100 |   for( i=0; i<n; i++ ) keep[i]=1;
101 |   for( i=0; i<n; i++ ) if(keep[i]) {
102 |     for( j=i+1; j<n; j++ ) if(keep[j]) {
103 |       rleIou(dt+i,dt+j,1,1,0,&u);
104 |       if(u>thr) keep[j]=0;
105 |     }
106 |   }
107 | }
108 | 
109 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) {
110 |   double h, w, i, u, ga, da; siz g, d; int crowd;
111 |   for( g=0; g<n; g++ ) {
112 |     BB G=gt+g*4; ga=G[2]*G[3]; crowd=iscrowd!=NULL && iscrowd[g];
113 |     for( d=0; d<m; d++ ) {
114 |       BB D=dt+d*4; da=D[2]*D[3]; o[g*m+d]=0;
115 |       w=fmin(D[2]+D[0],G[2]+G[0])-fmax(D[0],G[0]); if(w<=0) continue;
116 |       h=fmin(D[3]+D[1],G[3]+G[1])-fmax(D[1],G[1]); if(h<=0) continue;
117 |       i=w*h; u = crowd ? da : da+ga-i; o[g*m+d]=i/u;
118 |     }
119 |   }
120 | }
121 | 
122 | void bbNms( BB dt, siz n, uint *keep, double thr ) {
123 |   siz i, j; double u;
124 |   for( i=0; i<n; i++ ) keep[i]=1;
125 |   for( i=0; i<n; i++ ) if(keep[i]) {
126 |     for( j=i+1; j<n; j++ ) if(keep[j]) {
127 |       bbIou(dt+i*4,dt+j*4,1,1,0,&u);
128 |       if(u>thr) keep[j]=0;
129 |     }
130 |   }
131 | }
132 | 
133 | void rleToBbox( const RLE *R, BB bb, siz n ) {
134 |   siz i; for( i=0; i<n; i++ ) {
135 |     uint h, w, x, y, xs, ys, xe, ye, cc, t; siz j, m;
136 |     h=(uint)R[i].h; w=(uint)R[i].w; m=R[i].m;
137 |     m=((siz)(m/2))*2; xs=w; ys=h; xe=ye=0; cc=0;
138 |     if(m==0) { bb[4*i+0]=bb[4*i+1]=bb[4*i+2]=bb[4*i+3]=0; continue; }
139 |     for( j=0; j<m; j++ ) {
140 |       cc+=R[i].cnts[j]; t=cc-j%2; y=t%h; x=(t-y)/h;
141 |       xs=umin(xs,x); xe=umax(xe,x); ys=umin(ys,y); ye=umax(ye,y);
142 |     }
143 |     bb[4*i+0]=xs; bb[4*i+2]=xe-xs+1;
144 |     bb[4*i+1]=ys; bb[4*i+3]=ye-ys+1;
145 |   }
146 | }
147 | 
148 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) {
149 |   siz i; for( i=0; i<n; i++ ) {
150 |     double xs=bb[4*i+0], xe=xs+bb[4*i+2];
151 |     double ys=bb[4*i+1], ye=ys+bb[4*i+3];
152 |     double xy[8] = {xs,ys,xs,ye,xe,ye,xe,ys};
153 |     rleFrPoly( R+i, xy, 4, h, w );
154 |   }
155 | }
156 | 
157 | int uintCompare(const void *a, const void *b) {
158 |   uint c=*((uint*)a), d=*((uint*)b); return c>d?1:c<d?-1:0;
159 | }
160 | 
161 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) {
162 |   /* upsample and get discrete points densely along entire boundary */
163 |   siz j, m=0; double scale=5; int *x, *y, *u, *v; uint *a, *b;
164 |   x=malloc(sizeof(int)*(k+1)); y=malloc(sizeof(int)*(k+1));
165 |   for(j=0; j<k; j++) x[j]=(int)(scale*xy[j*2+0]+.5); x[k]=x[0];
166 |   for(j=0; j<k; j++) y[j]=(int)(scale*xy[j*2+1]+.5); y[k]=y[0];
167 |   for(j=0; j<k; j++) m+=umax(abs(x[j]-x[j+1]),abs(y[j]-y[j+1]))+1;
168 |   u=malloc(sizeof(int)*m); v=malloc(sizeof(int)*m); m=0;
169 |   for( j=0; j<k; j++ ) {
170 |     int xs=x[j], xe=x[j+1], ys=y[j], ye=y[j+1], dx, dy, t, d;
171 |     int flip; double s; dx=abs(xe-xs); dy=abs(ys-ye);
172 |     flip = (dx>=dy && xs>xe) || (dx<dy && ys>ye);
173 |     if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; }
174 |     s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy;
175 |     if(dx>=dy) for( d=0; d<=dx; d++ ) {
176 |       t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++;
177 |     } else for( d=0; d<=dy; d++ ) {
178 |       t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++;
179 |     }
180 |   }
181 |   /* get points along y-boundary and downsample */
182 |   free(x); free(y); k=m; m=0; double xd, yd;
183 |   x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k);
184 |   for( j=1; j<k; j++ ) if(u[j]!=u[j-1]) {
185 |     xd=(double)(u[j]<u[j-1]?u[j]:u[j]-1); xd=(xd+.5)/scale-.5;
186 |     if( floor(xd)!=xd || xd<0 || xd>w-1 ) continue;
187 |     yd=(double)(v[j]<v[j-1]?v[j]:v[j-1]); yd=(yd+.5)/scale-.5;
188 |     if(yd<0) yd=0; else if(yd>h) yd=h; yd=ceil(yd);
189 |     x[m]=(int) xd; y[m]=(int) yd; m++;
190 |   }
191 |   /* compute rle encoding given y-boundary points */
192 |   k=m; a=malloc(sizeof(uint)*(k+1));
193 |   for( j=0; j<k; j++ ) a[j]=(uint)(x[j]*(int)(h)+y[j]);
194 |   a[k++]=(uint)(h*w); free(u); free(v); free(x); free(y);
195 |   qsort(a,k,sizeof(uint),uintCompare); uint p=0;
196 |   for( j=0; j<k; j++ ) { uint t=a[j]; a[j]-=p; p=t; }
197 |   b=malloc(sizeof(uint)*k); j=m=0; b[m++]=a[j++];
198 |   while(j<k) if(a[j]>0) b[m++]=a[j++]; else {
199 |     j++; if(j<k) b[m-1]+=a[j++]; }
200 |   rleInit(R,h,w,m,b); free(a); free(b);
201 | }
202 | 
203 | char* rleToString( const RLE *R ) {
204 |   /* Similar to LEB128 but using 6 bits/char and ascii chars 48-111. */
205 |   siz i, m=R->m, p=0; long x; int more;
206 |   char *s=malloc(sizeof(char)*m*6);
207 |   for( i=0; i<m; i++ ) {
208 |     x=(long) R->cnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1;
209 |     while( more ) {
210 |       char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0;
211 |       if(more) c |= 0x20; c+=48; s[p++]=c;
212 |     }
213 |   }
214 |   s[p]=0; return s;
215 | }
216 | 
217 | void rleFrString( RLE *R, char *s, siz h, siz w ) {
218 |   siz m=0, p=0, k; long x; int more; uint *cnts;
219 |   while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0;
220 |   while( s[p] ) {
221 |     x=0; k=0; more=1;
222 |     while( more ) {
223 |       char c=s[p]-48; x |= (c & 0x1f) << 5*k;
224 |       more = c & 0x20; p++; k++;
225 |       if(!more && (c & 0x10)) x |= -1 << 5*k;
226 |     }
227 |     if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x;
228 |   }
229 |   rleInit(R,h,w,m,cnts); free(cnts);
230 | }
231 | 


--------------------------------------------------------------------------------
/utils/pycocotools/_mask.pyx:
--------------------------------------------------------------------------------
  1 | # distutils: language = c
  2 | # distutils: sources = ../common/maskApi.c
  3 | 
  4 | #**************************************************************************
  5 | # Microsoft COCO Toolbox.      version 2.0
  6 | # Data, paper, and tutorials available at:  http://mscoco.org/
  7 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
  8 | # Licensed under the Simplified BSD License [see coco/license.txt]
  9 | #**************************************************************************
 10 | 
 11 | __author__ = 'tsungyi'
 12 | 
 13 | import sys
 14 | PYTHON_VERSION = sys.version_info[0]
 15 | 
 16 | # import both Python-level and C-level symbols of Numpy
 17 | # the API uses Numpy to interface C and Python
 18 | import numpy as np
 19 | cimport numpy as np
 20 | from libc.stdlib cimport malloc, free
 21 | 
 22 | # intialized Numpy. must do.
 23 | np.import_array()
 24 | 
 25 | # import numpy C function
 26 | # we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management
 27 | cdef extern from "numpy/arrayobject.h":
 28 |     void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
 29 | 
 30 | # Declare the prototype of the C functions in MaskApi.h
 31 | cdef extern from "maskApi.h":
 32 |     ctypedef unsigned int uint
 33 |     ctypedef unsigned long siz
 34 |     ctypedef unsigned char byte
 35 |     ctypedef double* BB
 36 |     ctypedef struct RLE:
 37 |         siz h,
 38 |         siz w,
 39 |         siz m,
 40 |         uint* cnts,
 41 |     void rlesInit( RLE **R, siz n )
 42 |     void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n )
 43 |     void rleDecode( const RLE *R, byte *mask, siz n )
 44 |     void rleMerge( const RLE *R, RLE *M, siz n, int intersect )
 45 |     void rleArea( const RLE *R, siz n, uint *a )
 46 |     void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o )
 47 |     void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o )
 48 |     void rleToBbox( const RLE *R, BB bb, siz n )
 49 |     void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n )
 50 |     void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w )
 51 |     char* rleToString( const RLE *R )
 52 |     void rleFrString( RLE *R, char *s, siz h, siz w )
 53 | 
 54 | # python class to wrap RLE array in C
 55 | # the class handles the memory allocation and deallocation
 56 | cdef class RLEs:
 57 |     cdef RLE *_R
 58 |     cdef siz _n
 59 | 
 60 |     def __cinit__(self, siz n =0):
 61 |         rlesInit(&self._R, n)
 62 |         self._n = n
 63 | 
 64 |     # free the RLE array here
 65 |     def __dealloc__(self):
 66 |         if self._R is not NULL:
 67 |             for i in range(self._n):
 68 |                 free(self._R[i].cnts)
 69 |             free(self._R)
 70 |     def __getattr__(self, key):
 71 |         if key == 'n':
 72 |             return self._n
 73 |         raise AttributeError(key)
 74 | 
 75 | # python class to wrap Mask array in C
 76 | # the class handles the memory allocation and deallocation
 77 | cdef class Masks:
 78 |     cdef byte *_mask
 79 |     cdef siz _h
 80 |     cdef siz _w
 81 |     cdef siz _n
 82 | 
 83 |     def __cinit__(self, h, w, n):
 84 |         self._mask = <byte*> malloc(h*w*n* sizeof(byte))
 85 |         self._h = h
 86 |         self._w = w
 87 |         self._n = n
 88 |     # def __dealloc__(self):
 89 |         # the memory management of _mask has been passed to np.ndarray
 90 |         # it doesn't need to be freed here
 91 | 
 92 |     # called when passing into np.array() and return an np.ndarray in column-major order
 93 |     def __array__(self):
 94 |         cdef np.npy_intp shape[1]
 95 |         shape[0] = <np.npy_intp> self._h*self._w*self._n
 96 |         # Create a 1D array, and reshape it to fortran/Matlab column-major array
 97 |         ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')
 98 |         # The _mask allocated by Masks is now handled by ndarray
 99 |         PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)
100 |         return ndarray
101 | 
102 | # internal conversion from Python RLEs object to compressed RLE format
103 | def _toString(RLEs Rs):
104 |     cdef siz n = Rs.n
105 |     cdef bytes py_string
106 |     cdef char* c_string
107 |     objs = []
108 |     for i in range(n):
109 |         c_string = rleToString( <RLE*> &Rs._R[i] )
110 |         py_string = c_string
111 |         objs.append({
112 |             'size': [Rs._R[i].h, Rs._R[i].w],
113 |             'counts': py_string
114 |         })
115 |         free(c_string)
116 |     return objs
117 | 
118 | # internal conversion from compressed RLE format to Python RLEs object
119 | def _frString(rleObjs):
120 |     cdef siz n = len(rleObjs)
121 |     Rs = RLEs(n)
122 |     cdef bytes py_string
123 |     cdef char* c_string
124 |     for i, obj in enumerate(rleObjs):
125 |         if PYTHON_VERSION == 2:
126 |             py_string = str(obj['counts']).encode('utf8')
127 |         elif PYTHON_VERSION == 3:
128 |             py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
129 |         else:
130 |             raise Exception('Python version must be 2 or 3')
131 |         c_string = py_string
132 |         rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
133 |     return Rs
134 | 
135 | # encode mask to RLEs objects
136 | # list of RLE string can be generated by RLEs member function
137 | def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):
138 |     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
139 |     cdef RLEs Rs = RLEs(n)
140 |     rleEncode(Rs._R,<byte*>mask.data,h,w,n)
141 |     objs = _toString(Rs)
142 |     return objs
143 | 
144 | # decode mask from compressed list of RLE string or RLEs object
145 | def decode(rleObjs):
146 |     cdef RLEs Rs = _frString(rleObjs)
147 |     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
148 |     masks = Masks(h, w, n)
149 |     rleDecode(<RLE*>Rs._R, masks._mask, n);
150 |     return np.array(masks)
151 | 
152 | def merge(rleObjs, intersect=0):
153 |     cdef RLEs Rs = _frString(rleObjs)
154 |     cdef RLEs R = RLEs(1)
155 |     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
156 |     obj = _toString(R)[0]
157 |     return obj
158 | 
159 | def area(rleObjs):
160 |     cdef RLEs Rs = _frString(rleObjs)
161 |     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
162 |     rleArea(Rs._R, Rs._n, _a)
163 |     cdef np.npy_intp shape[1]
164 |     shape[0] = <np.npy_intp> Rs._n
165 |     a = np.array((Rs._n, ), dtype=np.uint8)
166 |     a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
167 |     PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
168 |     return a
169 | 
170 | # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
171 | def iou( dt, gt, pyiscrowd ):
172 |     def _preproc(objs):
173 |         if len(objs) == 0:
174 |             return objs
175 |         if type(objs) == np.ndarray:
176 |             if len(objs.shape) == 1:
177 |                 objs = objs.reshape((objs[0], 1))
178 |             # check if it's Nx4 bbox
179 |             if not len(objs.shape) == 2 or not objs.shape[1] == 4:
180 |                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
181 |             objs = objs.astype(np.double)
182 |         elif type(objs) == list:
183 |             # check if list is in box format and convert it to np.ndarray
184 |             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
185 |             isrle = np.all(np.array([type(obj) == dict for obj in objs]))
186 |             if isbox:
187 |                 objs = np.array(objs, dtype=np.double)
188 |                 if len(objs.shape) == 1:
189 |                     objs = objs.reshape((1,objs.shape[0]))
190 |             elif isrle:
191 |                 objs = _frString(objs)
192 |             else:
193 |                 raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
194 |         else:
195 |             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
196 |         return objs
197 |     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
198 |         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
199 |     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
200 |         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
201 |     def _len(obj):
202 |         cdef siz N = 0
203 |         if type(obj) == RLEs:
204 |             N = obj.n
205 |         elif len(obj)==0:
206 |             pass
207 |         elif type(obj) == np.ndarray:
208 |             N = obj.shape[0]
209 |         return N
210 |     # convert iscrowd to numpy array
211 |     cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8)
212 |     # simple type checking
213 |     cdef siz m, n
214 |     dt = _preproc(dt)
215 |     gt = _preproc(gt)
216 |     m = _len(dt)
217 |     n = _len(gt)
218 |     if m == 0 or n == 0:
219 |         return []
220 |     if not type(dt) == type(gt):
221 |         raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
222 | 
223 |     # define local variables
224 |     cdef double* _iou = <double*> 0
225 |     cdef np.npy_intp shape[1]
226 |     # check type and assign iou function
227 |     if type(dt) == RLEs:
228 |         _iouFun = _rleIou
229 |     elif type(dt) == np.ndarray:
230 |         _iouFun = _bbIou
231 |     else:
232 |         raise Exception('input data type not allowed.')
233 |     _iou = <double*> malloc(m*n* sizeof(double))
234 |     iou = np.zeros((m*n, ), dtype=np.double)
235 |     shape[0] = <np.npy_intp> m*n
236 |     iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
237 |     PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
238 |     _iouFun(dt, gt, iscrowd, m, n, iou)
239 |     return iou.reshape((m,n), order='F')
240 | 
241 | def toBbox( rleObjs ):
242 |     cdef RLEs Rs = _frString(rleObjs)
243 |     cdef siz n = Rs.n
244 |     cdef BB _bb = <BB> malloc(4*n* sizeof(double))
245 |     rleToBbox( <const RLE*> Rs._R, _bb, n )
246 |     cdef np.npy_intp shape[1]
247 |     shape[0] = <np.npy_intp> 4*n
248 |     bb = np.array((1,4*n), dtype=np.double)
249 |     bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
250 |     PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
251 |     return bb
252 | 
253 | def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
254 |     cdef siz n = bb.shape[0]
255 |     Rs = RLEs(n)
256 |     rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
257 |     objs = _toString(Rs)
258 |     return objs
259 | 
260 | def frPoly( poly, siz h, siz w ):
261 |     cdef np.ndarray[np.double_t, ndim=1] np_poly
262 |     n = len(poly)
263 |     Rs = RLEs(n)
264 |     for i, p in enumerate(poly):
265 |         np_poly = np.array(p, dtype=np.double, order='F')
266 |         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )
267 |     objs = _toString(Rs)
268 |     return objs
269 | 
270 | def frUncompressedRLE(ucRles, siz h, siz w):
271 |     cdef np.ndarray[np.uint32_t, ndim=1] cnts
272 |     cdef RLE R
273 |     cdef uint *data
274 |     n = len(ucRles)
275 |     objs = []
276 |     for i in range(n):
277 |         Rs = RLEs(1)
278 |         cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
279 |         # time for malloc can be saved here but it's fine
280 |         data = <uint*> malloc(len(cnts)* sizeof(uint))
281 |         for j in range(len(cnts)):
282 |             data[j] = <uint> cnts[j]
283 |         R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
284 |         Rs._R[0] = R
285 |         objs.append(_toString(Rs)[0])
286 |     return objs
287 | 
288 | def frPyObjects(pyobj, h, w):
289 |     # encode rle from a list of python objects
290 |     if type(pyobj) == np.ndarray:
291 |         objs = frBbox(pyobj, h, w)
292 |     elif type(pyobj) == list and len(pyobj[0]) == 4:
293 |         objs = frBbox(pyobj, h, w)
294 |     elif type(pyobj) == list and len(pyobj[0]) > 4:
295 |         objs = frPoly(pyobj, h, w)
296 |     elif type(pyobj) == list and type(pyobj[0]) == dict \
297 |         and 'counts' in pyobj[0] and 'size' in pyobj[0]:
298 |         objs = frUncompressedRLE(pyobj, h, w)
299 |     # encode rle from single python object
300 |     elif type(pyobj) == list and len(pyobj) == 4:
301 |         objs = frBbox([pyobj], h, w)[0]
302 |     elif type(pyobj) == list and len(pyobj) > 4:
303 |         objs = frPoly([pyobj], h, w)[0]
304 |     elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:
305 |         objs = frUncompressedRLE([pyobj], h, w)[0]
306 |     else:
307 |         raise Exception('input type is not supported.')
308 |     return objs
309 | 


--------------------------------------------------------------------------------
/models/FSSD_vgg_FPN.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | from .base_models import vgg, vgg_base
  7 | 
  8 | 
  9 | class BasicConv(nn.Module):
 10 | 
 11 |     def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True,
 12 |                  bn=False, bias=True, up_size=0):
 13 |         super(BasicConv, self).__init__()
 14 |         self.out_channels = out_planes
 15 |         self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding,
 16 |                               dilation=dilation, groups=groups, bias=bias)
 17 |         self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None
 18 |         self.relu = nn.ReLU(inplace=True) if relu else None
 19 |         self.up_size = up_size
 20 |         self.up_sample = nn.Upsample(size=(up_size, up_size), mode='bilinear') if up_size != 0 else None
 21 | 
 22 |     def forward(self, x):
 23 |         x = self.conv(x)
 24 |         if self.bn is not None:
 25 |             x = self.bn(x)
 26 |         if self.relu is not None:
 27 |             x = self.relu(x)
 28 |         if self.up_size > 0:
 29 |             x = self.up_sample(x)
 30 |         return x
 31 | 
 32 | 
 33 | class FSSD(nn.Module):
 34 |     """Single Shot Multibox Architecture
 35 |     The network is composed of a base VGG network followed by the
 36 |     added multibox conv layers.  Each multibox layer branches into
 37 |         1) conv2d for class conf scores
 38 |         2) conv2d for localization predictions
 39 |         3) associated priorbox layer to produce default bounding
 40 |            boxes specific to the layer's feature map size.
 41 |     See: https://arxiv.org/pdf/1712.00960.pdf or more details.
 42 |     Args:
 43 |         base: VGG16 layers for input, size of either 300 or 500
 44 |         extras: extra layers that feed to multibox loc and conf layers
 45 |         head: "multibox head" consists of loc and conf conv layers
 46 |     """
 47 | 
 48 |     def __init__(self, base, extras, ft_module, pyramid_ext, head, num_classes, size):
 49 |         super(FSSD, self).__init__()
 50 |         self.num_classes = num_classes
 51 |         # TODO: implement __call__ in PriorBox
 52 |         self.size = size
 53 | 
 54 |         # SSD network
 55 |         self.base = nn.ModuleList(base)
 56 |         self.extras = nn.ModuleList(extras)
 57 |         self.ft_module = nn.ModuleList(ft_module)
 58 |         self.pyramid_ext = nn.ModuleList(pyramid_ext)
 59 |         self.fea_bn = nn.BatchNorm2d(256 * len(self.ft_module), affine=True)
 60 | 
 61 |         self.loc = nn.ModuleList(head[0])
 62 |         self.conf = nn.ModuleList(head[1])
 63 | 
 64 |         self.softmax = nn.Softmax()
 65 | 
 66 |         self.conv_cat0 = nn.Conv2d(512, 256, kernel_size=1, padding=0, stride=1)
 67 |         self.upsample0 = nn.Upsample(size=(3, 3), mode='bilinear')
 68 | 
 69 |         self.conv_cat1 = nn.Conv2d(512, 256, kernel_size=1, padding=0, stride=1)
 70 |         self.upsample1 = nn.Upsample(size=(5, 5), mode='bilinear')
 71 | 
 72 |         self.conv_cat2 = nn.Conv2d(512, 256, kernel_size=1, padding=0, stride=1)
 73 |         self.upsample2 = nn.Upsample(size=(10, 10), mode='bilinear')
 74 | 
 75 |         self.conv_cat3 = nn.Conv2d(768, 512, kernel_size=1, padding=0, stride=1)
 76 |         self.upsample3 = nn.Upsample(size=(19, 19), mode='bilinear')
 77 | 
 78 |         self.conv_cat4 = nn.Conv2d(1024, 512, kernel_size=1, padding=0, stride=1)
 79 |         self.upsample4 = nn.Upsample(size=(38, 38), mode='bilinear')
 80 | 
 81 |     def forward(self, x, test=False):
 82 |         """Applies network layers and ops on input image(s) x.
 83 |         Args:
 84 |             x: input image or batch of images. Shape: [batch,3*batch,300,300].
 85 |         Return:
 86 |             Depending on phase:
 87 |             test:
 88 |                 Variable(tensor) of output class label predictions,
 89 |                 confidence score, and corresponding location predictions for
 90 |                 each object detected. Shape: [batch,topk,7]
 91 |             train:
 92 |                 list of concat outputs from:
 93 |                     1: confidence layers, Shape: [batch*num_priors,num_classes]
 94 |                     2: localization layers, Shape: [batch,num_priors*4]
 95 |                     3: priorbox layers, Shape: [2,num_priors*4]
 96 |         """
 97 |         source_features = list()
 98 |         transformed_features = list()
 99 |         loc = list()
100 |         conf = list()
101 | 
102 |         # apply vgg up to conv4_3 relu
103 |         for k in range(23):
104 |             x = self.base[k](x)
105 | 
106 |         source_features.append(x)
107 | 
108 |         # apply vgg up to fc7
109 |         for k in range(23, len(self.base)):
110 |             x = self.base[k](x)
111 |         source_features.append(x)
112 | 
113 |         # apply extra layers and cache source layer outputs
114 |         for k, v in enumerate(self.extras):
115 |             x = F.relu(v(x), inplace=True)
116 |         source_features.append(x)
117 |         assert len(self.ft_module) == len(source_features)
118 |         for k, v in enumerate(self.ft_module):
119 |             transformed_features.append(v(source_features[k]))
120 |         concat_fea = torch.cat(transformed_features, 1)
121 |         x = self.fea_bn(concat_fea)
122 |         pyramid_fea = list()
123 |         for k, v in enumerate(self.pyramid_ext):
124 |             x = v(x)
125 |             pyramid_fea.append(x)
126 | 
127 |         # ----------this block is to downsample the 1*1 layer to 3*3, and concat with the original 3*3 layer, like Dense connection
128 |         fpn_0 = list()
129 |         detect_5 = pyramid_fea[5]
130 |         detect_4 = pyramid_fea[4]
131 |         detect_5_4 = self.upsample0(detect_5)
132 |         fpn_0.append(detect_4)
133 |         fpn_0.append(detect_5_4)
134 |         detect_4 = torch.cat(fpn_0, 1)
135 |         detect_4 = self.conv_cat0(detect_4)
136 |         pyramid_fea[4] = detect_4
137 |         pyramid_fea[5] = detect_5
138 | 
139 |         # ----------this block is to downsample the 3*3 layer to 5*5, and concat with the original 5*5 layer, like Dense connection
140 |         fpn_1 = list()
141 |         detect_3 = pyramid_fea[3]
142 |         detect_4_3 = self.upsample1(detect_4)
143 |         fpn_1.append(detect_3)
144 |         fpn_1.append(detect_4_3)
145 |         detect_3 = torch.cat(fpn_1, 1)
146 |         detect_3 = self.conv_cat1(detect_3)
147 |         pyramid_fea[3] = detect_3
148 | 
149 |         # ----------this block is to downsample the 5*5 layer to 10*10, and concat with the original 10*10 layer, like Dense connection
150 |         fpn_2 = list()
151 |         detect_2 = pyramid_fea[2]
152 |         detect_3_2 = self.upsample2(detect_3)
153 |         fpn_2.append(detect_2)
154 |         fpn_2.append(detect_3_2)
155 |         detect_2 = torch.cat(fpn_2, 1)
156 |         detect_2 = self.conv_cat2(detect_2)
157 |         pyramid_fea[2] = detect_2
158 | 
159 |         # ----------this block is to downsample the 10*10 layer to 19*19, and concat with the original 19*19 layer, like Dense connection
160 |         fpn_3 = list()
161 |         detect_1 = pyramid_fea[1]
162 |         detect_2_1 = self.upsample3(detect_2)
163 |         fpn_3.append(detect_1)
164 |         fpn_3.append(detect_2_1)
165 |         detect_1 = torch.cat(fpn_3, 1)
166 |         detect_1 = self.conv_cat3(detect_1)
167 |         pyramid_fea[1] = detect_1
168 | 
169 |         # ----------this block is to downsample the 19*19 layer to 38*38, and concat with the original 38*38 layer, like Dense connection
170 |         fpn_4 = list()
171 |         detect_0 = pyramid_fea[0]
172 |         detect_1_0 = self.upsample4(detect_1)
173 |         fpn_4.append(detect_0)
174 |         fpn_4.append(detect_1_0)
175 |         detect_0 = torch.cat(fpn_4, 1)
176 |         detect_0 = self.conv_cat4(detect_0)
177 |         pyramid_fea[0] = detect_0
178 | 
179 |         # apply multibox head to source layers
180 |         for (x, l, c) in zip(pyramid_fea, self.loc, self.conf):
181 |             loc.append(l(x).permute(0, 2, 3, 1).contiguous())
182 |             conf.append(c(x).permute(0, 2, 3, 1).contiguous())
183 | 
184 |         loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
185 |         conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
186 |         if test:
187 |             output = (
188 |                 loc.view(loc.size(0), -1, 4),  # loc preds
189 |                 self.softmax(conf.view(-1, self.num_classes)),  # conf preds
190 |             )
191 |         else:
192 |             output = (
193 |                 loc.view(loc.size(0), -1, 4),
194 |                 conf.view(conf.size(0), -1, self.num_classes),
195 |             )
196 |         return output
197 | 
198 |     def load_weights(self, base_file):
199 |         other, ext = os.path.splitext(base_file)
200 |         if ext == '.pkl' or '.pth':
201 |             print('Loading weights into state dict...')
202 |             self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage))
203 |             print('Finished!')
204 |         else:
205 |             print('Sorry only .pth and .pkl files supported.')
206 | 
207 | 
208 | def add_extras(cfg, i, batch_norm=False):
209 |     # Extra layers added to VGG for feature scaling
210 |     layers = []
211 |     in_channels = i
212 |     flag = False
213 |     for k, v in enumerate(cfg):
214 |         if in_channels != 'S':
215 |             if v == 'S':
216 |                 layers += [nn.Conv2d(in_channels, cfg[k + 1],
217 |                                      kernel_size=(1, 3)[flag], stride=2, padding=1)]
218 |             else:
219 |                 layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
220 |             flag = not flag
221 |         in_channels = v
222 |     return layers
223 | 
224 | 
225 | def feature_transform_module(vgg, extral, size):
226 |     if size == 300:
227 |         up_size = 38
228 |     elif size == 512:
229 |         up_size = 64
230 | 
231 |     layers = []
232 |     # conv4_3
233 |     layers += [BasicConv(vgg[24].out_channels, 256, kernel_size=1, padding=0)]
234 |     # fc_7
235 |     layers += [BasicConv(vgg[-2].out_channels, 256, kernel_size=1, padding=0, up_size=up_size)]
236 |     layers += [BasicConv(extral[-1].out_channels, 256, kernel_size=1, padding=0, up_size=up_size)]
237 |     return vgg, extral, layers
238 | 
239 | 
240 | def pyramid_feature_extractor(size):
241 |     if size == 300:
242 |         layers = [BasicConv(256 * 3, 512, kernel_size=3, stride=1, padding=1),
243 |                   BasicConv(512, 512, kernel_size=3, stride=2, padding=1), \
244 |                   BasicConv(512, 256, kernel_size=3, stride=2, padding=1),
245 |                   BasicConv(256, 256, kernel_size=3, stride=2, padding=1), \
246 |                   BasicConv(256, 256, kernel_size=3, stride=1, padding=0),
247 |                   BasicConv(256, 256, kernel_size=3, stride=1, padding=0)]
248 |     elif size == 512:
249 |         layers = [BasicConv(256 * 3, 512, kernel_size=3, stride=1, padding=1),
250 |                   BasicConv(512, 512, kernel_size=3, stride=2, padding=1), \
251 |                   BasicConv(512, 256, kernel_size=3, stride=2, padding=1),
252 |                   BasicConv(256, 256, kernel_size=3, stride=2, padding=1), \
253 |                   BasicConv(256, 256, kernel_size=3, stride=2, padding=1),
254 |                   BasicConv(256, 256, kernel_size=3, stride=2, padding=1), \
255 |                   BasicConv(256, 256, kernel_size=4, padding=1, stride=1)]
256 |     return layers
257 | 
258 | 
259 | def multibox(fea_channels, cfg, num_classes):
260 |     loc_layers = []
261 |     conf_layers = []
262 |     assert len(fea_channels) == len(cfg)
263 |     for i, fea_channel in enumerate(fea_channels):
264 |         loc_layers += [nn.Conv2d(fea_channel, cfg[i] * 4, kernel_size=3, padding=1)]
265 |         conf_layers += [nn.Conv2d(fea_channel, cfg[i] * num_classes, kernel_size=3, padding=1)]
266 |     return (loc_layers, conf_layers)
267 | 
268 | 
269 | extras = {
270 |     '300': [256, 512, 128, 'S', 256],
271 |     '512': [256, 512, 128, 'S', 256],
272 | }
273 | mbox = {
274 |     '300': [4, 6, 6, 6, 4, 4],  # number of boxes per feature map location
275 |     '512': [6, 6, 6, 6, 6, 4, 4],
276 | }
277 | fea_channels = {
278 |     '300': [512, 512, 256, 256, 256, 256],
279 |     '512': [512, 512, 256, 256, 256, 256, 256]}
280 | 
281 | 
282 | def build_net(size=300, num_classes=21):
283 |     if size != 300 and size != 512:
284 |         print("Error: Sorry only FSSD300 and FSSD512 is supported currently!")
285 |         return
286 | 
287 |     return FSSD(*feature_transform_module(vgg(vgg_base[str(size)], 3), add_extras(extras[str(size)], 1024), size=size),
288 |                 pyramid_ext=pyramid_feature_extractor(size),
289 |                 head=multibox(fea_channels[str(size)], mbox[str(size)], num_classes), num_classes=num_classes,
290 | size=size)


--------------------------------------------------------------------------------
/models/FSSD_Mob_FPN.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import os
  4 | import torch
  5 | import torch.nn as nn
  6 | from utils.timer import Timer
  7 | sys.path.append('./')
  8 | from models.mobilenet import mobilenet_1
  9 | import time
 10 | from utils.timer import Timer
 11 | 
 12 | class BasicConv(nn.Module):
 13 |     def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True,
 14 |                  bn=False, bias=True, up_size=0):
 15 |         super(BasicConv, self).__init__()
 16 |         self.out_channels = out_planes
 17 |         self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding,
 18 |                               dilation=dilation, groups=groups, bias=bias)
 19 |         self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None
 20 |         self.relu = nn.ReLU(inplace=True) if relu else None
 21 |         self.up_size = up_size
 22 |         self.up_sample = nn.Upsample(size=(up_size, up_size), mode='bilinear') if up_size != 0 else None
 23 | 
 24 |     def forward(self, x):
 25 |         x = self.conv(x)
 26 |         if self.bn is not None:
 27 |             x = self.bn(x)
 28 |         if self.relu is not None:
 29 |             x = self.relu(x)
 30 |         if self.up_size > 0:
 31 |             x = self.up_sample(x)
 32 |         return x
 33 | 
 34 | class FSSD(nn.Module):
 35 |     """Single Shot Multibox Architecture
 36 |     The network is composed of a base VGG network followed by the
 37 |     added multibox conv layers.  Each multibox layer branches into
 38 |         1) conv2d for class conf scores
 39 |         2) conv2d for localization predictions
 40 |         3) associated priorbox layer to produce default bounding
 41 |            boxes specific to the layer's feature map size.
 42 |     See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 43 | 
 44 |     Args:
 45 |         phase: (string) Can be "test" or "train"
 46 |         base: VGG16 layers for input, size of either 300 or 500
 47 |         extras: extra layers that feed to multibox loc and conf layers
 48 |         head: "multibox head" consists of loc and conf conv layers
 49 |     """
 50 | 
 51 |     def __init__(self, size, head, ft_module, pyramid_ext, num_classes):
 52 |         super(FSSD, self).__init__()
 53 |         self.num_classes = num_classes
 54 |         # TODO: implement __call__ in PriorBox
 55 |         self.size = size
 56 | 
 57 |         # SSD network
 58 |         self.base = mobilenet_1()
 59 |         # Layer learns to scale the l2 normalized features from conv4_3
 60 |         self.ft_module = nn.ModuleList(ft_module)
 61 |         self.pyramid_ext = nn.ModuleList(pyramid_ext)
 62 | 
 63 |         self.loc = nn.ModuleList(head[0])
 64 |         self.conf = nn.ModuleList(head[1])
 65 |         #self.fea_bn = nn.BatchNorm2d(256, affine=True)
 66 |         self.fea_bn = nn.BatchNorm2d(256 * len(self.ft_module), affine=True)
 67 |         self.softmax = nn.Softmax()
 68 | 
 69 |         self.conv_cat0 = nn.Conv2d(256, 128, kernel_size=1, padding=0, stride=1)
 70 |         self.upsample0 = nn.Upsample(size=(3, 3), mode='bilinear')
 71 | 
 72 |         self.conv_cat1 = nn.Conv2d(384, 256, kernel_size=1, padding=0, stride=1)
 73 |         self.upsample1 = nn.Upsample(size=(5, 5), mode='bilinear')
 74 | 
 75 |         self.conv_cat2 = nn.Conv2d(512, 256, kernel_size=1, padding=0, stride=1)
 76 |         self.upsample2 = nn.Upsample(size=(10, 10), mode='bilinear')
 77 | 
 78 |         self.conv_cat3 = nn.Conv2d(768, 512, kernel_size=1, padding=0, stride=1)
 79 |         self.upsample3 = nn.Upsample(size=(19, 19), mode='bilinear')
 80 | 
 81 |         self.conv_cat4 = nn.Conv2d(1024, 512, kernel_size=1, padding=0, stride=1)
 82 |         self.upsample4 = nn.Upsample(size=(38, 38), mode='bilinear')
 83 | 
 84 | 
 85 |         self.time = time
 86 |         self.timer = Timer
 87 | 
 88 |     def forward(self, x, test=False):
 89 |         """Applies network layers and ops on input image(s) x.
 90 | 
 91 |         Args:
 92 |             x: input image or batch of images. Shape: [batch,3*batch,300,300].
 93 | 
 94 |         Return:
 95 |             Depending on phase:
 96 |             test:
 97 |                 Variable(tensor) of output class label predictions,
 98 |                 confidence score, and corresponding location predictions for
 99 |                 each object detected. Shape: [batch,topk,7]
100 | 
101 |             train:
102 |                 list of concat outputs from:
103 |                     1: confidence layers, Shape: [batch*num_priors,num_classes]
104 |                     2: localization layers, Shape: [batch,num_priors*4]
105 |                     3: priorbox layers, Shape: [2,num_priors*4]
106 |         """
107 |         source_features = list()
108 |         transformed_features = list()
109 |         loc = list()
110 |         conf = list()
111 | 
112 |         base_out = self.base(x)
113 | 
114 |         source_features.append(base_out[0])  # mobilenet 4_1
115 |         source_features.append(base_out[1])  # mobilent_5_5
116 |         source_features.append(base_out[2])  # mobilenet 6_1
117 | 
118 |         assert len(self.ft_module) == len(source_features)
119 |         for k, v in enumerate(self.ft_module):
120 |             transformed_features.append(v(source_features[k]))
121 |         concat_fea = torch.cat(transformed_features, 1)
122 |         x = self.fea_bn(concat_fea)
123 |         fea_bn = x
124 | 
125 |         # the six detect layers
126 |         pyramid_fea = list()
127 |         for k, v in enumerate(self.pyramid_ext):
128 |             x = v(x)
129 |             pyramid_fea.append(x)
130 | 
131 | 
132 |         #----------this block is to downsample the 1*1 layer to 3*3, and concat with the original 3*3 layer, like Dense connection
133 |         fpn_0 = list()
134 |         detect_5 = pyramid_fea[5]
135 |         detect_4 = pyramid_fea[4]
136 |         detect_5_4 = self.upsample0(detect_5)
137 |         fpn_0.append(detect_4)
138 |         fpn_0.append(detect_5_4)
139 |         detect_4 = torch.cat(fpn_0, 1)
140 |         detect_4 = self.conv_cat0(detect_4)
141 |         pyramid_fea[4] = detect_4
142 |         pyramid_fea[5] = detect_5
143 | 
144 |         #----------this block is to downsample the 3*3 layer to 5*5, and concat with the original 5*5 layer, like Dense connection
145 |         fpn_1 = list()
146 |         detect_3 = pyramid_fea[3]
147 |         detect_4_3 = self.upsample1(detect_4)
148 |         fpn_1.append(detect_3)
149 |         fpn_1.append(detect_4_3)
150 |         detect_3 = torch.cat(fpn_1, 1)
151 |         detect_3 = self.conv_cat1(detect_3)
152 |         pyramid_fea[3] = detect_3
153 | 
154 | 
155 |         #----------this block is to downsample the 5*5 layer to 10*10, and concat with the original 10*10 layer, like Dense connection
156 |         fpn_2 = list()
157 |         detect_2 = pyramid_fea[2]
158 |         detect_3_2 = self.upsample2(detect_3)
159 |         fpn_2.append(detect_2)
160 |         fpn_2.append(detect_3_2)
161 |         detect_2 = torch.cat(fpn_2, 1)
162 |         detect_2 = self.conv_cat2(detect_2)
163 |         pyramid_fea[2] = detect_2
164 | 
165 | 
166 |         #----------this block is to downsample the 10*10 layer to 19*19, and concat with the original 19*19 layer, like Dense connection
167 |         fpn_3 = list()
168 |         detect_1 = pyramid_fea[1]
169 |         detect_2_1 = self.upsample3(detect_2)
170 |         fpn_3.append(detect_1)
171 |         fpn_3.append(detect_2_1)
172 |         detect_1 = torch.cat(fpn_3, 1)
173 |         detect_1 = self.conv_cat3(detect_1)
174 |         pyramid_fea[1] = detect_1
175 | 
176 | 
177 |         #----------this block is to downsample the 19*19 layer to 38*38, and concat with the original 38*38 layer, like Dense connection
178 |         fpn_4 = list()
179 |         detect_0 = pyramid_fea[0]
180 |         detect_1_0 = self.upsample4(detect_1)
181 |         fpn_4.append(detect_0)
182 |         fpn_4.append(detect_1_0)
183 |         detect_0 = torch.cat(fpn_4, 1)
184 |         detect_0 = self.conv_cat4(detect_0)
185 |         pyramid_fea[0] = detect_0
186 | 
187 | 
188 |         # apply multibox head to source layers
189 |         for (x, l, c) in zip(pyramid_fea, self.loc, self.conf):
190 |             loc.append(l(x).permute(0, 2, 3, 1).contiguous())
191 |             conf.append(c(x).permute(0, 2, 3, 1).contiguous())
192 | 
193 | 
194 |         #every detect layer's cls and reg
195 |         loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
196 |         conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
197 | 
198 |         if test:
199 |             output = (
200 |                 loc.view(loc.size(0), -1, 4),  # loc preds
201 |                 self.softmax(conf.view(-1, self.num_classes)),  # conf preds
202 |             )
203 |             features = ()
204 |         else:
205 |             output = (
206 |                 loc.view(loc.size(0), -1, 4),
207 |                 conf.view(conf.size(0), -1, self.num_classes),
208 |             )
209 |             features = (
210 |                 fea_bn
211 |             )
212 |         return output
213 | 
214 |     def load_weights(self, base_file):
215 |         other, ext = os.path.splitext(base_file)
216 |         if ext == '.pkl' or '.pth':
217 |             print('Loading weights into state dict...')
218 |             state_dict = torch.load(base_file, map_location=lambda storage, loc: storage)
219 |             from collections import OrderedDict
220 |             new_state_dict = OrderedDict()
221 |             for k, v in state_dict.items():
222 |                 head = k[:7]
223 |                 if head == 'module.':
224 |                     name = k[7:]  # remove `module.`
225 |                 else:
226 |                     name = k
227 |                 new_state_dict[name] = v
228 |             self.base.load_state_dict(new_state_dict)
229 |             print('Finished!')
230 | 
231 |         else:
232 |             print('Sorry only .pth and .pkl files supported.')
233 | 
234 | from models.smooth_scale_transfer import *
235 | 
236 | def feature_transform_module(scale_factor):
237 |     layers = []
238 |     # conv4_1
239 |     layers += [BasicConv(int(256 * scale_factor), 256, kernel_size=1, padding=0)]
240 |     #layers += [down_sample(int(256 * scale_factor), 256)]
241 |     # conv5_5
242 |     layers += [BasicConv(int(512 * scale_factor), 256, kernel_size=1, padding=0, up_size=38)]
243 |     #layers += [BasicConv(int(512 * scale_factor), 256, kernel_size=3, padding=1, stride=2)]
244 |     # conv6_mpo1
245 |     layers += [BasicConv(int(1024 * scale_factor), 256, kernel_size=1, padding=0, up_size=38)]
246 |     #layers += [BasicConv(int(1024 * scale_factor), 256, kernel_size=1, padding=0)]
247 |     return layers
248 | 
249 | 
250 | 
251 | def pyramid_feature_extractor():
252 |     layers = []
253 |     #layers +=  [SST_6(256, 256), SST_5(256, 256), SST_4(256, 256), SST_3(256, 256), SST_2(256, 256), SST_1(256, 256)]
254 |     #
255 |     from models.mobilenet import DepthWiseBlock
256 |     layers = [DepthWiseBlock(256*3, 512, stride=1), DepthWiseBlock(512, 512, stride=2),
257 |               DepthWiseBlock(512, 256, stride=2), DepthWiseBlock(256, 256, stride=2), \
258 |               DepthWiseBlock(256, 128, stride=1, padding=0), DepthWiseBlock(128, 128, stride=1, padding=0)]
259 | 
260 |     return layers
261 | 
262 | 
263 | def multibox(fea_channels, cfg, num_classes):
264 |     loc_layers = []
265 |     conf_layers = []
266 |     assert len(fea_channels) == len(cfg)
267 |     for i, fea_channel in enumerate(fea_channels):
268 |         loc_layers += [nn.Conv2d(fea_channel, cfg[i] * 4, kernel_size=3, padding=1)]
269 |         conf_layers += [nn.Conv2d(fea_channel, cfg[i] * num_classes, kernel_size=3, padding=1)]
270 |     return (loc_layers, conf_layers)
271 | 
272 | mbox = {
273 |     '300': [4, 6, 6, 6, 4, 4],  # number of boxes per feature map location
274 |     '512': [4, 6, 6, 6, 4, 4],
275 | }
276 | fea_channels = [512, 512, 256, 256, 128, 128]
277 | 
278 | 
279 | def build_net(size=512, num_classes=21):
280 |     if size != 300 and size != 512:
281 |         print("Error: Sorry only SSD300 and SSD512 is supported currently!")
282 |         return
283 | 
284 |     return FSSD(size, multibox(fea_channels, mbox[str(size)], num_classes), feature_transform_module(1),
285 |                 pyramid_feature_extractor(), \
286 |                 num_classes=num_classes)
287 | 
288 | 
289 | 
290 | #input = torch.tensor(1, 10, 16*10*10).view(1, 16, 10, 10).float()
291 | # pyramid_fea = list()
292 | # for k, v in enumerate(pyramid_feature_extractor()):
293 | #     #x = v(input)
294 | #     pyramid_fea.append(v)
295 | # print(pyramid_fea)
296 | 
297 | 
298 | # from torch.autograd import Variable
299 | #
300 | # input1 = Variable(torch.randn(1, 3, 300, 300))
301 | # t = {'im_detect': Timer(), 'misc': Timer()}
302 | # t['im_detect'].tic()
303 | # net = build_net(300,21)
304 | # net = net.forward(input1)
305 | # detect_time = t['im_detect'].toc()
306 | # print(detect_time)
307 | #output = net(input1)
308 | 


--------------------------------------------------------------------------------
/data/coco.py:
--------------------------------------------------------------------------------
  1 | """VOC Dataset Classes
  2 | 
  3 | Original author: Francisco Massa
  4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
  5 | 
  6 | Updated by: Ellis Brown, Max deGroot
  7 | """
  8 | 
  9 | import json
 10 | import pickle
 11 | 
 12 | import cv2
 13 | import numpy as np
 14 | import os
 15 | import os.path
 16 | import torch
 17 | import torch.utils.data as data
 18 | import torchvision.transforms as transforms
 19 | 
 20 | from utils.pycocotools.coco import COCO
 21 | from utils.pycocotools.cocoeval import COCOeval
 22 | 
 23 | 
 24 | class COCODetection(data.Dataset):
 25 |     """VOC Detection Dataset Object
 26 | 
 27 |     input is image, target is annotation
 28 | 
 29 |     Arguments:
 30 |         root (string): filepath to VOCdevkit folder.
 31 |         image_set (string): imageset to use (eg. 'train', 'val', 'test')
 32 |         transform (callable, optional): transformation to perform on the
 33 |             input image
 34 |         target_transform (callable, optional): transformation to perform on the
 35 |             target `annotation`
 36 |             (eg: take in caption string, return tensor of word indices)
 37 |         dataset_name (string, optional): which dataset to load
 38 |             (default: 'VOC2007')
 39 |     """
 40 | 
 41 |     def __init__(self, root, image_sets, preproc=None, target_transform=None,
 42 |                  dataset_name='COCO'):
 43 |         self.root = root
 44 |         self.cache_path = os.path.join(self.root, 'cache')
 45 |         self.image_set = image_sets
 46 |         self.preproc = preproc
 47 |         self.target_transform = target_transform
 48 |         self.name = dataset_name
 49 |         self.ids = list()
 50 |         self.annotations = list()
 51 |         self._view_map = {
 52 |             'minival2014': 'val2014',  # 5k val2014 subset
 53 |             'valminusminival2014': 'val2014',  # val2014 \setminus minival2014
 54 |             'test-dev2015': 'test2015',
 55 |         }
 56 | 
 57 |         for (year, image_set) in image_sets:
 58 |             coco_name = image_set + year
 59 |             data_name = (self._view_map[coco_name]
 60 |             if coco_name in self._view_map
 61 |             else coco_name)
 62 |             annofile = self._get_ann_file(coco_name)
 63 |             _COCO = COCO(annofile)
 64 |             self._COCO = _COCO
 65 |             self.coco_name = coco_name
 66 |             cats = _COCO.loadCats(_COCO.getCatIds())
 67 |             self._classes = tuple(['__background__'] + [c['name'] for c in cats])
 68 |             self.num_classes = len(self._classes)
 69 |             self._class_to_ind = dict(zip(self._classes, range(self.num_classes)))
 70 |             self._class_to_coco_cat_id = dict(zip([c['name'] for c in cats],
 71 |                                                   _COCO.getCatIds()))
 72 |             indexes = _COCO.getImgIds()
 73 |             self.image_indexes = indexes
 74 |             self.ids.extend([self.image_path_from_index(data_name, index) for index in indexes])
 75 |             if image_set.find('test') != -1:
 76 |                 print('test set will not load annotations!')
 77 |             else:
 78 |                 self.annotations.extend(self._load_coco_annotations(coco_name, indexes, _COCO))
 79 | 
 80 |     def image_path_from_index(self, name, index):
 81 |         """
 82 |         Construct an image path from the image's "index" identifier.
 83 |         """
 84 |         # Example image path for index=119993:
 85 |         #   images/train2014/COCO_train2014_000000119993.jpg
 86 |         if '2014' in name or '2015' in name:
 87 |             file_name = ('COCO_' + name + '_' +
 88 |                          str(index).zfill(12) + '.jpg')
 89 |             image_path = os.path.join(self.root, 'images',
 90 |                                       name, file_name)
 91 |             assert os.path.exists(image_path), \
 92 |                 'Path does not exist: {}'.format(image_path)
 93 |         if '2017' in name:
 94 |             file_name = str(index).zfill(12) + '.jpg'
 95 |             image_path = os.path.join(self.root, name, file_name)
 96 |             assert os.path.exists(image_path), \
 97 |                 'Path does not exist: {}'.format(image_path)
 98 |         return image_path
 99 | 
100 |     def _get_ann_file(self, name):
101 |         prefix = 'instances' if name.find('test') == -1 \
102 |             else 'image_info'
103 |         return os.path.join(self.root, 'annotations',
104 |                             prefix + '_' + name + '.json')
105 | 
106 |     def _load_coco_annotations(self, coco_name, indexes, _COCO):
107 |         cache_file = os.path.join(self.cache_path, coco_name + '_gt_roidb.pkl')
108 |         if not os.path.exists(self.cache_path):
109 |             os.makedirs(self.cache_path)
110 |         if os.path.exists(cache_file):
111 |             with open(cache_file, 'rb') as fid:
112 |                 roidb = pickle.load(fid)
113 |             print('{} gt roidb loaded from {}'.format(coco_name, cache_file))
114 |             return roidb
115 | 
116 |         gt_roidb = [self._annotation_from_index(index, _COCO)
117 |                     for index in indexes]
118 |         with open(cache_file, 'wb') as fid:
119 |             pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL)
120 |         print('wrote gt roidb to {}'.format(cache_file))
121 |         return gt_roidb
122 | 
123 |     def _annotation_from_index(self, index, _COCO):
124 |         """
125 |         Loads COCO bounding-box instance annotations. Crowd instances are
126 |         handled by marking their overlaps (with all categories) to -1. This
127 |         overlap value means that crowd "instances" are excluded from training.
128 |         """
129 |         im_ann = _COCO.loadImgs(index)[0]
130 |         width = im_ann['width']
131 |         height = im_ann['height']
132 | 
133 |         annIds = _COCO.getAnnIds(imgIds=index, iscrowd=None)
134 |         objs = _COCO.loadAnns(annIds)
135 |         # Sanitize bboxes -- some are invalid
136 |         valid_objs = []
137 |         for obj in objs:
138 |             x1 = np.max((0, obj['bbox'][0]))
139 |             y1 = np.max((0, obj['bbox'][1]))
140 |             x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1))))
141 |             y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1))))
142 |             if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
143 |                 obj['clean_bbox'] = [x1, y1, x2, y2]
144 |                 valid_objs.append(obj)
145 |         objs = valid_objs
146 |         num_objs = len(objs)
147 | 
148 |         res = np.zeros((num_objs, 5))
149 | 
150 |         # Lookup table to map from COCO category ids to our internal class
151 |         # indices
152 |         coco_cat_id_to_class_ind = dict([(self._class_to_coco_cat_id[cls],
153 |                                           self._class_to_ind[cls])
154 |                                          for cls in self._classes[1:]])
155 | 
156 |         for ix, obj in enumerate(objs):
157 |             cls = coco_cat_id_to_class_ind[obj['category_id']]
158 |             res[ix, 0:4] = obj['clean_bbox']
159 |             res[ix, 4] = cls
160 | 
161 |         return res
162 | 
163 |     def __getitem__(self, index):
164 |         img_id = self.ids[index]
165 |         target = self.annotations[index]
166 |         img = cv2.imread(img_id, cv2.IMREAD_COLOR)
167 |         height, width, _ = img.shape
168 | 
169 |         if self.target_transform is not None:
170 |             target = self.target_transform(target)
171 | 
172 |         if self.preproc is not None:
173 |             img, target = self.preproc(img, target)
174 | 
175 |             # target = self.target_transform(target, width, height)
176 |         # print(target.shape)
177 | 
178 |         return img, target
179 | 
180 |     def __len__(self):
181 |         return len(self.ids)
182 | 
183 |     def pull_image(self, index):
184 |         '''Returns the original image object at index in PIL form
185 | 
186 |         Note: not using self.__getitem__(), as any transformations passed in
187 |         could mess up this functionality.
188 | 
189 |         Argument:
190 |             index (int): index of img to show
191 |         Return:
192 |             PIL img
193 |         '''
194 |         img_id = self.ids[index]
195 |         return cv2.imread(img_id, cv2.IMREAD_COLOR)
196 | 
197 |     def pull_tensor(self, index):
198 |         '''Returns the original image at an index in tensor form
199 | 
200 |         Note: not using self.__getitem__(), as any transformations passed in
201 |         could mess up this functionality.
202 | 
203 |         Argument:
204 |             index (int): index of img to show
205 |         Return:
206 |             tensorized version of img, squeezed
207 |         '''
208 |         to_tensor = transforms.ToTensor()
209 |         return torch.Tensor(self.pull_image(index)).unsqueeze_(0)
210 | 
211 |     def _print_detection_eval_metrics(self, coco_eval):
212 |         IoU_lo_thresh = 0.5
213 |         IoU_hi_thresh = 0.95
214 | 
215 |         def _get_thr_ind(coco_eval, thr):
216 |             ind = np.where((coco_eval.params.iouThrs > thr - 1e-5) &
217 |                            (coco_eval.params.iouThrs < thr + 1e-5))[0][0]
218 |             iou_thr = coco_eval.params.iouThrs[ind]
219 |             assert np.isclose(iou_thr, thr)
220 |             return ind
221 | 
222 |         ind_lo = _get_thr_ind(coco_eval, IoU_lo_thresh)
223 |         ind_hi = _get_thr_ind(coco_eval, IoU_hi_thresh)
224 |         # precision has dims (iou, recall, cls, area range, max dets)
225 |         # area range index 0: all area ranges
226 |         # max dets index 2: 100 per image
227 |         precision = \
228 |             coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2]
229 |         ap_default = np.mean(precision[precision > -1])
230 |         print('~~~~ Mean and per-category AP @ IoU=[{:.2f},{:.2f}] '
231 |               '~~~~'.format(IoU_lo_thresh, IoU_hi_thresh))
232 |         print('{:.1f}'.format(100 * ap_default))
233 |         for cls_ind, cls in enumerate(self._classes):
234 |             if cls == '__background__':
235 |                 continue
236 |             # minus 1 because of __background__
237 |             precision = coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, cls_ind - 1, 0, 2]
238 |             ap = np.mean(precision[precision > -1])
239 |             print('{:.1f}'.format(100 * ap))
240 | 
241 |         print('~~~~ Summary metrics ~~~~')
242 |         coco_eval.summarize()
243 | 
244 |     def _do_detection_eval(self, res_file, output_dir):
245 |         ann_type = 'bbox'
246 |         coco_dt = self._COCO.loadRes(res_file)
247 |         coco_eval = COCOeval(self._COCO, coco_dt)
248 |         coco_eval.params.useSegm = (ann_type == 'segm')
249 |         coco_eval.evaluate()
250 |         coco_eval.accumulate()
251 |         self._print_detection_eval_metrics(coco_eval)
252 |         eval_file = os.path.join(output_dir, 'detection_results.pkl')
253 |         with open(eval_file, 'wb') as fid:
254 |             pickle.dump(coco_eval, fid, pickle.HIGHEST_PROTOCOL)
255 |         print('Wrote COCO eval results to: {}'.format(eval_file))
256 | 
257 |     def _coco_results_one_category(self, boxes, cat_id):
258 |         results = []
259 |         for im_ind, index in enumerate(self.image_indexes):
260 |             dets = boxes[im_ind].astype(np.float)
261 |             if dets == []:
262 |                 continue
263 |             scores = dets[:, -1]
264 |             xs = dets[:, 0]
265 |             ys = dets[:, 1]
266 |             ws = dets[:, 2] - xs + 1
267 |             hs = dets[:, 3] - ys + 1
268 |             results.extend(
269 |                 [{'image_id': index,
270 |                   'category_id': cat_id,
271 |                   'bbox': [xs[k], ys[k], ws[k], hs[k]],
272 |                   'score': scores[k]} for k in range(dets.shape[0])])
273 |         return results
274 | 
275 |     def _write_coco_results_file(self, all_boxes, res_file):
276 |         # [{"image_id": 42,
277 |         #   "category_id": 18,
278 |         #   "bbox": [258.15,41.29,348.26,243.78],
279 |         #   "score": 0.236}, ...]
280 |         results = []
281 |         for cls_ind, cls in enumerate(self._classes):
282 |             if cls == '__background__':
283 |                 continue
284 |             print('Collecting {} results ({:d}/{:d})'.format(cls, cls_ind,
285 |                                                              self.num_classes))
286 |             coco_cat_id = self._class_to_coco_cat_id[cls]
287 |             results.extend(self._coco_results_one_category(all_boxes[cls_ind],
288 |                                                            coco_cat_id))
289 |             '''
290 |             if cls_ind ==30:
291 |                 res_f = res_file+ '_1.json'
292 |                 print('Writing results json to {}'.format(res_f))
293 |                 with open(res_f, 'w') as fid:
294 |                     json.dump(results, fid)
295 |                 results = []
296 |             '''
297 |         # res_f2 = res_file+'_2.json'
298 |         print('Writing results json to {}'.format(res_file))
299 |         with open(res_file, 'w') as fid:
300 |             json.dump(results, fid)
301 | 
302 |     def evaluate_detections(self, all_boxes, output_dir):
303 |         res_file = os.path.join(output_dir, ('detections_' +
304 |                                              self.coco_name +
305 |                                              '_results'))
306 |         res_file += '.json'
307 |         self._write_coco_results_file(all_boxes, res_file)
308 |         # Only do evaluation on non-test sets
309 |         if self.coco_name.find('test') == -1:
310 |             self._do_detection_eval(res_file, output_dir)
311 |         # Optionally cleanup results json file
312 | 


--------------------------------------------------------------------------------
/data/voc0712.py:
--------------------------------------------------------------------------------
  1 | """VOC Dataset Classes
  2 | 
  3 | Original author: Francisco Massa
  4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
  5 | 
  6 | Updated by: Ellis Brown, Max deGroot
  7 | """
  8 | 
  9 | import pickle
 10 | import sys
 11 | 
 12 | import cv2
 13 | import numpy as np
 14 | import os
 15 | import os.path
 16 | import torch
 17 | import torch.utils.data as data
 18 | import torchvision.transforms as transforms
 19 | from PIL import Image
 20 | 
 21 | from .voc_eval import voc_eval
 22 | 
 23 | if sys.version_info[0] == 2:
 24 |     import xml.etree.cElementTree as ET
 25 | else:
 26 |     import xml.etree.ElementTree as ET
 27 | 
 28 | VOC_CLASSES = ('__background__',  # always index 0
 29 |                'aeroplane', 'bicycle', 'bird', 'boat',
 30 |                'bottle', 'bus', 'car', 'cat', 'chair',
 31 |                'cow', 'diningtable', 'dog', 'horse',
 32 |                'motorbike', 'person', 'pottedplant',
 33 |                'sheep', 'sofa', 'train', 'tvmonitor')
 34 | 
 35 | # for making bounding boxes pretty
 36 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128),
 37 |           (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128))
 38 | 
 39 | 
 40 | class VOCSegmentation(data.Dataset):
 41 |     """VOC Segmentation Dataset Object
 42 |     input and target are both images
 43 | 
 44 |     NOTE: need to address https://github.com/pytorch/vision/issues/9
 45 | 
 46 |     Arguments:
 47 |         root (string): filepath to VOCdevkit folder.
 48 |         image_set (string): imageset to use (eg: 'train', 'val', 'test').
 49 |         transform (callable, optional): transformation to perform on the
 50 |             input image
 51 |         target_transform (callable, optional): transformation to perform on the
 52 |             target image
 53 |         dataset_name (string, optional): which dataset to load
 54 |             (default: 'VOC2007')
 55 |     """
 56 | 
 57 |     def __init__(self, root, image_set, transform=None, target_transform=None,
 58 |                  dataset_name='VOC2007'):
 59 |         self.root = '/home/zdh1901/data/VOCdevkit/'
 60 |         self.image_set = image_set
 61 |         self.transform = transform
 62 |         self.target_transform = target_transform
 63 | 
 64 |         self._annopath = os.path.join(
 65 |             self.root, dataset_name, 'SegmentationClass', '%s.png')
 66 |         self._imgpath = os.path.join(
 67 |             self.root, dataset_name, 'JPEGImages', '%s.jpg')
 68 |         self._imgsetpath = os.path.join(
 69 |             self.root, dataset_name, 'ImageSets', 'Segmentation', '%s.txt')
 70 | 
 71 |         with open(self._imgsetpath % self.image_set) as f:
 72 |             self.ids = f.readlines()
 73 |         self.ids = [x.strip('\n') for x in self.ids]
 74 | 
 75 |     def __getitem__(self, index):
 76 |         img_id = self.ids[index]
 77 | 
 78 |         target = Image.open(self._annopath % img_id).convert('RGB')
 79 |         img = Image.open(self._imgpath % img_id).convert('RGB')
 80 | 
 81 |         if self.transform is not None:
 82 |             img = self.transform(img)
 83 | 
 84 |         if self.target_transform is not None:
 85 |             target = self.target_transform(target)
 86 | 
 87 |         return img, target
 88 | 
 89 |     def __len__(self):
 90 |         return len(self.ids)
 91 | 
 92 | 
 93 | class AnnotationTransform(object):
 94 |     """Transforms a VOC annotation into a Tensor of bbox coords and label index
 95 |     Initilized with a dictionary lookup of classnames to indexes
 96 | 
 97 |     Arguments:
 98 |         class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
 99 |             (default: alphabetic indexing of VOC's 20 classes)
100 |         keep_difficult (bool, optional): keep difficult instances or not
101 |             (default: False)
102 |         height (int): height
103 |         width (int): width
104 |     """
105 | 
106 |     def __init__(self, class_to_ind=None, keep_difficult=True):
107 |         self.class_to_ind = class_to_ind or dict(
108 |             zip(VOC_CLASSES, range(len(VOC_CLASSES))))
109 |         self.keep_difficult = keep_difficult
110 | 
111 |     def __call__(self, target):
112 |         """
113 |         Arguments:
114 |             target (annotation) : the target annotation to be made usable
115 |                 will be an ET.Element
116 |         Returns:
117 |             a list containing lists of bounding boxes  [bbox coords, class name]
118 |         """
119 |         res = np.empty((0, 5))
120 |         for obj in target.iter('object'):
121 |             difficult = int(obj.find('difficult').text) == 1
122 |             if not self.keep_difficult and difficult:
123 |                 continue
124 |             name = obj.find('name').text.lower().strip()
125 |             bbox = obj.find('bndbox')
126 | 
127 |             pts = ['xmin', 'ymin', 'xmax', 'ymax']
128 |             bndbox = []
129 |             for i, pt in enumerate(pts):
130 |                 cur_pt = int(bbox.find(pt).text) - 1
131 |                 # scale height or width
132 |                 # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
133 |                 bndbox.append(cur_pt)
134 |             label_idx = self.class_to_ind[name]
135 |             bndbox.append(label_idx)
136 |             res = np.vstack((res, bndbox))  # [xmin, ymin, xmax, ymax, label_ind]
137 |             # img_id = target.find('filename').text[:-4]
138 | 
139 |         return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
140 | 
141 | 
142 | class VOCDetection(data.Dataset):
143 |     """VOC Detection Dataset Object
144 | 
145 |     input is image, target is annotation
146 | 
147 |     Arguments:
148 |         root (string): filepath to VOCdevkit folder.
149 |         image_set (string): imageset to use (eg. 'train', 'val', 'test')
150 |         transform (callable, optional): transformation to perform on the
151 |             input image
152 |         target_transform (callable, optional): transformation to perform on the
153 |             target `annotation`
154 |             (eg: take in caption string, return tensor of word indices)
155 |         dataset_name (string, optional): which dataset to load
156 |             (default: 'VOC2007')
157 |     """
158 | 
159 |     def __init__(self, root, image_sets, preproc=None, target_transform=None,
160 |                  dataset_name='VOC0712'):
161 |         self.root = root
162 |         self.image_set = image_sets
163 |         self.preproc = preproc
164 |         self.target_transform = target_transform
165 |         self.name = dataset_name
166 |         self._annopath = os.path.join('%s', 'Annotations', '%s.xml')
167 |         self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg')
168 |         self.ids = list()
169 |         for (year, name) in image_sets:
170 |             self._year = year
171 |             rootpath = os.path.join(self.root, 'VOC' + year)
172 |             for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
173 |                 self.ids.append((rootpath, line.strip()))
174 | 
175 |     def __getitem__(self, index):
176 |         img_id = self.ids[index]
177 |         target = ET.parse(self._annopath % img_id).getroot()
178 |         img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
179 |         height, width, _ = img.shape
180 | 
181 |         if self.target_transform is not None:
182 |             target = self.target_transform(target)
183 | 
184 |         if self.preproc is not None:
185 |             img, target = self.preproc(img, target)
186 |             # print(img.size())
187 | 
188 |             # target = self.target_transform(target, width, height)
189 |         # print(target.shape)
190 | 
191 |         return img, target
192 | 
193 |     def __len__(self):
194 |         return len(self.ids)
195 | 
196 |     def pull_image(self, index):
197 |         '''Returns the original image object at index in PIL form
198 | 
199 |         Note: not using self.__getitem__(), as any transformations passed in
200 |         could mess up this functionality.
201 | 
202 |         Argument:
203 |             index (int): index of img to show
204 |         Return:
205 |             PIL img
206 |         '''
207 |         img_id = self.ids[index]
208 |         return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
209 | 
210 |     def pull_anno(self, index):
211 |         '''Returns the original annotation of image at index
212 | 
213 |         Note: not using self.__getitem__(), as any transformations passed in
214 |         could mess up this functionality.
215 | 
216 |         Argument:
217 |             index (int): index of img to get annotation of
218 |         Return:
219 |             list:  [img_id, [(label, bbox coords),...]]
220 |                 eg: ('001718', [('dog', (96, 13, 438, 332))])
221 |         '''
222 |         img_id = self.ids[index]
223 |         anno = ET.parse(self._annopath % img_id).getroot()
224 |         gt = self.target_transform(anno, 1, 1)
225 |         return img_id[1], gt
226 | 
227 |     def pull_tensor(self, index):
228 |         '''Returns the original image at an index in tensor form
229 | 
230 |         Note: not using self.__getitem__(), as any transformations passed in
231 |         could mess up this functionality.
232 | 
233 |         Argument:
234 |             index (int): index of img to show
235 |         Return:
236 |             tensorized version of img, squeezed
237 |         '''
238 |         to_tensor = transforms.ToTensor()
239 |         return torch.Tensor(self.pull_image(index)).unsqueeze_(0)
240 | 
241 |     def evaluate_detections(self, all_boxes, output_dir=None):
242 |         """
243 |         all_boxes is a list of length number-of-classes.
244 |         Each list element is a list of length number-of-images.
245 |         Each of those list elements is either an empty list []
246 |         or a numpy array of detection.
247 | 
248 |         all_boxes[class][image] = [] or np.array of shape #dets x 5
249 |         """
250 |         self._write_voc_results_file(all_boxes)
251 |         aps, map = self._do_python_eval(output_dir)
252 |         return aps, map
253 | 
254 |     def _get_voc_results_file_template(self):
255 |         filename = 'comp4_det_test' + '_{:s}.txt'
256 |         filedir = os.path.join(
257 |             self.root, 'results', 'VOC' + self._year, 'Main')
258 |         if not os.path.exists(filedir):
259 |             os.makedirs(filedir)
260 |         path = os.path.join(filedir, filename)
261 |         return path
262 | 
263 |     def _write_voc_results_file(self, all_boxes):
264 |         for cls_ind, cls in enumerate(VOC_CLASSES):
265 |             cls_ind = cls_ind
266 |             if cls == '__background__':
267 |                 continue
268 |             print('Writing {} VOC results file'.format(cls))
269 |             filename = self._get_voc_results_file_template().format(cls)
270 |             with open(filename, 'wt') as f:
271 |                 for im_ind, index in enumerate(self.ids):
272 |                     index = index[1]
273 |                     dets = all_boxes[cls_ind][im_ind]
274 |                     if dets == []:
275 |                         continue
276 |                     for k in range(dets.shape[0]):
277 |                         f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
278 |                                 format(index, dets[k, -1],
279 |                                        dets[k, 0] + 1, dets[k, 1] + 1,
280 |                                        dets[k, 2] + 1, dets[k, 3] + 1))
281 | 
282 |     def _do_python_eval(self, output_dir='output'):
283 |         rootpath = os.path.join(self.root, 'VOC' + self._year)
284 |         name = self.image_set[0][1]
285 |         annopath = os.path.join(
286 |             rootpath,
287 |             'Annotations',
288 |             '{:s}.xml')
289 |         imagesetfile = os.path.join(
290 |             rootpath,
291 |             'ImageSets',
292 |             'Main',
293 |             name + '.txt')
294 |         cachedir = os.path.join(self.root, 'annotations_cache')
295 |         aps = []
296 |         # The PASCAL VOC metric changed in 2010
297 |         use_07_metric = True if int(self._year) < 2010 else False
298 |         print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
299 |         if output_dir is not None and not os.path.isdir(output_dir):
300 |             os.mkdir(output_dir)
301 |         for i, cls in enumerate(VOC_CLASSES):
302 | 
303 |             if cls == '__background__':
304 |                 continue
305 | 
306 |             filename = self._get_voc_results_file_template().format(cls)
307 |             rec, prec, ap = voc_eval(
308 |                 filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
309 |                 use_07_metric=use_07_metric)
310 |             aps += [ap]
311 |             print('AP for {} = {:.4f}'.format(cls, ap))
312 |             if output_dir is not None:
313 |                 with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
314 |                     pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
315 |         print('Mean AP = {:.4f}'.format(np.mean(aps)))
316 |         print('~~~~~~~~')
317 |         print('Results:')
318 |         for ap in aps:
319 |             print('{:.3f}'.format(ap))
320 |         print('{:.3f}'.format(np.mean(aps)))
321 |         print('~~~~~~~~')
322 |         print('')
323 |         print('--------------------------------------------------------------')
324 |         print('Results computed with the **unofficial** Python eval code.')
325 |         print('Results should be very close to the official MATLAB eval code.')
326 |         print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
327 |         print('-- Thanks, The Management')
328 |         print('--------------------------------------------------------------')
329 |         return aps, np.mean(aps)
330 | 
331 | 
332 | def detection_collate(batch):
333 |     """Custom collate fn for dealing with batches of images that have a different
334 |     number of associated object annotations (bounding boxes).
335 | 
336 |     Arguments:
337 |         batch: (tuple) A tuple of tensor images and lists of annotations
338 | 
339 |     Return:
340 |         A tuple containing:
341 |             1) (tensor) batch of images stacked on their 0 dim
342 |             2) (list of tensors) annotations for a given image are stacked on 0 dim
343 |     """
344 |     targets = []
345 |     imgs = []
346 |     for _, sample in enumerate(batch):
347 |         for _, tup in enumerate(sample):
348 |             if torch.is_tensor(tup):
349 |                 imgs.append(tup)
350 |             elif isinstance(tup, type(np.empty(0))):
351 |                 annos = torch.from_numpy(tup).float()
352 |                 targets.append(annos)
353 | 
354 |     return (torch.stack(imgs, 0), targets)
355 | 


--------------------------------------------------------------------------------
/data/augmentations.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torchvision import transforms
  3 | import cv2
  4 | import numpy as np
  5 | import types
  6 | from numpy import random
  7 | 
  8 | 
  9 | def intersect(box_a, box_b):
 10 |     max_xy = np.minimum(box_a[:, 2:], box_b[2:])
 11 |     min_xy = np.maximum(box_a[:, :2], box_b[:2])
 12 |     inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
 13 |     return inter[:, 0] * inter[:, 1]
 14 | 
 15 | 
 16 | def jaccard_numpy(box_a, box_b):
 17 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 18 |     is simply the intersection over union of two boxes.
 19 |     E.g.:
 20 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 21 |     Args:
 22 |         box_a: Multiple bounding boxes, Shape: [num_boxes,4]
 23 |         box_b: Single bounding box, Shape: [4]
 24 |     Return:
 25 |         jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
 26 |     """
 27 |     inter = intersect(box_a, box_b)
 28 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 29 |               (box_a[:, 3]-box_a[:, 1]))  # [A,B]
 30 |     area_b = ((box_b[2]-box_b[0]) *
 31 |               (box_b[3]-box_b[1]))  # [A,B]
 32 |     union = area_a + area_b - inter
 33 |     return inter / union  # [A,B]
 34 | 
 35 | 
 36 | class Compose(object):
 37 |     """Composes several augmentations together.
 38 |     Args:
 39 |         transforms (List[Transform]): list of transforms to compose.
 40 |     Example:
 41 |         >>> augmentations.Compose([
 42 |         >>>     transforms.CenterCrop(10),
 43 |         >>>     transforms.ToTensor(),
 44 |         >>> ])
 45 |     """
 46 | 
 47 |     def __init__(self, transforms):
 48 |         self.transforms = transforms
 49 | 
 50 |     def __call__(self, img, boxes=None, labels=None):
 51 |         for t in self.transforms:
 52 |             img, boxes, labels = t(img, boxes, labels)
 53 |         return img, boxes, labels
 54 | 
 55 | 
 56 | class Lambda(object):
 57 |     """Applies a lambda as a transform."""
 58 | 
 59 |     def __init__(self, lambd):
 60 |         assert isinstance(lambd, types.LambdaType)
 61 |         self.lambd = lambd
 62 | 
 63 |     def __call__(self, img, boxes=None, labels=None):
 64 |         return self.lambd(img, boxes, labels)
 65 | 
 66 | 
 67 | class ConvertFromInts(object):
 68 |     def __call__(self, image, boxes=None, labels=None):
 69 |         return image.astype(np.float32), boxes, labels
 70 | 
 71 | 
 72 | class SubtractMeans(object):
 73 |     def __init__(self, mean):
 74 |         self.mean = np.array(mean, dtype=np.float32)
 75 | 
 76 |     def __call__(self, image, boxes=None, labels=None):
 77 |         image = image.astype(np.float32)
 78 |         image -= self.mean
 79 |         return image.astype(np.float32), boxes, labels
 80 | 
 81 | 
 82 | class ToAbsoluteCoords(object):
 83 |     def __call__(self, image, boxes=None, labels=None):
 84 |         height, width, channels = image.shape
 85 |         boxes[:, 0] *= width
 86 |         boxes[:, 2] *= width
 87 |         boxes[:, 1] *= height
 88 |         boxes[:, 3] *= height
 89 | 
 90 |         return image, boxes, labels
 91 | 
 92 | 
 93 | class ToPercentCoords(object):
 94 |     def __call__(self, image, boxes=None, labels=None):
 95 |         height, width, channels = image.shape
 96 |         boxes[:, 0] /= width
 97 |         boxes[:, 2] /= width
 98 |         boxes[:, 1] /= height
 99 |         boxes[:, 3] /= height
100 | 
101 |         return image, boxes, labels
102 | 
103 | 
104 | class Resize(object):
105 |     def __init__(self, size=300):
106 |         self.size = size
107 | 
108 |     def __call__(self, image, boxes=None, labels=None):
109 |         image = cv2.resize(image, (self.size,
110 |                                  self.size))
111 |         return image, boxes, labels
112 | 
113 | 
114 | class RandomSaturation(object):
115 |     def __init__(self, lower=0.5, upper=1.5):
116 |         self.lower = lower
117 |         self.upper = upper
118 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
119 |         assert self.lower >= 0, "contrast lower must be non-negative."
120 | 
121 |     def __call__(self, image, boxes=None, labels=None):
122 |         if random.randint(2):
123 |             image[:, :, 1] *= random.uniform(self.lower, self.upper)
124 | 
125 |         return image, boxes, labels
126 | 
127 | 
128 | class RandomHue(object):
129 |     def __init__(self, delta=18.0):
130 |         assert delta >= 0.0 and delta <= 360.0
131 |         self.delta = delta
132 | 
133 |     def __call__(self, image, boxes=None, labels=None):
134 |         if random.randint(2):
135 |             image[:, :, 0] += random.uniform(-self.delta, self.delta)
136 |             image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
137 |             image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
138 |         return image, boxes, labels
139 | 
140 | 
141 | class RandomLightingNoise(object):
142 |     def __init__(self):
143 |         self.perms = ((0, 1, 2), (0, 2, 1),
144 |                       (1, 0, 2), (1, 2, 0),
145 |                       (2, 0, 1), (2, 1, 0))
146 | 
147 |     def __call__(self, image, boxes=None, labels=None):
148 |         if random.randint(2):
149 |             swap = self.perms[random.randint(len(self.perms))]
150 |             shuffle = SwapChannels(swap)  # shuffle channels
151 |             image = shuffle(image)
152 |         return image, boxes, labels
153 | 
154 | 
155 | class ConvertColor(object):
156 |     def __init__(self, current='BGR', transform='HSV'):
157 |         self.transform = transform
158 |         self.current = current
159 | 
160 |     def __call__(self, image, boxes=None, labels=None):
161 |         if self.current == 'BGR' and self.transform == 'HSV':
162 |             image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
163 |         elif self.current == 'HSV' and self.transform == 'BGR':
164 |             image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
165 |         else:
166 |             raise NotImplementedError
167 |         return image, boxes, labels
168 | 
169 | 
170 | class RandomContrast(object):
171 |     def __init__(self, lower=0.5, upper=1.5):
172 |         self.lower = lower
173 |         self.upper = upper
174 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
175 |         assert self.lower >= 0, "contrast lower must be non-negative."
176 | 
177 |     # expects float image
178 |     def __call__(self, image, boxes=None, labels=None):
179 |         if random.randint(2):
180 |             alpha = random.uniform(self.lower, self.upper)
181 |             image *= alpha
182 |         return image, boxes, labels
183 | 
184 | 
185 | class RandomBrightness(object):
186 |     def __init__(self, delta=32):
187 |         assert delta >= 0.0
188 |         assert delta <= 255.0
189 |         self.delta = delta
190 | 
191 |     def __call__(self, image, boxes=None, labels=None):
192 |         if random.randint(2):
193 |             delta = random.uniform(-self.delta, self.delta)
194 |             image += delta
195 |         return image, boxes, labels
196 | 
197 | 
198 | class ToCV2Image(object):
199 |     def __call__(self, tensor, boxes=None, labels=None):
200 |         return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels
201 | 
202 | 
203 | class ToTensor(object):
204 |     def __call__(self, cvimage, boxes=None, labels=None):
205 |         return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels
206 | 
207 | 
208 | class RandomSampleCrop(object):
209 |     """Crop
210 |     Arguments:
211 |         img (Image): the image being input during training
212 |         boxes (Tensor): the original bounding boxes in pt form
213 |         labels (Tensor): the class labels for each bbox
214 |         mode (float tuple): the min and max jaccard overlaps
215 |     Return:
216 |         (img, boxes, classes)
217 |             img (Image): the cropped image
218 |             boxes (Tensor): the adjusted bounding boxes in pt form
219 |             labels (Tensor): the class labels for each bbox
220 |     """
221 |     def __init__(self):
222 |         self.sample_options = (
223 |             # using entire original input image
224 |             None,
225 |             # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
226 |             (0.1, None),
227 |             (0.3, None),
228 |             (0.7, None),
229 |             (0.9, None),
230 |             # randomly sample a patch
231 |             (None, None),
232 |         )
233 | 
234 |     def __call__(self, image, boxes=None, labels=None):
235 |         height, width, _ = image.shape
236 |         while True:
237 |             # randomly choose a mode
238 |             mode = random.choice(self.sample_options)
239 |             if mode is None:
240 |                 return image, boxes, labels
241 | 
242 |             min_iou, max_iou = mode
243 |             if min_iou is None:
244 |                 min_iou = float('-inf')
245 |             if max_iou is None:
246 |                 max_iou = float('inf')
247 | 
248 |             # max trails (50)
249 |             for _ in range(50):
250 |                 current_image = image
251 | 
252 |                 w = random.uniform(0.3 * width, width)
253 |                 h = random.uniform(0.3 * height, height)
254 | 
255 |                 # aspect ratio constraint b/t .5 & 2
256 |                 if h / w < 0.5 or h / w > 2:
257 |                     continue
258 | 
259 |                 left = random.uniform(width - w)
260 |                 top = random.uniform(height - h)
261 | 
262 |                 # convert to integer rect x1,y1,x2,y2
263 |                 rect = np.array([int(left), int(top), int(left+w), int(top+h)])
264 | 
265 |                 # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
266 |                 overlap = jaccard_numpy(boxes, rect)
267 | 
268 |                 # is min and max overlap constraint satisfied? if not try again
269 |                 if overlap.min() < min_iou and max_iou < overlap.max():
270 |                     continue
271 | 
272 |                 # cut the crop from the image
273 |                 current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
274 |                                               :]
275 | 
276 |                 # keep overlap with gt box IF center in sampled patch
277 |                 centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
278 | 
279 |                 # mask in all gt boxes that above and to the left of centers
280 |                 m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
281 | 
282 |                 # mask in all gt boxes that under and to the right of centers
283 |                 m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
284 | 
285 |                 # mask in that both m1 and m2 are true
286 |                 mask = m1 * m2
287 | 
288 |                 # have any valid boxes? try again if not
289 |                 if not mask.any():
290 |                     continue
291 | 
292 |                 # take only matching gt boxes
293 |                 current_boxes = boxes[mask, :].copy()
294 | 
295 |                 # take only matching gt labels
296 |                 current_labels = labels[mask]
297 | 
298 |                 # should we use the box left and top corner or the crop's
299 |                 current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
300 |                                                   rect[:2])
301 |                 # adjust to crop (by substracting crop's left,top)
302 |                 current_boxes[:, :2] -= rect[:2]
303 | 
304 |                 current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
305 |                                                   rect[2:])
306 |                 # adjust to crop (by substracting crop's left,top)
307 |                 current_boxes[:, 2:] -= rect[:2]
308 | 
309 |                 return current_image, current_boxes, current_labels
310 | 
311 | 
312 | class Expand(object):
313 |     def __init__(self, mean):
314 |         self.mean = mean
315 | 
316 |     def __call__(self, image, boxes, labels):
317 |         if random.randint(2):
318 |             return image, boxes, labels
319 | 
320 |         height, width, depth = image.shape
321 |         ratio = random.uniform(1, 4)
322 |         left = random.uniform(0, width*ratio - width)
323 |         top = random.uniform(0, height*ratio - height)
324 | 
325 |         expand_image = np.zeros(
326 |             (int(height*ratio), int(width*ratio), depth),
327 |             dtype=image.dtype)
328 |         expand_image[:, :, :] = self.mean
329 |         expand_image[int(top):int(top + height),
330 |                      int(left):int(left + width)] = image
331 |         image = expand_image
332 | 
333 |         boxes = boxes.copy()
334 |         boxes[:, :2] += (int(left), int(top))
335 |         boxes[:, 2:] += (int(left), int(top))
336 | 
337 |         return image, boxes, labels
338 | 
339 | 
340 | class RandomMirror(object):
341 |     def __call__(self, image, boxes, classes):
342 |         _, width, _ = image.shape
343 |         if random.randint(2):
344 |             image = image[:, ::-1]
345 |             boxes = boxes.copy()
346 |             boxes[:, 0::2] = width - boxes[:, 2::-2]
347 |         return image, boxes, classes
348 | 
349 | 
350 | class SwapChannels(object):
351 |     """Transforms a tensorized image by swapping the channels in the order
352 |      specified in the swap tuple.
353 |     Args:
354 |         swaps (int triple): final order of channels
355 |             eg: (2, 1, 0)
356 |     """
357 | 
358 |     def __init__(self, swaps):
359 |         self.swaps = swaps
360 | 
361 |     def __call__(self, image):
362 |         """
363 |         Args:
364 |             image (Tensor): image tensor to be transformed
365 |         Return:
366 |             a tensor with channels swapped according to swap
367 |         """
368 |         # if torch.is_tensor(image):
369 |         #     image = image.data.cpu().numpy()
370 |         # else:
371 |         #     image = np.array(image)
372 |         image = image[:, :, self.swaps]
373 |         return image
374 | 
375 | 
376 | class PhotometricDistort(object):
377 |     def __init__(self):
378 |         self.pd = [
379 |             RandomContrast(),
380 |             ConvertColor(transform='HSV'),
381 |             RandomSaturation(),
382 |             RandomHue(),
383 |             ConvertColor(current='HSV', transform='BGR'),
384 |             RandomContrast()
385 |         ]
386 |         self.rand_brightness = RandomBrightness()
387 |         self.rand_light_noise = RandomLightingNoise()
388 | 
389 |     def __call__(self, image, boxes, labels):
390 |         im = image.copy()
391 |         im, boxes, labels = self.rand_brightness(im, boxes, labels)
392 |         if random.randint(2):
393 |             distort = Compose(self.pd[:-1])
394 |         else:
395 |             distort = Compose(self.pd[1:])
396 |         im, boxes, labels = distort(im, boxes, labels)
397 |         return self.rand_light_noise(im, boxes, labels)
398 | 
399 | 
400 | class SSDAugmentation(object):
401 |     def __init__(self, size=300, mean=(104, 117, 123)):
402 |         self.mean = mean
403 |         self.size = size
404 |         self.augment = Compose([
405 |             ConvertFromInts(),
406 |             ToAbsoluteCoords(),
407 |             PhotometricDistort(),
408 |             Expand(self.mean),
409 |             RandomSampleCrop(),
410 |             RandomMirror(),
411 |             ToPercentCoords(),
412 |             Resize(self.size),
413 |             SubtractMeans(self.mean)
414 |         ])
415 | 
416 |     def __call__(self, img, boxes, labels):
417 |         return self.augment(img, boxes, labels)
418 | 


--------------------------------------------------------------------------------
/utils/box_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import math
  4 | import numpy as np
  5 | if torch.cuda.is_available():
  6 |     import torch.backends.cudnn as cudnn
  7 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
  8 | 
  9 | 
 10 | def point_form(boxes):
 11 |     """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
 12 |     representation for comparison to point form ground truth data.
 13 |     Args:
 14 |         boxes: (tensor) center-size default boxes from priorbox layers.
 15 |     Return:
 16 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 17 |     """
 18 |     return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
 19 |                      boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax
 20 | 
 21 | 
 22 | def center_size(boxes):
 23 |     """ Convert prior_boxes to (cx, cy, w, h)
 24 |     representation for comparison to center-size form ground truth data.
 25 |     Args:
 26 |         boxes: (tensor) point_form boxes
 27 |     Return:
 28 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 29 |     """
 30 |     return torch.cat([(boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
 31 |                      boxes[:, 2:] - boxes[:, :2]], 1)  # w, h
 32 | 
 33 | 
 34 | def intersect(box_a, box_b):
 35 |     """ We resize both tensors to [A,B,2] without new malloc:
 36 |     [A,2] -> [A,1,2] -> [A,B,2]
 37 |     [B,2] -> [1,B,2] -> [A,B,2]
 38 |     Then we compute the area of intersect between box_a and box_b.
 39 |     Args:
 40 |       box_a: (tensor) bounding boxes, Shape: [A,4].
 41 |       box_b: (tensor) bounding boxes, Shape: [B,4].
 42 |     Return:
 43 |       (tensor) intersection area, Shape: [A,B].
 44 |     """
 45 |     A = box_a.size(0)
 46 |     B = box_b.size(0)
 47 |     max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
 48 |                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
 49 |     min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
 50 |                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
 51 |     inter = torch.clamp((max_xy - min_xy), min=0)
 52 |     return inter[:, :, 0] * inter[:, :, 1]
 53 | 
 54 | 
 55 | def jaccard(box_a, box_b):
 56 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 57 |     is simply the intersection over union of two boxes.  Here we operate on
 58 |     ground truth boxes and default boxes.
 59 |     E.g.:
 60 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 61 |     Args:
 62 |         box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
 63 |         box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
 64 |     Return:
 65 |         jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
 66 |     """
 67 |     inter = intersect(box_a, box_b)
 68 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 69 |               (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
 70 |     area_b = ((box_b[:, 2]-box_b[:, 0]) *
 71 |               (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
 72 |     union = area_a + area_b - inter
 73 |     return inter / union  # [A,B]
 74 | 
 75 | def matrix_iou(a,b):
 76 |     """
 77 |     return iou of a and b, numpy version for data augenmentation
 78 |     """
 79 |     lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
 80 |     rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
 81 | 
 82 |     area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
 83 |     area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
 84 |     area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
 85 |     return area_i / (area_a[:, np.newaxis] + area_b - area_i)
 86 | 
 87 | 
 88 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
 89 |     """Match each prior box with the ground truth box of the highest jaccard
 90 |     overlap, encode the bounding boxes, then return the matched indices
 91 |     corresponding to both confidence and location preds.
 92 |     Args:
 93 |         threshold: (float) The overlap threshold used when mathing boxes.
 94 |         truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
 95 |         priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
 96 |         variances: (tensor) Variances corresponding to each prior coord,
 97 |             Shape: [num_priors, 4].
 98 |         labels: (tensor) All the class labels for the image, Shape: [num_obj].
 99 |         loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
100 |         conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
101 |         idx: (int) current batch index
102 |     Return:
103 |         The matched indices corresponding to 1)location and 2)confidence preds.
104 |     """
105 |     # jaccard index
106 |     overlaps = jaccard(
107 |         truths,
108 |         point_form(priors)
109 |     )
110 |     # (Bipartite Matching)
111 |     # [1,num_objects] best prior for each ground truth
112 |     best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
113 |     # [1,num_priors] best ground truth for each prior
114 |     best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
115 |     best_truth_idx.squeeze_(0)
116 |     best_truth_overlap.squeeze_(0)
117 |     best_prior_idx.squeeze_(1)
118 |     best_prior_overlap.squeeze_(1)
119 |     best_truth_overlap.index_fill_(0, best_prior_idx, 2)  # ensure best prior
120 |     # TODO refactor: index  best_prior_idx with long tensor
121 |     # ensure every gt matches with its prior of max overlap
122 |     for j in range(best_prior_idx.size(0)):
123 |         best_truth_idx[best_prior_idx[j]] = j
124 |     matches = truths[best_truth_idx]          # Shape: [num_priors,4]
125 |     conf = labels[best_truth_idx]          # Shape: [num_priors]
126 |     conf[best_truth_overlap < threshold] = 0  # label as background
127 |     loc = encode(matches, priors, variances)
128 |     loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
129 |     conf_t[idx] = conf  # [num_priors] top class label for each prior
130 | 
131 | def refine_match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx,arm_loc):
132 |     """Match each arm bbox with the ground truth box of the highest jaccard
133 |     overlap, encode the bounding boxes, then return the matched indices
134 |     corresponding to both confidence and location preds.
135 |     Args:
136 |         threshold: (float) The overlap threshold used when mathing boxes.
137 |         truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
138 |         priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
139 |         variances: (tensor) Variances corresponding to each prior coord,
140 |             Shape: [num_priors, 4].
141 |         labels: (tensor) All the class labels for the image, Shape: [num_obj].
142 |         loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
143 |         conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
144 |         idx: (int) current batch index
145 |         arm_loc: (tensor) arm loc data,shape: [n_priors,4]
146 |     Return:
147 |         The matched indices corresponding to 1)location and 2)confidence preds.
148 |     """
149 |     # decode arm box
150 |     decode_arm = decode(arm_loc,priors=priors,variances=variances)
151 |     # jaccard index
152 |     overlaps = jaccard(
153 |         truths,
154 |         decode_arm
155 |     )
156 |     # (Bipartite Matching)
157 |     # [1,num_objects] best prior for each ground truth
158 |     best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
159 |     # [1,num_priors] best ground truth for each prior
160 |     best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
161 |     best_truth_idx.squeeze_(0)
162 |     best_truth_overlap.squeeze_(0)
163 |     best_prior_idx.squeeze_(1)
164 |     best_prior_overlap.squeeze_(1)
165 |     best_truth_overlap.index_fill_(0, best_prior_idx, 2)  # ensure best prior
166 |     # TODO refactor: index  best_prior_idx with long tensor
167 |     # ensure every gt matches with its prior of max overlap
168 |     for j in range(best_prior_idx.size(0)):
169 |         best_truth_idx[best_prior_idx[j]] = j
170 |     matches = truths[best_truth_idx]          # Shape: [num_priors,4]
171 |     conf = labels[best_truth_idx]          # Shape: [num_priors]
172 |     conf[best_truth_overlap < threshold] = 0  # label as background
173 |     loc = encode(matches, center_size(decode_arm), variances)
174 |     loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
175 |     conf_t[idx] = conf  # [num_priors] top class label for each prior
176 | 
177 | def encode(matched, priors, variances):
178 |     """Encode the variances from the priorbox layers into the ground truth boxes
179 |     we have matched (based on jaccard overlap) with the prior boxes.
180 |     Args:
181 |         matched: (tensor) Coords of ground truth for each prior in point-form
182 |             Shape: [num_priors, 4].
183 |         priors: (tensor) Prior boxes in center-offset form
184 |             Shape: [num_priors,4].
185 |         variances: (list[float]) Variances of priorboxes
186 |     Return:
187 |         encoded boxes (tensor), Shape: [num_priors, 4]
188 |     """
189 | 
190 |     # dist b/t match center and prior's center
191 |     g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
192 |     # encode variance
193 |     g_cxcy /= (variances[0] * priors[:, 2:])
194 |     # match wh / prior wh
195 |     g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
196 |     g_wh = torch.log(g_wh) / variances[1]
197 |     # return target for smooth_l1_loss
198 |     return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
199 | 
200 | 
201 | def encode_multi(matched, priors, offsets, variances):
202 |     """Encode the variances from the priorbox layers into the ground truth boxes
203 |     we have matched (based on jaccard overlap) with the prior boxes.
204 |     Args:
205 |         matched: (tensor) Coords of ground truth for each prior in point-form
206 |             Shape: [num_priors, 4].
207 |         priors: (tensor) Prior boxes in center-offset form
208 |             Shape: [num_priors,4].
209 |         variances: (list[float]) Variances of priorboxes
210 |     Return:
211 |         encoded boxes (tensor), Shape: [num_priors, 4]
212 |     """
213 | 
214 |     # dist b/t match center and prior's center
215 |     g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] - offsets[:,:2]
216 |     # encode variance
217 |     #g_cxcy /= (variances[0] * priors[:, 2:])
218 |     g_cxcy.div_(variances[0] * offsets[:, 2:])
219 |     # match wh / prior wh
220 |     g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
221 |     g_wh = torch.log(g_wh) / variances[1]
222 |     # return target for smooth_l1_loss
223 |     return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
224 | 
225 | # Adapted from https://github.com/Hakuyume/chainer-ssd
226 | def decode(loc, priors, variances):
227 |     """Decode locations from predictions using priors to undo
228 |     the encoding we did for offset regression at train time.
229 |     Args:
230 |         loc (tensor): location predictions for loc layers,
231 |             Shape: [num_priors,4]
232 |         priors (tensor): Prior boxes in center-offset form.
233 |             Shape: [num_priors,4].
234 |         variances: (list[float]) Variances of priorboxes
235 |     Return:
236 |         decoded bounding box predictions
237 |     """
238 | 
239 |     boxes = torch.cat((
240 |         priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
241 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
242 |     boxes[:, :2] -= boxes[:, 2:] / 2
243 |     boxes[:, 2:] += boxes[:, :2]
244 |     return boxes
245 | 
246 | def decode_multi(loc, priors, offsets, variances):
247 |     """Decode locations from predictions using priors to undo
248 |     the encoding we did for offset regression at train time.
249 |     Args:
250 |         loc (tensor): location predictions for loc layers,
251 |             Shape: [num_priors,4]
252 |         priors (tensor): Prior boxes in center-offset form.
253 |             Shape: [num_priors,4].
254 |         variances: (list[float]) Variances of priorboxes
255 |     Return:
256 |         decoded bounding box predictions
257 |     """
258 | 
259 |     boxes = torch.cat((
260 |         priors[:, :2] + offsets[:,:2]+ loc[:, :2] * variances[0] * offsets[:, 2:],
261 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
262 |     boxes[:, :2] -= boxes[:, 2:] / 2
263 |     boxes[:, 2:] += boxes[:, :2]
264 |     return boxes
265 | 
266 | def log_sum_exp(x):
267 |     """Utility function for computing log_sum_exp while determining
268 |     This will be used to determine unaveraged confidence loss across
269 |     all examples in a batch.
270 |     Args:
271 |         x (Variable(tensor)): conf_preds from conf layers
272 |     """
273 |     x_max = x.data.max()
274 |     return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
275 | 
276 | 
277 | # Original author: Francisco Massa:
278 | # https://github.com/fmassa/object-detection.torch
279 | # Ported to PyTorch by Max deGroot (02/01/2017)
280 | def nms(boxes, scores, overlap=0.5, top_k=200):
281 |     """Apply non-maximum suppression at test time to avoid detecting too many
282 |     overlapping bounding boxes for a given object.
283 |     Args:
284 |         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
285 |         scores: (tensor) The class predscores for the img, Shape:[num_priors].
286 |         overlap: (float) The overlap thresh for suppressing unnecessary boxes.
287 |         top_k: (int) The Maximum number of box preds to consider.
288 |     Return:
289 |         The indices of the kept boxes with respect to num_priors.
290 |     """
291 | 
292 |     keep = torch.Tensor(scores.size(0)).fill_(0).long()
293 |     if boxes.numel() == 0:
294 |         return keep
295 |     x1 = boxes[:, 0]
296 |     y1 = boxes[:, 1]
297 |     x2 = boxes[:, 2]
298 |     y2 = boxes[:, 3]
299 |     area = torch.mul(x2 - x1, y2 - y1)
300 |     v, idx = scores.sort(0)  # sort in ascending order
301 |     # I = I[v >= 0.01]
302 |     idx = idx[-top_k:]  # indices of the top-k largest vals
303 |     xx1 = boxes.new()
304 |     yy1 = boxes.new()
305 |     xx2 = boxes.new()
306 |     yy2 = boxes.new()
307 |     w = boxes.new()
308 |     h = boxes.new()
309 | 
310 |     # keep = torch.Tensor()
311 |     count = 0
312 |     while idx.numel() > 0:
313 |         i = idx[-1]  # index of current largest val
314 |         # keep.append(i)
315 |         keep[count] = i
316 |         count += 1
317 |         if idx.size(0) == 1:
318 |             break
319 |         idx = idx[:-1]  # remove kept element from view
320 |         # load bboxes of next highest vals
321 |         torch.index_select(x1, 0, idx, out=xx1)
322 |         torch.index_select(y1, 0, idx, out=yy1)
323 |         torch.index_select(x2, 0, idx, out=xx2)
324 |         torch.index_select(y2, 0, idx, out=yy2)
325 |         # store element-wise max with next highest score
326 |         xx1 = torch.clamp(xx1, min=x1[i])
327 |         yy1 = torch.clamp(yy1, min=y1[i])
328 |         xx2 = torch.clamp(xx2, max=x2[i])
329 |         yy2 = torch.clamp(yy2, max=y2[i])
330 |         w.resize_as_(xx2)
331 |         h.resize_as_(yy2)
332 |         w = xx2 - xx1
333 |         h = yy2 - yy1
334 |         # check sizes of xx1 and xx2.. after each iteration
335 |         w = torch.clamp(w, min=0.0)
336 |         h = torch.clamp(h, min=0.0)
337 |         inter = w*h
338 |         # IoU = i / (area(a) + area(b) - i)
339 |         rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
340 |         union = (rem_areas - inter) + area[i]
341 |         IoU = inter/union  # store result in iou
342 |         # keep only elements with an IoU <= overlap
343 |         idx = idx[IoU.le(overlap)]
344 |     return keep, count
345 | 


--------------------------------------------------------------------------------