├── utils ├── __init__.py ├── nms │ ├── __init__.py │ ├── gpu_nms.hpp │ ├── py_cpu_nms.py │ ├── gpu_nms.pyx │ ├── nms_kernel.cu │ └── cpu_nms.pyx ├── pycocotools │ ├── __init__.py │ ├── maskApi.h │ ├── mask.py │ ├── maskApi.c │ └── _mask.pyx ├── nms_wrapper.py ├── timer.py ├── build.py └── box_utils.py ├── layers ├── __init__.py ├── functions │ ├── __init__.py │ ├── prior_box.py │ └── detection.py └── modules │ ├── __init__.py │ ├── l2norm.py │ ├── multibox_loss.py │ └── refine_multibox_loss.py ├── make.sh ├── data ├── __init__.py ├── scripts │ ├── VOC2012.sh │ └── VOC2007.sh ├── config.py ├── voc0712_aug.py ├── voc_eval.py ├── data_augment.py ├── coco.py ├── voc0712.py └── augmentations.py ├── coco_voc.txt ├── LICENSE ├── .gitignore ├── README.md ├── demo.py └── models ├── base_models.py ├── mobilenet.py ├── misc.py ├── FSSD_vgg_FPN.py └── FSSD_Mob_FPN.py /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/nms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | from .modules import * 3 | -------------------------------------------------------------------------------- /make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd ./utils/ 3 | 4 | CUDA_PATH=/usr/local/cuda/ 5 | 6 | python build.py build_ext --inplace 7 | 8 | cd .. 9 | -------------------------------------------------------------------------------- /layers/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .detection import Detect 2 | from .prior_box import PriorBox 3 | 4 | 5 | __all__ = ['Detect', 'PriorBox'] 6 | -------------------------------------------------------------------------------- /utils/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /layers/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .multibox_loss import MultiBoxLoss 2 | from .refine_multibox_loss import RefineMultiBoxLoss 3 | from .l2norm import L2Norm 4 | 5 | __all__ = ['MultiBoxLoss','L2Norm'] 6 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | # from .voc import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES 2 | from .voc0712 import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES 3 | from .coco import COCODetection 4 | from .data_augment import * 5 | from .config import * 6 | -------------------------------------------------------------------------------- /coco_voc.txt: -------------------------------------------------------------------------------- 1 | 0,0,background 2 | 5,1,aeroplane 3 | 2,2,bicycle 4 | 15,3,bird 5 | 9,4,boat 6 | 40,5,bottle 7 | 6,6,bus 8 | 3,7,car 9 | 16,8,cat 10 | 57,9,chair 11 | 20,10,cow 12 | 61,11,diningtable 13 | 17,12,dog 14 | 18,13,horse 15 | 4,14,motorbike 16 | 1,15,person 17 | 59,16,pottedplant 18 | 19,17,sheep 19 | 58,18,sofa 20 | 7,19,train 21 | 63,20,tvmonitor -------------------------------------------------------------------------------- /layers/modules/l2norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Function 4 | from torch.autograd import Variable 5 | import torch.nn.init as init 6 | 7 | class L2Norm(nn.Module): 8 | def __init__(self,n_channels, scale): 9 | super(L2Norm,self).__init__() 10 | self.n_channels = n_channels 11 | self.gamma = scale or None 12 | self.eps = 1e-10 13 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 14 | self.reset_parameters() 15 | 16 | def reset_parameters(self): 17 | init.constant(self.weight,self.gamma) 18 | 19 | def forward(self, x): 20 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps 21 | x /= norm 22 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 23 | return out 24 | -------------------------------------------------------------------------------- /data/scripts/VOC2012.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2012 trainval ..." 24 | # Download the data. 25 | #curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 26 | echo "Done downloading." 27 | 28 | 29 | # Extract data 30 | echo "Extracting trainval ..." 31 | tar -xvf VOCtrainval_11-May-2012.tar 32 | echo "removing tar ..." 33 | rm VOCtrainval_11-May-2012.tar 34 | 35 | end=`date +%s` 36 | runtime=$((end-start)) 37 | 38 | echo "Completed in" $runtime "seconds" 39 | -------------------------------------------------------------------------------- /utils/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from .nms.cpu_nms import cpu_nms, cpu_soft_nms 9 | from .nms.gpu_nms import gpu_nms 10 | 11 | 12 | # def nms(dets, thresh, force_cpu=False): 13 | # """Dispatch to either CPU or GPU NMS implementations.""" 14 | # 15 | # if dets.shape[0] == 0: 16 | # return [] 17 | # if cfg.USE_GPU_NMS and not force_cpu: 18 | # return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 19 | # else: 20 | # return cpu_nms(dets, thresh) 21 | 22 | 23 | def nms(dets, thresh, force_cpu=False): 24 | """Dispatch to either CPU or GPU NMS implementations.""" 25 | 26 | if dets.shape[0] == 0: 27 | return [] 28 | if force_cpu: 29 | #return cpu_soft_nms(dets, thresh, method = 0) 30 | return cpu_nms(dets, thresh) 31 | return gpu_nms(dets, thresh) 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Fanbinqi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /data/scripts/VOC2007.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | #echo "Downloading VOC2007 trainval ..." 24 | # Download the data. 25 | #curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 26 | #echo "Downloading VOC2007 test data ..." 27 | #curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 28 | #echo "Done downloading." 29 | 30 | # Extract data 31 | echo "Extracting trainval ..." 32 | tar -xvf VOCtrainval_06-Nov-2007.tar 33 | echo "Extracting test ..." 34 | tar -xvf VOCtest_06-Nov-2007.tar 35 | echo "removing tars ..." 36 | rm VOCtrainval_06-Nov-2007.tar 37 | rm VOCtest_06-Nov-2007.tar 38 | 39 | end=`date +%s` 40 | runtime=$((end-start)) 41 | 42 | echo "Completed in" $runtime "seconds" 43 | -------------------------------------------------------------------------------- /utils/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /utils/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | 11 | class Timer(object): 12 | """A simple timer.""" 13 | def __init__(self): 14 | self.total_time = 0. 15 | self.calls = 0 16 | self.start_time = 0. 17 | self.diff = 0. 18 | self.average_time = 0. 19 | 20 | def tic(self): 21 | # using time.time instead of time.clock because time time.clock 22 | # does not normalize for multithreading 23 | self.start_time = time.time() 24 | 25 | def toc(self, average=True): 26 | self.diff = time.time() - self.start_time 27 | self.total_time += self.diff 28 | self.calls += 1 29 | self.average_time = self.total_time / self.calls 30 | if average: 31 | return self.average_time 32 | else: 33 | return self.diff 34 | 35 | def clear(self): 36 | self.total_time = 0. 37 | self.calls = 0 38 | self.start_time = 0. 39 | self.diff = 0. 40 | self.average_time = 0. 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /layers/functions/prior_box.py: -------------------------------------------------------------------------------- 1 | from itertools import product as product 2 | from math import sqrt as sqrt 3 | 4 | import torch 5 | 6 | if torch.cuda.is_available(): 7 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 8 | 9 | 10 | class PriorBox(object): 11 | """Compute priorbox coordinates in center-offset form for each source 12 | feature map. 13 | Note: 14 | This 'layer' has changed between versions of the original SSD 15 | paper, so we include both versions, but note v2 is the most tested and most 16 | recent version of the paper. 17 | 18 | """ 19 | 20 | def __init__(self, cfg): 21 | super(PriorBox, self).__init__() 22 | self.image_size = cfg['min_dim'] 23 | # number of priors for feature map location (either 4 or 6) 24 | self.num_priors = len(cfg['aspect_ratios']) 25 | self.variance = cfg['variance'] or [0.1] 26 | self.feature_maps = cfg['feature_maps'] 27 | self.min_sizes = cfg['min_sizes'] 28 | self.max_sizes = cfg['max_sizes'] 29 | self.steps = cfg['steps'] 30 | self.aspect_ratios = cfg['aspect_ratios'] 31 | self.clip = cfg['clip'] 32 | for v in self.variance: 33 | if v <= 0: 34 | raise ValueError('Variances must be greater than 0') 35 | 36 | def forward(self): 37 | mean = [] 38 | for k, f in enumerate(self.feature_maps): 39 | for i, j in product(range(f), repeat=2): 40 | f_k = self.image_size / self.steps[k] 41 | cx = (j + 0.5) / f_k 42 | cy = (i + 0.5) / f_k 43 | 44 | s_k = self.min_sizes[k] / self.image_size 45 | mean += [cx, cy, s_k, s_k] 46 | 47 | # aspect_ratio: 1 48 | # rel size: sqrt(s_k * s_(k+1)) 49 | if self.max_sizes: 50 | s_k_prime = sqrt(s_k * (self.max_sizes[k] / self.image_size)) 51 | mean += [cx, cy, s_k_prime, s_k_prime] 52 | 53 | # rest of aspect ratios 54 | for ar in self.aspect_ratios[k]: 55 | mean += [cx, cy, s_k * sqrt(ar), s_k / sqrt(ar)] 56 | mean += [cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)] 57 | 58 | # back to torch land 59 | output = torch.Tensor(mean).view(-1, 4) 60 | if self.clip: 61 | output.clamp_(max=1, min=0) 62 | return output 63 | -------------------------------------------------------------------------------- /utils/pycocotools/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | 9 | typedef unsigned int uint; 10 | typedef unsigned long siz; 11 | typedef unsigned char byte; 12 | typedef double* BB; 13 | typedef struct { siz h, w, m; uint *cnts; } RLE; 14 | 15 | /* Initialize/destroy RLE. */ 16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 17 | void rleFree( RLE *R ); 18 | 19 | /* Initialize/destroy RLE array. */ 20 | void rlesInit( RLE **R, siz n ); 21 | void rlesFree( RLE **R, siz n ); 22 | 23 | /* Encode binary masks using RLE. */ 24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 25 | 26 | /* Decode binary masks encoded via RLE. */ 27 | void rleDecode( const RLE *R, byte *mask, siz n ); 28 | 29 | /* Compute union or intersection of encoded masks. */ 30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ); 31 | 32 | /* Compute area of encoded masks. */ 33 | void rleArea( const RLE *R, siz n, uint *a ); 34 | 35 | /* Compute intersection over union between masks. */ 36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 37 | 38 | /* Compute non-maximum suppression between bounding masks */ 39 | void rleNms( RLE *dt, siz n, uint *keep, double thr ); 40 | 41 | /* Compute intersection over union between bounding boxes. */ 42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 43 | 44 | /* Compute non-maximum suppression between bounding boxes */ 45 | void bbNms( BB dt, siz n, uint *keep, double thr ); 46 | 47 | /* Get bounding boxes surrounding encoded masks. */ 48 | void rleToBbox( const RLE *R, BB bb, siz n ); 49 | 50 | /* Convert bounding boxes to encoded masks. */ 51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 52 | 53 | /* Convert polygon to encoded mask. */ 54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 55 | 56 | /* Get compressed string representation of encoded mask. */ 57 | char* rleToString( const RLE *R ); 58 | 59 | /* Convert from compressed string representation of encoded mask. */ 60 | void rleFrString( RLE *R, char *s, siz h, siz w ); 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## FFBNet 2 | FFBNET : LIGHTWEIGHT BACKBONE FOR OBJECT DETECTION BASED FEATURE FUSION BLOCK 3 | 4 | ## Our paper has been accepted by IEEE ICIP2019 for presentention. 5 | 6 | ### VOC2007 Test 7 | | System | *mAP* | **FPS** (1080Ti) | 8 | | :--------------------------------------- | :------: | :-----------------------: | 9 | | Mob-SSD | 68 | 190 | 10 | | Tiny-Yolo v3 | 61.3 | 220 | 11 | | Pelee | 70.9 | - | 12 | | SSD | 77.2 | 160 | 13 | | STDN | 78.1 | 41 | 14 | | FSSD | 78.8 | 150 | 15 | | RefineDet | 80.0 | - | 16 | | FFBNet | 73.54 | 185 | 17 | | VGG-FFB | 80.2 | 142 | 18 | 19 | ## Installation 20 | - Install [PyTorch 0.3.1](http://pytorch.org/) by selecting your environment on the website and running the appropriate command. 21 | - Clone this repository. This repository is mainly based on[lzx1413/PytorchSSD](https://github.com/lzx1413/PytorchSSD), and a huge thank to him. 22 | 23 | - Compile the nms and coco tools: 24 | ```Shell 25 | ./make.sh 26 | ``` 27 | 28 | ## Datasets 29 | 30 | ### VOC Dataset 31 | ##### Download VOC2007 trainval & test 32 | 33 | ```Shell 34 | # specify a directory for dataset to be downloaded into, else default is ~/data/ 35 | sh data/scripts/VOC2007.sh # 36 | ``` 37 | 38 | ##### Download VOC2012 trainval 39 | 40 | ```Shell 41 | # specify a directory for dataset to be downloaded into, else default is ~/data/ 42 | sh data/scripts/VOC2012.sh # 43 | ``` 44 | 45 | ## Training 46 | - First download the fc-reduced [VGG-16](https://arxiv.org/abs/1409.1556) PyTorch base network weights at: [BaiduYun Driver](https://pan.baidu.com/s/1nzOgaL8mAPex8_HLU4mb8Q), password is `mu59`. 47 | - MobileNet is reported in the [paper](https://arxiv.org/abs/1704.04861), weight file is available at: [BaiduYun Driver](https://pan.baidu.com/s/1LXq3p6IOoQ6YJMY0xhRkLQ), password is `f7oe`. 48 | 49 | ```Shell 50 | # Put vgg16_reducedfc.pth, and mobilenet_1.pth in a new folder weights and 51 | python train_test_mob.py or python train_test_vgg.py 52 | ``` 53 | ### Personal advice: when use Mobilenet v1 to train voc datasets, use a higher learning rate at the beginning, the convergence performance may be better. 54 | 55 | If you are interested in this paper or interested in lightweight detectors, please QQ me (374873360) 56 | -------------------------------------------------------------------------------- /data/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | 3 | # gets home dir cross platform 4 | import cv2 5 | cv2.setNumThreads(0) # pytorch issue 1355: possible deadlock in dataloader 6 | # note: if you used our download scripts, this should be right 7 | VOCroot = '/home/zdh1901/data/VOCdevkit' # path to VOCdevkit root dir 8 | 9 | COCOroot = '/home/zdh1901/data/coco' 10 | 11 | # RFB CONFIGS 12 | VOC_300 = { 13 | 'feature_maps': [38, 19, 10, 5, 3, 1], 14 | #'feature_maps': [1, 3, 5, 10, 19, 38], 15 | 16 | 'min_dim': 300, 17 | 18 | 'steps': [8, 16, 32, 64, 100, 300], 19 | #'steps': [300, 100, 64, 32, 16, 8], 20 | 21 | 'min_sizes': [30, 60, 111, 162, 213, 264], 22 | 23 | 'max_sizes': [60, 111, 162, 213, 264, 315], 24 | # 'min_sizes': [264, 213, 163, 111, 60, 30], 25 | # 26 | # 'max_sizes': [315, 264, 213, 163, 111, 60], 27 | 28 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 29 | #'aspect_ratios' : [[2], [2], [2, 3], [2, 3], [2, 3], [2]], 30 | 31 | 'variance': [0.1, 0.2], 32 | 33 | 'clip': True, 34 | } 35 | 36 | VOC_512 = { 37 | 'feature_maps': [38, 19, 10, 5, 3, 1], 38 | 39 | 'min_dim': 512, 40 | 41 | 'steps': [14, 27, 51, 102, 170, 512], 42 | 43 | 'min_sizes': [35.84, 76.8, 153.6, 230.4, 307.2, 384.0], 44 | 45 | 'max_sizes': [76.8, 153.6, 230.4, 307.2, 384.0, 460.8], 46 | 47 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 48 | 49 | 'variance': [0.1, 0.2], 50 | 51 | 'clip': True, 52 | } 53 | 54 | COCO_300 = { 55 | 'feature_maps': [38, 19, 10, 5, 3, 1], 56 | 57 | 'min_dim': 300, 58 | 59 | 'steps': [8, 16, 32, 64, 100, 300], 60 | 61 | 'min_sizes': [21, 45, 99, 153, 207, 261], 62 | 63 | 'max_sizes': [45, 99, 153, 207, 261, 315], 64 | 65 | 'aspect_ratios': [[2, 3], [2, 3], [2, 3], [2, 3], [2], [2]], 66 | 67 | 'variance': [0.1, 0.2], 68 | 69 | 'clip': True, 70 | } 71 | 72 | COCO_512 = { 73 | 'feature_maps': [64, 32, 16, 8, 4, 2, 1], 74 | 75 | 'min_dim': 512, 76 | 77 | 'steps': [8, 16, 32, 64, 128, 256, 512], 78 | 79 | 'min_sizes': [20.48, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8], 80 | 81 | 'max_sizes': [51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72], 82 | 83 | 'aspect_ratios': [[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]], 84 | 85 | 'variance': [0.1, 0.2], 86 | 87 | 'clip': True, 88 | } 89 | 90 | COCO_mobile_300 = { 91 | 'feature_maps': [19, 10, 5, 3, 2, 1], 92 | 93 | 'min_dim': 300, 94 | 95 | 'steps': [16, 32, 64, 100, 150, 300], 96 | 97 | 'min_sizes': [45, 90, 135, 180, 225, 270], 98 | 99 | 'max_sizes': [90, 135, 180, 225, 270, 315], 100 | 101 | 'aspect_ratios': [[2, 3], [2, 3], [2, 3], [2, 3], [2], [2]], 102 | 103 | 'variance': [0.1, 0.2], 104 | 105 | 'clip': True, 106 | } 107 | 108 | VOC_320 = { 109 | 'feature_maps': [40, 20, 10, 5], 110 | 111 | 'min_dim': 320, 112 | 113 | 'steps': [8, 16, 32, 64], 114 | 115 | 'min_sizes': [32, 64, 128, 256], 116 | 117 | 'max_sizes': [], 118 | 119 | 'aspect_ratios': [[2], [2], [2], [2]], 120 | 121 | 'variance': [0.1, 0.2], 122 | 123 | 'clip': True, 124 | } 125 | -------------------------------------------------------------------------------- /layers/functions/detection.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | 4 | from utils.box_utils import decode, center_size 5 | 6 | 7 | class Detect(Function): 8 | """At test time, Detect is the final layer of SSD. Decode location preds, 9 | apply non-maximum suppression to location predictions based on conf 10 | scores and threshold to a top_k number of output predictions for both 11 | confidence score and locations. 12 | """ 13 | 14 | def __init__(self, num_classes, bkg_label, cfg, object_score=0): 15 | self.num_classes = num_classes 16 | self.background_label = bkg_label 17 | self.object_score = object_score 18 | # self.thresh = thresh 19 | 20 | # Parameters used in nms. 21 | self.variance = cfg['variance'] 22 | 23 | def forward(self, predictions, prior, arm_data=None): 24 | """ 25 | Args: 26 | loc_data: (tensor) Loc preds from loc layers 27 | Shape: [batch,num_priors*4] 28 | conf_data: (tensor) Shape: Conf preds from conf layers 29 | Shape: [batch*num_priors,num_classes] 30 | prior_data: (tensor) Prior boxes and variances from priorbox layers 31 | Shape: [1,num_priors,4] 32 | """ 33 | 34 | loc, conf = predictions 35 | loc_data = loc.data 36 | conf_data = conf.data 37 | prior_data = prior.data 38 | num = loc_data.size(0) # batch size 39 | if arm_data: 40 | arm_loc, arm_conf = arm_data 41 | arm_loc_data = arm_loc.data 42 | arm_conf_data = arm_conf.data 43 | arm_object_conf = arm_conf_data[:, 1:] 44 | no_object_index = arm_object_conf <= self.object_score 45 | conf_data[no_object_index.expand_as(conf_data)] = 0 46 | 47 | self.num_priors = prior_data.size(0) 48 | self.boxes = torch.zeros(num, self.num_priors, 4) 49 | self.scores = torch.zeros(num, self.num_priors, self.num_classes) 50 | 51 | if num == 1: 52 | # size batch x num_classes x num_priors 53 | conf_preds = conf_data.unsqueeze(0) 54 | 55 | else: 56 | conf_preds = conf_data.view(num, self.num_priors, 57 | self.num_classes) 58 | self.boxes.expand(num, self.num_priors, 4) 59 | self.scores.expand(num, self.num_priors, self.num_classes) 60 | # Decode predictions into bboxes. 61 | for i in range(num): 62 | if arm_data: 63 | default = decode(arm_loc_data[i], prior_data, self.variance) 64 | default = center_size(default) 65 | else: 66 | default = prior_data 67 | decoded_boxes = decode(loc_data[i], default, self.variance) 68 | # For each class, perform nms 69 | conf_scores = conf_preds[i].clone() 70 | ''' 71 | c_mask = conf_scores.gt(self.thresh) 72 | decoded_boxes = decoded_boxes[c_mask] 73 | conf_scores = conf_scores[c_mask] 74 | ''' 75 | 76 | self.boxes[i] = decoded_boxes 77 | self.scores[i] = conf_scores 78 | 79 | return self.boxes, self.scores 80 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import os 4 | import cv2 5 | import pickle 6 | import argparse 7 | import torch 8 | import torch.nn as nn 9 | import torch.backends.cudnn as cudnn 10 | import torchvision.transforms as transforms 11 | import numpy as np 12 | from torch.autograd import Variable 13 | from data import VOCroot,COCOroot 14 | from data import AnnotationTransform, COCODetection, VOCDetection, BaseTransform, VOC_300,VOC_512,COCO_300,COCO_512, COCO_mobile_300 15 | from models.FSSD_vgg_FPN import build_net 16 | import torch.utils.data as data 17 | from layers.functions import Detect,PriorBox 18 | from utils.nms_wrapper import nms 19 | from utils.timer import Timer 20 | from matplotlib import pyplot as plt 21 | 22 | CLASSES = ('__background__', 23 | 'aeroplane', 'bicycle', 'bird', 'boat', 24 | 'bottle', 'bus', 'car', 'cat', 'chair', 25 | 'cow', 'diningtable', 'dog', 'horse', 26 | 'motorbike', 'person', 'pottedplant', 27 | 'sheep', 'sofa', 'train', 'tvmonitor') 28 | 29 | def test_net(net,img,name,detector,transform,priors,top_k=200,thresh=0.01): 30 | 31 | scale = torch.Tensor([img.shape[1], img.shape[0], 32 | img.shape[1], img.shape[0]]) 33 | #cv2.imshow('ori.jpg',img) 34 | #cv2.waitKey(2) 35 | # with torch.no_grad(): 36 | # x = transform(img).unsqueeze(0) 37 | # x = x.cuda() 38 | # scale = scale.cuda() 39 | x = Variable(transform(img).unsqueeze(0), volatile=True) 40 | x = x.cuda() 41 | scale = scale.cuda() 42 | 43 | out = net(x,test=True) 44 | boxes, scores = detector.forward(out, priors) 45 | boxes = boxes[0] 46 | scores = scores[0] 47 | a = [] 48 | boxes *= scale 49 | boxes = boxes.cpu().numpy() 50 | scores = scores.cpu().numpy() 51 | 52 | flag = True 53 | for j in range(1, 21): 54 | inds = np.where(scores[:, j] > thresh)[0] 55 | if len(inds) == 0: 56 | #print ("%s class" %str(j)) 57 | continue 58 | c_bboxes = boxes[inds] 59 | c_scores = scores[inds, j] 60 | c_dets = np.hstack((c_bboxes, c_scores[:, np.newaxis])).astype( 61 | np.float32, copy=False) 62 | keep = nms(c_dets, 0.45, force_cpu=True) 63 | c_dets = c_dets[keep, :] 64 | cls = np.ones(c_dets.shape[0])*j 65 | c_dets = np.column_stack((c_dets,cls)) 66 | if flag: 67 | result = c_dets 68 | flag = False 69 | else: 70 | result = np.vstack((result,c_dets)) 71 | 72 | a = list(result) 73 | #a.append(result) 74 | rgb_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 75 | colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist() 76 | plt.imshow(rgb_image) 77 | currentAxis = plt.gca() 78 | 79 | for (x1,y1,x2,y2,s,cls) in a: 80 | x1 = int(x1) 81 | y1 = int(y1) 82 | x2 = int(x2) 83 | y2 = int(y2) 84 | cls = int(cls) 85 | title = "%s:%.2f" % (CLASSES[int(cls)], s) 86 | coords = (x1,y1), x2-x1+1, y2-y1+1 87 | color = colors[cls] 88 | currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2)) 89 | currentAxis.text(x1, y1, title, bbox={'facecolor': color, 'alpha': 0.5}) 90 | plt.axis('off') 91 | plt.savefig(name.split('.')[0]+'.eps',format='eps',bbox_inches = 'tight') 92 | plt.show() 93 | 94 | if __name__ == "__main__": 95 | Image = os.listdir('image1/') 96 | 97 | for img_name in Image: 98 | img = cv2.imread("image1/"+img_name) 99 | model = './weights/FSSD_VGG.pth' 100 | net = build_net(300, 21) 101 | state_dict = torch.load(model) 102 | from collections import OrderedDict 103 | new_state_dict = OrderedDict() 104 | for k, v in state_dict.items(): 105 | head = k[:7] 106 | if head == 'module.': 107 | name = k[7:] # remove `module.` 108 | else: 109 | name = k 110 | new_state_dict[name] = v 111 | net.load_state_dict(new_state_dict) 112 | net.eval() 113 | net = net.cuda() 114 | cudnn.benchmark = True 115 | print("Finished loading model") 116 | transform = BaseTransform(300, (104, 117, 123)) 117 | detector = Detect(21, 0, VOC_300) 118 | priorbox = PriorBox(VOC_300) 119 | # with torch.no_grad(): 120 | # priors = priorbox.forward() 121 | # priors = priors.cuda() 122 | priors = Variable(priorbox.forward(), volatile=True) 123 | priors = priors.cuda() 124 | test_net(net, img, img_name, detector, transform, priors,top_k=200, thresh=0.7) -------------------------------------------------------------------------------- /models/base_models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def vgg(cfg, i, batch_norm=False): 6 | layers = [] 7 | in_channels = i 8 | for v in cfg: 9 | if v == 'M': 10 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 11 | elif v == 'C': 12 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 13 | else: 14 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 15 | if batch_norm: 16 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 17 | else: 18 | layers += [conv2d, nn.ReLU(inplace=True)] 19 | in_channels = v 20 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 21 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 22 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 23 | layers += [pool5, conv6, 24 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 25 | return layers 26 | 27 | 28 | vgg_base = { 29 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 30 | 512, 512, 512], 31 | '512': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 32 | 512, 512, 512], 33 | } 34 | 35 | 36 | class BasicConv(nn.Module): 37 | 38 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, 39 | bn=True, bias=False): 40 | super(BasicConv, self).__init__() 41 | self.out_channels = out_planes 42 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, 43 | dilation=dilation, groups=groups, bias=bias) 44 | self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None 45 | self.relu = nn.ReLU(inplace=True) if relu else None 46 | 47 | def forward(self, x): 48 | x = self.conv(x) 49 | if self.bn is not None: 50 | x = self.bn(x) 51 | if self.relu is not None: 52 | x = self.relu(x) 53 | return x 54 | 55 | 56 | class BasicRFB_a(nn.Module): 57 | 58 | def __init__(self, in_planes, out_planes, stride=1, scale=0.1): 59 | super(BasicRFB_a, self).__init__() 60 | self.scale = scale 61 | self.out_channels = out_planes 62 | inter_planes = in_planes // 4 63 | 64 | self.branch0 = nn.Sequential( 65 | BasicConv(in_planes, inter_planes, kernel_size=1, stride=1), 66 | BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=1, relu=False) 67 | ) 68 | self.branch1 = nn.Sequential( 69 | BasicConv(in_planes, inter_planes, kernel_size=1, stride=1), 70 | BasicConv(inter_planes, inter_planes, kernel_size=(3, 1), stride=1, padding=(1, 0)), 71 | BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=3, dilation=3, relu=False) 72 | ) 73 | self.branch2 = nn.Sequential( 74 | BasicConv(in_planes, inter_planes, kernel_size=1, stride=1), 75 | BasicConv(inter_planes, inter_planes, kernel_size=(1, 3), stride=stride, padding=(0, 1)), 76 | BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=3, dilation=3, relu=False) 77 | ) 78 | ''' 79 | self.branch3 = nn.Sequential( 80 | BasicConv(in_planes, inter_planes, kernel_size=1, stride=1), 81 | BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=1), 82 | BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=3, dilation=3, relu=False) 83 | ) 84 | ''' 85 | self.branch3 = nn.Sequential( 86 | BasicConv(in_planes, inter_planes // 2, kernel_size=1, stride=1), 87 | BasicConv(inter_planes // 2, (inter_planes // 4) * 3, kernel_size=(1, 3), stride=1, padding=(0, 1)), 88 | BasicConv((inter_planes // 4) * 3, inter_planes, kernel_size=(3, 1), stride=stride, padding=(1, 0)), 89 | BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=5, dilation=5, relu=False) 90 | ) 91 | 92 | self.ConvLinear = BasicConv(4 * inter_planes, out_planes, kernel_size=1, stride=1, relu=False) 93 | self.shortcut = BasicConv(in_planes, out_planes, kernel_size=1, stride=stride, relu=False) 94 | self.relu = nn.ReLU(inplace=False) 95 | 96 | def forward(self, x): 97 | x0 = self.branch0(x) 98 | x1 = self.branch1(x) 99 | x2 = self.branch2(x) 100 | x3 = self.branch3(x) 101 | 102 | out = torch.cat((x0, x1, x2, x3), 1) 103 | out = self.ConvLinear(out) 104 | short = self.shortcut(x) 105 | out = out * self.scale + short 106 | out = self.relu(out) 107 | 108 | return out 109 | -------------------------------------------------------------------------------- /utils/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | #import pycocotools._mask as _mask 4 | from . import _mask 5 | 6 | # Interface for manipulating masks stored in RLE format. 7 | # 8 | # RLE is a simple yet efficient format for storing binary masks. RLE 9 | # first divides a vector (or vectorized image) into a series of piecewise 10 | # constant regions and then for each piece simply stores the length of 11 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 12 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 13 | # (note that the odd counts are always the numbers of zeros). Instead of 14 | # storing the counts directly, additional compression is achieved with a 15 | # variable bitrate representation based on a common scheme called LEB128. 16 | # 17 | # Compression is greatest given large piecewise constant regions. 18 | # Specifically, the size of the RLE is proportional to the number of 19 | # *boundaries* in M (or for an image the number of boundaries in the y 20 | # direction). Assuming fairly simple shapes, the RLE representation is 21 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 22 | # is substantially lower, especially for large simple objects (large n). 23 | # 24 | # Many common operations on masks can be computed directly using the RLE 25 | # (without need for decoding). This includes computations such as area, 26 | # union, intersection, etc. All of these operations are linear in the 27 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 28 | # of the object. Computing these operations on the original mask is O(n). 29 | # Thus, using the RLE can result in substantial computational savings. 30 | # 31 | # The following API functions are defined: 32 | # encode - Encode binary masks using RLE. 33 | # decode - Decode binary masks encoded via RLE. 34 | # merge - Compute union or intersection of encoded masks. 35 | # iou - Compute intersection over union between masks. 36 | # area - Compute area of encoded masks. 37 | # toBbox - Get bounding boxes surrounding encoded masks. 38 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 39 | # 40 | # Usage: 41 | # Rs = encode( masks ) 42 | # masks = decode( Rs ) 43 | # R = merge( Rs, intersect=false ) 44 | # o = iou( dt, gt, iscrowd ) 45 | # a = area( Rs ) 46 | # bbs = toBbox( Rs ) 47 | # Rs = frPyObjects( [pyObjects], h, w ) 48 | # 49 | # In the API the following formats are used: 50 | # Rs - [dict] Run-length encoding of binary masks 51 | # R - dict Run-length encoding of binary mask 52 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 53 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 54 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 55 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 56 | # dt,gt - May be either bounding boxes or encoded masks 57 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 58 | # 59 | # Finally, a note about the intersection over union (iou) computation. 60 | # The standard iou of a ground truth (gt) and detected (dt) object is 61 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 62 | # For "crowd" regions, we use a modified criteria. If a gt object is 63 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 64 | # Choosing gt' in the crowd gt that best matches the dt can be done using 65 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 66 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 67 | # For crowd gt regions we use this modified criteria above for the iou. 68 | # 69 | # To compile run "python setup.py build_ext --inplace" 70 | # Please do not contact us for help with compiling. 71 | # 72 | # Microsoft COCO Toolbox. version 2.0 73 | # Data, paper, and tutorials available at: http://mscoco.org/ 74 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 75 | # Licensed under the Simplified BSD License [see coco/license.txt] 76 | 77 | iou = _mask.iou 78 | merge = _mask.merge 79 | frPyObjects = _mask.frPyObjects 80 | 81 | def encode(bimask): 82 | if len(bimask.shape) == 3: 83 | return _mask.encode(bimask) 84 | elif len(bimask.shape) == 2: 85 | h, w = bimask.shape 86 | return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] 87 | 88 | def decode(rleObjs): 89 | if type(rleObjs) == list: 90 | return _mask.decode(rleObjs) 91 | else: 92 | return _mask.decode([rleObjs])[:,:,0] 93 | 94 | def area(rleObjs): 95 | if type(rleObjs) == list: 96 | return _mask.area(rleObjs) 97 | else: 98 | return _mask.area([rleObjs])[0] 99 | 100 | def toBbox(rleObjs): 101 | if type(rleObjs) == list: 102 | return _mask.toBbox(rleObjs) 103 | else: 104 | return _mask.toBbox([rleObjs])[0] 105 | -------------------------------------------------------------------------------- /layers/modules/multibox_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from utils.box_utils import match, log_sum_exp 6 | GPU = False 7 | if torch.cuda.is_available(): 8 | GPU = True 9 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 10 | 11 | 12 | class MultiBoxLoss(nn.Module): 13 | """SSD Weighted Loss Function 14 | Compute Targets: 15 | 1) Produce Confidence Target Indices by matching ground truth boxes 16 | with (default) 'priorboxes' that have jaccard index > threshold parameter 17 | (default threshold: 0.5). 18 | 2) Produce localization target by 'encoding' variance into offsets of ground 19 | truth boxes and their matched 'priorboxes'. 20 | 3) Hard negative mining to filter the excessive number of negative examples 21 | that comes with using a large number of default bounding boxes. 22 | (default negative:positive ratio 3:1) 23 | Objective Loss: 24 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 25 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 26 | weighted by α which is set to 1 by cross val. 27 | Args: 28 | c: class confidences, 29 | l: predicted boxes, 30 | g: ground truth boxes 31 | N: number of matched default boxes 32 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 33 | """ 34 | 35 | 36 | def __init__(self, num_classes,overlap_thresh,prior_for_matching,bkg_label,neg_mining,neg_pos,neg_overlap,encode_target): 37 | super(MultiBoxLoss, self).__init__() 38 | self.num_classes = num_classes 39 | self.threshold = overlap_thresh 40 | self.background_label = bkg_label 41 | self.encode_target = encode_target 42 | self.use_prior_for_matching = prior_for_matching 43 | self.do_neg_mining = neg_mining 44 | self.negpos_ratio = neg_pos 45 | self.neg_overlap = neg_overlap 46 | self.variance = [0.1,0.2] 47 | 48 | def forward(self, predictions, priors, targets): 49 | """Multibox Loss 50 | Args: 51 | predictions (tuple): A tuple containing loc preds, conf preds, 52 | and prior boxes from SSD net. 53 | conf shape: torch.size(batch_size,num_priors,num_classes) 54 | loc shape: torch.size(batch_size,num_priors,4) 55 | priors shape: torch.size(num_priors,4) 56 | 57 | ground_truth (tensor): Ground truth boxes and labels for a batch, 58 | shape: [batch_size,num_objs,5] (last idx is the label). 59 | """ 60 | 61 | loc_data, conf_data = predictions 62 | priors = priors 63 | num = loc_data.size(0) 64 | num_priors = (priors.size(0)) 65 | num_classes = self.num_classes 66 | 67 | # match priors (default boxes) and ground truth boxes 68 | loc_t = torch.Tensor(num, num_priors, 4) 69 | conf_t = torch.LongTensor(num, num_priors) 70 | for idx in range(num): 71 | truths = targets[idx][:,:-1].data 72 | labels = targets[idx][:,-1].data 73 | defaults = priors.data 74 | match(self.threshold,truths,defaults,self.variance,labels,loc_t,conf_t,idx) 75 | if GPU: 76 | loc_t = loc_t.cuda() 77 | conf_t = conf_t.cuda() 78 | # wrap targets 79 | loc_t = Variable(loc_t, requires_grad=False) 80 | conf_t = Variable(conf_t,requires_grad=False) 81 | 82 | pos = conf_t > 0 83 | 84 | # Localization Loss (Smooth L1) 85 | # Shape: [batch,num_priors,4] 86 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 87 | loc_p = loc_data[pos_idx].view(-1,4) 88 | loc_t = loc_t[pos_idx].view(-1,4) 89 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 90 | 91 | # Compute max conf across batch for hard negative mining 92 | batch_conf = conf_data.view(-1,self.num_classes) 93 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1,1)) 94 | 95 | # Hard Negative Mining 96 | loss_c[pos] = 0 # filter out pos boxes for now 97 | loss_c = loss_c.view(num, -1) 98 | _,loss_idx = loss_c.sort(1, descending=True) 99 | _,idx_rank = loss_idx.sort(1) 100 | num_pos = pos.long().sum(1,keepdim=True) 101 | num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) 102 | neg = idx_rank < num_neg.expand_as(idx_rank) 103 | 104 | # Confidence Loss Including Positive and Negative Examples 105 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 106 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 107 | conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1,self.num_classes) 108 | targets_weighted = conf_t[(pos+neg).gt(0)] 109 | loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False) 110 | 111 | # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 112 | 113 | N = num_pos.data.sum() 114 | loss_l/=N 115 | loss_c/=N 116 | return loss_l,loss_c 117 | -------------------------------------------------------------------------------- /utils/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /utils/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | 70 | def cpu_soft_nms(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0): 71 | cdef unsigned int N = boxes.shape[0] 72 | cdef float iw, ih, box_area 73 | cdef float ua 74 | cdef int pos = 0 75 | cdef float maxscore = 0 76 | cdef int maxpos = 0 77 | cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov 78 | 79 | for i in range(N): 80 | maxscore = boxes[i, 4] 81 | maxpos = i 82 | 83 | tx1 = boxes[i,0] 84 | ty1 = boxes[i,1] 85 | tx2 = boxes[i,2] 86 | ty2 = boxes[i,3] 87 | ts = boxes[i,4] 88 | 89 | pos = i + 1 90 | # get max box 91 | while pos < N: 92 | if maxscore < boxes[pos, 4]: 93 | maxscore = boxes[pos, 4] 94 | maxpos = pos 95 | pos = pos + 1 96 | 97 | # add max box as a detection 98 | boxes[i,0] = boxes[maxpos,0] 99 | boxes[i,1] = boxes[maxpos,1] 100 | boxes[i,2] = boxes[maxpos,2] 101 | boxes[i,3] = boxes[maxpos,3] 102 | boxes[i,4] = boxes[maxpos,4] 103 | 104 | # swap ith box with position of max box 105 | boxes[maxpos,0] = tx1 106 | boxes[maxpos,1] = ty1 107 | boxes[maxpos,2] = tx2 108 | boxes[maxpos,3] = ty2 109 | boxes[maxpos,4] = ts 110 | 111 | tx1 = boxes[i,0] 112 | ty1 = boxes[i,1] 113 | tx2 = boxes[i,2] 114 | ty2 = boxes[i,3] 115 | ts = boxes[i,4] 116 | 117 | pos = i + 1 118 | # NMS iterations, note that N changes if detection boxes fall below threshold 119 | while pos < N: 120 | x1 = boxes[pos, 0] 121 | y1 = boxes[pos, 1] 122 | x2 = boxes[pos, 2] 123 | y2 = boxes[pos, 3] 124 | s = boxes[pos, 4] 125 | 126 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 127 | iw = (min(tx2, x2) - max(tx1, x1) + 1) 128 | if iw > 0: 129 | ih = (min(ty2, y2) - max(ty1, y1) + 1) 130 | if ih > 0: 131 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) 132 | ov = iw * ih / ua #iou between max box and detection box 133 | 134 | if method == 1: # linear 135 | if ov > Nt: 136 | weight = 1 - ov 137 | else: 138 | weight = 1 139 | elif method == 2: # gaussian 140 | weight = np.exp(-(ov * ov)/sigma) 141 | else: # original NMS 142 | if ov > Nt: 143 | weight = 0 144 | else: 145 | weight = 1 146 | 147 | boxes[pos, 4] = weight*boxes[pos, 4] 148 | 149 | # if box score falls below threshold, discard the box by swapping with last box 150 | # update N 151 | if boxes[pos, 4] < threshold: 152 | boxes[pos,0] = boxes[N-1, 0] 153 | boxes[pos,1] = boxes[N-1, 1] 154 | boxes[pos,2] = boxes[N-1, 2] 155 | boxes[pos,3] = boxes[N-1, 3] 156 | boxes[pos,4] = boxes[N-1, 4] 157 | N = N - 1 158 | pos = pos - 1 159 | 160 | pos = pos + 1 161 | 162 | keep = [i for i in range(N)] 163 | return keep 164 | -------------------------------------------------------------------------------- /layers/modules/refine_multibox_loss.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from utils.box_utils import match,refine_match, log_sum_exp,decode 7 | GPU = False 8 | if torch.cuda.is_available(): 9 | GPU = True 10 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 11 | 12 | 13 | class RefineMultiBoxLoss(nn.Module): 14 | """SSD Weighted Loss Function 15 | Compute Targets: 16 | 1) Produce Confidence Target Indices by matching ground truth boxes 17 | with (default) 'priorboxes' that have jaccard index > threshold parameter 18 | (default threshold: 0.5). 19 | 2) Produce localization target by 'encoding' variance into offsets of ground 20 | truth boxes and their matched 'priorboxes'. 21 | 3) Hard negative mining to filter the excessive number of negative examples 22 | that comes with using a large number of default bounding boxes. 23 | (default negative:positive ratio 3:1) 24 | Objective Loss: 25 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 26 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 27 | weighted by α which is set to 1 by cross val. 28 | Args: 29 | c: class confidences, 30 | l: predicted boxes, 31 | g: ground truth boxes 32 | N: number of matched default boxes 33 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 34 | """ 35 | 36 | 37 | def __init__(self, num_classes,overlap_thresh,prior_for_matching,bkg_label,neg_mining,neg_pos,neg_overlap,encode_target,object_score = 0): 38 | super(RefineMultiBoxLoss, self).__init__() 39 | self.num_classes = num_classes 40 | self.threshold = overlap_thresh 41 | self.background_label = bkg_label 42 | self.encode_target = encode_target 43 | self.use_prior_for_matching = prior_for_matching 44 | self.do_neg_mining = neg_mining 45 | self.negpos_ratio = neg_pos 46 | self.neg_overlap = neg_overlap 47 | self.object_score = object_score 48 | self.variance = [0.1,0.2] 49 | 50 | def forward(self, odm_data,priors, targets,arm_data = None,filter_object = False): 51 | """Multibox Loss 52 | Args: 53 | predictions (tuple): A tuple containing loc preds, conf preds, 54 | and prior boxes from SSD net. 55 | conf shape: torch.size(batch_size,num_priors,num_classes) 56 | loc shape: torch.size(batch_size,num_priors,4) 57 | priors shape: torch.size(num_priors,4) 58 | 59 | ground_truth (tensor): Ground truth boxes and labels for a batch, 60 | shape: [batch_size,num_objs,5] (last idx is the label). 61 | arm_data (tuple): arm branch containg arm_loc and arm_conf 62 | filter_object: whether filter out the prediction according to the arm conf score 63 | """ 64 | 65 | loc_data,conf_data = odm_data 66 | if arm_data: 67 | arm_loc,arm_conf = arm_data 68 | priors = priors.data 69 | num = loc_data.size(0) 70 | num_priors = (priors.size(0)) 71 | 72 | # match priors (default boxes) and ground truth boxes 73 | loc_t = torch.Tensor(num, num_priors, 4) 74 | conf_t = torch.LongTensor(num, num_priors) 75 | for idx in range(num): 76 | truths = targets[idx][:,:-1].data 77 | labels = targets[idx][:,-1].data 78 | #for object detection 79 | if self.num_classes == 2: 80 | labels = labels > 0 81 | if arm_data: 82 | refine_match(self.threshold,truths,priors,self.variance,labels,loc_t,conf_t,idx,arm_loc[idx].data) 83 | else: 84 | match(self.threshold,truths,priors,self.variance,labels,loc_t,conf_t,idx) 85 | if GPU: 86 | loc_t = loc_t.cuda() 87 | conf_t = conf_t.cuda() 88 | # wrap targets 89 | loc_t = Variable(loc_t, requires_grad=False) 90 | conf_t = Variable(conf_t,requires_grad=False) 91 | if arm_data and filter_object: 92 | arm_conf_data = arm_conf.data[:,:,1] 93 | pos = conf_t > 0 94 | object_score_index = arm_conf_data <= self.object_score 95 | pos[object_score_index] = 0 96 | 97 | else: 98 | pos = conf_t > 0 99 | 100 | # Localization Loss (Smooth L1) 101 | # Shape: [batch,num_priors,4] 102 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 103 | loc_p = loc_data[pos_idx].view(-1,4) 104 | loc_t = loc_t[pos_idx].view(-1,4) 105 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 106 | 107 | # Compute max conf across batch for hard negative mining 108 | batch_conf = conf_data.view(-1,self.num_classes) 109 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1,1)) 110 | 111 | # Hard Negative Mining 112 | loss_c[pos] = 0 # filter out pos boxes for now 113 | loss_c = loss_c.view(num, -1) 114 | _,loss_idx = loss_c.sort(1, descending=True) 115 | _,idx_rank = loss_idx.sort(1) 116 | num_pos = pos.long().sum(1,keepdim=True) 117 | num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) 118 | neg = idx_rank < num_neg.expand_as(idx_rank) 119 | 120 | # Confidence Loss Including Positive and Negative Examples 121 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 122 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 123 | conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1,self.num_classes) 124 | targets_weighted = conf_t[(pos+neg).gt(0)] 125 | loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False) 126 | 127 | # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 128 | N = num_pos.data.sum() 129 | loss_l/=N 130 | loss_c/=N 131 | return loss_l,loss_c 132 | -------------------------------------------------------------------------------- /utils/build.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | import numpy as np 11 | from distutils.core import setup 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 19 | for dir in path.split(os.pathsep): 20 | binpath = pjoin(dir, name) 21 | if os.path.exists(binpath): 22 | return os.path.abspath(binpath) 23 | return None 24 | 25 | 26 | def locate_cuda(): 27 | """Locate the CUDA environment on the system 28 | 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | 32 | Starts by looking for the CUDAHOME env variable. If not found, everything 33 | is based on finding 'nvcc' in the PATH. 34 | """ 35 | 36 | # first check if the CUDAHOME env variable is in use 37 | if 'CUDAHOME' in os.environ: 38 | home = os.environ['CUDAHOME'] 39 | nvcc = pjoin(home, 'bin', 'nvcc') 40 | else: 41 | # otherwise, search the PATH for NVCC 42 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 43 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 44 | if nvcc is None: 45 | raise EnvironmentError('The nvcc binary could not be ' 46 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 47 | home = os.path.dirname(os.path.dirname(nvcc)) 48 | 49 | cudaconfig = {'home': home, 'nvcc': nvcc, 50 | 'include': pjoin(home, 'include'), 51 | 'lib64': pjoin(home, 'lib64')} 52 | for k, v in cudaconfig.items(): 53 | if not os.path.exists(v): 54 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 55 | 56 | return cudaconfig 57 | 58 | 59 | CUDA = locate_cuda() 60 | 61 | # Obtain the numpy include directory. This logic works across numpy versions. 62 | try: 63 | numpy_include = np.get_include() 64 | except AttributeError: 65 | numpy_include = np.get_numpy_include() 66 | 67 | 68 | def customize_compiler_for_nvcc(self): 69 | """inject deep into distutils to customize how the dispatch 70 | to gcc/nvcc works. 71 | 72 | If you subclass UnixCCompiler, it's not trivial to get your subclass 73 | injected in, and still have the right customizations (i.e. 74 | distutils.sysconfig.customize_compiler) run on it. So instead of going 75 | the OO route, I have this. Note, it's kindof like a wierd functional 76 | subclassing going on.""" 77 | 78 | # tell the compiler it can processes .cu 79 | self.src_extensions.append('.cu') 80 | 81 | # save references to the default compiler_so and _comple methods 82 | default_compiler_so = self.compiler_so 83 | super = self._compile 84 | 85 | # now redefine the _compile method. This gets executed for each 86 | # object but distutils doesn't have the ability to change compilers 87 | # based on source extension: we add it. 88 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 89 | print(extra_postargs) 90 | if os.path.splitext(src)[1] == '.cu': 91 | # use the cuda for .cu files 92 | self.set_executable('compiler_so', CUDA['nvcc']) 93 | # use only a subset of the extra_postargs, which are 1-1 translated 94 | # from the extra_compile_args in the Extension class 95 | postargs = extra_postargs['nvcc'] 96 | else: 97 | postargs = extra_postargs['gcc'] 98 | 99 | super(obj, src, ext, cc_args, postargs, pp_opts) 100 | # reset the default compiler_so, which we might have changed for cuda 101 | self.compiler_so = default_compiler_so 102 | 103 | # inject our redefined _compile method into the class 104 | self._compile = _compile 105 | 106 | 107 | # run the customize_compiler 108 | class custom_build_ext(build_ext): 109 | def build_extensions(self): 110 | customize_compiler_for_nvcc(self.compiler) 111 | build_ext.build_extensions(self) 112 | 113 | 114 | ext_modules = [ 115 | Extension( 116 | "nms.cpu_nms", 117 | ["nms/cpu_nms.pyx"], 118 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 119 | include_dirs=[numpy_include] 120 | ), 121 | Extension('nms.gpu_nms', 122 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 123 | library_dirs=[CUDA['lib64']], 124 | libraries=['cudart'], 125 | language='c++', 126 | runtime_library_dirs=[CUDA['lib64']], 127 | # this syntax is specific to this build system 128 | # we're only going to use certain compiler args with nvcc and not with gcc 129 | # the implementation of this trick is in customize_compiler() below 130 | extra_compile_args={'gcc': ["-Wno-unused-function"], 131 | 'nvcc': ['-arch=sm_52', 132 | '--ptxas-options=-v', 133 | '-c', 134 | '--compiler-options', 135 | "'-fPIC'"]}, 136 | include_dirs=[numpy_include, CUDA['include']] 137 | ), 138 | Extension( 139 | 'pycocotools._mask', 140 | sources=['pycocotools/maskApi.c', 'pycocotools/_mask.pyx'], 141 | include_dirs=[numpy_include, 'pycocotools'], 142 | extra_compile_args={ 143 | 'gcc': ['-Wno-cpp', '-Wno-unused-function', '-std=c99']}, 144 | ), 145 | ] 146 | 147 | setup( 148 | name='mot_utils', 149 | ext_modules=ext_modules, 150 | # inject our custom trigger 151 | cmdclass={'build_ext': custom_build_ext}, 152 | ) 153 | -------------------------------------------------------------------------------- /models/mobilenet.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | """ 4 | Creates a MobileNet Model as defined in: 5 | Andrew G. Howard Menglong Zhu Bo Chen, et.al. (2017). 6 | MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications. 7 | (c) Yang Lu 8 | """ 9 | import math 10 | import torch.nn as nn 11 | import torch 12 | 13 | __all__ = ['DepthWiseBlock', 'mobilenet', 'mobilenet_2', 'mobilenet_1', 'mobilenet_075', 'mobilenet_05', 14 | 'mobilenet_025'] 15 | 16 | class SELayer(nn.Module): 17 | def __init__(self, channel, reduction=16): 18 | super(SELayer, self).__init__() 19 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 20 | self.fc = nn.Sequential( 21 | nn.Linear(channel, channel // reduction, bias=False), 22 | nn.ReLU(inplace=True), 23 | nn.Linear(channel // reduction, channel, bias=False), 24 | nn.Sigmoid() 25 | ) 26 | 27 | def forward(self, x): 28 | b, c, _, _ = x.size() 29 | y = self.avg_pool(x).view(b,c) 30 | y = self.fc(y).view(b, c, 1, 1) 31 | return x * y.expand_as(x) 32 | 33 | class DepthWiseBlock(nn.Module): 34 | def __init__(self, inplanes, planes, stride=1, padding=1): 35 | super(DepthWiseBlock, self).__init__() 36 | inplanes, planes = int(inplanes), int(planes) 37 | self.conv_dw = nn.Conv2d(inplanes, inplanes, kernel_size=3, padding=padding, stride=stride, groups=inplanes, 38 | bias=False) 39 | self.bn_dw = nn.BatchNorm2d(inplanes) 40 | self.conv_sep = nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, padding=0, bias=False) 41 | self.bn_sep = nn.BatchNorm2d(planes) 42 | self.relu = nn.ReLU(inplace=True) 43 | #self.se = SELayer(inplanes) 44 | 45 | def forward(self, x): 46 | out = self.conv_dw(x) 47 | out = self.bn_dw(out) 48 | out = self.relu(out) 49 | 50 | out = self.conv_sep(out) 51 | out = self.bn_sep(out) 52 | out = self.relu(out) 53 | #out = self.se(out) 54 | 55 | return out 56 | 57 | 58 | class MobileNet(nn.Module): 59 | def __init__(self, widen_factor=1.0, num_classes=1000): 60 | """ Constructor 61 | Args: 62 | widen_factor: config of widen_factor 63 | num_classes: number of classes 64 | """ 65 | super(MobileNet, self).__init__() 66 | 67 | block = DepthWiseBlock 68 | self.conv1 = nn.Conv2d(3, int(32 * widen_factor), kernel_size=3, stride=2, padding=1, bias=False) 69 | self.bn1 = nn.BatchNorm2d(int(32 * widen_factor)) 70 | self.relu = nn.ReLU(inplace=True) 71 | 72 | self.dw2_1 = block(32 * widen_factor, 64 * widen_factor) 73 | self.dw2_2 = block(64 * widen_factor, 128 * widen_factor, stride=2) 74 | 75 | self.dw3_1 = block(128 * widen_factor, 128 * widen_factor) 76 | self.dw3_2 = block(128 * widen_factor, 256 * widen_factor, stride=2) 77 | 78 | self.dw4_1 = block(256 * widen_factor, 256 * widen_factor) 79 | self.dw4_2 = block(256 * widen_factor, 512 * widen_factor, stride=2) 80 | 81 | self.dw5_1 = block(512 * widen_factor, 512 * widen_factor) 82 | self.dw5_2 = block(512 * widen_factor, 512 * widen_factor) 83 | self.dw5_3 = block(512 * widen_factor, 512 * widen_factor) 84 | self.dw5_4 = block(512 * widen_factor, 512 * widen_factor) 85 | self.dw5_5 = block(512 * widen_factor, 512 * widen_factor) 86 | self.dw5_6 = block(512 * widen_factor, 1024 * widen_factor, stride=2) 87 | 88 | self.dw6 = block(1024 * widen_factor, 1024 * widen_factor) 89 | 90 | self.avgpool = nn.AdaptiveAvgPool2d(1) 91 | self.fc = nn.Linear(int(1024 * widen_factor), num_classes) 92 | 93 | for m in self.modules(): 94 | if isinstance(m, nn.Conv2d): 95 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 96 | m.weight.data.normal_(0, math.sqrt(2. / n)) 97 | elif isinstance(m, nn.BatchNorm2d): 98 | m.weight.data.fill_(1) 99 | m.bias.data.zero_() 100 | 101 | def forward(self, x): 102 | x = self.conv1(x) 103 | x = self.bn1(x) 104 | x = self.relu(x) 105 | 106 | x = self.dw2_1(x) 107 | x = self.dw2_2(x) 108 | x = self.dw3_1(x) 109 | x = self.dw3_2(x) 110 | x0 = self.dw4_1(x) 111 | #print(x0.size(),"layer4_1") 112 | x = self.dw4_2(x0) 113 | x = self.dw5_1(x) 114 | x = self.dw5_2(x) 115 | x = self.dw5_3(x) 116 | x = self.dw5_4(x) 117 | x1 = self.dw5_5(x) 118 | #print(x1.size(), "layer5_5") 119 | x = self.dw5_6(x1) 120 | x2 = self.dw6(x) 121 | #print(x2.size(), "layer6_1") 122 | return x0, x1, x2 123 | 124 | 125 | def mobilenet(widen_factor=1.0, num_classes=1000): 126 | """ 127 | Construct MobileNet. 128 | """ 129 | model = MobileNet(widen_factor=widen_factor, num_classes=num_classes) 130 | return model 131 | 132 | 133 | def mobilenet_2(): 134 | """ 135 | Construct MobileNet. 136 | """ 137 | model = MobileNet(widen_factor=2.0, num_classes=1000) 138 | return model 139 | 140 | 141 | def mobilenet_1(): 142 | """ 143 | Construct MobileNet. 144 | """ 145 | model = MobileNet(widen_factor=1.0, num_classes=1000) 146 | return model 147 | 148 | 149 | def mobilenet_075(): 150 | """ 151 | Construct MobileNet. 152 | """ 153 | model = MobileNet(widen_factor=0.75, num_classes=1000) 154 | return model 155 | 156 | 157 | def mobilenet_05(): 158 | """ 159 | Construct MobileNet. 160 | """ 161 | model = MobileNet(widen_factor=0.5, num_classes=1000) 162 | return model 163 | 164 | 165 | def mobilenet_025(): 166 | """ 167 | Construct MobileNet. 168 | """ 169 | model = MobileNet(widen_factor=0.25, num_classes=1000) 170 | return model 171 | 172 | 173 | # if __name__ == '__main__': 174 | # mobilenet = mobilenet_1() 175 | # print(mobilenet) 176 | # print(mobilenet.state_dict().keys()) 177 | 178 | # from torch.autograd import Variable 179 | # 180 | # input = Variable(torch.randn(1, 3, 300, 300)) 181 | # 182 | # model = mobilenet() 183 | # print(model) 184 | # 185 | # output = model(input) 186 | # print(output.size()) 187 | -------------------------------------------------------------------------------- /data/voc0712_aug.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | #from .config import HOME 9 | import os.path as osp 10 | import sys 11 | import torch 12 | import torch.utils.data as data 13 | import cv2 14 | import numpy as np 15 | if sys.version_info[0] == 2: 16 | import xml.etree.cElementTree as ET 17 | else: 18 | import xml.etree.ElementTree as ET 19 | 20 | VOC_CLASSES = ( # always index 0 21 | 'aeroplane', 'bicycle', 'bird', 'boat', 22 | 'bottle', 'bus', 'car', 'cat', 'chair', 23 | 'cow', 'diningtable', 'dog', 'horse', 24 | 'motorbike', 'person', 'pottedplant', 25 | 'sheep', 'sofa', 'train', 'tvmonitor') 26 | 27 | # note: if you used our download scripts, this should be right 28 | #VOC_ROOT = osp.join(HOME, "data/VOCdevkit/") 29 | 30 | 31 | class VOCAnnotationTransform(object): 32 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 33 | Initilized with a dictionary lookup of classnames to indexes 34 | 35 | Arguments: 36 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 37 | (default: alphabetic indexing of VOC's 20 classes) 38 | keep_difficult (bool, optional): keep difficult instances or not 39 | (default: False) 40 | height (int): height 41 | width (int): width 42 | """ 43 | 44 | def __init__(self, class_to_ind=None, keep_difficult=False): 45 | self.class_to_ind = class_to_ind or dict( 46 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 47 | self.keep_difficult = keep_difficult 48 | 49 | def __call__(self, target, width, height): 50 | """ 51 | Arguments: 52 | target (annotation) : the target annotation to be made usable 53 | will be an ET.Element 54 | Returns: 55 | a list containing lists of bounding boxes [bbox coords, class name] 56 | """ 57 | res = [] 58 | for obj in target.iter('object'): 59 | difficult = int(obj.find('difficult').text) == 1 60 | if not self.keep_difficult and difficult: 61 | continue 62 | name = obj.find('name').text.lower().strip() 63 | bbox = obj.find('bndbox') 64 | 65 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 66 | bndbox = [] 67 | for i, pt in enumerate(pts): 68 | cur_pt = int(bbox.find(pt).text) - 1 69 | # scale height or width 70 | cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 71 | bndbox.append(cur_pt) 72 | label_idx = self.class_to_ind[name] 73 | bndbox.append(label_idx) 74 | res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] 75 | # img_id = target.find('filename').text[:-4] 76 | 77 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 78 | 79 | 80 | class VOCDetection(data.Dataset): 81 | """VOC Detection Dataset Object 82 | 83 | input is image, target is annotation 84 | 85 | Arguments: 86 | root (string): filepath to VOCdevkit folder. 87 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 88 | transform (callable, optional): transformation to perform on the 89 | input image 90 | target_transform (callable, optional): transformation to perform on the 91 | target `annotation` 92 | (eg: take in caption string, return tensor of word indices) 93 | dataset_name (string, optional): which dataset to load 94 | (default: 'VOC2007') 95 | """ 96 | 97 | def __init__(self, root, 98 | image_sets=[('2007', 'trainval'), ('2012', 'trainval')], 99 | transform=None, target_transform=VOCAnnotationTransform(), 100 | dataset_name='VOC0712'): 101 | self.root = root 102 | self.image_set = image_sets 103 | self.transform = transform 104 | self.target_transform = target_transform 105 | self.name = dataset_name 106 | self._annopath = osp.join('%s', 'Annotations', '%s.xml') 107 | self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg') 108 | self.ids = list() 109 | for (year, name) in image_sets: 110 | rootpath = osp.join(self.root, 'VOC' + year) 111 | for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 112 | self.ids.append((rootpath, line.strip())) 113 | 114 | def __getitem__(self, index): 115 | im, gt, h, w = self.pull_item(index) 116 | 117 | return im, gt 118 | 119 | def __len__(self): 120 | return len(self.ids) 121 | 122 | def pull_item(self, index): 123 | img_id = self.ids[index] 124 | 125 | target = ET.parse(self._annopath % img_id).getroot() 126 | img = cv2.imread(self._imgpath % img_id) 127 | height, width, channels = img.shape 128 | 129 | if self.target_transform is not None: 130 | target = self.target_transform(target, width, height) 131 | 132 | if self.transform is not None: 133 | target = np.array(target) 134 | img, boxes, labels = self.transform(img, target[:, :4], target[:, 4]) 135 | # to rgb 136 | img = img[:, :, (2, 1, 0)] 137 | # img = img.transpose(2, 0, 1) 138 | target = np.hstack((boxes, np.expand_dims(labels, axis=1))) 139 | return torch.from_numpy(img).permute(2, 0, 1), target, height, width 140 | # return torch.from_numpy(img), target, height, width 141 | 142 | def pull_image(self, index): 143 | '''Returns the original image object at index in PIL form 144 | 145 | Note: not using self.__getitem__(), as any transformations passed in 146 | could mess up this functionality. 147 | 148 | Argument: 149 | index (int): index of img to show 150 | Return: 151 | PIL img 152 | ''' 153 | img_id = self.ids[index] 154 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 155 | 156 | def pull_anno(self, index): 157 | '''Returns the original annotation of image at index 158 | 159 | Note: not using self.__getitem__(), as any transformations passed in 160 | could mess up this functionality. 161 | 162 | Argument: 163 | index (int): index of img to get annotation of 164 | Return: 165 | list: [img_id, [(label, bbox coords),...]] 166 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 167 | ''' 168 | img_id = self.ids[index] 169 | anno = ET.parse(self._annopath % img_id).getroot() 170 | gt = self.target_transform(anno, 1, 1) 171 | return img_id[1], gt 172 | 173 | def pull_tensor(self, index): 174 | '''Returns the original image at an index in tensor form 175 | 176 | Note: not using self.__getitem__(), as any transformations passed in 177 | could mess up this functionality. 178 | 179 | Argument: 180 | index (int): index of img to show 181 | Return: 182 | tensorized version of img, squeezed 183 | ''' 184 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 185 | -------------------------------------------------------------------------------- /data/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import pickle 8 | import xml.etree.ElementTree as ET 9 | 10 | import numpy as np 11 | import os 12 | 13 | 14 | def parse_rec(filename): 15 | """ Parse a PASCAL VOC xml file """ 16 | tree = ET.parse(filename) 17 | objects = [] 18 | for obj in tree.findall('object'): 19 | obj_struct = {} 20 | obj_struct['name'] = obj.find('name').text 21 | obj_struct['pose'] = obj.find('pose').text 22 | obj_struct['truncated'] = int(obj.find('truncated').text) 23 | obj_struct['difficult'] = int(obj.find('difficult').text) 24 | bbox = obj.find('bndbox') 25 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 26 | int(bbox.find('ymin').text), 27 | int(bbox.find('xmax').text), 28 | int(bbox.find('ymax').text)] 29 | objects.append(obj_struct) 30 | 31 | return objects 32 | 33 | 34 | def voc_ap(rec, prec, use_07_metric=False): 35 | """ ap = voc_ap(rec, prec, [use_07_metric]) 36 | Compute VOC AP given precision and recall. 37 | If use_07_metric is true, uses the 38 | VOC 07 11 point method (default:False). 39 | """ 40 | if use_07_metric: 41 | # 11 point metric 42 | ap = 0. 43 | for t in np.arange(0., 1.1, 0.1): 44 | if np.sum(rec >= t) == 0: 45 | p = 0 46 | else: 47 | p = np.max(prec[rec >= t]) 48 | ap = ap + p / 11. 49 | else: 50 | # correct AP calculation 51 | # first append sentinel values at the end 52 | mrec = np.concatenate(([0.], rec, [1.])) 53 | mpre = np.concatenate(([0.], prec, [0.])) 54 | 55 | # compute the precision envelope 56 | for i in range(mpre.size - 1, 0, -1): 57 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 58 | 59 | # to calculate area under PR curve, look for points 60 | # where X axis (recall) changes value 61 | i = np.where(mrec[1:] != mrec[:-1])[0] 62 | 63 | # and sum (\Delta recall) * prec 64 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 65 | return ap 66 | 67 | 68 | def voc_eval(detpath, 69 | annopath, 70 | imagesetfile, 71 | classname, 72 | cachedir, 73 | ovthresh=0.5, 74 | use_07_metric=False): 75 | """rec, prec, ap = voc_eval(detpath, 76 | annopath, 77 | imagesetfile, 78 | classname, 79 | [ovthresh], 80 | [use_07_metric]) 81 | 82 | Top level function that does the PASCAL VOC evaluation. 83 | 84 | detpath: Path to detections 85 | detpath.format(classname) should produce the detection results file. 86 | annopath: Path to annotations 87 | annopath.format(imagename) should be the xml annotations file. 88 | imagesetfile: Text file containing the list of images, one image per line. 89 | classname: Category name (duh) 90 | cachedir: Directory for caching the annotations 91 | [ovthresh]: Overlap threshold (default = 0.5) 92 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 93 | (default False) 94 | """ 95 | # assumes detections are in detpath.format(classname) 96 | # assumes annotations are in annopath.format(imagename) 97 | # assumes imagesetfile is a text file with each line an image name 98 | # cachedir caches the annotations in a pickle file 99 | 100 | # first load gt 101 | if not os.path.isdir(cachedir): 102 | os.mkdir(cachedir) 103 | cachefile = os.path.join(cachedir, 'annots.pkl') 104 | # read list of images 105 | with open(imagesetfile, 'r') as f: 106 | lines = f.readlines() 107 | imagenames = [x.strip() for x in lines] 108 | 109 | if not os.path.isfile(cachefile): 110 | # load annots 111 | recs = {} 112 | for i, imagename in enumerate(imagenames): 113 | recs[imagename] = parse_rec(annopath.format(imagename)) 114 | if i % 100 == 0: 115 | print('Reading annotation for {:d}/{:d}'.format( 116 | i + 1, len(imagenames))) 117 | # save 118 | print('Saving cached annotations to {:s}'.format(cachefile)) 119 | with open(cachefile, 'wb') as f: 120 | pickle.dump(recs, f) 121 | else: 122 | # load 123 | with open(cachefile, 'rb') as f: 124 | recs = pickle.load(f) 125 | 126 | # extract gt objects for this class 127 | class_recs = {} 128 | npos = 0 129 | for imagename in imagenames: 130 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 131 | bbox = np.array([x['bbox'] for x in R]) 132 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 133 | det = [False] * len(R) 134 | npos = npos + sum(~difficult) 135 | class_recs[imagename] = {'bbox': bbox, 136 | 'difficult': difficult, 137 | 'det': det} 138 | 139 | # read dets 140 | detfile = detpath.format(classname) 141 | with open(detfile, 'r') as f: 142 | lines = f.readlines() 143 | 144 | splitlines = [x.strip().split(' ') for x in lines] 145 | image_ids = [x[0] for x in splitlines] 146 | confidence = np.array([float(x[1]) for x in splitlines]) 147 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 148 | 149 | # sort by confidence 150 | sorted_ind = np.argsort(-confidence) 151 | sorted_scores = np.sort(-confidence) 152 | BB = BB[sorted_ind, :] 153 | image_ids = [image_ids[x] for x in sorted_ind] 154 | 155 | # go down dets and mark TPs and FPs 156 | nd = len(image_ids) 157 | tp = np.zeros(nd) 158 | fp = np.zeros(nd) 159 | for d in range(nd): 160 | R = class_recs[image_ids[d]] 161 | bb = BB[d, :].astype(float) 162 | ovmax = -np.inf 163 | BBGT = R['bbox'].astype(float) 164 | 165 | if BBGT.size > 0: 166 | # compute overlaps 167 | # intersection 168 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 169 | iymin = np.maximum(BBGT[:, 1], bb[1]) 170 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 171 | iymax = np.minimum(BBGT[:, 3], bb[3]) 172 | iw = np.maximum(ixmax - ixmin + 1., 0.) 173 | ih = np.maximum(iymax - iymin + 1., 0.) 174 | inters = iw * ih 175 | 176 | # union 177 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 178 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 179 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 180 | 181 | overlaps = inters / uni 182 | ovmax = np.max(overlaps) 183 | jmax = np.argmax(overlaps) 184 | 185 | if ovmax > ovthresh: 186 | if not R['difficult'][jmax]: 187 | if not R['det'][jmax]: 188 | tp[d] = 1. 189 | R['det'][jmax] = 1 190 | else: 191 | fp[d] = 1. 192 | else: 193 | fp[d] = 1. 194 | 195 | # compute precision recall 196 | fp = np.cumsum(fp) 197 | tp = np.cumsum(tp) 198 | rec = tp / float(npos) 199 | # avoid divide by zero in case the first detection matches a difficult 200 | # ground truth 201 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 202 | ap = voc_ap(rec, prec, use_07_metric) 203 | 204 | return rec, prec, ap 205 | -------------------------------------------------------------------------------- /models/misc.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os 3 | import shutil 4 | import pickle as pkl 5 | import time 6 | import numpy as np 7 | import hashlib 8 | 9 | from IPython import embed 10 | 11 | class Logger(object): 12 | def __init__(self): 13 | self._logger = None 14 | 15 | def init(self, logdir, name='log'): 16 | if self._logger is None: 17 | import logging 18 | if not os.path.exists(logdir): 19 | os.makedirs(logdir) 20 | log_file = os.path.join(logdir, name) 21 | if os.path.exists(log_file): 22 | os.remove(log_file) 23 | self._logger = logging.getLogger() 24 | self._logger.setLevel('INFO') 25 | fh = logging.FileHandler(log_file) 26 | ch = logging.StreamHandler() 27 | self._logger.addHandler(fh) 28 | self._logger.addHandler(ch) 29 | 30 | def info(self, str_info): 31 | self.init('/tmp', 'tmp.log') 32 | self._logger.info(str_info) 33 | logger = Logger() 34 | 35 | print = logger.info 36 | def ensure_dir(path, erase=False): 37 | if os.path.exists(path) and erase: 38 | print("Removing old folder {}".format(path)) 39 | shutil.rmtree(path) 40 | if not os.path.exists(path): 41 | print("Creating folder {}".format(path)) 42 | os.makedirs(path) 43 | 44 | def load_pickle(path): 45 | begin_st = time.time() 46 | with open(path, 'rb') as f: 47 | print("Loading pickle object from {}".format(path)) 48 | v = pkl.load(f) 49 | print("=> Done ({:.4f} s)".format(time.time() - begin_st)) 50 | return v 51 | 52 | def dump_pickle(obj, path): 53 | with open(path, 'wb') as f: 54 | print("Dumping pickle object to {}".format(path)) 55 | pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) 56 | 57 | def auto_select_gpu(mem_bound=500, utility_bound=0, gpus=(0, 1, 2, 3, 4, 5, 6, 7), num_gpu=1, selected_gpus=None): 58 | import sys 59 | import os 60 | import subprocess 61 | import re 62 | import time 63 | import numpy as np 64 | if 'CUDA_VISIBLE_DEVCIES' in os.environ: 65 | sys.exit(0) 66 | if selected_gpus is None: 67 | mem_trace = [] 68 | utility_trace = [] 69 | for i in range(5): # sample 5 times 70 | info = subprocess.check_output('nvidia-smi', shell=True).decode('utf-8') 71 | mem = [int(s[:-5]) for s in re.compile('\d+MiB\s/').findall(info)] 72 | utility = [int(re.compile('\d+').findall(s)[0]) for s in re.compile('\d+%\s+Default').findall(info)] 73 | mem_trace.append(mem) 74 | utility_trace.append(utility) 75 | time.sleep(0.1) 76 | mem = np.mean(mem_trace, axis=0) 77 | utility = np.mean(utility_trace, axis=0) 78 | assert(len(mem) == len(utility)) 79 | nGPU = len(utility) 80 | ideal_gpus = [i for i in range(nGPU) if mem[i] <= mem_bound and utility[i] <= utility_bound and i in gpus] 81 | 82 | if len(ideal_gpus) < num_gpu: 83 | print("No sufficient resource, available: {}, require {} gpu".format(ideal_gpus, num_gpu)) 84 | sys.exit(0) 85 | else: 86 | selected_gpus = list(map(str, ideal_gpus[:num_gpu])) 87 | else: 88 | selected_gpus = selected_gpus.split(',') 89 | 90 | print("Setting GPU: {}".format(selected_gpus)) 91 | os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(selected_gpus) 92 | return selected_gpus 93 | 94 | def expand_user(path): 95 | return os.path.abspath(os.path.expanduser(path)) 96 | 97 | def model_snapshot(model, new_file, old_file=None, verbose=False): 98 | from collections import OrderedDict 99 | import torch 100 | if isinstance(model, torch.nn.DataParallel): 101 | model = model.module 102 | if old_file and os.path.exists(expand_user(old_file)): 103 | if verbose: 104 | print("Removing old model {}".format(expand_user(old_file))) 105 | os.remove(expand_user(old_file)) 106 | if verbose: 107 | print("Saving model to {}".format(expand_user(new_file))) 108 | 109 | state_dict = OrderedDict() 110 | for k, v in model.state_dict().items(): 111 | if v.is_cuda: 112 | v = v.cpu() 113 | state_dict[k] = v 114 | torch.save(state_dict, expand_user(new_file)) 115 | 116 | 117 | def load_lmdb(lmdb_file, n_records=None): 118 | import lmdb 119 | import numpy as np 120 | lmdb_file = expand_user(lmdb_file) 121 | if os.path.exists(lmdb_file): 122 | data = [] 123 | env = lmdb.open(lmdb_file, readonly=True, max_readers=512) 124 | with env.begin() as txn: 125 | cursor = txn.cursor() 126 | begin_st = time.time() 127 | print("Loading lmdb file {} into memory".format(lmdb_file)) 128 | for key, value in cursor: 129 | _, target, _ = key.decode('ascii').split(':') 130 | target = int(target) 131 | img = cv2.imdecode(np.fromstring(value, np.uint8), cv2.IMREAD_COLOR) 132 | data.append((img, target)) 133 | if n_records is not None and len(data) >= n_records: 134 | break 135 | env.close() 136 | print("=> Done ({:.4f} s)".format(time.time() - begin_st)) 137 | return data 138 | else: 139 | print("Not found lmdb file".format(lmdb_file)) 140 | 141 | def str2img(str_b): 142 | return cv2.imdecode(np.fromstring(str_b, np.uint8), cv2.IMREAD_COLOR) 143 | 144 | def img2str(img): 145 | return cv2.imencode('.jpg', img)[1].tostring() 146 | 147 | def md5(s): 148 | m = hashlib.md5() 149 | m.update(s) 150 | return m.hexdigest() 151 | 152 | def eval_model(model, ds, n_sample=None, ngpu=1, is_imagenet=False): 153 | import tqdm 154 | import torch 155 | from torch import nn 156 | from torch.autograd import Variable 157 | 158 | class ModelWrapper(nn.Module): 159 | def __init__(self, model): 160 | super(ModelWrapper, self).__init__() 161 | self.model = model 162 | self.mean = [0.485, 0.456, 0.406] 163 | self.std = [0.229, 0.224, 0.225] 164 | 165 | def forward(self, input): 166 | input.data.div_(255.) 167 | input.data[:, 0, :, :].sub_(self.mean[0]).div_(self.std[0]) 168 | input.data[:, 1, :, :].sub_(self.mean[1]).div_(self.std[1]) 169 | input.data[:, 2, :, :].sub_(self.mean[2]).div_(self.std[2]) 170 | return self.model(input) 171 | 172 | correct1, correct5 = 0, 0 173 | n_passed = 0 174 | if is_imagenet: 175 | model = ModelWrapper(model) 176 | model = model.eval() 177 | model = torch.nn.DataParallel(model, device_ids=range(ngpu)).cuda() 178 | 179 | n_sample = len(ds) if n_sample is None else n_sample 180 | for idx, (data, target) in enumerate(tqdm.tqdm(ds, total=n_sample)): 181 | n_passed += len(data) 182 | data = Variable(torch.FloatTensor(data)).cuda() 183 | indx_target = torch.LongTensor(target) 184 | output = model(data) 185 | bs = output.size(0) 186 | idx_pred = output.data.sort(1, descending=True)[1] 187 | 188 | idx_gt1 = indx_target.expand(1, bs).transpose_(0, 1) 189 | idx_gt5 = idx_gt1.expand(bs, 5) 190 | 191 | correct1 += idx_pred[:, :1].cpu().eq(idx_gt1).sum() 192 | correct5 += idx_pred[:, :5].cpu().eq(idx_gt5).sum() 193 | 194 | if idx >= n_sample - 1: 195 | break 196 | 197 | acc1 = correct1 * 1.0 / n_passed 198 | acc5 = correct5 * 1.0 / n_passed 199 | return acc1, acc5 200 | 201 | def load_state_dict(model, model_urls, model_root): 202 | from torch.utils import model_zoo 203 | from torch import nn 204 | import re 205 | from collections import OrderedDict 206 | own_state_old = model.state_dict() 207 | own_state = OrderedDict() # remove all 'group' string 208 | for k, v in own_state_old.items(): 209 | k = re.sub('group\d+\.', '', k) 210 | own_state[k] = v 211 | 212 | state_dict = model_zoo.load_url(model_urls, model_root) 213 | 214 | for name, param in state_dict.items(): 215 | if name not in own_state: 216 | print(own_state.keys()) 217 | raise KeyError('unexpected key "{}" in state_dict' 218 | .format(name)) 219 | if isinstance(param, nn.Parameter): 220 | # backwards compatibility for serialized parameters 221 | param = param.data 222 | own_state[name].copy_(param) 223 | 224 | missing = set(own_state.keys()) - set(state_dict.keys()) 225 | if len(missing) > 0: 226 | raise KeyError('missing keys in state_dict: "{}"'.format(missing)) -------------------------------------------------------------------------------- /data/data_augment.py: -------------------------------------------------------------------------------- 1 | """Data augmentation functionality. Passed as callable transformations to 2 | Dataset classes. 3 | 4 | The data augmentation procedures were interpreted from @weiliu89's SSD paper 5 | http://arxiv.org/abs/1512.02325 6 | 7 | TODO: implement data_augment for training 8 | 9 | Ellis Brown, Max deGroot 10 | """ 11 | 12 | import math 13 | 14 | import cv2 15 | import numpy as np 16 | import random 17 | import torch 18 | 19 | from utils.box_utils import matrix_iou 20 | 21 | 22 | # import torch_transforms 23 | 24 | def _crop(image, boxes, labels): 25 | height, width, _ = image.shape 26 | 27 | if len(boxes) == 0: 28 | return image, boxes, labels 29 | 30 | while True: 31 | mode = random.choice(( 32 | None, 33 | (0.1, None), 34 | (0.3, None), 35 | (0.5, None), 36 | (0.7, None), 37 | (0.9, None), 38 | (None, None), 39 | )) 40 | 41 | if mode is None: 42 | return image, boxes, labels 43 | 44 | min_iou, max_iou = mode 45 | if min_iou is None: 46 | min_iou = float('-inf') 47 | if max_iou is None: 48 | max_iou = float('inf') 49 | 50 | for _ in range(50): 51 | scale = random.uniform(0.3, 1.) 52 | min_ratio = max(0.5, scale * scale) 53 | max_ratio = min(2, 1. / scale / scale) 54 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 55 | w = int(scale * ratio * width) 56 | h = int((scale / ratio) * height) 57 | 58 | l = random.randrange(width - w) 59 | t = random.randrange(height - h) 60 | roi = np.array((l, t, l + w, t + h)) 61 | 62 | iou = matrix_iou(boxes, roi[np.newaxis]) 63 | 64 | if not (min_iou <= iou.min() and iou.max() <= max_iou): 65 | continue 66 | 67 | image_t = image[roi[1]:roi[3], roi[0]:roi[2]] 68 | 69 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2 70 | mask = np.logical_and(roi[:2] < centers, centers < roi[2:]) \ 71 | .all(axis=1) 72 | boxes_t = boxes[mask].copy() 73 | labels_t = labels[mask].copy() 74 | if len(boxes_t) == 0: 75 | continue 76 | 77 | boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2]) 78 | boxes_t[:, :2] -= roi[:2] 79 | boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:]) 80 | boxes_t[:, 2:] -= roi[:2] 81 | 82 | return image_t, boxes_t, labels_t 83 | 84 | 85 | def _distort(image): 86 | def _convert(image, alpha=1, beta=0): 87 | tmp = image.astype(float) * alpha + beta 88 | tmp[tmp < 0] = 0 89 | tmp[tmp > 255] = 255 90 | image[:] = tmp 91 | 92 | image = image.copy() 93 | 94 | if random.randrange(2): 95 | _convert(image, beta=random.uniform(-32, 32)) 96 | 97 | if random.randrange(2): 98 | _convert(image, alpha=random.uniform(0.5, 1.5)) 99 | 100 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 101 | 102 | if random.randrange(2): 103 | tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) 104 | tmp %= 180 105 | image[:, :, 0] = tmp 106 | 107 | if random.randrange(2): 108 | _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) 109 | 110 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 111 | 112 | return image 113 | 114 | 115 | def _expand(image, boxes, fill, p): 116 | if random.random() > p: 117 | return image, boxes 118 | 119 | height, width, depth = image.shape 120 | for _ in range(50): 121 | scale = random.uniform(1, 4) 122 | 123 | min_ratio = max(0.5, 1. / scale / scale) 124 | max_ratio = min(2, scale * scale) 125 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 126 | ws = scale * ratio 127 | hs = scale / ratio 128 | if ws < 1 or hs < 1: 129 | continue 130 | w = int(ws * width) 131 | h = int(hs * height) 132 | 133 | left = random.randint(0, w - width) 134 | top = random.randint(0, h - height) 135 | 136 | boxes_t = boxes.copy() 137 | boxes_t[:, :2] += (left, top) 138 | boxes_t[:, 2:] += (left, top) 139 | 140 | expand_image = np.empty( 141 | (h, w, depth), 142 | dtype=image.dtype) 143 | expand_image[:, :] = fill 144 | expand_image[top:top + height, left:left + width] = image 145 | image = expand_image 146 | 147 | return image, boxes_t 148 | 149 | 150 | def _mirror(image, boxes): 151 | _, width, _ = image.shape 152 | if random.randrange(2): 153 | image = image[:, ::-1] 154 | boxes = boxes.copy() 155 | boxes[:, 0::2] = width - boxes[:, 2::-2] 156 | return image, boxes 157 | 158 | 159 | def preproc_for_test(image, insize, mean, std=(1, 1, 1)): 160 | interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4] 161 | interp_method = interp_methods[random.randrange(5)] 162 | image = cv2.resize(image, (insize, insize), interpolation=interp_method) 163 | image = image.astype(np.float32) 164 | image -= mean 165 | image /= std 166 | return image.transpose(2, 0, 1) 167 | 168 | 169 | class preproc(object): 170 | 171 | def __init__(self, resize, rgb_means, rgb_std=(1, 1, 1), p=0.2): 172 | self.means = rgb_means 173 | self.std = rgb_std 174 | self.resize = resize 175 | self.p = p 176 | 177 | def __call__(self, image, targets): 178 | boxes = targets[:, :-1].copy() 179 | labels = targets[:, -1].copy() 180 | if len(boxes) == 0: 181 | # boxes = np.empty((0, 4)) 182 | targets = np.zeros((1, 5)) 183 | image = preproc_for_test(image, self.resize, self.means, self.std) 184 | return torch.from_numpy(image), targets 185 | 186 | image_o = image.copy() 187 | targets_o = targets.copy() 188 | height_o, width_o, _ = image_o.shape 189 | boxes_o = targets_o[:, :-1] 190 | labels_o = targets_o[:, -1] 191 | boxes_o[:, 0::2] /= width_o 192 | boxes_o[:, 1::2] /= height_o 193 | labels_o = np.expand_dims(labels_o, 1) 194 | targets_o = np.hstack((boxes_o, labels_o)) 195 | 196 | image_t, boxes, labels = _crop(image, boxes, labels) 197 | image_t = _distort(image_t) 198 | image_t, boxes = _expand(image_t, boxes, self.means, self.p) 199 | image_t, boxes = _mirror(image_t, boxes) 200 | # image_t, boxes = _mirror(image, boxes) 201 | 202 | height, width, _ = image_t.shape 203 | image_t = preproc_for_test(image_t, self.resize, self.means, self.std) 204 | boxes = boxes.copy() 205 | boxes[:, 0::2] /= width 206 | boxes[:, 1::2] /= height 207 | b_w = (boxes[:, 2] - boxes[:, 0]) * 1. 208 | b_h = (boxes[:, 3] - boxes[:, 1]) * 1. 209 | mask_b = np.minimum(b_w, b_h) > 0.01 210 | boxes_t = boxes[mask_b] 211 | labels_t = labels[mask_b].copy() 212 | 213 | if len(boxes_t) == 0: 214 | image = preproc_for_test(image_o, self.resize, self.means, self.std) 215 | return torch.from_numpy(image), targets_o 216 | 217 | labels_t = np.expand_dims(labels_t, 1) 218 | targets_t = np.hstack((boxes_t, labels_t)) 219 | 220 | return torch.from_numpy(image_t), targets_t 221 | 222 | 223 | class BaseTransform(object): 224 | """Defines the transformations that should be applied to test PIL image 225 | for input into the network 226 | 227 | dimension -> tensorize -> color adj 228 | 229 | Arguments: 230 | resize (int): input dimension to SSD 231 | rgb_means ((int,int,int)): average RGB of the dataset 232 | (104,117,123) 233 | rgb_std: std of the dataset 234 | swap ((int,int,int)): final order of channels 235 | Returns: 236 | transform (transform) : callable transform to be applied to test/val 237 | data 238 | """ 239 | 240 | def __init__(self, resize, rgb_means, rgb_std=(1, 1, 1), swap=(2, 0, 1)): 241 | self.means = rgb_means 242 | self.resize = resize 243 | self.std = rgb_std 244 | self.swap = swap 245 | 246 | # assume input is cv2 img for now 247 | def __call__(self, img): 248 | interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4] 249 | interp_method = interp_methods[0] 250 | img = cv2.resize(np.array(img), (self.resize, 251 | self.resize), interpolation=interp_method).astype(np.float32) 252 | img -= self.means 253 | img /= self.std 254 | img = img.transpose(self.swap) 255 | return torch.from_numpy(img) 256 | -------------------------------------------------------------------------------- /utils/pycocotools/maskApi.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "maskApi.h" 8 | #include 9 | #include 10 | 11 | uint umin( uint a, uint b ) { return (ab) ? a : b; } 13 | 14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { 15 | R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); 16 | siz j; if(cnts) for(j=0; jcnts[j]=cnts[j]; 17 | } 18 | 19 | void rleFree( RLE *R ) { 20 | free(R->cnts); R->cnts=0; 21 | } 22 | 23 | void rlesInit( RLE **R, siz n ) { 24 | siz i; *R = (RLE*) malloc(sizeof(RLE)*n); 25 | for(i=0; i0 ) { 61 | c=umin(ca,cb); cc+=c; ct=0; 62 | ca-=c; if(!ca && a0) { 83 | crowd=iscrowd!=NULL && iscrowd[g]; 84 | if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } 85 | siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb; 86 | ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; 87 | cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; 88 | while( ct>0 ) { 89 | c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; 90 | ca-=c; if(!ca && athr) keep[j]=0; 105 | } 106 | } 107 | } 108 | 109 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) { 110 | double h, w, i, u, ga, da; siz g, d; int crowd; 111 | for( g=0; gthr) keep[j]=0; 129 | } 130 | } 131 | } 132 | 133 | void rleToBbox( const RLE *R, BB bb, siz n ) { 134 | siz i; for( i=0; id?1:c=dy && xs>xe) || (dxye); 173 | if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } 174 | s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; 175 | if(dx>=dy) for( d=0; d<=dx; d++ ) { 176 | t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; 177 | } else for( d=0; d<=dy; d++ ) { 178 | t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; 179 | } 180 | } 181 | /* get points along y-boundary and downsample */ 182 | free(x); free(y); k=m; m=0; double xd, yd; 183 | x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); 184 | for( j=1; jw-1 ) continue; 187 | yd=(double)(v[j]h) yd=h; yd=ceil(yd); 189 | x[m]=(int) xd; y[m]=(int) yd; m++; 190 | } 191 | /* compute rle encoding given y-boundary points */ 192 | k=m; a=malloc(sizeof(uint)*(k+1)); 193 | for( j=0; j0) b[m++]=a[j++]; else { 199 | j++; if(jm, p=0; long x; int more; 206 | char *s=malloc(sizeof(char)*m*6); 207 | for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; 209 | while( more ) { 210 | char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; 211 | if(more) c |= 0x20; c+=48; s[p++]=c; 212 | } 213 | } 214 | s[p]=0; return s; 215 | } 216 | 217 | void rleFrString( RLE *R, char *s, siz h, siz w ) { 218 | siz m=0, p=0, k; long x; int more; uint *cnts; 219 | while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; 220 | while( s[p] ) { 221 | x=0; k=0; more=1; 222 | while( more ) { 223 | char c=s[p]-48; x |= (c & 0x1f) << 5*k; 224 | more = c & 0x20; p++; k++; 225 | if(!more && (c & 0x10)) x |= -1 << 5*k; 226 | } 227 | if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; 228 | } 229 | rleInit(R,h,w,m,cnts); free(cnts); 230 | } 231 | -------------------------------------------------------------------------------- /utils/pycocotools/_mask.pyx: -------------------------------------------------------------------------------- 1 | # distutils: language = c 2 | # distutils: sources = ../common/maskApi.c 3 | 4 | #************************************************************************** 5 | # Microsoft COCO Toolbox. version 2.0 6 | # Data, paper, and tutorials available at: http://mscoco.org/ 7 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 8 | # Licensed under the Simplified BSD License [see coco/license.txt] 9 | #************************************************************************** 10 | 11 | __author__ = 'tsungyi' 12 | 13 | import sys 14 | PYTHON_VERSION = sys.version_info[0] 15 | 16 | # import both Python-level and C-level symbols of Numpy 17 | # the API uses Numpy to interface C and Python 18 | import numpy as np 19 | cimport numpy as np 20 | from libc.stdlib cimport malloc, free 21 | 22 | # intialized Numpy. must do. 23 | np.import_array() 24 | 25 | # import numpy C function 26 | # we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management 27 | cdef extern from "numpy/arrayobject.h": 28 | void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) 29 | 30 | # Declare the prototype of the C functions in MaskApi.h 31 | cdef extern from "maskApi.h": 32 | ctypedef unsigned int uint 33 | ctypedef unsigned long siz 34 | ctypedef unsigned char byte 35 | ctypedef double* BB 36 | ctypedef struct RLE: 37 | siz h, 38 | siz w, 39 | siz m, 40 | uint* cnts, 41 | void rlesInit( RLE **R, siz n ) 42 | void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) 43 | void rleDecode( const RLE *R, byte *mask, siz n ) 44 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ) 45 | void rleArea( const RLE *R, siz n, uint *a ) 46 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) 47 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) 48 | void rleToBbox( const RLE *R, BB bb, siz n ) 49 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) 50 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) 51 | char* rleToString( const RLE *R ) 52 | void rleFrString( RLE *R, char *s, siz h, siz w ) 53 | 54 | # python class to wrap RLE array in C 55 | # the class handles the memory allocation and deallocation 56 | cdef class RLEs: 57 | cdef RLE *_R 58 | cdef siz _n 59 | 60 | def __cinit__(self, siz n =0): 61 | rlesInit(&self._R, n) 62 | self._n = n 63 | 64 | # free the RLE array here 65 | def __dealloc__(self): 66 | if self._R is not NULL: 67 | for i in range(self._n): 68 | free(self._R[i].cnts) 69 | free(self._R) 70 | def __getattr__(self, key): 71 | if key == 'n': 72 | return self._n 73 | raise AttributeError(key) 74 | 75 | # python class to wrap Mask array in C 76 | # the class handles the memory allocation and deallocation 77 | cdef class Masks: 78 | cdef byte *_mask 79 | cdef siz _h 80 | cdef siz _w 81 | cdef siz _n 82 | 83 | def __cinit__(self, h, w, n): 84 | self._mask = malloc(h*w*n* sizeof(byte)) 85 | self._h = h 86 | self._w = w 87 | self._n = n 88 | # def __dealloc__(self): 89 | # the memory management of _mask has been passed to np.ndarray 90 | # it doesn't need to be freed here 91 | 92 | # called when passing into np.array() and return an np.ndarray in column-major order 93 | def __array__(self): 94 | cdef np.npy_intp shape[1] 95 | shape[0] = self._h*self._w*self._n 96 | # Create a 1D array, and reshape it to fortran/Matlab column-major array 97 | ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F') 98 | # The _mask allocated by Masks is now handled by ndarray 99 | PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA) 100 | return ndarray 101 | 102 | # internal conversion from Python RLEs object to compressed RLE format 103 | def _toString(RLEs Rs): 104 | cdef siz n = Rs.n 105 | cdef bytes py_string 106 | cdef char* c_string 107 | objs = [] 108 | for i in range(n): 109 | c_string = rleToString( &Rs._R[i] ) 110 | py_string = c_string 111 | objs.append({ 112 | 'size': [Rs._R[i].h, Rs._R[i].w], 113 | 'counts': py_string 114 | }) 115 | free(c_string) 116 | return objs 117 | 118 | # internal conversion from compressed RLE format to Python RLEs object 119 | def _frString(rleObjs): 120 | cdef siz n = len(rleObjs) 121 | Rs = RLEs(n) 122 | cdef bytes py_string 123 | cdef char* c_string 124 | for i, obj in enumerate(rleObjs): 125 | if PYTHON_VERSION == 2: 126 | py_string = str(obj['counts']).encode('utf8') 127 | elif PYTHON_VERSION == 3: 128 | py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts'] 129 | else: 130 | raise Exception('Python version must be 2 or 3') 131 | c_string = py_string 132 | rleFrString( &Rs._R[i], c_string, obj['size'][0], obj['size'][1] ) 133 | return Rs 134 | 135 | # encode mask to RLEs objects 136 | # list of RLE string can be generated by RLEs member function 137 | def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask): 138 | h, w, n = mask.shape[0], mask.shape[1], mask.shape[2] 139 | cdef RLEs Rs = RLEs(n) 140 | rleEncode(Rs._R,mask.data,h,w,n) 141 | objs = _toString(Rs) 142 | return objs 143 | 144 | # decode mask from compressed list of RLE string or RLEs object 145 | def decode(rleObjs): 146 | cdef RLEs Rs = _frString(rleObjs) 147 | h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n 148 | masks = Masks(h, w, n) 149 | rleDecode(Rs._R, masks._mask, n); 150 | return np.array(masks) 151 | 152 | def merge(rleObjs, intersect=0): 153 | cdef RLEs Rs = _frString(rleObjs) 154 | cdef RLEs R = RLEs(1) 155 | rleMerge(Rs._R, R._R, Rs._n, intersect) 156 | obj = _toString(R)[0] 157 | return obj 158 | 159 | def area(rleObjs): 160 | cdef RLEs Rs = _frString(rleObjs) 161 | cdef uint* _a = malloc(Rs._n* sizeof(uint)) 162 | rleArea(Rs._R, Rs._n, _a) 163 | cdef np.npy_intp shape[1] 164 | shape[0] = Rs._n 165 | a = np.array((Rs._n, ), dtype=np.uint8) 166 | a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a) 167 | PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA) 168 | return a 169 | 170 | # iou computation. support function overload (RLEs-RLEs and bbox-bbox). 171 | def iou( dt, gt, pyiscrowd ): 172 | def _preproc(objs): 173 | if len(objs) == 0: 174 | return objs 175 | if type(objs) == np.ndarray: 176 | if len(objs.shape) == 1: 177 | objs = objs.reshape((objs[0], 1)) 178 | # check if it's Nx4 bbox 179 | if not len(objs.shape) == 2 or not objs.shape[1] == 4: 180 | raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension') 181 | objs = objs.astype(np.double) 182 | elif type(objs) == list: 183 | # check if list is in box format and convert it to np.ndarray 184 | isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs])) 185 | isrle = np.all(np.array([type(obj) == dict for obj in objs])) 186 | if isbox: 187 | objs = np.array(objs, dtype=np.double) 188 | if len(objs.shape) == 1: 189 | objs = objs.reshape((1,objs.shape[0])) 190 | elif isrle: 191 | objs = _frString(objs) 192 | else: 193 | raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])') 194 | else: 195 | raise Exception('unrecognized type. The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.') 196 | return objs 197 | def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): 198 | rleIou( dt._R, gt._R, m, n, iscrowd.data, _iou.data ) 199 | def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): 200 | bbIou( dt.data, gt.data, m, n, iscrowd.data, _iou.data ) 201 | def _len(obj): 202 | cdef siz N = 0 203 | if type(obj) == RLEs: 204 | N = obj.n 205 | elif len(obj)==0: 206 | pass 207 | elif type(obj) == np.ndarray: 208 | N = obj.shape[0] 209 | return N 210 | # convert iscrowd to numpy array 211 | cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8) 212 | # simple type checking 213 | cdef siz m, n 214 | dt = _preproc(dt) 215 | gt = _preproc(gt) 216 | m = _len(dt) 217 | n = _len(gt) 218 | if m == 0 or n == 0: 219 | return [] 220 | if not type(dt) == type(gt): 221 | raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray') 222 | 223 | # define local variables 224 | cdef double* _iou = 0 225 | cdef np.npy_intp shape[1] 226 | # check type and assign iou function 227 | if type(dt) == RLEs: 228 | _iouFun = _rleIou 229 | elif type(dt) == np.ndarray: 230 | _iouFun = _bbIou 231 | else: 232 | raise Exception('input data type not allowed.') 233 | _iou = malloc(m*n* sizeof(double)) 234 | iou = np.zeros((m*n, ), dtype=np.double) 235 | shape[0] = m*n 236 | iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou) 237 | PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA) 238 | _iouFun(dt, gt, iscrowd, m, n, iou) 239 | return iou.reshape((m,n), order='F') 240 | 241 | def toBbox( rleObjs ): 242 | cdef RLEs Rs = _frString(rleObjs) 243 | cdef siz n = Rs.n 244 | cdef BB _bb = malloc(4*n* sizeof(double)) 245 | rleToBbox( Rs._R, _bb, n ) 246 | cdef np.npy_intp shape[1] 247 | shape[0] = 4*n 248 | bb = np.array((1,4*n), dtype=np.double) 249 | bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4)) 250 | PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA) 251 | return bb 252 | 253 | def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ): 254 | cdef siz n = bb.shape[0] 255 | Rs = RLEs(n) 256 | rleFrBbox( Rs._R, bb.data, h, w, n ) 257 | objs = _toString(Rs) 258 | return objs 259 | 260 | def frPoly( poly, siz h, siz w ): 261 | cdef np.ndarray[np.double_t, ndim=1] np_poly 262 | n = len(poly) 263 | Rs = RLEs(n) 264 | for i, p in enumerate(poly): 265 | np_poly = np.array(p, dtype=np.double, order='F') 266 | rleFrPoly( &Rs._R[i], np_poly.data, int(len(p)/2), h, w ) 267 | objs = _toString(Rs) 268 | return objs 269 | 270 | def frUncompressedRLE(ucRles, siz h, siz w): 271 | cdef np.ndarray[np.uint32_t, ndim=1] cnts 272 | cdef RLE R 273 | cdef uint *data 274 | n = len(ucRles) 275 | objs = [] 276 | for i in range(n): 277 | Rs = RLEs(1) 278 | cnts = np.array(ucRles[i]['counts'], dtype=np.uint32) 279 | # time for malloc can be saved here but it's fine 280 | data = malloc(len(cnts)* sizeof(uint)) 281 | for j in range(len(cnts)): 282 | data[j] = cnts[j] 283 | R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), data) 284 | Rs._R[0] = R 285 | objs.append(_toString(Rs)[0]) 286 | return objs 287 | 288 | def frPyObjects(pyobj, h, w): 289 | # encode rle from a list of python objects 290 | if type(pyobj) == np.ndarray: 291 | objs = frBbox(pyobj, h, w) 292 | elif type(pyobj) == list and len(pyobj[0]) == 4: 293 | objs = frBbox(pyobj, h, w) 294 | elif type(pyobj) == list and len(pyobj[0]) > 4: 295 | objs = frPoly(pyobj, h, w) 296 | elif type(pyobj) == list and type(pyobj[0]) == dict \ 297 | and 'counts' in pyobj[0] and 'size' in pyobj[0]: 298 | objs = frUncompressedRLE(pyobj, h, w) 299 | # encode rle from single python object 300 | elif type(pyobj) == list and len(pyobj) == 4: 301 | objs = frBbox([pyobj], h, w)[0] 302 | elif type(pyobj) == list and len(pyobj) > 4: 303 | objs = frPoly([pyobj], h, w)[0] 304 | elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj: 305 | objs = frUncompressedRLE([pyobj], h, w)[0] 306 | else: 307 | raise Exception('input type is not supported.') 308 | return objs 309 | -------------------------------------------------------------------------------- /models/FSSD_vgg_FPN.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from .base_models import vgg, vgg_base 7 | 8 | 9 | class BasicConv(nn.Module): 10 | 11 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, 12 | bn=False, bias=True, up_size=0): 13 | super(BasicConv, self).__init__() 14 | self.out_channels = out_planes 15 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, 16 | dilation=dilation, groups=groups, bias=bias) 17 | self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None 18 | self.relu = nn.ReLU(inplace=True) if relu else None 19 | self.up_size = up_size 20 | self.up_sample = nn.Upsample(size=(up_size, up_size), mode='bilinear') if up_size != 0 else None 21 | 22 | def forward(self, x): 23 | x = self.conv(x) 24 | if self.bn is not None: 25 | x = self.bn(x) 26 | if self.relu is not None: 27 | x = self.relu(x) 28 | if self.up_size > 0: 29 | x = self.up_sample(x) 30 | return x 31 | 32 | 33 | class FSSD(nn.Module): 34 | """Single Shot Multibox Architecture 35 | The network is composed of a base VGG network followed by the 36 | added multibox conv layers. Each multibox layer branches into 37 | 1) conv2d for class conf scores 38 | 2) conv2d for localization predictions 39 | 3) associated priorbox layer to produce default bounding 40 | boxes specific to the layer's feature map size. 41 | See: https://arxiv.org/pdf/1712.00960.pdf or more details. 42 | Args: 43 | base: VGG16 layers for input, size of either 300 or 500 44 | extras: extra layers that feed to multibox loc and conf layers 45 | head: "multibox head" consists of loc and conf conv layers 46 | """ 47 | 48 | def __init__(self, base, extras, ft_module, pyramid_ext, head, num_classes, size): 49 | super(FSSD, self).__init__() 50 | self.num_classes = num_classes 51 | # TODO: implement __call__ in PriorBox 52 | self.size = size 53 | 54 | # SSD network 55 | self.base = nn.ModuleList(base) 56 | self.extras = nn.ModuleList(extras) 57 | self.ft_module = nn.ModuleList(ft_module) 58 | self.pyramid_ext = nn.ModuleList(pyramid_ext) 59 | self.fea_bn = nn.BatchNorm2d(256 * len(self.ft_module), affine=True) 60 | 61 | self.loc = nn.ModuleList(head[0]) 62 | self.conf = nn.ModuleList(head[1]) 63 | 64 | self.softmax = nn.Softmax() 65 | 66 | self.conv_cat0 = nn.Conv2d(512, 256, kernel_size=1, padding=0, stride=1) 67 | self.upsample0 = nn.Upsample(size=(3, 3), mode='bilinear') 68 | 69 | self.conv_cat1 = nn.Conv2d(512, 256, kernel_size=1, padding=0, stride=1) 70 | self.upsample1 = nn.Upsample(size=(5, 5), mode='bilinear') 71 | 72 | self.conv_cat2 = nn.Conv2d(512, 256, kernel_size=1, padding=0, stride=1) 73 | self.upsample2 = nn.Upsample(size=(10, 10), mode='bilinear') 74 | 75 | self.conv_cat3 = nn.Conv2d(768, 512, kernel_size=1, padding=0, stride=1) 76 | self.upsample3 = nn.Upsample(size=(19, 19), mode='bilinear') 77 | 78 | self.conv_cat4 = nn.Conv2d(1024, 512, kernel_size=1, padding=0, stride=1) 79 | self.upsample4 = nn.Upsample(size=(38, 38), mode='bilinear') 80 | 81 | def forward(self, x, test=False): 82 | """Applies network layers and ops on input image(s) x. 83 | Args: 84 | x: input image or batch of images. Shape: [batch,3*batch,300,300]. 85 | Return: 86 | Depending on phase: 87 | test: 88 | Variable(tensor) of output class label predictions, 89 | confidence score, and corresponding location predictions for 90 | each object detected. Shape: [batch,topk,7] 91 | train: 92 | list of concat outputs from: 93 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 94 | 2: localization layers, Shape: [batch,num_priors*4] 95 | 3: priorbox layers, Shape: [2,num_priors*4] 96 | """ 97 | source_features = list() 98 | transformed_features = list() 99 | loc = list() 100 | conf = list() 101 | 102 | # apply vgg up to conv4_3 relu 103 | for k in range(23): 104 | x = self.base[k](x) 105 | 106 | source_features.append(x) 107 | 108 | # apply vgg up to fc7 109 | for k in range(23, len(self.base)): 110 | x = self.base[k](x) 111 | source_features.append(x) 112 | 113 | # apply extra layers and cache source layer outputs 114 | for k, v in enumerate(self.extras): 115 | x = F.relu(v(x), inplace=True) 116 | source_features.append(x) 117 | assert len(self.ft_module) == len(source_features) 118 | for k, v in enumerate(self.ft_module): 119 | transformed_features.append(v(source_features[k])) 120 | concat_fea = torch.cat(transformed_features, 1) 121 | x = self.fea_bn(concat_fea) 122 | pyramid_fea = list() 123 | for k, v in enumerate(self.pyramid_ext): 124 | x = v(x) 125 | pyramid_fea.append(x) 126 | 127 | # ----------this block is to downsample the 1*1 layer to 3*3, and concat with the original 3*3 layer, like Dense connection 128 | fpn_0 = list() 129 | detect_5 = pyramid_fea[5] 130 | detect_4 = pyramid_fea[4] 131 | detect_5_4 = self.upsample0(detect_5) 132 | fpn_0.append(detect_4) 133 | fpn_0.append(detect_5_4) 134 | detect_4 = torch.cat(fpn_0, 1) 135 | detect_4 = self.conv_cat0(detect_4) 136 | pyramid_fea[4] = detect_4 137 | pyramid_fea[5] = detect_5 138 | 139 | # ----------this block is to downsample the 3*3 layer to 5*5, and concat with the original 5*5 layer, like Dense connection 140 | fpn_1 = list() 141 | detect_3 = pyramid_fea[3] 142 | detect_4_3 = self.upsample1(detect_4) 143 | fpn_1.append(detect_3) 144 | fpn_1.append(detect_4_3) 145 | detect_3 = torch.cat(fpn_1, 1) 146 | detect_3 = self.conv_cat1(detect_3) 147 | pyramid_fea[3] = detect_3 148 | 149 | # ----------this block is to downsample the 5*5 layer to 10*10, and concat with the original 10*10 layer, like Dense connection 150 | fpn_2 = list() 151 | detect_2 = pyramid_fea[2] 152 | detect_3_2 = self.upsample2(detect_3) 153 | fpn_2.append(detect_2) 154 | fpn_2.append(detect_3_2) 155 | detect_2 = torch.cat(fpn_2, 1) 156 | detect_2 = self.conv_cat2(detect_2) 157 | pyramid_fea[2] = detect_2 158 | 159 | # ----------this block is to downsample the 10*10 layer to 19*19, and concat with the original 19*19 layer, like Dense connection 160 | fpn_3 = list() 161 | detect_1 = pyramid_fea[1] 162 | detect_2_1 = self.upsample3(detect_2) 163 | fpn_3.append(detect_1) 164 | fpn_3.append(detect_2_1) 165 | detect_1 = torch.cat(fpn_3, 1) 166 | detect_1 = self.conv_cat3(detect_1) 167 | pyramid_fea[1] = detect_1 168 | 169 | # ----------this block is to downsample the 19*19 layer to 38*38, and concat with the original 38*38 layer, like Dense connection 170 | fpn_4 = list() 171 | detect_0 = pyramid_fea[0] 172 | detect_1_0 = self.upsample4(detect_1) 173 | fpn_4.append(detect_0) 174 | fpn_4.append(detect_1_0) 175 | detect_0 = torch.cat(fpn_4, 1) 176 | detect_0 = self.conv_cat4(detect_0) 177 | pyramid_fea[0] = detect_0 178 | 179 | # apply multibox head to source layers 180 | for (x, l, c) in zip(pyramid_fea, self.loc, self.conf): 181 | loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 182 | conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 183 | 184 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 185 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 186 | if test: 187 | output = ( 188 | loc.view(loc.size(0), -1, 4), # loc preds 189 | self.softmax(conf.view(-1, self.num_classes)), # conf preds 190 | ) 191 | else: 192 | output = ( 193 | loc.view(loc.size(0), -1, 4), 194 | conf.view(conf.size(0), -1, self.num_classes), 195 | ) 196 | return output 197 | 198 | def load_weights(self, base_file): 199 | other, ext = os.path.splitext(base_file) 200 | if ext == '.pkl' or '.pth': 201 | print('Loading weights into state dict...') 202 | self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage)) 203 | print('Finished!') 204 | else: 205 | print('Sorry only .pth and .pkl files supported.') 206 | 207 | 208 | def add_extras(cfg, i, batch_norm=False): 209 | # Extra layers added to VGG for feature scaling 210 | layers = [] 211 | in_channels = i 212 | flag = False 213 | for k, v in enumerate(cfg): 214 | if in_channels != 'S': 215 | if v == 'S': 216 | layers += [nn.Conv2d(in_channels, cfg[k + 1], 217 | kernel_size=(1, 3)[flag], stride=2, padding=1)] 218 | else: 219 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 220 | flag = not flag 221 | in_channels = v 222 | return layers 223 | 224 | 225 | def feature_transform_module(vgg, extral, size): 226 | if size == 300: 227 | up_size = 38 228 | elif size == 512: 229 | up_size = 64 230 | 231 | layers = [] 232 | # conv4_3 233 | layers += [BasicConv(vgg[24].out_channels, 256, kernel_size=1, padding=0)] 234 | # fc_7 235 | layers += [BasicConv(vgg[-2].out_channels, 256, kernel_size=1, padding=0, up_size=up_size)] 236 | layers += [BasicConv(extral[-1].out_channels, 256, kernel_size=1, padding=0, up_size=up_size)] 237 | return vgg, extral, layers 238 | 239 | 240 | def pyramid_feature_extractor(size): 241 | if size == 300: 242 | layers = [BasicConv(256 * 3, 512, kernel_size=3, stride=1, padding=1), 243 | BasicConv(512, 512, kernel_size=3, stride=2, padding=1), \ 244 | BasicConv(512, 256, kernel_size=3, stride=2, padding=1), 245 | BasicConv(256, 256, kernel_size=3, stride=2, padding=1), \ 246 | BasicConv(256, 256, kernel_size=3, stride=1, padding=0), 247 | BasicConv(256, 256, kernel_size=3, stride=1, padding=0)] 248 | elif size == 512: 249 | layers = [BasicConv(256 * 3, 512, kernel_size=3, stride=1, padding=1), 250 | BasicConv(512, 512, kernel_size=3, stride=2, padding=1), \ 251 | BasicConv(512, 256, kernel_size=3, stride=2, padding=1), 252 | BasicConv(256, 256, kernel_size=3, stride=2, padding=1), \ 253 | BasicConv(256, 256, kernel_size=3, stride=2, padding=1), 254 | BasicConv(256, 256, kernel_size=3, stride=2, padding=1), \ 255 | BasicConv(256, 256, kernel_size=4, padding=1, stride=1)] 256 | return layers 257 | 258 | 259 | def multibox(fea_channels, cfg, num_classes): 260 | loc_layers = [] 261 | conf_layers = [] 262 | assert len(fea_channels) == len(cfg) 263 | for i, fea_channel in enumerate(fea_channels): 264 | loc_layers += [nn.Conv2d(fea_channel, cfg[i] * 4, kernel_size=3, padding=1)] 265 | conf_layers += [nn.Conv2d(fea_channel, cfg[i] * num_classes, kernel_size=3, padding=1)] 266 | return (loc_layers, conf_layers) 267 | 268 | 269 | extras = { 270 | '300': [256, 512, 128, 'S', 256], 271 | '512': [256, 512, 128, 'S', 256], 272 | } 273 | mbox = { 274 | '300': [4, 6, 6, 6, 4, 4], # number of boxes per feature map location 275 | '512': [6, 6, 6, 6, 6, 4, 4], 276 | } 277 | fea_channels = { 278 | '300': [512, 512, 256, 256, 256, 256], 279 | '512': [512, 512, 256, 256, 256, 256, 256]} 280 | 281 | 282 | def build_net(size=300, num_classes=21): 283 | if size != 300 and size != 512: 284 | print("Error: Sorry only FSSD300 and FSSD512 is supported currently!") 285 | return 286 | 287 | return FSSD(*feature_transform_module(vgg(vgg_base[str(size)], 3), add_extras(extras[str(size)], 1024), size=size), 288 | pyramid_ext=pyramid_feature_extractor(size), 289 | head=multibox(fea_channels[str(size)], mbox[str(size)], num_classes), num_classes=num_classes, 290 | size=size) -------------------------------------------------------------------------------- /models/FSSD_Mob_FPN.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import os 4 | import torch 5 | import torch.nn as nn 6 | from utils.timer import Timer 7 | sys.path.append('./') 8 | from models.mobilenet import mobilenet_1 9 | import time 10 | from utils.timer import Timer 11 | 12 | class BasicConv(nn.Module): 13 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, 14 | bn=False, bias=True, up_size=0): 15 | super(BasicConv, self).__init__() 16 | self.out_channels = out_planes 17 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, 18 | dilation=dilation, groups=groups, bias=bias) 19 | self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None 20 | self.relu = nn.ReLU(inplace=True) if relu else None 21 | self.up_size = up_size 22 | self.up_sample = nn.Upsample(size=(up_size, up_size), mode='bilinear') if up_size != 0 else None 23 | 24 | def forward(self, x): 25 | x = self.conv(x) 26 | if self.bn is not None: 27 | x = self.bn(x) 28 | if self.relu is not None: 29 | x = self.relu(x) 30 | if self.up_size > 0: 31 | x = self.up_sample(x) 32 | return x 33 | 34 | class FSSD(nn.Module): 35 | """Single Shot Multibox Architecture 36 | The network is composed of a base VGG network followed by the 37 | added multibox conv layers. Each multibox layer branches into 38 | 1) conv2d for class conf scores 39 | 2) conv2d for localization predictions 40 | 3) associated priorbox layer to produce default bounding 41 | boxes specific to the layer's feature map size. 42 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 43 | 44 | Args: 45 | phase: (string) Can be "test" or "train" 46 | base: VGG16 layers for input, size of either 300 or 500 47 | extras: extra layers that feed to multibox loc and conf layers 48 | head: "multibox head" consists of loc and conf conv layers 49 | """ 50 | 51 | def __init__(self, size, head, ft_module, pyramid_ext, num_classes): 52 | super(FSSD, self).__init__() 53 | self.num_classes = num_classes 54 | # TODO: implement __call__ in PriorBox 55 | self.size = size 56 | 57 | # SSD network 58 | self.base = mobilenet_1() 59 | # Layer learns to scale the l2 normalized features from conv4_3 60 | self.ft_module = nn.ModuleList(ft_module) 61 | self.pyramid_ext = nn.ModuleList(pyramid_ext) 62 | 63 | self.loc = nn.ModuleList(head[0]) 64 | self.conf = nn.ModuleList(head[1]) 65 | #self.fea_bn = nn.BatchNorm2d(256, affine=True) 66 | self.fea_bn = nn.BatchNorm2d(256 * len(self.ft_module), affine=True) 67 | self.softmax = nn.Softmax() 68 | 69 | self.conv_cat0 = nn.Conv2d(256, 128, kernel_size=1, padding=0, stride=1) 70 | self.upsample0 = nn.Upsample(size=(3, 3), mode='bilinear') 71 | 72 | self.conv_cat1 = nn.Conv2d(384, 256, kernel_size=1, padding=0, stride=1) 73 | self.upsample1 = nn.Upsample(size=(5, 5), mode='bilinear') 74 | 75 | self.conv_cat2 = nn.Conv2d(512, 256, kernel_size=1, padding=0, stride=1) 76 | self.upsample2 = nn.Upsample(size=(10, 10), mode='bilinear') 77 | 78 | self.conv_cat3 = nn.Conv2d(768, 512, kernel_size=1, padding=0, stride=1) 79 | self.upsample3 = nn.Upsample(size=(19, 19), mode='bilinear') 80 | 81 | self.conv_cat4 = nn.Conv2d(1024, 512, kernel_size=1, padding=0, stride=1) 82 | self.upsample4 = nn.Upsample(size=(38, 38), mode='bilinear') 83 | 84 | 85 | self.time = time 86 | self.timer = Timer 87 | 88 | def forward(self, x, test=False): 89 | """Applies network layers and ops on input image(s) x. 90 | 91 | Args: 92 | x: input image or batch of images. Shape: [batch,3*batch,300,300]. 93 | 94 | Return: 95 | Depending on phase: 96 | test: 97 | Variable(tensor) of output class label predictions, 98 | confidence score, and corresponding location predictions for 99 | each object detected. Shape: [batch,topk,7] 100 | 101 | train: 102 | list of concat outputs from: 103 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 104 | 2: localization layers, Shape: [batch,num_priors*4] 105 | 3: priorbox layers, Shape: [2,num_priors*4] 106 | """ 107 | source_features = list() 108 | transformed_features = list() 109 | loc = list() 110 | conf = list() 111 | 112 | base_out = self.base(x) 113 | 114 | source_features.append(base_out[0]) # mobilenet 4_1 115 | source_features.append(base_out[1]) # mobilent_5_5 116 | source_features.append(base_out[2]) # mobilenet 6_1 117 | 118 | assert len(self.ft_module) == len(source_features) 119 | for k, v in enumerate(self.ft_module): 120 | transformed_features.append(v(source_features[k])) 121 | concat_fea = torch.cat(transformed_features, 1) 122 | x = self.fea_bn(concat_fea) 123 | fea_bn = x 124 | 125 | # the six detect layers 126 | pyramid_fea = list() 127 | for k, v in enumerate(self.pyramid_ext): 128 | x = v(x) 129 | pyramid_fea.append(x) 130 | 131 | 132 | #----------this block is to downsample the 1*1 layer to 3*3, and concat with the original 3*3 layer, like Dense connection 133 | fpn_0 = list() 134 | detect_5 = pyramid_fea[5] 135 | detect_4 = pyramid_fea[4] 136 | detect_5_4 = self.upsample0(detect_5) 137 | fpn_0.append(detect_4) 138 | fpn_0.append(detect_5_4) 139 | detect_4 = torch.cat(fpn_0, 1) 140 | detect_4 = self.conv_cat0(detect_4) 141 | pyramid_fea[4] = detect_4 142 | pyramid_fea[5] = detect_5 143 | 144 | #----------this block is to downsample the 3*3 layer to 5*5, and concat with the original 5*5 layer, like Dense connection 145 | fpn_1 = list() 146 | detect_3 = pyramid_fea[3] 147 | detect_4_3 = self.upsample1(detect_4) 148 | fpn_1.append(detect_3) 149 | fpn_1.append(detect_4_3) 150 | detect_3 = torch.cat(fpn_1, 1) 151 | detect_3 = self.conv_cat1(detect_3) 152 | pyramid_fea[3] = detect_3 153 | 154 | 155 | #----------this block is to downsample the 5*5 layer to 10*10, and concat with the original 10*10 layer, like Dense connection 156 | fpn_2 = list() 157 | detect_2 = pyramid_fea[2] 158 | detect_3_2 = self.upsample2(detect_3) 159 | fpn_2.append(detect_2) 160 | fpn_2.append(detect_3_2) 161 | detect_2 = torch.cat(fpn_2, 1) 162 | detect_2 = self.conv_cat2(detect_2) 163 | pyramid_fea[2] = detect_2 164 | 165 | 166 | #----------this block is to downsample the 10*10 layer to 19*19, and concat with the original 19*19 layer, like Dense connection 167 | fpn_3 = list() 168 | detect_1 = pyramid_fea[1] 169 | detect_2_1 = self.upsample3(detect_2) 170 | fpn_3.append(detect_1) 171 | fpn_3.append(detect_2_1) 172 | detect_1 = torch.cat(fpn_3, 1) 173 | detect_1 = self.conv_cat3(detect_1) 174 | pyramid_fea[1] = detect_1 175 | 176 | 177 | #----------this block is to downsample the 19*19 layer to 38*38, and concat with the original 38*38 layer, like Dense connection 178 | fpn_4 = list() 179 | detect_0 = pyramid_fea[0] 180 | detect_1_0 = self.upsample4(detect_1) 181 | fpn_4.append(detect_0) 182 | fpn_4.append(detect_1_0) 183 | detect_0 = torch.cat(fpn_4, 1) 184 | detect_0 = self.conv_cat4(detect_0) 185 | pyramid_fea[0] = detect_0 186 | 187 | 188 | # apply multibox head to source layers 189 | for (x, l, c) in zip(pyramid_fea, self.loc, self.conf): 190 | loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 191 | conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 192 | 193 | 194 | #every detect layer's cls and reg 195 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 196 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 197 | 198 | if test: 199 | output = ( 200 | loc.view(loc.size(0), -1, 4), # loc preds 201 | self.softmax(conf.view(-1, self.num_classes)), # conf preds 202 | ) 203 | features = () 204 | else: 205 | output = ( 206 | loc.view(loc.size(0), -1, 4), 207 | conf.view(conf.size(0), -1, self.num_classes), 208 | ) 209 | features = ( 210 | fea_bn 211 | ) 212 | return output 213 | 214 | def load_weights(self, base_file): 215 | other, ext = os.path.splitext(base_file) 216 | if ext == '.pkl' or '.pth': 217 | print('Loading weights into state dict...') 218 | state_dict = torch.load(base_file, map_location=lambda storage, loc: storage) 219 | from collections import OrderedDict 220 | new_state_dict = OrderedDict() 221 | for k, v in state_dict.items(): 222 | head = k[:7] 223 | if head == 'module.': 224 | name = k[7:] # remove `module.` 225 | else: 226 | name = k 227 | new_state_dict[name] = v 228 | self.base.load_state_dict(new_state_dict) 229 | print('Finished!') 230 | 231 | else: 232 | print('Sorry only .pth and .pkl files supported.') 233 | 234 | from models.smooth_scale_transfer import * 235 | 236 | def feature_transform_module(scale_factor): 237 | layers = [] 238 | # conv4_1 239 | layers += [BasicConv(int(256 * scale_factor), 256, kernel_size=1, padding=0)] 240 | #layers += [down_sample(int(256 * scale_factor), 256)] 241 | # conv5_5 242 | layers += [BasicConv(int(512 * scale_factor), 256, kernel_size=1, padding=0, up_size=38)] 243 | #layers += [BasicConv(int(512 * scale_factor), 256, kernel_size=3, padding=1, stride=2)] 244 | # conv6_mpo1 245 | layers += [BasicConv(int(1024 * scale_factor), 256, kernel_size=1, padding=0, up_size=38)] 246 | #layers += [BasicConv(int(1024 * scale_factor), 256, kernel_size=1, padding=0)] 247 | return layers 248 | 249 | 250 | 251 | def pyramid_feature_extractor(): 252 | layers = [] 253 | #layers += [SST_6(256, 256), SST_5(256, 256), SST_4(256, 256), SST_3(256, 256), SST_2(256, 256), SST_1(256, 256)] 254 | # 255 | from models.mobilenet import DepthWiseBlock 256 | layers = [DepthWiseBlock(256*3, 512, stride=1), DepthWiseBlock(512, 512, stride=2), 257 | DepthWiseBlock(512, 256, stride=2), DepthWiseBlock(256, 256, stride=2), \ 258 | DepthWiseBlock(256, 128, stride=1, padding=0), DepthWiseBlock(128, 128, stride=1, padding=0)] 259 | 260 | return layers 261 | 262 | 263 | def multibox(fea_channels, cfg, num_classes): 264 | loc_layers = [] 265 | conf_layers = [] 266 | assert len(fea_channels) == len(cfg) 267 | for i, fea_channel in enumerate(fea_channels): 268 | loc_layers += [nn.Conv2d(fea_channel, cfg[i] * 4, kernel_size=3, padding=1)] 269 | conf_layers += [nn.Conv2d(fea_channel, cfg[i] * num_classes, kernel_size=3, padding=1)] 270 | return (loc_layers, conf_layers) 271 | 272 | mbox = { 273 | '300': [4, 6, 6, 6, 4, 4], # number of boxes per feature map location 274 | '512': [4, 6, 6, 6, 4, 4], 275 | } 276 | fea_channels = [512, 512, 256, 256, 128, 128] 277 | 278 | 279 | def build_net(size=512, num_classes=21): 280 | if size != 300 and size != 512: 281 | print("Error: Sorry only SSD300 and SSD512 is supported currently!") 282 | return 283 | 284 | return FSSD(size, multibox(fea_channels, mbox[str(size)], num_classes), feature_transform_module(1), 285 | pyramid_feature_extractor(), \ 286 | num_classes=num_classes) 287 | 288 | 289 | 290 | #input = torch.tensor(1, 10, 16*10*10).view(1, 16, 10, 10).float() 291 | # pyramid_fea = list() 292 | # for k, v in enumerate(pyramid_feature_extractor()): 293 | # #x = v(input) 294 | # pyramid_fea.append(v) 295 | # print(pyramid_fea) 296 | 297 | 298 | # from torch.autograd import Variable 299 | # 300 | # input1 = Variable(torch.randn(1, 3, 300, 300)) 301 | # t = {'im_detect': Timer(), 'misc': Timer()} 302 | # t['im_detect'].tic() 303 | # net = build_net(300,21) 304 | # net = net.forward(input1) 305 | # detect_time = t['im_detect'].toc() 306 | # print(detect_time) 307 | #output = net(input1) 308 | -------------------------------------------------------------------------------- /data/coco.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | 9 | import json 10 | import pickle 11 | 12 | import cv2 13 | import numpy as np 14 | import os 15 | import os.path 16 | import torch 17 | import torch.utils.data as data 18 | import torchvision.transforms as transforms 19 | 20 | from utils.pycocotools.coco import COCO 21 | from utils.pycocotools.cocoeval import COCOeval 22 | 23 | 24 | class COCODetection(data.Dataset): 25 | """VOC Detection Dataset Object 26 | 27 | input is image, target is annotation 28 | 29 | Arguments: 30 | root (string): filepath to VOCdevkit folder. 31 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 32 | transform (callable, optional): transformation to perform on the 33 | input image 34 | target_transform (callable, optional): transformation to perform on the 35 | target `annotation` 36 | (eg: take in caption string, return tensor of word indices) 37 | dataset_name (string, optional): which dataset to load 38 | (default: 'VOC2007') 39 | """ 40 | 41 | def __init__(self, root, image_sets, preproc=None, target_transform=None, 42 | dataset_name='COCO'): 43 | self.root = root 44 | self.cache_path = os.path.join(self.root, 'cache') 45 | self.image_set = image_sets 46 | self.preproc = preproc 47 | self.target_transform = target_transform 48 | self.name = dataset_name 49 | self.ids = list() 50 | self.annotations = list() 51 | self._view_map = { 52 | 'minival2014': 'val2014', # 5k val2014 subset 53 | 'valminusminival2014': 'val2014', # val2014 \setminus minival2014 54 | 'test-dev2015': 'test2015', 55 | } 56 | 57 | for (year, image_set) in image_sets: 58 | coco_name = image_set + year 59 | data_name = (self._view_map[coco_name] 60 | if coco_name in self._view_map 61 | else coco_name) 62 | annofile = self._get_ann_file(coco_name) 63 | _COCO = COCO(annofile) 64 | self._COCO = _COCO 65 | self.coco_name = coco_name 66 | cats = _COCO.loadCats(_COCO.getCatIds()) 67 | self._classes = tuple(['__background__'] + [c['name'] for c in cats]) 68 | self.num_classes = len(self._classes) 69 | self._class_to_ind = dict(zip(self._classes, range(self.num_classes))) 70 | self._class_to_coco_cat_id = dict(zip([c['name'] for c in cats], 71 | _COCO.getCatIds())) 72 | indexes = _COCO.getImgIds() 73 | self.image_indexes = indexes 74 | self.ids.extend([self.image_path_from_index(data_name, index) for index in indexes]) 75 | if image_set.find('test') != -1: 76 | print('test set will not load annotations!') 77 | else: 78 | self.annotations.extend(self._load_coco_annotations(coco_name, indexes, _COCO)) 79 | 80 | def image_path_from_index(self, name, index): 81 | """ 82 | Construct an image path from the image's "index" identifier. 83 | """ 84 | # Example image path for index=119993: 85 | # images/train2014/COCO_train2014_000000119993.jpg 86 | if '2014' in name or '2015' in name: 87 | file_name = ('COCO_' + name + '_' + 88 | str(index).zfill(12) + '.jpg') 89 | image_path = os.path.join(self.root, 'images', 90 | name, file_name) 91 | assert os.path.exists(image_path), \ 92 | 'Path does not exist: {}'.format(image_path) 93 | if '2017' in name: 94 | file_name = str(index).zfill(12) + '.jpg' 95 | image_path = os.path.join(self.root, name, file_name) 96 | assert os.path.exists(image_path), \ 97 | 'Path does not exist: {}'.format(image_path) 98 | return image_path 99 | 100 | def _get_ann_file(self, name): 101 | prefix = 'instances' if name.find('test') == -1 \ 102 | else 'image_info' 103 | return os.path.join(self.root, 'annotations', 104 | prefix + '_' + name + '.json') 105 | 106 | def _load_coco_annotations(self, coco_name, indexes, _COCO): 107 | cache_file = os.path.join(self.cache_path, coco_name + '_gt_roidb.pkl') 108 | if not os.path.exists(self.cache_path): 109 | os.makedirs(self.cache_path) 110 | if os.path.exists(cache_file): 111 | with open(cache_file, 'rb') as fid: 112 | roidb = pickle.load(fid) 113 | print('{} gt roidb loaded from {}'.format(coco_name, cache_file)) 114 | return roidb 115 | 116 | gt_roidb = [self._annotation_from_index(index, _COCO) 117 | for index in indexes] 118 | with open(cache_file, 'wb') as fid: 119 | pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL) 120 | print('wrote gt roidb to {}'.format(cache_file)) 121 | return gt_roidb 122 | 123 | def _annotation_from_index(self, index, _COCO): 124 | """ 125 | Loads COCO bounding-box instance annotations. Crowd instances are 126 | handled by marking their overlaps (with all categories) to -1. This 127 | overlap value means that crowd "instances" are excluded from training. 128 | """ 129 | im_ann = _COCO.loadImgs(index)[0] 130 | width = im_ann['width'] 131 | height = im_ann['height'] 132 | 133 | annIds = _COCO.getAnnIds(imgIds=index, iscrowd=None) 134 | objs = _COCO.loadAnns(annIds) 135 | # Sanitize bboxes -- some are invalid 136 | valid_objs = [] 137 | for obj in objs: 138 | x1 = np.max((0, obj['bbox'][0])) 139 | y1 = np.max((0, obj['bbox'][1])) 140 | x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1)))) 141 | y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1)))) 142 | if obj['area'] > 0 and x2 >= x1 and y2 >= y1: 143 | obj['clean_bbox'] = [x1, y1, x2, y2] 144 | valid_objs.append(obj) 145 | objs = valid_objs 146 | num_objs = len(objs) 147 | 148 | res = np.zeros((num_objs, 5)) 149 | 150 | # Lookup table to map from COCO category ids to our internal class 151 | # indices 152 | coco_cat_id_to_class_ind = dict([(self._class_to_coco_cat_id[cls], 153 | self._class_to_ind[cls]) 154 | for cls in self._classes[1:]]) 155 | 156 | for ix, obj in enumerate(objs): 157 | cls = coco_cat_id_to_class_ind[obj['category_id']] 158 | res[ix, 0:4] = obj['clean_bbox'] 159 | res[ix, 4] = cls 160 | 161 | return res 162 | 163 | def __getitem__(self, index): 164 | img_id = self.ids[index] 165 | target = self.annotations[index] 166 | img = cv2.imread(img_id, cv2.IMREAD_COLOR) 167 | height, width, _ = img.shape 168 | 169 | if self.target_transform is not None: 170 | target = self.target_transform(target) 171 | 172 | if self.preproc is not None: 173 | img, target = self.preproc(img, target) 174 | 175 | # target = self.target_transform(target, width, height) 176 | # print(target.shape) 177 | 178 | return img, target 179 | 180 | def __len__(self): 181 | return len(self.ids) 182 | 183 | def pull_image(self, index): 184 | '''Returns the original image object at index in PIL form 185 | 186 | Note: not using self.__getitem__(), as any transformations passed in 187 | could mess up this functionality. 188 | 189 | Argument: 190 | index (int): index of img to show 191 | Return: 192 | PIL img 193 | ''' 194 | img_id = self.ids[index] 195 | return cv2.imread(img_id, cv2.IMREAD_COLOR) 196 | 197 | def pull_tensor(self, index): 198 | '''Returns the original image at an index in tensor form 199 | 200 | Note: not using self.__getitem__(), as any transformations passed in 201 | could mess up this functionality. 202 | 203 | Argument: 204 | index (int): index of img to show 205 | Return: 206 | tensorized version of img, squeezed 207 | ''' 208 | to_tensor = transforms.ToTensor() 209 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 210 | 211 | def _print_detection_eval_metrics(self, coco_eval): 212 | IoU_lo_thresh = 0.5 213 | IoU_hi_thresh = 0.95 214 | 215 | def _get_thr_ind(coco_eval, thr): 216 | ind = np.where((coco_eval.params.iouThrs > thr - 1e-5) & 217 | (coco_eval.params.iouThrs < thr + 1e-5))[0][0] 218 | iou_thr = coco_eval.params.iouThrs[ind] 219 | assert np.isclose(iou_thr, thr) 220 | return ind 221 | 222 | ind_lo = _get_thr_ind(coco_eval, IoU_lo_thresh) 223 | ind_hi = _get_thr_ind(coco_eval, IoU_hi_thresh) 224 | # precision has dims (iou, recall, cls, area range, max dets) 225 | # area range index 0: all area ranges 226 | # max dets index 2: 100 per image 227 | precision = \ 228 | coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2] 229 | ap_default = np.mean(precision[precision > -1]) 230 | print('~~~~ Mean and per-category AP @ IoU=[{:.2f},{:.2f}] ' 231 | '~~~~'.format(IoU_lo_thresh, IoU_hi_thresh)) 232 | print('{:.1f}'.format(100 * ap_default)) 233 | for cls_ind, cls in enumerate(self._classes): 234 | if cls == '__background__': 235 | continue 236 | # minus 1 because of __background__ 237 | precision = coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, cls_ind - 1, 0, 2] 238 | ap = np.mean(precision[precision > -1]) 239 | print('{:.1f}'.format(100 * ap)) 240 | 241 | print('~~~~ Summary metrics ~~~~') 242 | coco_eval.summarize() 243 | 244 | def _do_detection_eval(self, res_file, output_dir): 245 | ann_type = 'bbox' 246 | coco_dt = self._COCO.loadRes(res_file) 247 | coco_eval = COCOeval(self._COCO, coco_dt) 248 | coco_eval.params.useSegm = (ann_type == 'segm') 249 | coco_eval.evaluate() 250 | coco_eval.accumulate() 251 | self._print_detection_eval_metrics(coco_eval) 252 | eval_file = os.path.join(output_dir, 'detection_results.pkl') 253 | with open(eval_file, 'wb') as fid: 254 | pickle.dump(coco_eval, fid, pickle.HIGHEST_PROTOCOL) 255 | print('Wrote COCO eval results to: {}'.format(eval_file)) 256 | 257 | def _coco_results_one_category(self, boxes, cat_id): 258 | results = [] 259 | for im_ind, index in enumerate(self.image_indexes): 260 | dets = boxes[im_ind].astype(np.float) 261 | if dets == []: 262 | continue 263 | scores = dets[:, -1] 264 | xs = dets[:, 0] 265 | ys = dets[:, 1] 266 | ws = dets[:, 2] - xs + 1 267 | hs = dets[:, 3] - ys + 1 268 | results.extend( 269 | [{'image_id': index, 270 | 'category_id': cat_id, 271 | 'bbox': [xs[k], ys[k], ws[k], hs[k]], 272 | 'score': scores[k]} for k in range(dets.shape[0])]) 273 | return results 274 | 275 | def _write_coco_results_file(self, all_boxes, res_file): 276 | # [{"image_id": 42, 277 | # "category_id": 18, 278 | # "bbox": [258.15,41.29,348.26,243.78], 279 | # "score": 0.236}, ...] 280 | results = [] 281 | for cls_ind, cls in enumerate(self._classes): 282 | if cls == '__background__': 283 | continue 284 | print('Collecting {} results ({:d}/{:d})'.format(cls, cls_ind, 285 | self.num_classes)) 286 | coco_cat_id = self._class_to_coco_cat_id[cls] 287 | results.extend(self._coco_results_one_category(all_boxes[cls_ind], 288 | coco_cat_id)) 289 | ''' 290 | if cls_ind ==30: 291 | res_f = res_file+ '_1.json' 292 | print('Writing results json to {}'.format(res_f)) 293 | with open(res_f, 'w') as fid: 294 | json.dump(results, fid) 295 | results = [] 296 | ''' 297 | # res_f2 = res_file+'_2.json' 298 | print('Writing results json to {}'.format(res_file)) 299 | with open(res_file, 'w') as fid: 300 | json.dump(results, fid) 301 | 302 | def evaluate_detections(self, all_boxes, output_dir): 303 | res_file = os.path.join(output_dir, ('detections_' + 304 | self.coco_name + 305 | '_results')) 306 | res_file += '.json' 307 | self._write_coco_results_file(all_boxes, res_file) 308 | # Only do evaluation on non-test sets 309 | if self.coco_name.find('test') == -1: 310 | self._do_detection_eval(res_file, output_dir) 311 | # Optionally cleanup results json file 312 | -------------------------------------------------------------------------------- /data/voc0712.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | 9 | import pickle 10 | import sys 11 | 12 | import cv2 13 | import numpy as np 14 | import os 15 | import os.path 16 | import torch 17 | import torch.utils.data as data 18 | import torchvision.transforms as transforms 19 | from PIL import Image 20 | 21 | from .voc_eval import voc_eval 22 | 23 | if sys.version_info[0] == 2: 24 | import xml.etree.cElementTree as ET 25 | else: 26 | import xml.etree.ElementTree as ET 27 | 28 | VOC_CLASSES = ('__background__', # always index 0 29 | 'aeroplane', 'bicycle', 'bird', 'boat', 30 | 'bottle', 'bus', 'car', 'cat', 'chair', 31 | 'cow', 'diningtable', 'dog', 'horse', 32 | 'motorbike', 'person', 'pottedplant', 33 | 'sheep', 'sofa', 'train', 'tvmonitor') 34 | 35 | # for making bounding boxes pretty 36 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128), 37 | (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128)) 38 | 39 | 40 | class VOCSegmentation(data.Dataset): 41 | """VOC Segmentation Dataset Object 42 | input and target are both images 43 | 44 | NOTE: need to address https://github.com/pytorch/vision/issues/9 45 | 46 | Arguments: 47 | root (string): filepath to VOCdevkit folder. 48 | image_set (string): imageset to use (eg: 'train', 'val', 'test'). 49 | transform (callable, optional): transformation to perform on the 50 | input image 51 | target_transform (callable, optional): transformation to perform on the 52 | target image 53 | dataset_name (string, optional): which dataset to load 54 | (default: 'VOC2007') 55 | """ 56 | 57 | def __init__(self, root, image_set, transform=None, target_transform=None, 58 | dataset_name='VOC2007'): 59 | self.root = '/home/zdh1901/data/VOCdevkit/' 60 | self.image_set = image_set 61 | self.transform = transform 62 | self.target_transform = target_transform 63 | 64 | self._annopath = os.path.join( 65 | self.root, dataset_name, 'SegmentationClass', '%s.png') 66 | self._imgpath = os.path.join( 67 | self.root, dataset_name, 'JPEGImages', '%s.jpg') 68 | self._imgsetpath = os.path.join( 69 | self.root, dataset_name, 'ImageSets', 'Segmentation', '%s.txt') 70 | 71 | with open(self._imgsetpath % self.image_set) as f: 72 | self.ids = f.readlines() 73 | self.ids = [x.strip('\n') for x in self.ids] 74 | 75 | def __getitem__(self, index): 76 | img_id = self.ids[index] 77 | 78 | target = Image.open(self._annopath % img_id).convert('RGB') 79 | img = Image.open(self._imgpath % img_id).convert('RGB') 80 | 81 | if self.transform is not None: 82 | img = self.transform(img) 83 | 84 | if self.target_transform is not None: 85 | target = self.target_transform(target) 86 | 87 | return img, target 88 | 89 | def __len__(self): 90 | return len(self.ids) 91 | 92 | 93 | class AnnotationTransform(object): 94 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 95 | Initilized with a dictionary lookup of classnames to indexes 96 | 97 | Arguments: 98 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 99 | (default: alphabetic indexing of VOC's 20 classes) 100 | keep_difficult (bool, optional): keep difficult instances or not 101 | (default: False) 102 | height (int): height 103 | width (int): width 104 | """ 105 | 106 | def __init__(self, class_to_ind=None, keep_difficult=True): 107 | self.class_to_ind = class_to_ind or dict( 108 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 109 | self.keep_difficult = keep_difficult 110 | 111 | def __call__(self, target): 112 | """ 113 | Arguments: 114 | target (annotation) : the target annotation to be made usable 115 | will be an ET.Element 116 | Returns: 117 | a list containing lists of bounding boxes [bbox coords, class name] 118 | """ 119 | res = np.empty((0, 5)) 120 | for obj in target.iter('object'): 121 | difficult = int(obj.find('difficult').text) == 1 122 | if not self.keep_difficult and difficult: 123 | continue 124 | name = obj.find('name').text.lower().strip() 125 | bbox = obj.find('bndbox') 126 | 127 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 128 | bndbox = [] 129 | for i, pt in enumerate(pts): 130 | cur_pt = int(bbox.find(pt).text) - 1 131 | # scale height or width 132 | # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 133 | bndbox.append(cur_pt) 134 | label_idx = self.class_to_ind[name] 135 | bndbox.append(label_idx) 136 | res = np.vstack((res, bndbox)) # [xmin, ymin, xmax, ymax, label_ind] 137 | # img_id = target.find('filename').text[:-4] 138 | 139 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 140 | 141 | 142 | class VOCDetection(data.Dataset): 143 | """VOC Detection Dataset Object 144 | 145 | input is image, target is annotation 146 | 147 | Arguments: 148 | root (string): filepath to VOCdevkit folder. 149 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 150 | transform (callable, optional): transformation to perform on the 151 | input image 152 | target_transform (callable, optional): transformation to perform on the 153 | target `annotation` 154 | (eg: take in caption string, return tensor of word indices) 155 | dataset_name (string, optional): which dataset to load 156 | (default: 'VOC2007') 157 | """ 158 | 159 | def __init__(self, root, image_sets, preproc=None, target_transform=None, 160 | dataset_name='VOC0712'): 161 | self.root = root 162 | self.image_set = image_sets 163 | self.preproc = preproc 164 | self.target_transform = target_transform 165 | self.name = dataset_name 166 | self._annopath = os.path.join('%s', 'Annotations', '%s.xml') 167 | self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg') 168 | self.ids = list() 169 | for (year, name) in image_sets: 170 | self._year = year 171 | rootpath = os.path.join(self.root, 'VOC' + year) 172 | for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 173 | self.ids.append((rootpath, line.strip())) 174 | 175 | def __getitem__(self, index): 176 | img_id = self.ids[index] 177 | target = ET.parse(self._annopath % img_id).getroot() 178 | img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 179 | height, width, _ = img.shape 180 | 181 | if self.target_transform is not None: 182 | target = self.target_transform(target) 183 | 184 | if self.preproc is not None: 185 | img, target = self.preproc(img, target) 186 | # print(img.size()) 187 | 188 | # target = self.target_transform(target, width, height) 189 | # print(target.shape) 190 | 191 | return img, target 192 | 193 | def __len__(self): 194 | return len(self.ids) 195 | 196 | def pull_image(self, index): 197 | '''Returns the original image object at index in PIL form 198 | 199 | Note: not using self.__getitem__(), as any transformations passed in 200 | could mess up this functionality. 201 | 202 | Argument: 203 | index (int): index of img to show 204 | Return: 205 | PIL img 206 | ''' 207 | img_id = self.ids[index] 208 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 209 | 210 | def pull_anno(self, index): 211 | '''Returns the original annotation of image at index 212 | 213 | Note: not using self.__getitem__(), as any transformations passed in 214 | could mess up this functionality. 215 | 216 | Argument: 217 | index (int): index of img to get annotation of 218 | Return: 219 | list: [img_id, [(label, bbox coords),...]] 220 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 221 | ''' 222 | img_id = self.ids[index] 223 | anno = ET.parse(self._annopath % img_id).getroot() 224 | gt = self.target_transform(anno, 1, 1) 225 | return img_id[1], gt 226 | 227 | def pull_tensor(self, index): 228 | '''Returns the original image at an index in tensor form 229 | 230 | Note: not using self.__getitem__(), as any transformations passed in 231 | could mess up this functionality. 232 | 233 | Argument: 234 | index (int): index of img to show 235 | Return: 236 | tensorized version of img, squeezed 237 | ''' 238 | to_tensor = transforms.ToTensor() 239 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 240 | 241 | def evaluate_detections(self, all_boxes, output_dir=None): 242 | """ 243 | all_boxes is a list of length number-of-classes. 244 | Each list element is a list of length number-of-images. 245 | Each of those list elements is either an empty list [] 246 | or a numpy array of detection. 247 | 248 | all_boxes[class][image] = [] or np.array of shape #dets x 5 249 | """ 250 | self._write_voc_results_file(all_boxes) 251 | aps, map = self._do_python_eval(output_dir) 252 | return aps, map 253 | 254 | def _get_voc_results_file_template(self): 255 | filename = 'comp4_det_test' + '_{:s}.txt' 256 | filedir = os.path.join( 257 | self.root, 'results', 'VOC' + self._year, 'Main') 258 | if not os.path.exists(filedir): 259 | os.makedirs(filedir) 260 | path = os.path.join(filedir, filename) 261 | return path 262 | 263 | def _write_voc_results_file(self, all_boxes): 264 | for cls_ind, cls in enumerate(VOC_CLASSES): 265 | cls_ind = cls_ind 266 | if cls == '__background__': 267 | continue 268 | print('Writing {} VOC results file'.format(cls)) 269 | filename = self._get_voc_results_file_template().format(cls) 270 | with open(filename, 'wt') as f: 271 | for im_ind, index in enumerate(self.ids): 272 | index = index[1] 273 | dets = all_boxes[cls_ind][im_ind] 274 | if dets == []: 275 | continue 276 | for k in range(dets.shape[0]): 277 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 278 | format(index, dets[k, -1], 279 | dets[k, 0] + 1, dets[k, 1] + 1, 280 | dets[k, 2] + 1, dets[k, 3] + 1)) 281 | 282 | def _do_python_eval(self, output_dir='output'): 283 | rootpath = os.path.join(self.root, 'VOC' + self._year) 284 | name = self.image_set[0][1] 285 | annopath = os.path.join( 286 | rootpath, 287 | 'Annotations', 288 | '{:s}.xml') 289 | imagesetfile = os.path.join( 290 | rootpath, 291 | 'ImageSets', 292 | 'Main', 293 | name + '.txt') 294 | cachedir = os.path.join(self.root, 'annotations_cache') 295 | aps = [] 296 | # The PASCAL VOC metric changed in 2010 297 | use_07_metric = True if int(self._year) < 2010 else False 298 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 299 | if output_dir is not None and not os.path.isdir(output_dir): 300 | os.mkdir(output_dir) 301 | for i, cls in enumerate(VOC_CLASSES): 302 | 303 | if cls == '__background__': 304 | continue 305 | 306 | filename = self._get_voc_results_file_template().format(cls) 307 | rec, prec, ap = voc_eval( 308 | filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5, 309 | use_07_metric=use_07_metric) 310 | aps += [ap] 311 | print('AP for {} = {:.4f}'.format(cls, ap)) 312 | if output_dir is not None: 313 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 314 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 315 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 316 | print('~~~~~~~~') 317 | print('Results:') 318 | for ap in aps: 319 | print('{:.3f}'.format(ap)) 320 | print('{:.3f}'.format(np.mean(aps))) 321 | print('~~~~~~~~') 322 | print('') 323 | print('--------------------------------------------------------------') 324 | print('Results computed with the **unofficial** Python eval code.') 325 | print('Results should be very close to the official MATLAB eval code.') 326 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 327 | print('-- Thanks, The Management') 328 | print('--------------------------------------------------------------') 329 | return aps, np.mean(aps) 330 | 331 | 332 | def detection_collate(batch): 333 | """Custom collate fn for dealing with batches of images that have a different 334 | number of associated object annotations (bounding boxes). 335 | 336 | Arguments: 337 | batch: (tuple) A tuple of tensor images and lists of annotations 338 | 339 | Return: 340 | A tuple containing: 341 | 1) (tensor) batch of images stacked on their 0 dim 342 | 2) (list of tensors) annotations for a given image are stacked on 0 dim 343 | """ 344 | targets = [] 345 | imgs = [] 346 | for _, sample in enumerate(batch): 347 | for _, tup in enumerate(sample): 348 | if torch.is_tensor(tup): 349 | imgs.append(tup) 350 | elif isinstance(tup, type(np.empty(0))): 351 | annos = torch.from_numpy(tup).float() 352 | targets.append(annos) 353 | 354 | return (torch.stack(imgs, 0), targets) 355 | -------------------------------------------------------------------------------- /data/augmentations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision import transforms 3 | import cv2 4 | import numpy as np 5 | import types 6 | from numpy import random 7 | 8 | 9 | def intersect(box_a, box_b): 10 | max_xy = np.minimum(box_a[:, 2:], box_b[2:]) 11 | min_xy = np.maximum(box_a[:, :2], box_b[:2]) 12 | inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf) 13 | return inter[:, 0] * inter[:, 1] 14 | 15 | 16 | def jaccard_numpy(box_a, box_b): 17 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 18 | is simply the intersection over union of two boxes. 19 | E.g.: 20 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 21 | Args: 22 | box_a: Multiple bounding boxes, Shape: [num_boxes,4] 23 | box_b: Single bounding box, Shape: [4] 24 | Return: 25 | jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]] 26 | """ 27 | inter = intersect(box_a, box_b) 28 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 29 | (box_a[:, 3]-box_a[:, 1])) # [A,B] 30 | area_b = ((box_b[2]-box_b[0]) * 31 | (box_b[3]-box_b[1])) # [A,B] 32 | union = area_a + area_b - inter 33 | return inter / union # [A,B] 34 | 35 | 36 | class Compose(object): 37 | """Composes several augmentations together. 38 | Args: 39 | transforms (List[Transform]): list of transforms to compose. 40 | Example: 41 | >>> augmentations.Compose([ 42 | >>> transforms.CenterCrop(10), 43 | >>> transforms.ToTensor(), 44 | >>> ]) 45 | """ 46 | 47 | def __init__(self, transforms): 48 | self.transforms = transforms 49 | 50 | def __call__(self, img, boxes=None, labels=None): 51 | for t in self.transforms: 52 | img, boxes, labels = t(img, boxes, labels) 53 | return img, boxes, labels 54 | 55 | 56 | class Lambda(object): 57 | """Applies a lambda as a transform.""" 58 | 59 | def __init__(self, lambd): 60 | assert isinstance(lambd, types.LambdaType) 61 | self.lambd = lambd 62 | 63 | def __call__(self, img, boxes=None, labels=None): 64 | return self.lambd(img, boxes, labels) 65 | 66 | 67 | class ConvertFromInts(object): 68 | def __call__(self, image, boxes=None, labels=None): 69 | return image.astype(np.float32), boxes, labels 70 | 71 | 72 | class SubtractMeans(object): 73 | def __init__(self, mean): 74 | self.mean = np.array(mean, dtype=np.float32) 75 | 76 | def __call__(self, image, boxes=None, labels=None): 77 | image = image.astype(np.float32) 78 | image -= self.mean 79 | return image.astype(np.float32), boxes, labels 80 | 81 | 82 | class ToAbsoluteCoords(object): 83 | def __call__(self, image, boxes=None, labels=None): 84 | height, width, channels = image.shape 85 | boxes[:, 0] *= width 86 | boxes[:, 2] *= width 87 | boxes[:, 1] *= height 88 | boxes[:, 3] *= height 89 | 90 | return image, boxes, labels 91 | 92 | 93 | class ToPercentCoords(object): 94 | def __call__(self, image, boxes=None, labels=None): 95 | height, width, channels = image.shape 96 | boxes[:, 0] /= width 97 | boxes[:, 2] /= width 98 | boxes[:, 1] /= height 99 | boxes[:, 3] /= height 100 | 101 | return image, boxes, labels 102 | 103 | 104 | class Resize(object): 105 | def __init__(self, size=300): 106 | self.size = size 107 | 108 | def __call__(self, image, boxes=None, labels=None): 109 | image = cv2.resize(image, (self.size, 110 | self.size)) 111 | return image, boxes, labels 112 | 113 | 114 | class RandomSaturation(object): 115 | def __init__(self, lower=0.5, upper=1.5): 116 | self.lower = lower 117 | self.upper = upper 118 | assert self.upper >= self.lower, "contrast upper must be >= lower." 119 | assert self.lower >= 0, "contrast lower must be non-negative." 120 | 121 | def __call__(self, image, boxes=None, labels=None): 122 | if random.randint(2): 123 | image[:, :, 1] *= random.uniform(self.lower, self.upper) 124 | 125 | return image, boxes, labels 126 | 127 | 128 | class RandomHue(object): 129 | def __init__(self, delta=18.0): 130 | assert delta >= 0.0 and delta <= 360.0 131 | self.delta = delta 132 | 133 | def __call__(self, image, boxes=None, labels=None): 134 | if random.randint(2): 135 | image[:, :, 0] += random.uniform(-self.delta, self.delta) 136 | image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 137 | image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 138 | return image, boxes, labels 139 | 140 | 141 | class RandomLightingNoise(object): 142 | def __init__(self): 143 | self.perms = ((0, 1, 2), (0, 2, 1), 144 | (1, 0, 2), (1, 2, 0), 145 | (2, 0, 1), (2, 1, 0)) 146 | 147 | def __call__(self, image, boxes=None, labels=None): 148 | if random.randint(2): 149 | swap = self.perms[random.randint(len(self.perms))] 150 | shuffle = SwapChannels(swap) # shuffle channels 151 | image = shuffle(image) 152 | return image, boxes, labels 153 | 154 | 155 | class ConvertColor(object): 156 | def __init__(self, current='BGR', transform='HSV'): 157 | self.transform = transform 158 | self.current = current 159 | 160 | def __call__(self, image, boxes=None, labels=None): 161 | if self.current == 'BGR' and self.transform == 'HSV': 162 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 163 | elif self.current == 'HSV' and self.transform == 'BGR': 164 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 165 | else: 166 | raise NotImplementedError 167 | return image, boxes, labels 168 | 169 | 170 | class RandomContrast(object): 171 | def __init__(self, lower=0.5, upper=1.5): 172 | self.lower = lower 173 | self.upper = upper 174 | assert self.upper >= self.lower, "contrast upper must be >= lower." 175 | assert self.lower >= 0, "contrast lower must be non-negative." 176 | 177 | # expects float image 178 | def __call__(self, image, boxes=None, labels=None): 179 | if random.randint(2): 180 | alpha = random.uniform(self.lower, self.upper) 181 | image *= alpha 182 | return image, boxes, labels 183 | 184 | 185 | class RandomBrightness(object): 186 | def __init__(self, delta=32): 187 | assert delta >= 0.0 188 | assert delta <= 255.0 189 | self.delta = delta 190 | 191 | def __call__(self, image, boxes=None, labels=None): 192 | if random.randint(2): 193 | delta = random.uniform(-self.delta, self.delta) 194 | image += delta 195 | return image, boxes, labels 196 | 197 | 198 | class ToCV2Image(object): 199 | def __call__(self, tensor, boxes=None, labels=None): 200 | return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels 201 | 202 | 203 | class ToTensor(object): 204 | def __call__(self, cvimage, boxes=None, labels=None): 205 | return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels 206 | 207 | 208 | class RandomSampleCrop(object): 209 | """Crop 210 | Arguments: 211 | img (Image): the image being input during training 212 | boxes (Tensor): the original bounding boxes in pt form 213 | labels (Tensor): the class labels for each bbox 214 | mode (float tuple): the min and max jaccard overlaps 215 | Return: 216 | (img, boxes, classes) 217 | img (Image): the cropped image 218 | boxes (Tensor): the adjusted bounding boxes in pt form 219 | labels (Tensor): the class labels for each bbox 220 | """ 221 | def __init__(self): 222 | self.sample_options = ( 223 | # using entire original input image 224 | None, 225 | # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9 226 | (0.1, None), 227 | (0.3, None), 228 | (0.7, None), 229 | (0.9, None), 230 | # randomly sample a patch 231 | (None, None), 232 | ) 233 | 234 | def __call__(self, image, boxes=None, labels=None): 235 | height, width, _ = image.shape 236 | while True: 237 | # randomly choose a mode 238 | mode = random.choice(self.sample_options) 239 | if mode is None: 240 | return image, boxes, labels 241 | 242 | min_iou, max_iou = mode 243 | if min_iou is None: 244 | min_iou = float('-inf') 245 | if max_iou is None: 246 | max_iou = float('inf') 247 | 248 | # max trails (50) 249 | for _ in range(50): 250 | current_image = image 251 | 252 | w = random.uniform(0.3 * width, width) 253 | h = random.uniform(0.3 * height, height) 254 | 255 | # aspect ratio constraint b/t .5 & 2 256 | if h / w < 0.5 or h / w > 2: 257 | continue 258 | 259 | left = random.uniform(width - w) 260 | top = random.uniform(height - h) 261 | 262 | # convert to integer rect x1,y1,x2,y2 263 | rect = np.array([int(left), int(top), int(left+w), int(top+h)]) 264 | 265 | # calculate IoU (jaccard overlap) b/t the cropped and gt boxes 266 | overlap = jaccard_numpy(boxes, rect) 267 | 268 | # is min and max overlap constraint satisfied? if not try again 269 | if overlap.min() < min_iou and max_iou < overlap.max(): 270 | continue 271 | 272 | # cut the crop from the image 273 | current_image = current_image[rect[1]:rect[3], rect[0]:rect[2], 274 | :] 275 | 276 | # keep overlap with gt box IF center in sampled patch 277 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 278 | 279 | # mask in all gt boxes that above and to the left of centers 280 | m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1]) 281 | 282 | # mask in all gt boxes that under and to the right of centers 283 | m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1]) 284 | 285 | # mask in that both m1 and m2 are true 286 | mask = m1 * m2 287 | 288 | # have any valid boxes? try again if not 289 | if not mask.any(): 290 | continue 291 | 292 | # take only matching gt boxes 293 | current_boxes = boxes[mask, :].copy() 294 | 295 | # take only matching gt labels 296 | current_labels = labels[mask] 297 | 298 | # should we use the box left and top corner or the crop's 299 | current_boxes[:, :2] = np.maximum(current_boxes[:, :2], 300 | rect[:2]) 301 | # adjust to crop (by substracting crop's left,top) 302 | current_boxes[:, :2] -= rect[:2] 303 | 304 | current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], 305 | rect[2:]) 306 | # adjust to crop (by substracting crop's left,top) 307 | current_boxes[:, 2:] -= rect[:2] 308 | 309 | return current_image, current_boxes, current_labels 310 | 311 | 312 | class Expand(object): 313 | def __init__(self, mean): 314 | self.mean = mean 315 | 316 | def __call__(self, image, boxes, labels): 317 | if random.randint(2): 318 | return image, boxes, labels 319 | 320 | height, width, depth = image.shape 321 | ratio = random.uniform(1, 4) 322 | left = random.uniform(0, width*ratio - width) 323 | top = random.uniform(0, height*ratio - height) 324 | 325 | expand_image = np.zeros( 326 | (int(height*ratio), int(width*ratio), depth), 327 | dtype=image.dtype) 328 | expand_image[:, :, :] = self.mean 329 | expand_image[int(top):int(top + height), 330 | int(left):int(left + width)] = image 331 | image = expand_image 332 | 333 | boxes = boxes.copy() 334 | boxes[:, :2] += (int(left), int(top)) 335 | boxes[:, 2:] += (int(left), int(top)) 336 | 337 | return image, boxes, labels 338 | 339 | 340 | class RandomMirror(object): 341 | def __call__(self, image, boxes, classes): 342 | _, width, _ = image.shape 343 | if random.randint(2): 344 | image = image[:, ::-1] 345 | boxes = boxes.copy() 346 | boxes[:, 0::2] = width - boxes[:, 2::-2] 347 | return image, boxes, classes 348 | 349 | 350 | class SwapChannels(object): 351 | """Transforms a tensorized image by swapping the channels in the order 352 | specified in the swap tuple. 353 | Args: 354 | swaps (int triple): final order of channels 355 | eg: (2, 1, 0) 356 | """ 357 | 358 | def __init__(self, swaps): 359 | self.swaps = swaps 360 | 361 | def __call__(self, image): 362 | """ 363 | Args: 364 | image (Tensor): image tensor to be transformed 365 | Return: 366 | a tensor with channels swapped according to swap 367 | """ 368 | # if torch.is_tensor(image): 369 | # image = image.data.cpu().numpy() 370 | # else: 371 | # image = np.array(image) 372 | image = image[:, :, self.swaps] 373 | return image 374 | 375 | 376 | class PhotometricDistort(object): 377 | def __init__(self): 378 | self.pd = [ 379 | RandomContrast(), 380 | ConvertColor(transform='HSV'), 381 | RandomSaturation(), 382 | RandomHue(), 383 | ConvertColor(current='HSV', transform='BGR'), 384 | RandomContrast() 385 | ] 386 | self.rand_brightness = RandomBrightness() 387 | self.rand_light_noise = RandomLightingNoise() 388 | 389 | def __call__(self, image, boxes, labels): 390 | im = image.copy() 391 | im, boxes, labels = self.rand_brightness(im, boxes, labels) 392 | if random.randint(2): 393 | distort = Compose(self.pd[:-1]) 394 | else: 395 | distort = Compose(self.pd[1:]) 396 | im, boxes, labels = distort(im, boxes, labels) 397 | return self.rand_light_noise(im, boxes, labels) 398 | 399 | 400 | class SSDAugmentation(object): 401 | def __init__(self, size=300, mean=(104, 117, 123)): 402 | self.mean = mean 403 | self.size = size 404 | self.augment = Compose([ 405 | ConvertFromInts(), 406 | ToAbsoluteCoords(), 407 | PhotometricDistort(), 408 | Expand(self.mean), 409 | RandomSampleCrop(), 410 | RandomMirror(), 411 | ToPercentCoords(), 412 | Resize(self.size), 413 | SubtractMeans(self.mean) 414 | ]) 415 | 416 | def __call__(self, img, boxes, labels): 417 | return self.augment(img, boxes, labels) 418 | -------------------------------------------------------------------------------- /utils/box_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | import numpy as np 5 | if torch.cuda.is_available(): 6 | import torch.backends.cudnn as cudnn 7 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 8 | 9 | 10 | def point_form(boxes): 11 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax) 12 | representation for comparison to point form ground truth data. 13 | Args: 14 | boxes: (tensor) center-size default boxes from priorbox layers. 15 | Return: 16 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 17 | """ 18 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin 19 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax 20 | 21 | 22 | def center_size(boxes): 23 | """ Convert prior_boxes to (cx, cy, w, h) 24 | representation for comparison to center-size form ground truth data. 25 | Args: 26 | boxes: (tensor) point_form boxes 27 | Return: 28 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 29 | """ 30 | return torch.cat([(boxes[:, 2:] + boxes[:, :2])/2, # cx, cy 31 | boxes[:, 2:] - boxes[:, :2]], 1) # w, h 32 | 33 | 34 | def intersect(box_a, box_b): 35 | """ We resize both tensors to [A,B,2] without new malloc: 36 | [A,2] -> [A,1,2] -> [A,B,2] 37 | [B,2] -> [1,B,2] -> [A,B,2] 38 | Then we compute the area of intersect between box_a and box_b. 39 | Args: 40 | box_a: (tensor) bounding boxes, Shape: [A,4]. 41 | box_b: (tensor) bounding boxes, Shape: [B,4]. 42 | Return: 43 | (tensor) intersection area, Shape: [A,B]. 44 | """ 45 | A = box_a.size(0) 46 | B = box_b.size(0) 47 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), 48 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) 49 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), 50 | box_b[:, :2].unsqueeze(0).expand(A, B, 2)) 51 | inter = torch.clamp((max_xy - min_xy), min=0) 52 | return inter[:, :, 0] * inter[:, :, 1] 53 | 54 | 55 | def jaccard(box_a, box_b): 56 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 57 | is simply the intersection over union of two boxes. Here we operate on 58 | ground truth boxes and default boxes. 59 | E.g.: 60 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 61 | Args: 62 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] 63 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] 64 | Return: 65 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 66 | """ 67 | inter = intersect(box_a, box_b) 68 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 69 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] 70 | area_b = ((box_b[:, 2]-box_b[:, 0]) * 71 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] 72 | union = area_a + area_b - inter 73 | return inter / union # [A,B] 74 | 75 | def matrix_iou(a,b): 76 | """ 77 | return iou of a and b, numpy version for data augenmentation 78 | """ 79 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 80 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 81 | 82 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 83 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 84 | area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) 85 | return area_i / (area_a[:, np.newaxis] + area_b - area_i) 86 | 87 | 88 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx): 89 | """Match each prior box with the ground truth box of the highest jaccard 90 | overlap, encode the bounding boxes, then return the matched indices 91 | corresponding to both confidence and location preds. 92 | Args: 93 | threshold: (float) The overlap threshold used when mathing boxes. 94 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors]. 95 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 96 | variances: (tensor) Variances corresponding to each prior coord, 97 | Shape: [num_priors, 4]. 98 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 99 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 100 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 101 | idx: (int) current batch index 102 | Return: 103 | The matched indices corresponding to 1)location and 2)confidence preds. 104 | """ 105 | # jaccard index 106 | overlaps = jaccard( 107 | truths, 108 | point_form(priors) 109 | ) 110 | # (Bipartite Matching) 111 | # [1,num_objects] best prior for each ground truth 112 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 113 | # [1,num_priors] best ground truth for each prior 114 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 115 | best_truth_idx.squeeze_(0) 116 | best_truth_overlap.squeeze_(0) 117 | best_prior_idx.squeeze_(1) 118 | best_prior_overlap.squeeze_(1) 119 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior 120 | # TODO refactor: index best_prior_idx with long tensor 121 | # ensure every gt matches with its prior of max overlap 122 | for j in range(best_prior_idx.size(0)): 123 | best_truth_idx[best_prior_idx[j]] = j 124 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 125 | conf = labels[best_truth_idx] # Shape: [num_priors] 126 | conf[best_truth_overlap < threshold] = 0 # label as background 127 | loc = encode(matches, priors, variances) 128 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 129 | conf_t[idx] = conf # [num_priors] top class label for each prior 130 | 131 | def refine_match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx,arm_loc): 132 | """Match each arm bbox with the ground truth box of the highest jaccard 133 | overlap, encode the bounding boxes, then return the matched indices 134 | corresponding to both confidence and location preds. 135 | Args: 136 | threshold: (float) The overlap threshold used when mathing boxes. 137 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors]. 138 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 139 | variances: (tensor) Variances corresponding to each prior coord, 140 | Shape: [num_priors, 4]. 141 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 142 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 143 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 144 | idx: (int) current batch index 145 | arm_loc: (tensor) arm loc data,shape: [n_priors,4] 146 | Return: 147 | The matched indices corresponding to 1)location and 2)confidence preds. 148 | """ 149 | # decode arm box 150 | decode_arm = decode(arm_loc,priors=priors,variances=variances) 151 | # jaccard index 152 | overlaps = jaccard( 153 | truths, 154 | decode_arm 155 | ) 156 | # (Bipartite Matching) 157 | # [1,num_objects] best prior for each ground truth 158 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 159 | # [1,num_priors] best ground truth for each prior 160 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 161 | best_truth_idx.squeeze_(0) 162 | best_truth_overlap.squeeze_(0) 163 | best_prior_idx.squeeze_(1) 164 | best_prior_overlap.squeeze_(1) 165 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior 166 | # TODO refactor: index best_prior_idx with long tensor 167 | # ensure every gt matches with its prior of max overlap 168 | for j in range(best_prior_idx.size(0)): 169 | best_truth_idx[best_prior_idx[j]] = j 170 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 171 | conf = labels[best_truth_idx] # Shape: [num_priors] 172 | conf[best_truth_overlap < threshold] = 0 # label as background 173 | loc = encode(matches, center_size(decode_arm), variances) 174 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 175 | conf_t[idx] = conf # [num_priors] top class label for each prior 176 | 177 | def encode(matched, priors, variances): 178 | """Encode the variances from the priorbox layers into the ground truth boxes 179 | we have matched (based on jaccard overlap) with the prior boxes. 180 | Args: 181 | matched: (tensor) Coords of ground truth for each prior in point-form 182 | Shape: [num_priors, 4]. 183 | priors: (tensor) Prior boxes in center-offset form 184 | Shape: [num_priors,4]. 185 | variances: (list[float]) Variances of priorboxes 186 | Return: 187 | encoded boxes (tensor), Shape: [num_priors, 4] 188 | """ 189 | 190 | # dist b/t match center and prior's center 191 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] 192 | # encode variance 193 | g_cxcy /= (variances[0] * priors[:, 2:]) 194 | # match wh / prior wh 195 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 196 | g_wh = torch.log(g_wh) / variances[1] 197 | # return target for smooth_l1_loss 198 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 199 | 200 | 201 | def encode_multi(matched, priors, offsets, variances): 202 | """Encode the variances from the priorbox layers into the ground truth boxes 203 | we have matched (based on jaccard overlap) with the prior boxes. 204 | Args: 205 | matched: (tensor) Coords of ground truth for each prior in point-form 206 | Shape: [num_priors, 4]. 207 | priors: (tensor) Prior boxes in center-offset form 208 | Shape: [num_priors,4]. 209 | variances: (list[float]) Variances of priorboxes 210 | Return: 211 | encoded boxes (tensor), Shape: [num_priors, 4] 212 | """ 213 | 214 | # dist b/t match center and prior's center 215 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] - offsets[:,:2] 216 | # encode variance 217 | #g_cxcy /= (variances[0] * priors[:, 2:]) 218 | g_cxcy.div_(variances[0] * offsets[:, 2:]) 219 | # match wh / prior wh 220 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 221 | g_wh = torch.log(g_wh) / variances[1] 222 | # return target for smooth_l1_loss 223 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 224 | 225 | # Adapted from https://github.com/Hakuyume/chainer-ssd 226 | def decode(loc, priors, variances): 227 | """Decode locations from predictions using priors to undo 228 | the encoding we did for offset regression at train time. 229 | Args: 230 | loc (tensor): location predictions for loc layers, 231 | Shape: [num_priors,4] 232 | priors (tensor): Prior boxes in center-offset form. 233 | Shape: [num_priors,4]. 234 | variances: (list[float]) Variances of priorboxes 235 | Return: 236 | decoded bounding box predictions 237 | """ 238 | 239 | boxes = torch.cat(( 240 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 241 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 242 | boxes[:, :2] -= boxes[:, 2:] / 2 243 | boxes[:, 2:] += boxes[:, :2] 244 | return boxes 245 | 246 | def decode_multi(loc, priors, offsets, variances): 247 | """Decode locations from predictions using priors to undo 248 | the encoding we did for offset regression at train time. 249 | Args: 250 | loc (tensor): location predictions for loc layers, 251 | Shape: [num_priors,4] 252 | priors (tensor): Prior boxes in center-offset form. 253 | Shape: [num_priors,4]. 254 | variances: (list[float]) Variances of priorboxes 255 | Return: 256 | decoded bounding box predictions 257 | """ 258 | 259 | boxes = torch.cat(( 260 | priors[:, :2] + offsets[:,:2]+ loc[:, :2] * variances[0] * offsets[:, 2:], 261 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 262 | boxes[:, :2] -= boxes[:, 2:] / 2 263 | boxes[:, 2:] += boxes[:, :2] 264 | return boxes 265 | 266 | def log_sum_exp(x): 267 | """Utility function for computing log_sum_exp while determining 268 | This will be used to determine unaveraged confidence loss across 269 | all examples in a batch. 270 | Args: 271 | x (Variable(tensor)): conf_preds from conf layers 272 | """ 273 | x_max = x.data.max() 274 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max 275 | 276 | 277 | # Original author: Francisco Massa: 278 | # https://github.com/fmassa/object-detection.torch 279 | # Ported to PyTorch by Max deGroot (02/01/2017) 280 | def nms(boxes, scores, overlap=0.5, top_k=200): 281 | """Apply non-maximum suppression at test time to avoid detecting too many 282 | overlapping bounding boxes for a given object. 283 | Args: 284 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 285 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 286 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 287 | top_k: (int) The Maximum number of box preds to consider. 288 | Return: 289 | The indices of the kept boxes with respect to num_priors. 290 | """ 291 | 292 | keep = torch.Tensor(scores.size(0)).fill_(0).long() 293 | if boxes.numel() == 0: 294 | return keep 295 | x1 = boxes[:, 0] 296 | y1 = boxes[:, 1] 297 | x2 = boxes[:, 2] 298 | y2 = boxes[:, 3] 299 | area = torch.mul(x2 - x1, y2 - y1) 300 | v, idx = scores.sort(0) # sort in ascending order 301 | # I = I[v >= 0.01] 302 | idx = idx[-top_k:] # indices of the top-k largest vals 303 | xx1 = boxes.new() 304 | yy1 = boxes.new() 305 | xx2 = boxes.new() 306 | yy2 = boxes.new() 307 | w = boxes.new() 308 | h = boxes.new() 309 | 310 | # keep = torch.Tensor() 311 | count = 0 312 | while idx.numel() > 0: 313 | i = idx[-1] # index of current largest val 314 | # keep.append(i) 315 | keep[count] = i 316 | count += 1 317 | if idx.size(0) == 1: 318 | break 319 | idx = idx[:-1] # remove kept element from view 320 | # load bboxes of next highest vals 321 | torch.index_select(x1, 0, idx, out=xx1) 322 | torch.index_select(y1, 0, idx, out=yy1) 323 | torch.index_select(x2, 0, idx, out=xx2) 324 | torch.index_select(y2, 0, idx, out=yy2) 325 | # store element-wise max with next highest score 326 | xx1 = torch.clamp(xx1, min=x1[i]) 327 | yy1 = torch.clamp(yy1, min=y1[i]) 328 | xx2 = torch.clamp(xx2, max=x2[i]) 329 | yy2 = torch.clamp(yy2, max=y2[i]) 330 | w.resize_as_(xx2) 331 | h.resize_as_(yy2) 332 | w = xx2 - xx1 333 | h = yy2 - yy1 334 | # check sizes of xx1 and xx2.. after each iteration 335 | w = torch.clamp(w, min=0.0) 336 | h = torch.clamp(h, min=0.0) 337 | inter = w*h 338 | # IoU = i / (area(a) + area(b) - i) 339 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 340 | union = (rem_areas - inter) + area[i] 341 | IoU = inter/union # store result in iou 342 | # keep only elements with an IoU <= overlap 343 | idx = idx[IoU.le(overlap)] 344 | return keep, count 345 | --------------------------------------------------------------------------------