├── SSD ├── models │ ├── __init__.py │ ├── base_models.py │ └── SSD.py ├── utils │ ├── __init__.py │ ├── nms │ │ ├── __init__.py │ │ ├── gpu_nms.hpp │ │ ├── py_cpu_nms.py │ │ ├── gpu_nms.pyx │ │ ├── nms_kernel.cu │ │ └── cpu_nms.pyx │ ├── pycocotools │ │ ├── __init__.py │ │ ├── maskApi.h │ │ ├── mask.py │ │ ├── maskApi.c │ │ ├── _mask.pyx │ │ ├── coco.py │ │ └── cocoeval.py │ ├── nms_wrapper.py │ ├── timer.py │ ├── build.py │ └── box_utils.py ├── layers │ ├── __init__.py │ ├── modules │ │ ├── __init__.py │ │ ├── loss.py │ │ └── multibox_loss.py │ ├── functions │ │ ├── __init__.py │ │ ├── prior_box.py │ │ └── detection.py │ └── l2norm.py ├── image │ ├── 000050.jpg │ ├── 000753.jpg │ ├── 000762.jpg │ ├── 001070.jpg │ ├── 001136.jpg │ └── 001275.jpg ├── make.sh ├── data │ ├── __init__.py │ ├── scripts │ │ ├── VOC2012.sh │ │ └── VOC2007.sh │ ├── config.py │ ├── voc_eval.py │ ├── coco.py │ ├── data_augment.py │ └── voc0712.py ├── README.md ├── val.py └── train.py ├── YOLO v3 └── README.md └── README.md /SSD/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SSD/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SSD/utils/nms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SSD/utils/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /SSD/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | from .modules import * 3 | -------------------------------------------------------------------------------- /SSD/layers/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .multibox_loss import MultiBoxLoss 2 | 3 | __all__ = ['MultiBoxLoss'] 4 | -------------------------------------------------------------------------------- /SSD/image/000050.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlyldxwl/Stronger-One-stage-detector-with-much-Tricks/HEAD/SSD/image/000050.jpg -------------------------------------------------------------------------------- /SSD/image/000753.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlyldxwl/Stronger-One-stage-detector-with-much-Tricks/HEAD/SSD/image/000753.jpg -------------------------------------------------------------------------------- /SSD/image/000762.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlyldxwl/Stronger-One-stage-detector-with-much-Tricks/HEAD/SSD/image/000762.jpg -------------------------------------------------------------------------------- /SSD/image/001070.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlyldxwl/Stronger-One-stage-detector-with-much-Tricks/HEAD/SSD/image/001070.jpg -------------------------------------------------------------------------------- /SSD/image/001136.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlyldxwl/Stronger-One-stage-detector-with-much-Tricks/HEAD/SSD/image/001136.jpg -------------------------------------------------------------------------------- /SSD/image/001275.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlyldxwl/Stronger-One-stage-detector-with-much-Tricks/HEAD/SSD/image/001275.jpg -------------------------------------------------------------------------------- /SSD/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd ./utils/ 3 | 4 | CUDA_PATH=/usr/local/cuda/ 5 | 6 | python build.py build_ext --inplace 7 | 8 | cd .. 9 | -------------------------------------------------------------------------------- /SSD/layers/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .detection import Detect 2 | from .prior_box import PriorBox 3 | 4 | 5 | __all__ = ['Detect', 'PriorBox'] 6 | -------------------------------------------------------------------------------- /SSD/utils/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /SSD/data/__init__.py: -------------------------------------------------------------------------------- 1 | # from .voc import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES 2 | from .voc0712 import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES 3 | from .coco import COCODetection 4 | from .data_augment import * 5 | from .config import * 6 | -------------------------------------------------------------------------------- /YOLO v3/README.md: -------------------------------------------------------------------------------- 1 | # Stronger YOLO v3 with much Tricks 2 | 3 | This code will be released soon. 4 | 5 | If you are interested in this repo, please give attention to me. 6 | 7 | Furthermore, if you have more GPUs, please feel free to concat me and help me to train YOLO V3. ([yhao.chen0617@gmail.com](yhao.chen0617@gmail.com)) 8 | 9 | -------------------------------------------------------------------------------- /SSD/layers/l2norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Function 4 | from torch.autograd import Variable 5 | import torch.nn.init as init 6 | 7 | class L2Norm(nn.Module): 8 | def __init__(self,n_channels, scale): 9 | super(L2Norm,self).__init__() 10 | self.n_channels = n_channels 11 | self.gamma = scale or None 12 | self.eps = 1e-10 13 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 14 | self.reset_parameters() 15 | 16 | def reset_parameters(self): 17 | init.constant(self.weight,self.gamma) 18 | 19 | def forward(self, x): 20 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps 21 | #x /= norm 22 | x = torch.div(x,norm) 23 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 24 | return out 25 | -------------------------------------------------------------------------------- /SSD/data/scripts/VOC2012.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2012 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 26 | echo "Done downloading." 27 | 28 | 29 | # Extract data 30 | echo "Extracting trainval ..." 31 | tar -xvf VOCtrainval_11-May-2012.tar 32 | echo "removing tar ..." 33 | rm VOCtrainval_11-May-2012.tar 34 | 35 | end=`date +%s` 36 | runtime=$((end-start)) 37 | 38 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stronger One-stage detector with much Tricks 2 | 3 | This repo was inspired by the paper [Bag of Freebies for Training Object Detection Neural Networks](https://arxiv.org/pdf/1902.04103). 4 | 5 | I would test popular training tricks as many as I can for improving one-stage detector accuarcy, feel free to leave a comment or email me about the tricks you want me to test ([yhao.chen0617@gmail.com](yhao.chen0617@gmail.com)). 6 | 7 | **Traing Data** : VOC0712 trainval 8 | 9 | **Test data** : VOC07 test 10 | 11 | **GPU** : TITAN X (pascal) 12 | 13 | **Framework** : Pytorch 0.4 14 | 15 | Network | mAP | FPS | Parameter 16 | --|:--:|:--:|:--: 17 | SSD 300| 80.58 | ~100| - 18 | YOLOV3 544| - | - | - 19 | 20 | **Note**: 21 | 22 | - [ ] Stronger YOLOv3 with much tricks will be released soon. 23 | - [ ] This repo does not use **multi-scale train** because of the limitation of GPU memory (I only have one card), which is extremely beneficial to detector. 24 | -------------------------------------------------------------------------------- /SSD/utils/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from .nms.cpu_nms import cpu_nms, cpu_soft_nms 9 | from .nms.gpu_nms import gpu_nms 10 | 11 | 12 | # def nms(dets, thresh, force_cpu=False): 13 | # """Dispatch to either CPU or GPU NMS implementations.""" 14 | # 15 | # if dets.shape[0] == 0: 16 | # return [] 17 | # if cfg.USE_GPU_NMS and not force_cpu: 18 | # return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 19 | # else: 20 | # return cpu_nms(dets, thresh) 21 | 22 | 23 | def nms(dets, thresh, force_cpu=False): 24 | """Dispatch to either CPU or GPU NMS implementations.""" 25 | 26 | if dets.shape[0] == 0: 27 | return [] 28 | if force_cpu: 29 | #return cpu_soft_nms(dets, thresh, method = 0) 30 | return cpu_nms(dets, thresh) 31 | return gpu_nms(dets, thresh) 32 | -------------------------------------------------------------------------------- /SSD/README.md: -------------------------------------------------------------------------------- 1 | # Stronger SSD with much Tricks 2 | ## Tricks 3 | This repo was mainly used the following tricks. 4 | 5 | Trick | Reference paper 6 | --|:--: 7 | Warm up | - 8 | Cos lr | - 9 | Htd lr | - 10 | Batch Normalization | - 11 | Group Normalization | - 12 | No bais decay | - 13 | Label smooth | - 14 | Mixup | - 15 | Random erasing | - 16 | Balance Smoothl1 | - 17 | Focal loss | - 18 | GIOU | - 19 | Octconv | - 20 | 21 | 22 | ## Result 23 | Pretrained model is VGG-16 (atrous). The size of all models is 300×300. 24 | 25 | **SSD equips much data augmentation operations, which leads miuxp, label smooth and some data augmentation methods or regularization don't work.** 26 | 27 | ## Note 28 | 29 | - [ ] 80.58 is not the final resualt. The experiment of SSD300 with Focal loss, GIoU and Octconv is still going on. 30 | 31 | - [ ] BN can merge into convolution layer, thus it will not increase any inference time and parameters. The merge code will be pubilc soon. 32 | 33 | - [ ] Multi-Scale traing with SSD 300 will acquire a significant gain, which will be released when I go to the internship (about one month later), because I only have one GPU now. 34 | -------------------------------------------------------------------------------- /SSD/data/scripts/VOC2007.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2007 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 26 | echo "Downloading VOC2007 test data ..." 27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 28 | echo "Done downloading." 29 | 30 | # Extract data 31 | echo "Extracting trainval ..." 32 | tar -xvf VOCtrainval_06-Nov-2007.tar 33 | echo "Extracting test ..." 34 | tar -xvf VOCtest_06-Nov-2007.tar 35 | echo "removing tars ..." 36 | rm VOCtrainval_06-Nov-2007.tar 37 | rm VOCtest_06-Nov-2007.tar 38 | 39 | end=`date +%s` 40 | runtime=$((end-start)) 41 | 42 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /SSD/utils/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /SSD/utils/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /SSD/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | 11 | class Timer(object): 12 | """A simple timer.""" 13 | def __init__(self): 14 | self.total_time = 0. 15 | self.calls = 0 16 | self.start_time = 0. 17 | self.diff = 0. 18 | self.average_time = 0. 19 | 20 | def tic(self): 21 | # using time.time instead of time.clock because time time.clock 22 | # does not normalize for multithreading 23 | self.start_time = time.time() 24 | 25 | def toc(self, average=True): 26 | self.diff = time.time() - self.start_time 27 | self.total_time += self.diff 28 | self.calls += 1 29 | self.average_time = self.total_time / self.calls 30 | if average: 31 | return self.average_time 32 | else: 33 | return self.diff 34 | 35 | def clear(self): 36 | self.total_time = 0. 37 | self.calls = 0 38 | self.start_time = 0. 39 | self.diff = 0. 40 | self.average_time = 0. 41 | -------------------------------------------------------------------------------- /SSD/layers/functions/prior_box.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.backends.cudnn as cudnn 4 | from math import sqrt as sqrt 5 | from itertools import product as product 6 | 7 | 8 | class PriorBox(object): 9 | """Compute priorbox coordinates in center-offset form for each source 10 | feature map. 11 | Note: 12 | This 'layer' has changed between versions of the original SSD 13 | paper, so we include both versions, but note v2 is the most tested and most 14 | recent version of the paper. 15 | 16 | """ 17 | def __init__(self, cfg): 18 | super(PriorBox, self).__init__() 19 | self.image_size = cfg['min_dim'] 20 | # number of priors for feature map location (either 4 or 6) 21 | self.num_priors = len(cfg['aspect_ratios']) 22 | self.variance = cfg['variance'] or [0.1] 23 | self.feature_maps = cfg['feature_maps'] 24 | self.min_sizes = cfg['min_sizes'] 25 | self.max_sizes = cfg['max_sizes'] 26 | self.steps = cfg['steps'] 27 | self.aspect_ratios = cfg['aspect_ratios'] 28 | self.clip = cfg['clip'] 29 | for v in self.variance: 30 | if v <= 0: 31 | raise ValueError('Variances must be greater than 0') 32 | 33 | def forward(self): 34 | mean = [] 35 | for k, f in enumerate(self.feature_maps): 36 | for i, j in product(range(f), repeat=2): 37 | f_k = self.image_size / self.steps[k] 38 | cx = (j + 0.5) / f_k 39 | cy = (i + 0.5) / f_k 40 | 41 | s_k = self.min_sizes[k]/self.image_size 42 | mean += [cx, cy, s_k, s_k] 43 | 44 | # aspect_ratio: 1 45 | # rel size: sqrt(s_k * s_(k+1)) 46 | s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size)) 47 | mean += [cx, cy, s_k_prime, s_k_prime] 48 | 49 | # rest of aspect ratios 50 | for ar in self.aspect_ratios[k]: 51 | mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)] 52 | mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)] 53 | 54 | # back to torch land 55 | output = torch.Tensor(mean).view(-1, 4) 56 | if self.clip: 57 | output.clamp_(max=1, min=0) 58 | return output 59 | -------------------------------------------------------------------------------- /SSD/models/base_models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | def vgg(cfg, i, batch_norm=False): 5 | layers = [] 6 | in_channels = i 7 | for v in cfg: 8 | if v == 'M': 9 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 10 | elif v == 'C': 11 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 12 | else: 13 | if not batch_norm: 14 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 15 | else: 16 | conv2d = BasicConv(in_channels, v, kernel_size=3, padding=1,relu=False, bn=batch_norm, bias=True) 17 | layers += [conv2d,nn.ReLU(inplace=True)] 18 | in_channels = v 19 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 20 | if not batch_norm: 21 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 22 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 23 | else: 24 | conv6 = BasicConv(512, 1024, kernel_size=3, padding=6, dilation=6, relu=False) 25 | conv7 = BasicConv(1024, 1024, kernel_size=1, relu=False) 26 | layers += [pool5, conv6, nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 27 | return layers 28 | 29 | vgg_base = { 30 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 31 | 512, 512, 512], 32 | '512': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 33 | 512, 512, 512], 34 | } 35 | 36 | class BasicConv(nn.Module): 37 | 38 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, 39 | bn=True, bias=False): 40 | super(BasicConv, self).__init__() 41 | self.out_channels = out_planes 42 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, 43 | dilation=dilation, groups=groups, bias=bias) 44 | self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None 45 | self.relu = nn.ReLU(inplace=True) if relu else None 46 | 47 | def forward(self, x): 48 | x = self.conv(x) 49 | if self.bn is not None: 50 | x = self.bn(x) 51 | if self.relu is not None: 52 | x = self.relu(x) 53 | return x 54 | -------------------------------------------------------------------------------- /SSD/utils/pycocotools/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | 9 | typedef unsigned int uint; 10 | typedef unsigned long siz; 11 | typedef unsigned char byte; 12 | typedef double* BB; 13 | typedef struct { siz h, w, m; uint *cnts; } RLE; 14 | 15 | /* Initialize/destroy RLE. */ 16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 17 | void rleFree( RLE *R ); 18 | 19 | /* Initialize/destroy RLE array. */ 20 | void rlesInit( RLE **R, siz n ); 21 | void rlesFree( RLE **R, siz n ); 22 | 23 | /* Encode binary masks using RLE. */ 24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 25 | 26 | /* Decode binary masks encoded via RLE. */ 27 | void rleDecode( const RLE *R, byte *mask, siz n ); 28 | 29 | /* Compute union or intersection of encoded masks. */ 30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ); 31 | 32 | /* Compute area of encoded masks. */ 33 | void rleArea( const RLE *R, siz n, uint *a ); 34 | 35 | /* Compute intersection over union between masks. */ 36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 37 | 38 | /* Compute non-maximum suppression between bounding masks */ 39 | void rleNms( RLE *dt, siz n, uint *keep, double thr ); 40 | 41 | /* Compute intersection over union between bounding boxes. */ 42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 43 | 44 | /* Compute non-maximum suppression between bounding boxes */ 45 | void bbNms( BB dt, siz n, uint *keep, double thr ); 46 | 47 | /* Get bounding boxes surrounding encoded masks. */ 48 | void rleToBbox( const RLE *R, BB bb, siz n ); 49 | 50 | /* Convert bounding boxes to encoded masks. */ 51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 52 | 53 | /* Convert polygon to encoded mask. */ 54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 55 | 56 | /* Get compressed string representation of encoded mask. */ 57 | char* rleToString( const RLE *R ); 58 | 59 | /* Convert from compressed string representation of encoded mask. */ 60 | void rleFrString( RLE *R, char *s, siz h, siz w ); 61 | -------------------------------------------------------------------------------- /SSD/data/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | import os.path 3 | 4 | # gets home dir cross platform 5 | home = os.path.expanduser("~") 6 | ddir = os.path.join(home,"data/VOCdevkit/") 7 | 8 | # note: if you used our download scripts, this should be right 9 | VOCroot = ddir # path to VOCdevkit root dir 10 | COCOroot = os.path.join(home,"data/COCO/") 11 | 12 | 13 | #RFB CONFIGS 14 | VOC_300 = { 15 | 'feature_maps' : [38, 19, 10, 5, 3, 1], 16 | 17 | 'min_dim' : 300, 18 | 19 | 'steps' : [8, 16, 32, 64, 100, 300], 20 | 21 | 'min_sizes' : [30, 60, 111, 162, 213, 264], 22 | 23 | 'max_sizes' : [60, 111, 162, 213, 264, 315], 24 | 25 | 'aspect_ratios' : [[2,3], [2, 3], [2, 3], [2, 3], [2], [2]], 26 | 27 | 'variance' : [0.1, 0.2], 28 | 29 | 'clip' : True, 30 | } 31 | 32 | VOC_512= { 33 | 'feature_maps' : [64, 32, 16, 8, 4, 2, 1], 34 | 35 | 'min_dim' : 512, 36 | 37 | 'steps' : [8, 16, 32, 64, 128, 256, 512], 38 | 39 | 'min_sizes' : [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8 ], 40 | 41 | 'max_sizes' : [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6], 42 | 43 | 'aspect_ratios' : [[2,3], [2, 3], [2, 3], [2, 3], [2,3], [2], [2]], 44 | 45 | 'variance' : [0.1, 0.2], 46 | 47 | 'clip' : True, 48 | } 49 | 50 | 51 | COCO_300 = { 52 | 'feature_maps' : [38, 19, 10, 5, 3, 1], 53 | 54 | 'min_dim' : 300, 55 | 56 | 'steps' : [8, 16, 32, 64, 100, 300], 57 | 58 | 'min_sizes' : [21, 45, 99, 153, 207, 261], 59 | 60 | 'max_sizes' : [45, 99, 153, 207, 261, 315], 61 | 62 | 'aspect_ratios' : [[2,3], [2, 3], [2, 3], [2, 3], [2], [2]], 63 | 64 | 'variance' : [0.1, 0.2], 65 | 66 | 'clip' : True, 67 | } 68 | 69 | COCO_512= { 70 | 'feature_maps' : [64, 32, 16, 8, 4, 2, 1], 71 | 72 | 'min_dim' : 512, 73 | 74 | 'steps' : [8, 16, 32, 64, 128, 256, 512], 75 | 76 | 'min_sizes' : [20.48, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8], 77 | 78 | 'max_sizes' : [51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72], 79 | 80 | 'aspect_ratios' : [[2,3], [2, 3], [2, 3], [2, 3], [2,3], [2], [2]], 81 | 82 | 'variance' : [0.1, 0.2], 83 | 84 | 'clip' : True, 85 | } 86 | 87 | COCO_mobile_300 = { 88 | 'feature_maps' : [19, 10, 5, 3, 2, 1], 89 | 90 | 'min_dim' : 300, 91 | 92 | 'steps' : [16, 32, 64, 100, 150, 300], 93 | 94 | 'min_sizes' : [45, 90, 135, 180, 225, 270], 95 | 96 | 'max_sizes' : [90, 135, 180, 225, 270, 315], 97 | 98 | 'aspect_ratios' : [[2,3], [2, 3], [2, 3], [2, 3], [2], [2]], 99 | 100 | 'variance' : [0.1, 0.2], 101 | 102 | 'clip' : True, 103 | } 104 | -------------------------------------------------------------------------------- /SSD/layers/functions/detection.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.backends.cudnn as cudnn 4 | from torch.autograd import Function 5 | from torch.autograd import Variable 6 | from utils.box_utils import decode, point_form 7 | 8 | 9 | class Detect(Function): 10 | """At test time, Detect is the final layer of SSD. Decode location preds, 11 | apply non-maximum suppression to location predictions based on conf 12 | scores and threshold to a top_k number of output predictions for both 13 | confidence score and locations. 14 | """ 15 | def __init__(self, num_classes, bkg_label, cfg, GIOU=False): 16 | self.num_classes = num_classes 17 | self.background_label = bkg_label 18 | 19 | self.variance = cfg['variance'] 20 | self.giou = GIOU 21 | 22 | def forward(self, predictions, prior): 23 | """ 24 | Args: 25 | loc_data: (tensor) Loc preds from loc layers 26 | Shape: [batch,num_priors*4] 27 | conf_data: (tensor) Shape: Conf preds from conf layers 28 | Shape: [batch*num_priors,num_classes] 29 | prior_data: (tensor) Prior boxes and variances from priorbox layers 30 | Shape: [1,num_priors,4] 31 | """ 32 | 33 | loc, conf = predictions 34 | 35 | loc_data = loc.data 36 | conf_data = conf.data 37 | prior_data = prior.data 38 | num = loc_data.size(0) # batch size 39 | self.num_priors = prior_data.size(0) 40 | self.boxes = torch.zeros(1, self.num_priors, 4) 41 | self.scores = torch.zeros(1, self.num_priors, self.num_classes) 42 | if loc_data.is_cuda: 43 | self.boxes = self.boxes.cuda() 44 | self.scores = self.scores.cuda() 45 | 46 | if num == 1: 47 | # size batch x num_classes x num_priors 48 | conf_preds = conf_data.unsqueeze(0) 49 | 50 | else: 51 | conf_preds = conf_data.view(num, self.num_priors, 52 | self.num_classes) 53 | self.boxes.expand_(num, self.num_priors, 4) 54 | self.scores.expand_(num, self.num_priors, self.num_classes) 55 | 56 | # Decode predictions into bboxes. 57 | for i in range(num): 58 | if self.giou: 59 | p = decode(loc_data[i], prior_data, self.variance) 60 | decoded_boxes = torch.stack([torch.min(p[:,0],p[:,2]), torch.min(p[:,1],p[:,3]), torch.max(p[:,0],p[:,2]), torch.max(p[:,1],p[:,3])],1) 61 | else: 62 | decoded_boxes = decode(loc_data[i], prior_data, self.variance) 63 | conf_scores = conf_preds[i].clone() 64 | 65 | self.boxes[i] = decoded_boxes 66 | self.scores[i] = conf_scores 67 | 68 | return self.boxes, self.scores 69 | 70 | -------------------------------------------------------------------------------- /SSD/val.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import os 4 | import pickle 5 | import argparse 6 | import torch 7 | import torch.nn as nn 8 | import torch.backends.cudnn as cudnn 9 | import torchvision.transforms as transforms 10 | import numpy as np 11 | from torch.autograd import Variable 12 | from data import VOCroot,COCOroot 13 | from data import AnnotationTransform, COCODetection, VOCDetection, BaseTransform, VOC_300,VOC_512,COCO_300,COCO_512, COCO_mobile_300 14 | 15 | import torch.utils.data as data 16 | from layers.functions import Detect,PriorBox 17 | from utils.nms_wrapper import nms 18 | from utils.timer import Timer 19 | 20 | 21 | def val_net(priors,save_val_folder,testset,num_classes,net,detector,transform,max_per_image,thresh,cuda,vgg_bn): 22 | if not os.path.exists(save_val_folder): 23 | os.makedirs(save_val_folder) 24 | # dump predictions and assoc. ground truth to text file for now 25 | num_images = len(testset) 26 | 27 | all_boxes = [[[] for _ in range(num_images)] 28 | for _ in range(num_classes)] 29 | 30 | _t = {'im_detect': Timer(), 'misc': Timer()} 31 | det_file = os.path.join(save_val_folder, 'detections.pkl') 32 | 33 | for i in range(num_images): 34 | img = testset.pull_image(i) 35 | scale = torch.Tensor([img.shape[1], img.shape[0], 36 | img.shape[1], img.shape[0]]) 37 | with torch.no_grad(): 38 | x = transform(img).unsqueeze(0) 39 | if cuda: 40 | x = x.cuda() 41 | scale = scale.cuda() 42 | 43 | _t['im_detect'].tic() 44 | out = net(x,vgg_bn=vgg_bn,test='True') # forward pass 45 | boxes, scores = detector.forward(out, priors) 46 | detect_time = _t['im_detect'].toc() 47 | boxes = boxes[0] 48 | scores = scores[0] 49 | 50 | boxes *= scale 51 | boxes = boxes.cpu().numpy() 52 | scores = scores.cpu().numpy() 53 | # scale each detection back up to the image 54 | 55 | _t['misc'].tic() 56 | 57 | for j in range(1, num_classes): 58 | inds = np.where(scores[:, j] > thresh)[0] 59 | if len(inds) == 0: 60 | all_boxes[j][i] = np.empty([0, 5], dtype=np.float32) 61 | continue 62 | c_bboxes = boxes[inds] 63 | c_scores = scores[inds, j] 64 | c_dets = np.hstack((c_bboxes, c_scores[:, np.newaxis])).astype( 65 | np.float32, copy=False) 66 | 67 | keep = nms(c_dets, 0.45, force_cpu=False) 68 | # keep = keep[:40] 69 | c_dets = c_dets[keep, :] 70 | all_boxes[j][i] = c_dets 71 | if max_per_image > 0: 72 | image_scores = np.hstack([all_boxes[j][i][:, -1] for j in range(1, num_classes)]) 73 | if len(image_scores) > max_per_image: 74 | image_thresh = np.sort(image_scores)[-max_per_image] 75 | for j in range(1, num_classes): 76 | keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] 77 | all_boxes[j][i] = all_boxes[j][i][keep, :] 78 | 79 | nms_time = _t['misc'].toc() 80 | 81 | if i % 1000 == 0: 82 | print('im_detect: {:d}/{:d} {:.4f}s {:.3f}s' 83 | .format(i + 1, num_images, detect_time, nms_time)) 84 | _t['im_detect'].clear() 85 | _t['misc'].clear() 86 | 87 | with open(det_file, 'wb') as f: 88 | pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) 89 | 90 | print('Evaluating detections') 91 | testset.evaluate_detections(all_boxes, save_val_folder) 92 | 93 | 94 | if __name__ == "__main__": 95 | pass 96 | 97 | -------------------------------------------------------------------------------- /SSD/layers/modules/loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | from utils.box_utils import log_sum_exp, focal_sum_exp, decode 8 | 9 | class SmoothL1_Mixup_Balance_loss(nn.Module): 10 | def __init__(self, alpha=0.5, gamma=1.5, balance = False, mixup = False, size_average=False): 11 | super(SmoothL1_Mixup_Balance_loss,self).__init__() 12 | self.balance = balance 13 | self.mixup = mixup 14 | self.size_average = size_average 15 | if self.balance: 16 | self.a = alpha 17 | self.r = gamma 18 | self.b = math.exp(gamma / alpha) - 1 19 | self.c = gamma / self.b - alpha 20 | 21 | def forward(self, predict, truth, weight=None): 22 | if self.mixup: 23 | assert predict.shape[0]== truth.shape[0]== weight.shape[0] 24 | else: 25 | assert predict.shape[0] == truth.shape[0] 26 | t = torch.abs(truth-predict) 27 | if self.balance: 28 | smbloss = torch.where(t < 1, self.a * (self.b * t + 1) * torch.log(self.b * t + 1) / self.b - self.a * t, self.r * t + self.c) 29 | else: 30 | smbloss = torch.where(t < 1, 0.5 * t ** 2, t - 0.5) 31 | if self.mixup: 32 | smbloss = smbloss.sum(1, keepdim=True) * weight 33 | else: 34 | smbloss = smbloss.sum(1) 35 | if self.size_average: 36 | return torch.mean(smbloss) 37 | else: 38 | return smbloss.sum() 39 | 40 | class Crossentropy_Mixup_SoftmaxFocal_LableSmooth_loss(nn.Module): 41 | def __init__(self, mixup=False, focal_loss=False, gamma=2, alpha=1, label_smooth=False,size_average=False): 42 | super(Crossentropy_Mixup_SoftmaxFocal_LableSmooth_loss,self).__init__() 43 | self.mixup = mixup 44 | self.softmax_focal = focal_loss 45 | if self.softmax_focal: 46 | self.gamma = gamma 47 | self.alpha = alpha 48 | self.label_smooth = label_smooth 49 | self.size_average = size_average 50 | 51 | def forward(self, predict, truth, weight=None): 52 | if self.mixup: 53 | assert predict.shape[0] == truth.shape[0] == weight.shape[0] 54 | else: 55 | assert predict.shape[0] == truth.shape[0] 56 | if self.softmax_focal: 57 | # using OHEM and focal loss with CE 58 | soft_score = focal_sum_exp(predict) 59 | pro = self.alpha * (1 - soft_score) ** self.gamma 60 | cmsloss = (log_sum_exp(predict) - predict.gather(1, truth.view(-1, 1))) * pro.gather(1, truth.view(-1,1)) 61 | elif self.label_smooth: 62 | cmsloss = (log_sum_exp(predict, label_smooth=True) * truth).sum(1, keepdim=True) 63 | else: 64 | cmsloss = log_sum_exp(predict) - predict.gather(1, truth.view(-1, 1)) 65 | if self.mixup: 66 | cmsloss = cmsloss * weight 67 | if self.size_average: 68 | return cmsloss.mean() 69 | else: 70 | return cmsloss.sum() 71 | 72 | class GIoUloss(nn.Module): 73 | def __init__(self,size_average=False): 74 | super(GIoUloss,self).__init__() 75 | self.size_average = size_average 76 | 77 | def _GIoU(self, p, g): 78 | areas_p = (p[:, 2] - p[:, 0]) * (p[:, 3] - p[:, 1]) 79 | areas_g = (g[:, 2] - g[:, 0]) * (g[:, 3] - g[:, 1]) 80 | x1y1 = torch.max(p[:, :2], g[:, :2]) 81 | x2y2 = torch.min(p[:, 2:], g[:, 2:]) 82 | inter = torch.clamp((x2y2 - x1y1), min=0) 83 | area_inter = inter[:, 0] * inter[:, 1] 84 | x1y1 = torch.min(p[:, :2], g[:, :2]) 85 | x2y2 = torch.max(p[:, 2:], g[:, 2:]) 86 | total = x2y2 - x1y1 87 | area_total = total[:, 0] * total[:, 1] # 闭包区域面积 88 | uni = areas_g + areas_p - area_inter 89 | iou = area_inter / uni 90 | Giou = iou - (area_total - uni) / area_total 91 | return Giou 92 | 93 | def forward(self, predict, priors, target, variance=[0.1,0.2]): 94 | assert priors.shape == predict.shape == target.shape, "GIoU loss ERROR!" 95 | 96 | p = decode(predict, priors, variance) 97 | p_n = torch.stack([torch.min(p[:,0],p[:,2]), torch.min(p[:,1],p[:,3]), torch.max(p[:,0],p[:,2]), torch.max(p[:,1],p[:,3])],1) 98 | loss = 1 - self._GIoU(p_n, target) 99 | if self.size_average: 100 | return loss.mean() 101 | else: 102 | return loss.sum() 103 | 104 | 105 | if __name__ == "__main__": 106 | print("This is a loss function implementation file.") 107 | pass 108 | -------------------------------------------------------------------------------- /SSD/utils/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | #import pycocotools._mask as _mask 4 | from . import _mask 5 | 6 | # Interface for manipulating masks stored in RLE format. 7 | # 8 | # RLE is a simple yet efficient format for storing binary masks. RLE 9 | # first divides a vector (or vectorized image) into a series of piecewise 10 | # constant regions and then for each piece simply stores the length of 11 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 12 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 13 | # (note that the odd counts are always the numbers of zeros). Instead of 14 | # storing the counts directly, additional compression is achieved with a 15 | # variable bitrate representation based on a common scheme called LEB128. 16 | # 17 | # Compression is greatest given large piecewise constant regions. 18 | # Specifically, the size of the RLE is proportional to the number of 19 | # *boundaries* in M (or for an image the number of boundaries in the y 20 | # direction). Assuming fairly simple shapes, the RLE representation is 21 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 22 | # is substantially lower, especially for large simple objects (large n). 23 | # 24 | # Many common operations on masks can be computed directly using the RLE 25 | # (without need for decoding). This includes computations such as area, 26 | # union, intersection, etc. All of these operations are linear in the 27 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 28 | # of the object. Computing these operations on the original mask is O(n). 29 | # Thus, using the RLE can result in substantial computational savings. 30 | # 31 | # The following API functions are defined: 32 | # encode - Encode binary masks using RLE. 33 | # decode - Decode binary masks encoded via RLE. 34 | # merge - Compute union or intersection of encoded masks. 35 | # iou - Compute intersection over union between masks. 36 | # area - Compute area of encoded masks. 37 | # toBbox - Get bounding boxes surrounding encoded masks. 38 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 39 | # 40 | # Usage: 41 | # Rs = encode( masks ) 42 | # masks = decode( Rs ) 43 | # R = merge( Rs, intersect=false ) 44 | # o = iou( dt, gt, iscrowd ) 45 | # a = area( Rs ) 46 | # bbs = toBbox( Rs ) 47 | # Rs = frPyObjects( [pyObjects], h, w ) 48 | # 49 | # In the API the following formats are used: 50 | # Rs - [dict] Run-length encoding of binary masks 51 | # R - dict Run-length encoding of binary mask 52 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 53 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 54 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 55 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 56 | # dt,gt - May be either bounding boxes or encoded masks 57 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 58 | # 59 | # Finally, a note about the intersection over union (iou) computation. 60 | # The standard iou of a ground truth (gt) and detected (dt) object is 61 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 62 | # For "crowd" regions, we use a modified criteria. If a gt object is 63 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 64 | # Choosing gt' in the crowd gt that best matches the dt can be done using 65 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 66 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 67 | # For crowd gt regions we use this modified criteria above for the iou. 68 | # 69 | # To compile run "python setup.py build_ext --inplace" 70 | # Please do not contact us for help with compiling. 71 | # 72 | # Microsoft COCO Toolbox. version 2.0 73 | # Data, paper, and tutorials available at: http://mscoco.org/ 74 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 75 | # Licensed under the Simplified BSD License [see coco/license.txt] 76 | 77 | iou = _mask.iou 78 | merge = _mask.merge 79 | frPyObjects = _mask.frPyObjects 80 | 81 | def encode(bimask): 82 | if len(bimask.shape) == 3: 83 | return _mask.encode(bimask) 84 | elif len(bimask.shape) == 2: 85 | h, w = bimask.shape 86 | return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] 87 | 88 | def decode(rleObjs): 89 | if type(rleObjs) == list: 90 | return _mask.decode(rleObjs) 91 | else: 92 | return _mask.decode([rleObjs])[:,:,0] 93 | 94 | def area(rleObjs): 95 | if type(rleObjs) == list: 96 | return _mask.area(rleObjs) 97 | else: 98 | return _mask.area([rleObjs])[0] 99 | 100 | def toBbox(rleObjs): 101 | if type(rleObjs) == list: 102 | return _mask.toBbox(rleObjs) 103 | else: 104 | return _mask.toBbox([rleObjs])[0] 105 | -------------------------------------------------------------------------------- /SSD/utils/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /SSD/utils/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | 70 | def cpu_soft_nms(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0): 71 | cdef unsigned int N = boxes.shape[0] 72 | cdef float iw, ih, box_area 73 | cdef float ua 74 | cdef int pos = 0 75 | cdef float maxscore = 0 76 | cdef int maxpos = 0 77 | cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov 78 | 79 | for i in range(N): 80 | maxscore = boxes[i, 4] 81 | maxpos = i 82 | 83 | tx1 = boxes[i,0] 84 | ty1 = boxes[i,1] 85 | tx2 = boxes[i,2] 86 | ty2 = boxes[i,3] 87 | ts = boxes[i,4] 88 | 89 | pos = i + 1 90 | # get max box 91 | while pos < N: 92 | if maxscore < boxes[pos, 4]: 93 | maxscore = boxes[pos, 4] 94 | maxpos = pos 95 | pos = pos + 1 96 | 97 | # add max box as a detection 98 | boxes[i,0] = boxes[maxpos,0] 99 | boxes[i,1] = boxes[maxpos,1] 100 | boxes[i,2] = boxes[maxpos,2] 101 | boxes[i,3] = boxes[maxpos,3] 102 | boxes[i,4] = boxes[maxpos,4] 103 | 104 | # swap ith box with position of max box 105 | boxes[maxpos,0] = tx1 106 | boxes[maxpos,1] = ty1 107 | boxes[maxpos,2] = tx2 108 | boxes[maxpos,3] = ty2 109 | boxes[maxpos,4] = ts 110 | 111 | tx1 = boxes[i,0] 112 | ty1 = boxes[i,1] 113 | tx2 = boxes[i,2] 114 | ty2 = boxes[i,3] 115 | ts = boxes[i,4] 116 | 117 | pos = i + 1 118 | # NMS iterations, note that N changes if detection boxes fall below threshold 119 | while pos < N: 120 | x1 = boxes[pos, 0] 121 | y1 = boxes[pos, 1] 122 | x2 = boxes[pos, 2] 123 | y2 = boxes[pos, 3] 124 | s = boxes[pos, 4] 125 | 126 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 127 | iw = (min(tx2, x2) - max(tx1, x1) + 1) 128 | if iw > 0: 129 | ih = (min(ty2, y2) - max(ty1, y1) + 1) 130 | if ih > 0: 131 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) 132 | ov = iw * ih / ua #iou between max box and detection box 133 | 134 | if method == 1: # linear 135 | if ov > Nt: 136 | weight = 1 - ov 137 | else: 138 | weight = 1 139 | elif method == 2: # gaussian 140 | weight = np.exp(-(ov * ov)/sigma) 141 | else: # original NMS 142 | if ov > Nt: 143 | weight = 0 144 | else: 145 | weight = 1 146 | 147 | boxes[pos, 4] = weight*boxes[pos, 4] 148 | 149 | # if box score falls below threshold, discard the box by swapping with last box 150 | # update N 151 | if boxes[pos, 4] < threshold: 152 | boxes[pos,0] = boxes[N-1, 0] 153 | boxes[pos,1] = boxes[N-1, 1] 154 | boxes[pos,2] = boxes[N-1, 2] 155 | boxes[pos,3] = boxes[N-1, 3] 156 | boxes[pos,4] = boxes[N-1, 4] 157 | N = N - 1 158 | pos = pos - 1 159 | 160 | pos = pos + 1 161 | 162 | keep = [i for i in range(N)] 163 | return keep 164 | -------------------------------------------------------------------------------- /SSD/utils/build.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | import numpy as np 11 | from distutils.core import setup 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 19 | for dir in path.split(os.pathsep): 20 | binpath = pjoin(dir, name) 21 | if os.path.exists(binpath): 22 | return os.path.abspath(binpath) 23 | return None 24 | 25 | 26 | def locate_cuda(): 27 | """Locate the CUDA environment on the system 28 | 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | 32 | Starts by looking for the CUDAHOME env variable. If not found, everything 33 | is based on finding 'nvcc' in the PATH. 34 | """ 35 | 36 | # first check if the CUDAHOME env variable is in use 37 | if 'CUDAHOME' in os.environ: 38 | home = os.environ['CUDAHOME'] 39 | nvcc = pjoin(home, 'bin', 'nvcc') 40 | else: 41 | # otherwise, search the PATH for NVCC 42 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 43 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 44 | if nvcc is None: 45 | raise EnvironmentError('The nvcc binary could not be ' 46 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 47 | home = os.path.dirname(os.path.dirname(nvcc)) 48 | 49 | cudaconfig = {'home': home, 'nvcc': nvcc, 50 | 'include': pjoin(home, 'include'), 51 | 'lib64': pjoin(home, 'lib64')} 52 | for k, v in cudaconfig.items(): 53 | if not os.path.exists(v): 54 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 55 | 56 | return cudaconfig 57 | 58 | 59 | CUDA = locate_cuda() 60 | 61 | # Obtain the numpy include directory. This logic works across numpy versions. 62 | try: 63 | numpy_include = np.get_include() 64 | except AttributeError: 65 | numpy_include = np.get_numpy_include() 66 | 67 | 68 | def customize_compiler_for_nvcc(self): 69 | """inject deep into distutils to customize how the dispatch 70 | to gcc/nvcc works. 71 | 72 | If you subclass UnixCCompiler, it's not trivial to get your subclass 73 | injected in, and still have the right customizations (i.e. 74 | distutils.sysconfig.customize_compiler) run on it. So instead of going 75 | the OO route, I have this. Note, it's kindof like a wierd functional 76 | subclassing going on.""" 77 | 78 | # tell the compiler it can processes .cu 79 | self.src_extensions.append('.cu') 80 | 81 | # save references to the default compiler_so and _comple methods 82 | default_compiler_so = self.compiler_so 83 | super = self._compile 84 | 85 | # now redefine the _compile method. This gets executed for each 86 | # object but distutils doesn't have the ability to change compilers 87 | # based on source extension: we add it. 88 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 89 | print(extra_postargs) 90 | if os.path.splitext(src)[1] == '.cu': 91 | # use the cuda for .cu files 92 | self.set_executable('compiler_so', CUDA['nvcc']) 93 | # use only a subset of the extra_postargs, which are 1-1 translated 94 | # from the extra_compile_args in the Extension class 95 | postargs = extra_postargs['nvcc'] 96 | else: 97 | postargs = extra_postargs['gcc'] 98 | 99 | super(obj, src, ext, cc_args, postargs, pp_opts) 100 | # reset the default compiler_so, which we might have changed for cuda 101 | self.compiler_so = default_compiler_so 102 | 103 | # inject our redefined _compile method into the class 104 | self._compile = _compile 105 | 106 | 107 | # run the customize_compiler 108 | class custom_build_ext(build_ext): 109 | def build_extensions(self): 110 | customize_compiler_for_nvcc(self.compiler) 111 | build_ext.build_extensions(self) 112 | 113 | 114 | ext_modules = [ 115 | Extension( 116 | "nms.cpu_nms", 117 | ["nms/cpu_nms.pyx"], 118 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 119 | include_dirs=[numpy_include] 120 | ), 121 | Extension('nms.gpu_nms', 122 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 123 | library_dirs=[CUDA['lib64']], 124 | libraries=['cudart'], 125 | language='c++', 126 | runtime_library_dirs=[CUDA['lib64']], 127 | # this syntax is specific to this build system 128 | # we're only going to use certain compiler args with nvcc and not with gcc 129 | # the implementation of this trick is in customize_compiler() below 130 | extra_compile_args={'gcc': ["-Wno-unused-function"], 131 | 'nvcc': ['-arch=sm_52', 132 | '--ptxas-options=-v', 133 | '-c', 134 | '--compiler-options', 135 | "'-fPIC'"]}, 136 | include_dirs=[numpy_include, CUDA['include']] 137 | ), 138 | Extension( 139 | 'pycocotools._mask', 140 | sources=['pycocotools/maskApi.c', 'pycocotools/_mask.pyx'], 141 | include_dirs=[numpy_include, 'pycocotools'], 142 | extra_compile_args={ 143 | 'gcc': ['-Wno-cpp', '-Wno-unused-function', '-std=c99']}, 144 | ), 145 | ] 146 | 147 | setup( 148 | name='mot_utils', 149 | ext_modules=ext_modules, 150 | # inject our custom trigger 151 | cmdclass={'build_ext': custom_build_ext}, 152 | ) 153 | -------------------------------------------------------------------------------- /SSD/data/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os 9 | import pickle 10 | import numpy as np 11 | import pdb 12 | 13 | 14 | def parse_rec(filename): 15 | """ Parse a PASCAL VOC xml file """ 16 | tree = ET.parse(filename) 17 | objects = [] 18 | for obj in tree.findall('object'): 19 | obj_struct = {} 20 | obj_struct['name'] = obj.find('name').text 21 | obj_struct['pose'] = obj.find('pose').text 22 | obj_struct['truncated'] = int(obj.find('truncated').text) 23 | obj_struct['difficult'] = int(obj.find('difficult').text) 24 | bbox = obj.find('bndbox') 25 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 26 | int(bbox.find('ymin').text), 27 | int(bbox.find('xmax').text), 28 | int(bbox.find('ymax').text)] 29 | objects.append(obj_struct) 30 | 31 | return objects 32 | 33 | 34 | 35 | def voc_ap(rec, prec, use_07_metric=False): 36 | """ ap = voc_ap(rec, prec, [use_07_metric]) 37 | Compute VOC AP given precision and recall. 38 | If use_07_metric is true, uses the 39 | VOC 07 11 point method (default:False). 40 | """ 41 | if use_07_metric: 42 | # 11 point metric 43 | ap = 0. 44 | for t in np.arange(0., 1.1, 0.1): 45 | if np.sum(rec >= t) == 0: 46 | p = 0 47 | else: 48 | p = np.max(prec[rec >= t]) 49 | ap = ap + p / 11. 50 | else: 51 | # correct AP calculation 52 | # first append sentinel values at the end 53 | mrec = np.concatenate(([0.], rec, [1.])) 54 | mpre = np.concatenate(([0.], prec, [0.])) 55 | 56 | # compute the precision envelope 57 | for i in range(mpre.size - 1, 0, -1): 58 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 59 | 60 | # to calculate area under PR curve, look for points 61 | # where X axis (recall) changes value 62 | i = np.where(mrec[1:] != mrec[:-1])[0] 63 | 64 | # and sum (\Delta recall) * prec 65 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 66 | return ap 67 | 68 | def voc_eval(detpath, 69 | annopath, 70 | imagesetfile, 71 | classname, 72 | cachedir, 73 | ovthresh=0.5, 74 | use_07_metric=False): 75 | """rec, prec, ap = voc_eval(detpath, 76 | annopath, 77 | imagesetfile, 78 | classname, 79 | [ovthresh], 80 | [use_07_metric]) 81 | 82 | Top level function that does the PASCAL VOC evaluation. 83 | 84 | detpath: Path to detections 85 | detpath.format(classname) should produce the detection results file. 86 | annopath: Path to annotations 87 | annopath.format(imagename) should be the xml annotations file. 88 | imagesetfile: Text file containing the list of images, one image per line. 89 | classname: Category name (duh) 90 | cachedir: Directory for caching the annotations 91 | [ovthresh]: Overlap threshold (default = 0.5) 92 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 93 | (default False) 94 | """ 95 | # assumes detections are in detpath.format(classname) 96 | # assumes annotations are in annopath.format(imagename) 97 | # assumes imagesetfile is a text file with each line an image name 98 | # cachedir caches the annotations in a pickle file 99 | 100 | # first load gt 101 | if not os.path.isdir(cachedir): 102 | os.mkdir(cachedir) 103 | cachefile = os.path.join(cachedir, 'annots.pkl') 104 | # read list of images 105 | with open(imagesetfile, 'r') as f: 106 | lines = f.readlines() 107 | imagenames = [x.strip() for x in lines] 108 | 109 | if not os.path.isfile(cachefile): 110 | # load annots 111 | recs = {} 112 | for i, imagename in enumerate(imagenames): 113 | recs[imagename] = parse_rec(annopath.format(imagename)) 114 | if i % 100 == 0: 115 | print('Reading annotation for {:d}/{:d}'.format( 116 | i + 1, len(imagenames))) 117 | # save 118 | print('Saving cached annotations to {:s}'.format(cachefile)) 119 | with open(cachefile, 'wb') as f: 120 | pickle.dump(recs, f) 121 | else: 122 | # load 123 | with open(cachefile, 'rb') as f: 124 | recs = pickle.load(f) 125 | 126 | # extract gt objects for this class 127 | class_recs = {} 128 | npos = 0 129 | for imagename in imagenames: 130 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 131 | bbox = np.array([x['bbox'] for x in R]) 132 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 133 | det = [False] * len(R) 134 | npos = npos + sum(~difficult) 135 | class_recs[imagename] = {'bbox': bbox, 136 | 'difficult': difficult, 137 | 'det': det} 138 | 139 | # read dets 140 | detfile = detpath.format(classname) 141 | with open(detfile, 'r') as f: 142 | lines = f.readlines() 143 | 144 | splitlines = [x.strip().split(' ') for x in lines] 145 | image_ids = [x[0] for x in splitlines] 146 | confidence = np.array([float(x[1]) for x in splitlines]) 147 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 148 | 149 | # sort by confidence 150 | sorted_ind = np.argsort(-confidence) 151 | sorted_scores = np.sort(-confidence) 152 | BB = BB[sorted_ind, :] 153 | image_ids = [image_ids[x] for x in sorted_ind] 154 | 155 | # go down dets and mark TPs and FPs 156 | nd = len(image_ids) 157 | tp = np.zeros(nd) 158 | fp = np.zeros(nd) 159 | for d in range(nd): 160 | R = class_recs[image_ids[d]] 161 | bb = BB[d, :].astype(float) 162 | ovmax = -np.inf 163 | BBGT = R['bbox'].astype(float) 164 | 165 | if BBGT.size > 0: 166 | # compute overlaps 167 | # intersection 168 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 169 | iymin = np.maximum(BBGT[:, 1], bb[1]) 170 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 171 | iymax = np.minimum(BBGT[:, 3], bb[3]) 172 | iw = np.maximum(ixmax - ixmin + 1., 0.) 173 | ih = np.maximum(iymax - iymin + 1., 0.) 174 | inters = iw * ih 175 | 176 | # union 177 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 178 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 179 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 180 | 181 | overlaps = inters / uni 182 | ovmax = np.max(overlaps) 183 | jmax = np.argmax(overlaps) 184 | 185 | if ovmax > ovthresh: 186 | if not R['difficult'][jmax]: 187 | if not R['det'][jmax]: 188 | tp[d] = 1. 189 | R['det'][jmax] = 1 190 | else: 191 | fp[d] = 1. 192 | else: 193 | fp[d] = 1. 194 | 195 | # compute precision recall 196 | fp = np.cumsum(fp) 197 | tp = np.cumsum(tp) 198 | rec = tp / float(npos) 199 | # avoid divide by zero in case the first detection matches a difficult 200 | # ground truth 201 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 202 | ap = voc_ap(rec, prec, use_07_metric) 203 | 204 | return rec, prec, ap 205 | -------------------------------------------------------------------------------- /SSD/layers/modules/multibox_loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | from utils.box_utils import match, match_mixup,point_form 8 | GPU = False 9 | if torch.cuda.is_available(): 10 | GPU = True 11 | from .loss import * 12 | 13 | class MultiBoxLoss(nn.Module): 14 | """SSD Weighted Loss Function 15 | Compute Targets: 16 | 1) Produce Confidence Target Indices by matching ground truth boxes 17 | with (default) 'priorboxes' that have jaccard index > threshold parameter 18 | (default threshold: 0.5). 19 | 2) Produce localization target by 'encoding' variance into offsets of ground 20 | truth boxes and their matched 'priorboxes'. 21 | 3) Hard negative mining to filter the excessive number of negative examples 22 | that comes with using a large number of default bounding boxes. 23 | (default negative:positive ratio 3:1) 24 | Objective Loss: 25 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 26 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 27 | weighted by α which is set to 1 by cross val. 28 | Args: 29 | c: class confidences, 30 | l: predicted boxes, 31 | g: ground truth boxes 32 | N: number of matched default boxes 33 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 34 | """ 35 | 36 | 37 | def __init__(self, num_classes,overlap_thresh,prior_for_matching,bkg_label,neg_mining,neg_pos,neg_overlap,encode_target, 38 | label_smmooth=False, balance_l1=False, focal_loss=False, giou=False): 39 | super(MultiBoxLoss, self).__init__() 40 | self.num_classes = num_classes 41 | self.threshold = overlap_thresh 42 | self.background_label = bkg_label 43 | self.encode_target = encode_target 44 | self.use_prior_for_matching = prior_for_matching 45 | self.do_neg_mining = neg_mining 46 | self.negpos_ratio = neg_pos 47 | self.neg_overlap = neg_overlap 48 | self.variance = [0.1,0.2] 49 | self.label_smooth = label_smmooth 50 | if self.label_smooth: 51 | self.label_pos = 0.9 52 | self.label_neg = (1.0 - self.label_pos) / (self.num_classes - 1) 53 | self.balance_l1 = balance_l1 54 | self.focal_loss = focal_loss 55 | self.softmax_focal = False # using OHEM, CEWithsoftmax and Focal loss 56 | self.sigmoid_focal = False # Original Focal loss(Using sigmoid with CE) 57 | if self.focal_loss: 58 | self.softmax_focal = True 59 | if self.sigmoid_focal: 60 | self.alpha = 0.25 61 | self.gamma = 2.0 62 | self.giou = giou 63 | 64 | def forward(self, predictions, priors, targets): 65 | """Multibox Loss 66 | Args: 67 | predictions (tuple): A tuple containing loc preds, conf preds, 68 | and prior boxes from SSD net. 69 | conf shape: torch.size(batch_size,num_priors,num_classes) 70 | loc shape: torch.size(batch_size,num_priors,4) 71 | priors shape: torch.size(num_priors,4) 72 | 73 | ground_truth (tensor): Ground truth boxes and labels for a batch, 74 | shape: [batch_size,num_objs,5] (last idx is the label). 75 | """ 76 | 77 | loc_data, conf_data = predictions 78 | priors = priors 79 | num = loc_data.size(0) 80 | num_priors = (priors.size(0)) 81 | num_classes = self.num_classes 82 | 83 | # match priors (default boxes) and ground truth boxes 84 | loc_t = torch.Tensor(num, num_priors, 4) 85 | conf_t = torch.LongTensor(num, num_priors) 86 | if targets[0].shape[1] == 6:# mixup 87 | weight_t = torch.Tensor(num, num_priors) 88 | for idx in range(num): 89 | defaults = priors.data 90 | if targets[idx].shape[1] == 6: # mixup 91 | truths = targets[idx][:, :-2].data 92 | labels = targets[idx][:, -2].data 93 | weight_loss = targets[idx][:, -1].data 94 | match_mixup(self.threshold, truths, defaults, self.variance, labels, loc_t, conf_t, idx, weight_t, weight_loss, self.giou) 95 | elif targets[idx].shape[1] == 5: # no moxiup 96 | truths = targets[idx][:, :-1].data 97 | labels = targets[idx][:, -1].data 98 | match(self.threshold, truths, defaults, self.variance, labels, loc_t, conf_t, idx, self.giou) 99 | else: 100 | print('The shape of targets is error') 101 | 102 | if GPU: 103 | loc_t = loc_t.cuda() 104 | conf_t = conf_t.cuda() 105 | # wrap targets 106 | loc_t = Variable(loc_t, requires_grad=False) 107 | conf_t = Variable(conf_t,requires_grad=False) 108 | 109 | pos = conf_t > 0 110 | 111 | mix_up = (False, True)[targets[0].shape[1] == 6] 112 | pos_weight = None 113 | weights_conf = None 114 | 115 | # Localization Loss (Smooth L1) 116 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 117 | loc_p = loc_data[pos_idx].view(-1,4) 118 | loc_t = loc_t[pos_idx].view(-1,4) 119 | 120 | if self.giou: 121 | # prior_giou = point_form(priors) # [x,y,h,w]->[x0,y0,x1,y1] 122 | prior_giou = priors.unsqueeze(0).expand(num, num_priors, 4) 123 | prior_giou = prior_giou[pos_idx].view(-1, 4) 124 | reg_loss = GIoUloss() 125 | loss_l = reg_loss(loc_p, prior_giou, loc_t) 126 | else: 127 | if mix_up: 128 | weight_t = weight_t.cuda() 129 | weight_t = Variable(weight_t, requires_grad=False) 130 | pos_weight = weight_t[pos].view(-1, 1) 131 | 132 | reg_loss = SmoothL1_Mixup_Balance_loss(mixup=mix_up, balance=self.balance_l1, size_average=False) 133 | loss_l = reg_loss(loc_p, loc_t, pos_weight) 134 | 135 | # Confidence Loss 136 | if self.sigmoid_focal: 137 | # if use original focal loss, please modify the output of the test in models/SSD.py to the sigmoid 138 | batch_conf = conf_data.view(-1, self.num_classes) 139 | label_onehot = batch_conf.clone().zero_().scatter(1, conf_t.view(-1,1), 1) 140 | alpha = self.alpha * label_onehot + (1 - self.alpha) * (1 - label_onehot) 141 | p = torch.sigmoid(batch_conf) 142 | pt = torch.where(label_onehot==1, p, 1-p) 143 | loss_c = - alpha * ((1 - pt) ** self.gamma) * torch.log(pt) 144 | loss_c = loss_c.sum() 145 | num_pos = pos.long().sum(1, keepdim=True) 146 | else: 147 | batch_conf = conf_data.view(-1, self.num_classes) 148 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) 149 | 150 | # Hard Negative Mining 151 | loss_c[pos.view(-1, 1)] = 0 # filter out pos boxes for now 152 | loss_c = loss_c.view(num, -1) 153 | _, loss_idx = loss_c.sort(1, descending=True) 154 | _, idx_rank = loss_idx.sort(1) 155 | num_pos = pos.long().sum(1, keepdim=True) 156 | num_neg = torch.clamp(self.negpos_ratio * num_pos, max=pos.size(1) - 1) 157 | neg = idx_rank < num_neg.expand_as(idx_rank) 158 | 159 | # Confidence Loss Including Positive and Negative Examples 160 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 161 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 162 | conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(-1, self.num_classes) 163 | if self.label_smooth: 164 | p = conf_t.clone().view(-1, 1).float() 165 | lp = torch.where(p < 1, p + 1, torch.tensor(self.label_pos).cuda()) 166 | label = batch_conf.clone().zero_().scatter_(1, conf_t.view(-1, 1), lp) 167 | label[:, 1:][pos.clone().view(-1, 1).flatten()] += self.label_neg 168 | label_ohem = (pos + neg).view(-1, 1).expand_as(batch_conf) 169 | targets_weighted = label[label_ohem.gt(0)].view(-1, self.num_classes) 170 | else: 171 | targets_weighted = conf_t[(pos + neg).gt(0)] 172 | if mix_up: 173 | weights_conf = weight_t[(pos + neg).gt(0)] 174 | weights_conf = torch.where(weights_conf > 0, weights_conf, weights_conf + 1.0).view(-1, 1) 175 | 176 | conf_loss = Crossentropy_Mixup_SoftmaxFocal_LableSmooth_loss(mixup=mix_up,focal_loss=self.softmax_focal,gamma=2.0,alpha=1.0, 177 | label_smooth=self.label_smooth,size_average=False) 178 | loss_c = conf_loss(conf_p, targets_weighted, weights_conf) 179 | 180 | # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 181 | 182 | N = max(num_pos.data.sum().float(), 1) 183 | loss_l/=N 184 | loss_c/=N 185 | return loss_l,loss_c 186 | -------------------------------------------------------------------------------- /SSD/utils/pycocotools/maskApi.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "maskApi.h" 8 | #include 9 | #include 10 | 11 | uint umin( uint a, uint b ) { return (ab) ? a : b; } 13 | 14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { 15 | R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); 16 | siz j; if(cnts) for(j=0; jcnts[j]=cnts[j]; 17 | } 18 | 19 | void rleFree( RLE *R ) { 20 | free(R->cnts); R->cnts=0; 21 | } 22 | 23 | void rlesInit( RLE **R, siz n ) { 24 | siz i; *R = (RLE*) malloc(sizeof(RLE)*n); 25 | for(i=0; i0 ) { 61 | c=umin(ca,cb); cc+=c; ct=0; 62 | ca-=c; if(!ca && a0) { 83 | crowd=iscrowd!=NULL && iscrowd[g]; 84 | if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } 85 | siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb; 86 | ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; 87 | cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; 88 | while( ct>0 ) { 89 | c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; 90 | ca-=c; if(!ca && athr) keep[j]=0; 105 | } 106 | } 107 | } 108 | 109 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) { 110 | double h, w, i, u, ga, da; siz g, d; int crowd; 111 | for( g=0; gthr) keep[j]=0; 129 | } 130 | } 131 | } 132 | 133 | void rleToBbox( const RLE *R, BB bb, siz n ) { 134 | siz i; for( i=0; id?1:c=dy && xs>xe) || (dxye); 173 | if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } 174 | s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; 175 | if(dx>=dy) for( d=0; d<=dx; d++ ) { 176 | t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; 177 | } else for( d=0; d<=dy; d++ ) { 178 | t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; 179 | } 180 | } 181 | /* get points along y-boundary and downsample */ 182 | free(x); free(y); k=m; m=0; double xd, yd; 183 | x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); 184 | for( j=1; jw-1 ) continue; 187 | yd=(double)(v[j]h) yd=h; yd=ceil(yd); 189 | x[m]=(int) xd; y[m]=(int) yd; m++; 190 | } 191 | /* compute rle encoding given y-boundary points */ 192 | k=m; a=malloc(sizeof(uint)*(k+1)); 193 | for( j=0; j0) b[m++]=a[j++]; else { 199 | j++; if(jm, p=0; long x; int more; 206 | char *s=malloc(sizeof(char)*m*6); 207 | for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; 209 | while( more ) { 210 | char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; 211 | if(more) c |= 0x20; c+=48; s[p++]=c; 212 | } 213 | } 214 | s[p]=0; return s; 215 | } 216 | 217 | void rleFrString( RLE *R, char *s, siz h, siz w ) { 218 | siz m=0, p=0, k; long x; int more; uint *cnts; 219 | while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; 220 | while( s[p] ) { 221 | x=0; k=0; more=1; 222 | while( more ) { 223 | char c=s[p]-48; x |= (c & 0x1f) << 5*k; 224 | more = c & 0x20; p++; k++; 225 | if(!more && (c & 0x10)) x |= -1 << 5*k; 226 | } 227 | if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; 228 | } 229 | rleInit(R,h,w,m,cnts); free(cnts); 230 | } 231 | -------------------------------------------------------------------------------- /SSD/models/SSD.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from .base_models import vgg, vgg_base 6 | from layers import l2norm 7 | 8 | class BasicConv(nn.Module): 9 | 10 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, 11 | gn=False, bn=False): 12 | super(BasicConv, self).__init__() 13 | self.out_channels = out_planes 14 | if gn and bn: 15 | exit("Don't allow simultaneous use of BN and GN !") 16 | bias = (gn == bn) 17 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) 18 | self.gn = nn.GroupNorm(32, out_planes,eps=1e-5, affine=True) if gn else None 19 | self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None 20 | self.relu = nn.ReLU(inplace=True) if relu else None 21 | 22 | def forward(self, x): 23 | x = self.conv(x) 24 | if self.gn is not None: 25 | x = self.gn(x) 26 | if self.bn is not None: 27 | x = self.bn(x) 28 | if self.relu is not None: 29 | x = self.relu(x) 30 | return x 31 | 32 | class SSD(nn.Module): 33 | """Single Shot Multibox Architecture 34 | The network is composed of a base VGG network followed by the 35 | added multibox conv layers. Each multibox layer branches into 36 | 1) conv2d for class conf scores 37 | 2) conv2d for localization predictions 38 | 3) associated priorbox layer to produce default bounding 39 | boxes specific to the layer's feature map size. 40 | See: https://arxiv.org/pdf/1712.00960.pdf or more details. 41 | 42 | Args: 43 | base: VGG16 layers for input, size of either 300 or 500 44 | extras: extra layers that feed to multibox loc and conf layers 45 | head: "multibox head" consists of loc and conf conv layers 46 | """ 47 | 48 | def __init__(self, base, extras, head, num_classes, size, norm): 49 | super(SSD, self).__init__() 50 | self.num_classes = num_classes 51 | # TODO: implement __call__ in PriorBox 52 | self.size = size 53 | 54 | # SSD network 55 | self.base = nn.ModuleList(base) 56 | self.extras = nn.ModuleList(extras) 57 | if norm is "L2Norm": 58 | self.Norm = l2norm.L2Norm(512, 20) 59 | elif norm is "BN": 60 | self.Norm = nn.BatchNorm2d(512, eps=1e-5, momentum=0.01, affine=True) 61 | elif norm is "GN": 62 | self.Norm = nn.GroupNorm(32, 512, eps=1e-5, affine=True) # group is defaulted to 32 63 | else: 64 | exit("Error type of Normalization, please assign one of L2Norm, BN, GN") 65 | 66 | self.loc = nn.ModuleList(head[0]) 67 | self.conf = nn.ModuleList(head[1]) 68 | 69 | self.softmax = nn.Softmax() 70 | 71 | def forward(self, x, vgg_bn=False,test=False): 72 | """Applies network layers and ops on input image(s) x. 73 | 74 | Args: 75 | x: input image or batch of images. Shape: [batch,3*batch,300,300]. 76 | 77 | Return: 78 | Depending on phase: 79 | test: 80 | Variable(tensor) of output class label predictions, 81 | confidence score, and corresponding location predictions for 82 | each object detected. Shape: [batch,topk,7] 83 | 84 | train: 85 | list of concat outputs from: 86 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 87 | 2: localization layers, Shape: [batch,num_priors*4] 88 | 3: priorbox layers, Shape: [2,num_priors*4] 89 | """ 90 | source_features = list() 91 | loc = list() 92 | conf = list() 93 | 94 | # apply vgg up to conv4_3 relu 95 | for k in range(23): 96 | x = self.base[k](x) 97 | 98 | if vgg_bn: 99 | x1 = x 100 | else: 101 | x1 = self.Norm(x) 102 | source_features.append(x1) 103 | 104 | # apply vgg up to fc7 105 | for k in range(23, len(self.base)): 106 | x = self.base[k](x) 107 | source_features.append(x) 108 | 109 | for i,k in enumerate(self.extras): 110 | x = k(x) 111 | if i % 2 == 1: 112 | source_features.append(x) 113 | 114 | # apply multibox head to source layers 115 | for (x, l, c) in zip(source_features, self.loc, self.conf): 116 | loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 117 | conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 118 | 119 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 120 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 121 | if test: 122 | output = ( 123 | loc.view(loc.size(0), -1, 4), # loc preds 124 | self.softmax(conf.view(-1, self.num_classes)), # conf preds 125 | # torch.sigmoid(conf.view(-1, self.num_classes)) 126 | ) 127 | else: 128 | output = ( 129 | loc.view(loc.size(0), -1, 4), 130 | conf.view(conf.size(0), -1, self.num_classes), 131 | ) 132 | return output 133 | 134 | def load_weights(self, base_file): 135 | other, ext = os.path.splitext(base_file) 136 | if ext == '.pkl' or '.pth': 137 | print('Loading weights into state dict...') 138 | self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage)) 139 | print('Finished!') 140 | else: 141 | print('Sorry only .pth and .pkl files supported.') 142 | 143 | def add_extras(size,norm): 144 | if size == 300: 145 | if norm == "BN" or norm == "L2Norm": # if using L2Norm, we set BN for normalization on the extra layers 146 | layers = [BasicConv(1024, 256, kernel_size=1, stride=1, padding=0, bn=True), 147 | BasicConv(256, 512, kernel_size=3, stride=2, padding=1, bn=True), 148 | BasicConv(512, 128, kernel_size=1, stride=1, padding=0, bn=True), 149 | BasicConv(128, 256, kernel_size=3, stride=2, padding=1, bn=True), 150 | BasicConv(256, 128, kernel_size=1, stride=1, padding=0, bn=True), 151 | BasicConv(128, 256, kernel_size=3, stride=1, padding=0, bn=True), 152 | BasicConv(256, 128, kernel_size=1, stride=1, padding=0, bn=True), 153 | BasicConv(128, 256, kernel_size=3, stride=1, padding=0, bn=True),] 154 | elif norm == "GN": 155 | layers = [BasicConv(1024, 256, kernel_size=1, stride=1, padding=0, gn=True), 156 | BasicConv(256, 512, kernel_size=3, stride=2, padding=1, gn=True), 157 | BasicConv(512, 128, kernel_size=1, stride=1, padding=0, gn=True), 158 | BasicConv(128, 256, kernel_size=3, stride=2, padding=1, gn=True), 159 | BasicConv(256, 128, kernel_size=1, stride=1, padding=0, gn=True), 160 | BasicConv(128, 256, kernel_size=3, stride=1, padding=0, gn=True), 161 | BasicConv(256, 128, kernel_size=1, stride=1, padding=0, gn=True), 162 | BasicConv(128, 256, kernel_size=3, stride=1, padding=0, gn=True), ] 163 | else: 164 | exit("Error type of Normalization, please assign one of L2Norm, BN, GN") 165 | 166 | elif size == 512: 167 | layers = [BasicConv(256 * 3, 512, kernel_size=3, stride=1, padding=1), 168 | BasicConv(512, 512, kernel_size=3, stride=2, padding=1), \ 169 | BasicConv(512, 256, kernel_size=3, stride=2, padding=1), 170 | BasicConv(256, 256, kernel_size=3, stride=2, padding=1), \ 171 | BasicConv(256, 256, kernel_size=3, stride=2, padding=1), 172 | BasicConv(256, 256, kernel_size=3, stride=2, padding=1), \ 173 | BasicConv(256, 256, kernel_size=4, padding=1, stride=1)] 174 | return layers 175 | 176 | 177 | def multibox(fea_channels, cfg, num_classes): 178 | loc_layers = [] 179 | conf_layers = [] 180 | assert len(fea_channels) == len(cfg) 181 | for i, fea_channel in enumerate(fea_channels): 182 | loc_layers += [nn.Conv2d(fea_channel, cfg[i] * 4, kernel_size=3, padding=1)] 183 | conf_layers += [nn.Conv2d(fea_channel, cfg[i] * num_classes, kernel_size=3, padding=1)] 184 | return (loc_layers, conf_layers) 185 | 186 | 187 | mbox = { 188 | '300': [6, 6, 6, 6, 4, 4], # number of boxes per feature map location 189 | '512': [6, 6, 6, 6, 6, 4, 4], 190 | } 191 | fea_channels = { 192 | '300': [512, 1024, 512, 256, 256, 256], 193 | '512': [512, 512, 256, 256, 256, 256, 256]} 194 | 195 | 196 | def build_net(size=300, num_classes=21, norm="BN",vgg_bn=False): 197 | if size != 300 and size != 512: 198 | print("Error: Sorry only FSSD300 and FSSD512 is supported currently!") 199 | return 200 | 201 | return SSD(base=vgg(vgg_base[str(size)], 3, batch_norm=vgg_bn),extras=add_extras(size,norm),head=multibox(fea_channels[str(size)], mbox[str(size)], num_classes), 202 | num_classes=num_classes, size=size, norm=norm) 203 | 204 | -------------------------------------------------------------------------------- /SSD/utils/pycocotools/_mask.pyx: -------------------------------------------------------------------------------- 1 | # distutils: language = c 2 | # distutils: sources = ../common/maskApi.c 3 | 4 | #************************************************************************** 5 | # Microsoft COCO Toolbox. version 2.0 6 | # Data, paper, and tutorials available at: http://mscoco.org/ 7 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 8 | # Licensed under the Simplified BSD License [see coco/license.txt] 9 | #************************************************************************** 10 | 11 | __author__ = 'tsungyi' 12 | 13 | import sys 14 | PYTHON_VERSION = sys.version_info[0] 15 | 16 | # import both Python-level and C-level symbols of Numpy 17 | # the API uses Numpy to interface C and Python 18 | import numpy as np 19 | cimport numpy as np 20 | from libc.stdlib cimport malloc, free 21 | 22 | # intialized Numpy. must do. 23 | np.import_array() 24 | 25 | # import numpy C function 26 | # we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management 27 | cdef extern from "numpy/arrayobject.h": 28 | void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) 29 | 30 | # Declare the prototype of the C functions in MaskApi.h 31 | cdef extern from "maskApi.h": 32 | ctypedef unsigned int uint 33 | ctypedef unsigned long siz 34 | ctypedef unsigned char byte 35 | ctypedef double* BB 36 | ctypedef struct RLE: 37 | siz h, 38 | siz w, 39 | siz m, 40 | uint* cnts, 41 | void rlesInit( RLE **R, siz n ) 42 | void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) 43 | void rleDecode( const RLE *R, byte *mask, siz n ) 44 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ) 45 | void rleArea( const RLE *R, siz n, uint *a ) 46 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) 47 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) 48 | void rleToBbox( const RLE *R, BB bb, siz n ) 49 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) 50 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) 51 | char* rleToString( const RLE *R ) 52 | void rleFrString( RLE *R, char *s, siz h, siz w ) 53 | 54 | # python class to wrap RLE array in C 55 | # the class handles the memory allocation and deallocation 56 | cdef class RLEs: 57 | cdef RLE *_R 58 | cdef siz _n 59 | 60 | def __cinit__(self, siz n =0): 61 | rlesInit(&self._R, n) 62 | self._n = n 63 | 64 | # free the RLE array here 65 | def __dealloc__(self): 66 | if self._R is not NULL: 67 | for i in range(self._n): 68 | free(self._R[i].cnts) 69 | free(self._R) 70 | def __getattr__(self, key): 71 | if key == 'n': 72 | return self._n 73 | raise AttributeError(key) 74 | 75 | # python class to wrap Mask array in C 76 | # the class handles the memory allocation and deallocation 77 | cdef class Masks: 78 | cdef byte *_mask 79 | cdef siz _h 80 | cdef siz _w 81 | cdef siz _n 82 | 83 | def __cinit__(self, h, w, n): 84 | self._mask = malloc(h*w*n* sizeof(byte)) 85 | self._h = h 86 | self._w = w 87 | self._n = n 88 | # def __dealloc__(self): 89 | # the memory management of _mask has been passed to np.ndarray 90 | # it doesn't need to be freed here 91 | 92 | # called when passing into np.array() and return an np.ndarray in column-major order 93 | def __array__(self): 94 | cdef np.npy_intp shape[1] 95 | shape[0] = self._h*self._w*self._n 96 | # Create a 1D array, and reshape it to fortran/Matlab column-major array 97 | ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F') 98 | # The _mask allocated by Masks is now handled by ndarray 99 | PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA) 100 | return ndarray 101 | 102 | # internal conversion from Python RLEs object to compressed RLE format 103 | def _toString(RLEs Rs): 104 | cdef siz n = Rs.n 105 | cdef bytes py_string 106 | cdef char* c_string 107 | objs = [] 108 | for i in range(n): 109 | c_string = rleToString( &Rs._R[i] ) 110 | py_string = c_string 111 | objs.append({ 112 | 'size': [Rs._R[i].h, Rs._R[i].w], 113 | 'counts': py_string 114 | }) 115 | free(c_string) 116 | return objs 117 | 118 | # internal conversion from compressed RLE format to Python RLEs object 119 | def _frString(rleObjs): 120 | cdef siz n = len(rleObjs) 121 | Rs = RLEs(n) 122 | cdef bytes py_string 123 | cdef char* c_string 124 | for i, obj in enumerate(rleObjs): 125 | if PYTHON_VERSION == 2: 126 | py_string = str(obj['counts']).encode('utf8') 127 | elif PYTHON_VERSION == 3: 128 | py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts'] 129 | else: 130 | raise Exception('Python version must be 2 or 3') 131 | c_string = py_string 132 | rleFrString( &Rs._R[i], c_string, obj['size'][0], obj['size'][1] ) 133 | return Rs 134 | 135 | # encode mask to RLEs objects 136 | # list of RLE string can be generated by RLEs member function 137 | def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask): 138 | h, w, n = mask.shape[0], mask.shape[1], mask.shape[2] 139 | cdef RLEs Rs = RLEs(n) 140 | rleEncode(Rs._R,mask.data,h,w,n) 141 | objs = _toString(Rs) 142 | return objs 143 | 144 | # decode mask from compressed list of RLE string or RLEs object 145 | def decode(rleObjs): 146 | cdef RLEs Rs = _frString(rleObjs) 147 | h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n 148 | masks = Masks(h, w, n) 149 | rleDecode(Rs._R, masks._mask, n); 150 | return np.array(masks) 151 | 152 | def merge(rleObjs, intersect=0): 153 | cdef RLEs Rs = _frString(rleObjs) 154 | cdef RLEs R = RLEs(1) 155 | rleMerge(Rs._R, R._R, Rs._n, intersect) 156 | obj = _toString(R)[0] 157 | return obj 158 | 159 | def area(rleObjs): 160 | cdef RLEs Rs = _frString(rleObjs) 161 | cdef uint* _a = malloc(Rs._n* sizeof(uint)) 162 | rleArea(Rs._R, Rs._n, _a) 163 | cdef np.npy_intp shape[1] 164 | shape[0] = Rs._n 165 | a = np.array((Rs._n, ), dtype=np.uint8) 166 | a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a) 167 | PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA) 168 | return a 169 | 170 | # iou computation. support function overload (RLEs-RLEs and bbox-bbox). 171 | def iou( dt, gt, pyiscrowd ): 172 | def _preproc(objs): 173 | if len(objs) == 0: 174 | return objs 175 | if type(objs) == np.ndarray: 176 | if len(objs.shape) == 1: 177 | objs = objs.reshape((objs[0], 1)) 178 | # check if it's Nx4 bbox 179 | if not len(objs.shape) == 2 or not objs.shape[1] == 4: 180 | raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension') 181 | objs = objs.astype(np.double) 182 | elif type(objs) == list: 183 | # check if list is in box format and convert it to np.ndarray 184 | isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs])) 185 | isrle = np.all(np.array([type(obj) == dict for obj in objs])) 186 | if isbox: 187 | objs = np.array(objs, dtype=np.double) 188 | if len(objs.shape) == 1: 189 | objs = objs.reshape((1,objs.shape[0])) 190 | elif isrle: 191 | objs = _frString(objs) 192 | else: 193 | raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])') 194 | else: 195 | raise Exception('unrecognized type. The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.') 196 | return objs 197 | def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): 198 | rleIou( dt._R, gt._R, m, n, iscrowd.data, _iou.data ) 199 | def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): 200 | bbIou( dt.data, gt.data, m, n, iscrowd.data, _iou.data ) 201 | def _len(obj): 202 | cdef siz N = 0 203 | if type(obj) == RLEs: 204 | N = obj.n 205 | elif len(obj)==0: 206 | pass 207 | elif type(obj) == np.ndarray: 208 | N = obj.shape[0] 209 | return N 210 | # convert iscrowd to numpy array 211 | cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8) 212 | # simple type checking 213 | cdef siz m, n 214 | dt = _preproc(dt) 215 | gt = _preproc(gt) 216 | m = _len(dt) 217 | n = _len(gt) 218 | if m == 0 or n == 0: 219 | return [] 220 | if not type(dt) == type(gt): 221 | raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray') 222 | 223 | # define local variables 224 | cdef double* _iou = 0 225 | cdef np.npy_intp shape[1] 226 | # check type and assign iou function 227 | if type(dt) == RLEs: 228 | _iouFun = _rleIou 229 | elif type(dt) == np.ndarray: 230 | _iouFun = _bbIou 231 | else: 232 | raise Exception('input data type not allowed.') 233 | _iou = malloc(m*n* sizeof(double)) 234 | iou = np.zeros((m*n, ), dtype=np.double) 235 | shape[0] = m*n 236 | iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou) 237 | PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA) 238 | _iouFun(dt, gt, iscrowd, m, n, iou) 239 | return iou.reshape((m,n), order='F') 240 | 241 | def toBbox( rleObjs ): 242 | cdef RLEs Rs = _frString(rleObjs) 243 | cdef siz n = Rs.n 244 | cdef BB _bb = malloc(4*n* sizeof(double)) 245 | rleToBbox( Rs._R, _bb, n ) 246 | cdef np.npy_intp shape[1] 247 | shape[0] = 4*n 248 | bb = np.array((1,4*n), dtype=np.double) 249 | bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4)) 250 | PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA) 251 | return bb 252 | 253 | def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ): 254 | cdef siz n = bb.shape[0] 255 | Rs = RLEs(n) 256 | rleFrBbox( Rs._R, bb.data, h, w, n ) 257 | objs = _toString(Rs) 258 | return objs 259 | 260 | def frPoly( poly, siz h, siz w ): 261 | cdef np.ndarray[np.double_t, ndim=1] np_poly 262 | n = len(poly) 263 | Rs = RLEs(n) 264 | for i, p in enumerate(poly): 265 | np_poly = np.array(p, dtype=np.double, order='F') 266 | rleFrPoly( &Rs._R[i], np_poly.data, int(len(p)/2), h, w ) 267 | objs = _toString(Rs) 268 | return objs 269 | 270 | def frUncompressedRLE(ucRles, siz h, siz w): 271 | cdef np.ndarray[np.uint32_t, ndim=1] cnts 272 | cdef RLE R 273 | cdef uint *data 274 | n = len(ucRles) 275 | objs = [] 276 | for i in range(n): 277 | Rs = RLEs(1) 278 | cnts = np.array(ucRles[i]['counts'], dtype=np.uint32) 279 | # time for malloc can be saved here but it's fine 280 | data = malloc(len(cnts)* sizeof(uint)) 281 | for j in range(len(cnts)): 282 | data[j] = cnts[j] 283 | R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), data) 284 | Rs._R[0] = R 285 | objs.append(_toString(Rs)[0]) 286 | return objs 287 | 288 | def frPyObjects(pyobj, h, w): 289 | # encode rle from a list of python objects 290 | if type(pyobj) == np.ndarray: 291 | objs = frBbox(pyobj, h, w) 292 | elif type(pyobj) == list and len(pyobj[0]) == 4: 293 | objs = frBbox(pyobj, h, w) 294 | elif type(pyobj) == list and len(pyobj[0]) > 4: 295 | objs = frPoly(pyobj, h, w) 296 | elif type(pyobj) == list and type(pyobj[0]) == dict \ 297 | and 'counts' in pyobj[0] and 'size' in pyobj[0]: 298 | objs = frUncompressedRLE(pyobj, h, w) 299 | # encode rle from single python object 300 | elif type(pyobj) == list and len(pyobj) == 4: 301 | objs = frBbox([pyobj], h, w)[0] 302 | elif type(pyobj) == list and len(pyobj) > 4: 303 | objs = frPoly([pyobj], h, w)[0] 304 | elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj: 305 | objs = frUncompressedRLE([pyobj], h, w)[0] 306 | else: 307 | raise Exception('input type is not supported.') 308 | return objs 309 | -------------------------------------------------------------------------------- /SSD/data/coco.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | 9 | import os 10 | import pickle 11 | import os.path 12 | import sys 13 | import torch 14 | import torch.utils.data as data 15 | import torchvision.transforms as transforms 16 | import cv2 17 | import numpy as np 18 | import json 19 | import uuid 20 | 21 | from utils.pycocotools.coco import COCO 22 | from utils.pycocotools.cocoeval import COCOeval 23 | from utils.pycocotools import mask as COCOmask 24 | 25 | 26 | class COCODetection(data.Dataset): 27 | 28 | """VOC Detection Dataset Object 29 | 30 | input is image, target is annotation 31 | 32 | Arguments: 33 | root (string): filepath to VOCdevkit folder. 34 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 35 | transform (callable, optional): transformation to perform on the 36 | input image 37 | target_transform (callable, optional): transformation to perform on the 38 | target `annotation` 39 | (eg: take in caption string, return tensor of word indices) 40 | dataset_name (string, optional): which dataset to load 41 | (default: 'VOC2007') 42 | """ 43 | 44 | def __init__(self, root, image_sets, preproc=None, target_transform=None, 45 | dataset_name='COCO'): 46 | self.root = root 47 | self.cache_path = os.path.join(self.root, 'cache') 48 | self.image_set = image_sets 49 | self.preproc = preproc 50 | self.target_transform = target_transform 51 | self.name = dataset_name 52 | self.ids = list() 53 | self.annotations = list() 54 | self._view_map = { 55 | 'minival2014' : 'val2014', # 5k val2014 subset 56 | 'valminusminival2014' : 'val2014', # val2014 \setminus minival2014 57 | 'test-dev2015' : 'test2015', 58 | } 59 | 60 | for (year, image_set) in image_sets: 61 | coco_name = image_set+year 62 | data_name = (self._view_map[coco_name] 63 | if coco_name in self._view_map 64 | else coco_name) 65 | annofile = self._get_ann_file(coco_name) 66 | _COCO = COCO(annofile) 67 | self._COCO = _COCO 68 | self.coco_name = coco_name 69 | cats = _COCO.loadCats(_COCO.getCatIds()) 70 | self._classes = tuple(['__background__'] + [c['name'] for c in cats]) 71 | self.num_classes = len(self._classes) 72 | self._class_to_ind = dict(zip(self._classes, range(self.num_classes))) 73 | self._class_to_coco_cat_id = dict(zip([c['name'] for c in cats], 74 | _COCO.getCatIds())) 75 | indexes = _COCO.getImgIds() 76 | self.image_indexes = indexes 77 | self.ids.extend([self.image_path_from_index(data_name, index) for index in indexes ]) 78 | if image_set.find('test') != -1: 79 | print('test set will not load annotations!') 80 | else: 81 | self.annotations.extend(self._load_coco_annotations(coco_name, indexes,_COCO)) 82 | 83 | 84 | 85 | def image_path_from_index(self, name, index): 86 | """ 87 | Construct an image path from the image's "index" identifier. 88 | """ 89 | # Example image path for index=119993: 90 | # images/train2014/COCO_train2014_000000119993.jpg 91 | file_name = ('COCO_' + name + '_' + 92 | str(index).zfill(12) + '.jpg') 93 | image_path = os.path.join(self.root, 'images', 94 | name, file_name) 95 | assert os.path.exists(image_path), \ 96 | 'Path does not exist: {}'.format(image_path) 97 | return image_path 98 | 99 | 100 | def _get_ann_file(self, name): 101 | prefix = 'instances' if name.find('test') == -1 \ 102 | else 'image_info' 103 | return os.path.join(self.root, 'annotations', 104 | prefix + '_' + name + '.json') 105 | 106 | 107 | def _load_coco_annotations(self, coco_name, indexes, _COCO): 108 | cache_file=os.path.join(self.cache_path,coco_name+'_gt_roidb.pkl') 109 | if os.path.exists(cache_file): 110 | with open(cache_file, 'rb') as fid: 111 | roidb = pickle.load(fid) 112 | print('{} gt roidb loaded from {}'.format(coco_name,cache_file)) 113 | return roidb 114 | 115 | gt_roidb = [self._annotation_from_index(index, _COCO) 116 | for index in indexes] 117 | with open(cache_file, 'wb') as fid: 118 | pickle.dump(gt_roidb,fid,pickle.HIGHEST_PROTOCOL) 119 | print('wrote gt roidb to {}'.format(cache_file)) 120 | return gt_roidb 121 | 122 | 123 | def _annotation_from_index(self, index, _COCO): 124 | """ 125 | Loads COCO bounding-box instance annotations. Crowd instances are 126 | handled by marking their overlaps (with all categories) to -1. This 127 | overlap value means that crowd "instances" are excluded from training. 128 | """ 129 | im_ann = _COCO.loadImgs(index)[0] 130 | width = im_ann['width'] 131 | height = im_ann['height'] 132 | 133 | annIds = _COCO.getAnnIds(imgIds=index, iscrowd=None) 134 | objs = _COCO.loadAnns(annIds) 135 | # Sanitize bboxes -- some are invalid 136 | valid_objs = [] 137 | for obj in objs: 138 | x1 = np.max((0, obj['bbox'][0])) 139 | y1 = np.max((0, obj['bbox'][1])) 140 | x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1)))) 141 | y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1)))) 142 | if obj['area'] > 0 and x2 >= x1 and y2 >= y1: 143 | obj['clean_bbox'] = [x1, y1, x2, y2] 144 | valid_objs.append(obj) 145 | objs = valid_objs 146 | num_objs = len(objs) 147 | 148 | res = np.zeros((num_objs, 5)) 149 | 150 | # Lookup table to map from COCO category ids to our internal class 151 | # indices 152 | coco_cat_id_to_class_ind = dict([(self._class_to_coco_cat_id[cls], 153 | self._class_to_ind[cls]) 154 | for cls in self._classes[1:]]) 155 | 156 | for ix, obj in enumerate(objs): 157 | cls = coco_cat_id_to_class_ind[obj['category_id']] 158 | res[ix, 0:4] = obj['clean_bbox'] 159 | res[ix, 4] = cls 160 | 161 | return res 162 | 163 | 164 | 165 | def __getitem__(self, index): 166 | img_id = self.ids[index] 167 | target = self.annotations[index] 168 | img = cv2.imread(img_id, cv2.IMREAD_COLOR) 169 | height, width, _ = img.shape 170 | 171 | if self.target_transform is not None: 172 | target = self.target_transform(target) 173 | 174 | 175 | if self.preproc is not None: 176 | img, target = self.preproc(img, target) 177 | 178 | # target = self.target_transform(target, width, height) 179 | #print(target.shape) 180 | 181 | return img, target 182 | 183 | def __len__(self): 184 | return len(self.ids) 185 | 186 | def pull_image(self, index): 187 | '''Returns the original image object at index in PIL form 188 | 189 | Note: not using self.__getitem__(), as any transformations passed in 190 | could mess up this functionality. 191 | 192 | Argument: 193 | index (int): index of img to show 194 | Return: 195 | PIL img 196 | ''' 197 | img_id = self.ids[index] 198 | return cv2.imread(img_id, cv2.IMREAD_COLOR) 199 | 200 | 201 | def pull_tensor(self, index): 202 | '''Returns the original image at an index in tensor form 203 | 204 | Note: not using self.__getitem__(), as any transformations passed in 205 | could mess up this functionality. 206 | 207 | Argument: 208 | index (int): index of img to show 209 | Return: 210 | tensorized version of img, squeezed 211 | ''' 212 | to_tensor = transforms.ToTensor() 213 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 214 | 215 | def _print_detection_eval_metrics(self, coco_eval): 216 | IoU_lo_thresh = 0.5 217 | IoU_hi_thresh = 0.95 218 | def _get_thr_ind(coco_eval, thr): 219 | ind = np.where((coco_eval.params.iouThrs > thr - 1e-5) & 220 | (coco_eval.params.iouThrs < thr + 1e-5))[0][0] 221 | iou_thr = coco_eval.params.iouThrs[ind] 222 | assert np.isclose(iou_thr, thr) 223 | return ind 224 | 225 | ind_lo = _get_thr_ind(coco_eval, IoU_lo_thresh) 226 | ind_hi = _get_thr_ind(coco_eval, IoU_hi_thresh) 227 | # precision has dims (iou, recall, cls, area range, max dets) 228 | # area range index 0: all area ranges 229 | # max dets index 2: 100 per image 230 | precision = \ 231 | coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2] 232 | ap_default = np.mean(precision[precision > -1]) 233 | print('~~~~ Mean and per-category AP @ IoU=[{:.2f},{:.2f}] ' 234 | '~~~~'.format(IoU_lo_thresh, IoU_hi_thresh)) 235 | print('{:.1f}'.format(100 * ap_default)) 236 | for cls_ind, cls in enumerate(self._classes): 237 | if cls == '__background__': 238 | continue 239 | # minus 1 because of __background__ 240 | precision = coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, cls_ind - 1, 0, 2] 241 | ap = np.mean(precision[precision > -1]) 242 | print('{:.1f}'.format(100 * ap)) 243 | 244 | print('~~~~ Summary metrics ~~~~') 245 | coco_eval.summarize() 246 | 247 | def _do_detection_eval(self, res_file, output_dir): 248 | ann_type = 'bbox' 249 | coco_dt = self._COCO.loadRes(res_file) 250 | coco_eval = COCOeval(self._COCO, coco_dt) 251 | coco_eval.params.useSegm = (ann_type == 'segm') 252 | coco_eval.evaluate() 253 | coco_eval.accumulate() 254 | self._print_detection_eval_metrics(coco_eval) 255 | eval_file = os.path.join(output_dir, 'detection_results.pkl') 256 | with open(eval_file, 'wb') as fid: 257 | pickle.dump(coco_eval, fid, pickle.HIGHEST_PROTOCOL) 258 | print('Wrote COCO eval results to: {}'.format(eval_file)) 259 | 260 | def _coco_results_one_category(self, boxes, cat_id): 261 | results = [] 262 | for im_ind, index in enumerate(self.image_indexes): 263 | dets = boxes[im_ind].astype(np.float) 264 | if dets == []: 265 | continue 266 | scores = dets[:, -1] 267 | xs = dets[:, 0] 268 | ys = dets[:, 1] 269 | ws = dets[:, 2] - xs + 1 270 | hs = dets[:, 3] - ys + 1 271 | results.extend( 272 | [{'image_id' : index, 273 | 'category_id' : cat_id, 274 | 'bbox' : [xs[k], ys[k], ws[k], hs[k]], 275 | 'score' : scores[k]} for k in range(dets.shape[0])]) 276 | return results 277 | 278 | def _write_coco_results_file(self, all_boxes, res_file): 279 | # [{"image_id": 42, 280 | # "category_id": 18, 281 | # "bbox": [258.15,41.29,348.26,243.78], 282 | # "score": 0.236}, ...] 283 | results = [] 284 | for cls_ind, cls in enumerate(self._classes): 285 | if cls == '__background__': 286 | continue 287 | print('Collecting {} results ({:d}/{:d})'.format(cls, cls_ind, 288 | self.num_classes )) 289 | coco_cat_id = self._class_to_coco_cat_id[cls] 290 | results.extend(self._coco_results_one_category(all_boxes[cls_ind], 291 | coco_cat_id)) 292 | ''' 293 | if cls_ind ==30: 294 | res_f = res_file+ '_1.json' 295 | print('Writing results json to {}'.format(res_f)) 296 | with open(res_f, 'w') as fid: 297 | json.dump(results, fid) 298 | results = [] 299 | ''' 300 | #res_f2 = res_file+'_2.json' 301 | print('Writing results json to {}'.format(res_file)) 302 | with open(res_file, 'w') as fid: 303 | json.dump(results, fid) 304 | 305 | def evaluate_detections(self, all_boxes, output_dir): 306 | res_file = os.path.join(output_dir, ('detections_' + 307 | self.coco_name + 308 | '_results')) 309 | res_file += '.json' 310 | self._write_coco_results_file(all_boxes, res_file) 311 | # Only do evaluation on non-test sets 312 | if self.coco_name.find('test') == -1: 313 | self._do_detection_eval(res_file, output_dir) 314 | # Optionally cleanup results json file 315 | 316 | -------------------------------------------------------------------------------- /SSD/data/data_augment.py: -------------------------------------------------------------------------------- 1 | """Data augmentation functionality. Passed as callable transformations to 2 | Dataset classes. 3 | 4 | The data augmentation procedures were interpreted from @weiliu89's SSD paper 5 | http://arxiv.org/abs/1512.02325 6 | """ 7 | 8 | import torch 9 | from torchvision import transforms 10 | import cv2 11 | import numpy as np 12 | import random 13 | import math 14 | from utils.box_utils import matrix_iou 15 | # import torch_transforms 16 | 17 | def _crop(image, boxes, labels): 18 | height, width, _ = image.shape 19 | 20 | if len(boxes)== 0: 21 | return image, boxes, labels 22 | 23 | while True: 24 | mode = random.choice(( 25 | None, 26 | (0.1, None), 27 | (0.3, None), 28 | (0.5, None), 29 | (0.7, None), 30 | (0.9, None), 31 | (None, None), 32 | )) 33 | 34 | if mode is None: 35 | return image, boxes, labels 36 | 37 | min_iou, max_iou = mode 38 | if min_iou is None: 39 | min_iou = float('-inf') 40 | if max_iou is None: 41 | max_iou = float('inf') 42 | 43 | for _ in range(50): 44 | scale = random.uniform(0.3,1.) 45 | min_ratio = max(0.5, scale*scale) 46 | max_ratio = min(2, 1. / scale / scale) 47 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 48 | w = int(scale * ratio * width) 49 | h = int((scale / ratio) * height) 50 | 51 | 52 | l = random.randrange(width - w) 53 | t = random.randrange(height - h) 54 | roi = np.array((l, t, l + w, t + h)) 55 | 56 | iou = matrix_iou(boxes, roi[np.newaxis]) 57 | 58 | if not (min_iou <= iou.min() and iou.max() <= max_iou): 59 | continue 60 | 61 | image_t = image[roi[1]:roi[3], roi[0]:roi[2]] 62 | 63 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2 64 | mask = np.logical_and(roi[:2] < centers, centers < roi[2:]).all(axis=1) 65 | boxes_t = boxes[mask].copy() 66 | labels_t = labels[mask].copy() 67 | if len(boxes_t) == 0: 68 | continue 69 | 70 | boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2]) 71 | boxes_t[:, :2] -= roi[:2] 72 | boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:]) 73 | boxes_t[:, 2:] -= roi[:2] 74 | 75 | return image_t, boxes_t,labels_t 76 | 77 | def _crop_mixup(image, boxes, labels, weights): 78 | height, width, _ = image.shape 79 | 80 | if len(boxes) == 0: 81 | return image, boxes, labels, weights 82 | 83 | while True: 84 | mode = random.choice(( 85 | None, 86 | (0.1, None), 87 | (0.3, None), 88 | (0.5, None), 89 | (0.7, None), 90 | (0.9, None), 91 | (None, None), 92 | )) 93 | 94 | if mode is None: 95 | return image, boxes, labels, weights 96 | 97 | min_iou, max_iou = mode 98 | if min_iou is None: 99 | min_iou = float('-inf') 100 | if max_iou is None: 101 | max_iou = float('inf') 102 | 103 | for _ in range(50): 104 | scale = random.uniform(0.3, 1.) 105 | min_ratio = max(0.5, scale * scale) 106 | max_ratio = min(2, 1. / scale / scale) 107 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 108 | w = int(scale * ratio * width) 109 | h = int((scale / ratio) * height) 110 | 111 | l = random.randrange(width - w) 112 | t = random.randrange(height - h) 113 | roi = np.array((l, t, l + w, t + h)) 114 | 115 | iou = matrix_iou(boxes, roi[np.newaxis]) 116 | 117 | if not (min_iou <= iou.min() and iou.max() <= max_iou): 118 | continue 119 | 120 | image_t = image[roi[1]:roi[3], roi[0]:roi[2]] 121 | 122 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2 123 | mask = np.logical_and(roi[:2] < centers, centers < roi[2:]).all(axis=1) 124 | boxes_t = boxes[mask].copy() 125 | labels_t = labels[mask].copy() 126 | weights_t = weights[mask].copy() 127 | if len(boxes_t) == 0: 128 | continue 129 | 130 | boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2]) 131 | boxes_t[:, :2] -= roi[:2] 132 | boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:]) 133 | boxes_t[:, 2:] -= roi[:2] 134 | 135 | return image_t, boxes_t, labels_t, weights_t 136 | 137 | 138 | def _distort(image): 139 | def _convert(image, alpha=1, beta=0): 140 | tmp = image.astype(float) * alpha + beta 141 | tmp[tmp < 0] = 0 142 | tmp[tmp > 255] = 255 143 | image[:] = tmp 144 | 145 | image = image.copy() 146 | 147 | if random.randrange(2): 148 | _convert(image, beta=random.uniform(-32, 32)) 149 | 150 | if random.randrange(2): 151 | _convert(image, alpha=random.uniform(0.5, 1.5)) 152 | 153 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 154 | 155 | if random.randrange(2): 156 | tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) 157 | tmp %= 180 158 | image[:, :, 0] = tmp 159 | 160 | if random.randrange(2): 161 | _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) 162 | 163 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 164 | 165 | return image 166 | 167 | 168 | def _expand(image, boxes,fill, p): 169 | if random.random() > p: 170 | return image, boxes 171 | 172 | height, width, depth = image.shape 173 | for _ in range(50): 174 | scale = random.uniform(1,4) 175 | 176 | min_ratio = max(0.5, 1./scale/scale) 177 | max_ratio = min(2, scale*scale) 178 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 179 | ws = scale*ratio 180 | hs = scale/ratio 181 | if ws < 1 or hs < 1: 182 | continue 183 | w = int(ws * width) 184 | h = int(hs * height) 185 | 186 | left = random.randint(0, w - width) 187 | top = random.randint(0, h - height) 188 | 189 | boxes_t = boxes.copy() 190 | boxes_t[:, :2] += (left, top) 191 | boxes_t[:, 2:] += (left, top) 192 | 193 | 194 | expand_image = np.empty( 195 | (h, w, depth), 196 | dtype=image.dtype) 197 | expand_image[:, :] = fill 198 | expand_image[top:top + height, left:left + width] = image 199 | image = expand_image 200 | 201 | return image, boxes_t 202 | 203 | 204 | def _mirror(image, boxes): 205 | _, width, _ = image.shape 206 | if random.randrange(2): 207 | image = image[:, ::-1] 208 | boxes = boxes.copy() 209 | boxes[:, 0::2] = width - boxes[:, 2::-2] 210 | return image, boxes 211 | 212 | 213 | def preproc_for_test(image, insize, mean): 214 | interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4] 215 | interp_method = interp_methods[random.randrange(5)] 216 | image = cv2.resize(image, (insize, insize),interpolation=interp_method) 217 | image = image.astype(np.float32) 218 | image -= mean 219 | return image.transpose(2, 0, 1) 220 | 221 | def _random_erasing(image, boxes, means, p=0.6, sl=0.02, sh=0.2, r1=0.3): 222 | if random.uniform(0, 1) > p: 223 | return image 224 | area_boxes = (boxes[:,2]-boxes[:,0])*(boxes[:,3]-boxes[:,1]) 225 | m = [2*j for j in means] # Please ensure the value of 2*mean is less to 255. 226 | for i in range(len(boxes)): 227 | for _ in range(50): 228 | area = random.uniform(sl,sh) * area_boxes[i] 229 | aspect_ratio = random.uniform(r1, 1.0 / r1) 230 | 231 | h = int(round(math.sqrt(area * aspect_ratio))) 232 | w = int(round(math.sqrt(area / aspect_ratio))) 233 | 234 | boxes_w = boxes[i,2] - boxes[i,0] 235 | boxes_h = boxes[i,3] - boxes[i,1] 236 | 237 | if w < boxes_w and h < boxes_h: 238 | x1 = int(random.randint(0,(boxes_w - w)) + boxes[i,0]) 239 | y1 = int(random.randint(0,(boxes_h - h)) + boxes[i,1]) 240 | image[y1:y1 + h, x1:x1 + w, :] = m 241 | break 242 | 243 | for j in range(50): 244 | area = random.uniform(sl, sh) * image.shape[0] * image.shape[1] 245 | aspect_ratio = random.uniform(r1, 1.0 / r1) 246 | h = int(round(math.sqrt(area * aspect_ratio))) 247 | w = int(round(math.sqrt(area / aspect_ratio))) 248 | 249 | if w < image.shape[1] and h < image.shape[0]: 250 | x1 = int(random.randint(0, (image.shape[1] - w))) 251 | y1 = int(random.randint(0, (image.shape[0] - h))) 252 | img_crop = np.array((x1,y1,x1+w,y1+h)) 253 | 254 | ios = matrix_iou(boxes, img_crop[np.newaxis], erasing=True) 255 | if ios.max() < 0.2: 256 | image[y1:y1 + h, x1:x1 + w, :] = m 257 | break 258 | # cv2.imshow('eras.jpg',image) 259 | # cv2.waitKey() 260 | # exit() 261 | return image 262 | 263 | 264 | class preproc(object): 265 | 266 | def __init__(self, resize, rgb_means, p): 267 | self.means = rgb_means 268 | self.resize = resize 269 | self.p = p 270 | 271 | def __call__(self, image, targets, random_erasing): 272 | boxes = targets[:,:-1].copy() 273 | labels = targets[:,-1].copy() 274 | if len(boxes) == 0: 275 | #boxes = np.empty((0, 4)) 276 | targets = np.zeros((1,5)) 277 | image = preproc_for_test(image, self.resize, self.means) 278 | return torch.from_numpy(image), targets 279 | 280 | image_o = image.copy() 281 | targets_o = targets.copy() 282 | height_o, width_o, _ = image_o.shape 283 | boxes_o = targets_o[:,:-1] 284 | labels_o = targets_o[:,-1] 285 | boxes_o[:, 0::2] /= width_o 286 | boxes_o[:, 1::2] /= height_o 287 | labels_o = np.expand_dims(labels_o,1) 288 | targets_o = np.hstack((boxes_o,labels_o)) 289 | 290 | image_t, boxes, labels = _crop(image, boxes, labels) 291 | image_t = _distort(image_t) 292 | if random_erasing: 293 | image_t = _random_erasing(image_t, boxes, self.means) 294 | image_t, boxes = _expand(image_t, boxes, self.means, self.p) 295 | image_t, boxes = _mirror(image_t, boxes) 296 | 297 | height, width, _ = image_t.shape 298 | image_t = preproc_for_test(image_t, self.resize, self.means) 299 | boxes = boxes.copy() 300 | boxes[:, 0::2] /= width 301 | boxes[:, 1::2] /= height 302 | b_w = (boxes[:, 2] - boxes[:, 0])*1. 303 | b_h = (boxes[:, 3] - boxes[:, 1])*1. 304 | mask_b= np.minimum(b_w, b_h) > 0.01 305 | boxes_t = boxes[mask_b] 306 | labels_t = labels[mask_b].copy() 307 | 308 | if len(boxes_t)==0: 309 | image = preproc_for_test(image_o, self.resize, self.means) 310 | return torch.from_numpy(image),targets_o 311 | 312 | labels_t = np.expand_dims(labels_t,1) 313 | targets_t = np.hstack((boxes_t,labels_t)) 314 | 315 | return torch.from_numpy(image_t), targets_t 316 | 317 | class preproc_mixup(object): 318 | 319 | def __init__(self, resize, rgb_means, p): 320 | self.means = rgb_means 321 | self.resize = resize 322 | self.p = p 323 | 324 | def __call__(self, image, targets, random_erasing): 325 | boxes = targets[:,:-2].copy() 326 | labels = targets[:,-2].copy() 327 | weights = targets[:,-1].copy() 328 | if len(boxes) == 0: 329 | #boxes = np.empty((0, 4)) 330 | targets = np.zeros((1,6)) 331 | image = preproc_for_test(image, self.resize, self.means) 332 | return torch.from_numpy(image), targets 333 | 334 | image_o = image.copy() 335 | targets_o = targets.copy() 336 | height_o, width_o, _ = image_o.shape 337 | boxes_o = targets_o[:,:-2] 338 | labels_o = targets_o[:,-2] 339 | weights_o = targets_o[:, -1] 340 | boxes_o[:, 0::2] /= width_o 341 | boxes_o[:, 1::2] /= height_o 342 | labels_o = np.expand_dims(labels_o,1) 343 | weights_o = np.expand_dims(weights_o, 1) 344 | targets_o = np.hstack((boxes_o,labels_o,weights_o)) 345 | 346 | image_t, boxes, labels, weights = _crop_mixup(image, boxes, labels, weights) 347 | image_t = _distort(image_t) 348 | if random_erasing: 349 | image_t = _random_erasing(image_t, boxes, self.means) 350 | image_t, boxes = _expand(image_t, boxes, self.means, self.p) 351 | image_t, boxes = _mirror(image_t, boxes) 352 | 353 | height, width, _ = image_t.shape 354 | image_t = preproc_for_test(image_t, self.resize, self.means) 355 | boxes = boxes.copy() 356 | boxes[:, 0::2] /= width 357 | boxes[:, 1::2] /= height 358 | b_w = (boxes[:, 2] - boxes[:, 0])*1. 359 | b_h = (boxes[:, 3] - boxes[:, 1])*1. 360 | mask_b= np.minimum(b_w, b_h) > 0.01 361 | boxes_t = boxes[mask_b] 362 | labels_t = labels[mask_b].copy() 363 | weights_t = weights[mask_b].copy() 364 | 365 | if len(boxes_t)==0: 366 | image = preproc_for_test(image_o, self.resize, self.means) 367 | return torch.from_numpy(image),targets_o 368 | 369 | labels_t = np.expand_dims(labels_t,1) 370 | weights_t = np.expand_dims(weights_t,1) 371 | targets_t = np.hstack((boxes_t,labels_t,weights_t)) 372 | 373 | 374 | 375 | return torch.from_numpy(image_t), targets_t 376 | 377 | class BaseTransform(object): 378 | """Defines the transformations that should be applied to test PIL image 379 | for input into the network 380 | 381 | dimension -> tensorize -> color adj 382 | 383 | Arguments: 384 | resize (int): input dimension to SSD 385 | rgb_means ((int,int,int)): average RGB of the dataset 386 | (104,117,123) 387 | swap ((int,int,int)): final order of channels 388 | Returns: 389 | transform (transform) : callable transform to be applied to test/val 390 | data 391 | """ 392 | def __init__(self, resize, rgb_means, swap=(2, 0, 1)): 393 | self.means = rgb_means 394 | self.resize = resize 395 | self.swap = swap 396 | 397 | # assume input is cv2 img for now 398 | def __call__(self, img): 399 | 400 | interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4] 401 | interp_method = interp_methods[0] 402 | img = cv2.resize(np.array(img), (self.resize, 403 | self.resize),interpolation = interp_method).astype(np.float32) 404 | img -= self.means 405 | img = img.transpose(self.swap) 406 | return torch.from_numpy(img) 407 | -------------------------------------------------------------------------------- /SSD/utils/box_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | import numpy as np 5 | if torch.cuda.is_available(): 6 | import torch.backends.cudnn as cudnn 7 | 8 | 9 | def point_form(boxes): 10 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax) 11 | representation for comparison to point form ground truth data. 12 | Args: 13 | boxes: (tensor) center-size default boxes from priorbox layers. 14 | Return: 15 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 16 | """ 17 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin 18 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax 19 | 20 | 21 | def center_size(boxes): 22 | """ Convert prior_boxes to (cx, cy, w, h) 23 | representation for comparison to center-size form ground truth data. 24 | Args: 25 | boxes: (tensor) point_form boxes 26 | Return: 27 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 28 | """ 29 | return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy 30 | boxes[:, 2:] - boxes[:, :2], 1) # w, h 31 | 32 | 33 | def intersect(box_a, box_b): 34 | """ We resize both tensors to [A,B,2] without new malloc: 35 | [A,2] -> [A,1,2] -> [A,B,2] 36 | [B,2] -> [1,B,2] -> [A,B,2] 37 | Then we compute the area of intersect between box_a and box_b. 38 | Args: 39 | box_a: (tensor) bounding boxes, Shape: [A,4]. 40 | box_b: (tensor) bounding boxes, Shape: [B,4]. 41 | Return: 42 | (tensor) intersection area, Shape: [A,B]. 43 | """ 44 | A = box_a.size(0) 45 | B = box_b.size(0) 46 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), 47 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) 48 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), 49 | box_b[:, :2].unsqueeze(0).expand(A, B, 2)) 50 | inter = torch.clamp((max_xy - min_xy), min=0) 51 | return inter[:, :, 0] * inter[:, :, 1] 52 | 53 | 54 | def jaccard(box_a, box_b): 55 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 56 | is simply the intersection over union of two boxes. Here we operate on 57 | ground truth boxes and default boxes. 58 | E.g.: 59 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 60 | Args: 61 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] 62 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] 63 | Return: 64 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 65 | """ 66 | inter = intersect(box_a, box_b) 67 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 68 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] 69 | area_b = ((box_b[:, 2]-box_b[:, 0]) * 70 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] 71 | union = area_a + area_b - inter 72 | return inter / union # [A,B] 73 | 74 | def matrix_iou(a,b,erasing=False): 75 | """ 76 | return iou of a and b, numpy version for data augenmentation 77 | """ 78 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 79 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 80 | 81 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 82 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 83 | area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) 84 | if erasing: 85 | IoS = area_i / np.minimum(area_a[:, np.newaxis], area_b) # iog 86 | return IoS 87 | return area_i / (area_a[:, np.newaxis] + area_b - area_i) 88 | 89 | def match_mixup(threshold, truths, priors, variances, labels, loc_t, conf_t, idx, weight_t, weight_loss, giou): 90 | overlaps = jaccard( 91 | truths, 92 | point_form(priors) 93 | ) 94 | # (Bipartite Matching) 95 | # [1,num_objects] best prior for each ground truth 96 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 97 | # [1,num_priors] best ground truth for each prior 98 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 99 | best_truth_idx.squeeze_(0) 100 | best_truth_overlap.squeeze_(0) 101 | best_prior_idx.squeeze_(1) 102 | best_prior_overlap.squeeze_(1) 103 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior 104 | # TODO refactor: index best_prior_idx with long tensor 105 | # ensure every gt matches with its prior of max overlap 106 | for j in range(best_prior_idx.size(0)): 107 | best_truth_idx[best_prior_idx[j]] = j 108 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 109 | conf = labels[best_truth_idx] # Shape: [num_priors] 110 | conf[best_truth_overlap < threshold] = 0 # label as background 111 | if not giou: 112 | loc = encode(matches, priors, variances) 113 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 114 | else: 115 | loc_t[idx] = matches 116 | conf_t[idx] = conf # [num_priors] top class label for each prior 117 | weight = weight_loss[best_truth_idx] 118 | weight[best_truth_overlap < threshold] = 0.0 119 | weight_t[idx] = weight 120 | 121 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx, giou): 122 | """Match each prior box with the ground truth box of the highest jaccard 123 | overlap, encode the bounding boxes, then return the matched indices 124 | corresponding to both confidence and location preds. 125 | Args: 126 | threshold: (float) The overlap threshold used when mathing boxes. 127 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors]. 128 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 129 | variances: (tensor) Variances corresponding to each prior coord, 130 | Shape: [num_priors, 4]. 131 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 132 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 133 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 134 | idx: (int) current batch index 135 | Return: 136 | The matched indices corresponding to 1)location and 2)confidence preds. 137 | """ 138 | # jaccard index 139 | overlaps = jaccard( 140 | truths, 141 | point_form(priors) 142 | ) 143 | # (Bipartite Matching) 144 | # [1,num_objects] best prior for each ground truth 145 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 146 | # [1,num_priors] best ground truth for each prior 147 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 148 | best_truth_idx.squeeze_(0) 149 | best_truth_overlap.squeeze_(0) 150 | best_prior_idx.squeeze_(1) 151 | best_prior_overlap.squeeze_(1) 152 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior 153 | # TODO refactor: index best_prior_idx with long tensor 154 | # ensure every gt matches with its prior of max overlap 155 | for j in range(best_prior_idx.size(0)): 156 | best_truth_idx[best_prior_idx[j]] = j 157 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 158 | conf = labels[best_truth_idx] # Shape: [num_priors] 159 | conf[best_truth_overlap < threshold] = 0 # label as background 160 | if not giou: 161 | loc = encode(matches, priors, variances) 162 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 163 | else: 164 | loc_t[idx] = matches 165 | conf_t[idx] = conf # [num_priors] top class label for each prior 166 | 167 | def encode(matched, priors, variances): 168 | """Encode the variances from the priorbox layers into the ground truth boxes 169 | we have matched (based on jaccard overlap) with the prior boxes. 170 | Args: 171 | matched: (tensor) Coords of ground truth for each prior in point-form 172 | Shape: [num_priors, 4]. 173 | priors: (tensor) Prior boxes in center-offset form 174 | Shape: [num_priors,4]. 175 | variances: (list[float]) Variances of priorboxes 176 | Return: 177 | encoded boxes (tensor), Shape: [num_priors, 4] 178 | """ 179 | 180 | # dist b/t match center and prior's center 181 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] 182 | # encode variance 183 | g_cxcy /= (variances[0] * priors[:, 2:]) 184 | # match wh / prior wh 185 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 186 | g_wh = torch.log(g_wh) / variances[1] 187 | # return target for smooth_l1_loss 188 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 189 | 190 | 191 | def encode_multi(matched, priors, offsets, variances): 192 | """Encode the variances from the priorbox layers into the ground truth boxes 193 | we have matched (based on jaccard overlap) with the prior boxes. 194 | Args: 195 | matched: (tensor) Coords of ground truth for each prior in point-form 196 | Shape: [num_priors, 4]. 197 | priors: (tensor) Prior boxes in center-offset form 198 | Shape: [num_priors,4]. 199 | variances: (list[float]) Variances of priorboxes 200 | Return: 201 | encoded boxes (tensor), Shape: [num_priors, 4] 202 | """ 203 | 204 | # dist b/t match center and prior's center 205 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] - offsets[:,:2] 206 | # encode variance 207 | #g_cxcy /= (variances[0] * priors[:, 2:]) 208 | g_cxcy.div_(variances[0] * offsets[:, 2:]) 209 | # match wh / prior wh 210 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 211 | g_wh = torch.log(g_wh) / variances[1] 212 | # return target for smooth_l1_loss 213 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 214 | 215 | # Adapted from https://github.com/Hakuyume/chainer-ssd 216 | def decode(loc, priors, variances): 217 | """Decode locations from predictions using priors to undo 218 | the encoding we did for offset regression at train time. 219 | Args: 220 | loc (tensor): location predictions for loc layers, 221 | Shape: [num_priors,4] 222 | priors (tensor): Prior boxes in center-offset form. 223 | Shape: [num_priors,4]. 224 | variances: (list[float]) Variances of priorboxes 225 | Return: 226 | decoded bounding box predictions 227 | """ 228 | 229 | boxes = torch.cat(( 230 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 231 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 232 | boxes[:, :2] -= boxes[:, 2:] / 2 233 | boxes[:, 2:] += boxes[:, :2] 234 | return boxes 235 | 236 | def decode_multi(loc, priors, offsets, variances): 237 | """Decode locations from predictions using priors to undo 238 | the encoding we did for offset regression at train time. 239 | Args: 240 | loc (tensor): location predictions for loc layers, 241 | Shape: [num_priors,4] 242 | priors (tensor): Prior boxes in center-offset form. 243 | Shape: [num_priors,4]. 244 | variances: (list[float]) Variances of priorboxes 245 | Return: 246 | decoded bounding box predictions 247 | """ 248 | 249 | boxes = torch.cat(( 250 | priors[:, :2] + offsets[:,:2]+ loc[:, :2] * variances[0] * offsets[:, 2:], 251 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 252 | boxes[:, :2] -= boxes[:, 2:] / 2 253 | boxes[:, 2:] += boxes[:, :2] 254 | return boxes 255 | 256 | def log_sum_exp(x,label_smooth=False): 257 | """Utility function for computing log_sum_exp while determining 258 | This will be used to determine unaveraged confidence loss across 259 | all examples in a batch. 260 | Args: 261 | x (Variable(tensor)): conf_preds from conf layers 262 | """ 263 | x_max = x.data.max() 264 | if label_smooth: 265 | return torch.log(torch.sum(torch.exp(x - x_max), 1, keepdim=True)) + x_max - x 266 | else: 267 | return torch.log(torch.sum(torch.exp(x - x_max), 1, keepdim=True)) + x_max 268 | 269 | def focal_sum_exp(x): 270 | """Utility function for computing log_sum_exp while determining 271 | This will be used to determine unaveraged confidence loss across 272 | all examples in a batch. 273 | Args: 274 | x (Variable(tensor)): conf_preds from conf layers 275 | """ 276 | x_max = x.data.max() 277 | return torch.exp(x - x_max) / torch.sum(torch.exp(x - x_max), 1, keepdim=True) 278 | 279 | # Original author: Francisco Massa: 280 | # https://github.com/fmassa/object-detection.torch 281 | # Ported to PyTorch by Max deGroot (02/01/2017) 282 | def nms(boxes, scores, overlap=0.5, top_k=200): 283 | """Apply non-maximum suppression at test time to avoid detecting too many 284 | overlapping bounding boxes for a given object. 285 | Args: 286 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 287 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 288 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 289 | top_k: (int) The Maximum number of box preds to consider. 290 | Return: 291 | The indices of the kept boxes with respect to num_priors. 292 | """ 293 | 294 | keep = torch.Tensor(scores.size(0)).fill_(0).long() 295 | if boxes.numel() == 0: 296 | return keep 297 | x1 = boxes[:, 0] 298 | y1 = boxes[:, 1] 299 | x2 = boxes[:, 2] 300 | y2 = boxes[:, 3] 301 | area = torch.mul(x2 - x1, y2 - y1) 302 | v, idx = scores.sort(0) # sort in ascending order 303 | # I = I[v >= 0.01] 304 | idx = idx[-top_k:] # indices of the top-k largest vals 305 | xx1 = boxes.new() 306 | yy1 = boxes.new() 307 | xx2 = boxes.new() 308 | yy2 = boxes.new() 309 | w = boxes.new() 310 | h = boxes.new() 311 | 312 | # keep = torch.Tensor() 313 | count = 0 314 | while idx.numel() > 0: 315 | i = idx[-1] # index of current largest val 316 | # keep.append(i) 317 | keep[count] = i 318 | count += 1 319 | if idx.size(0) == 1: 320 | break 321 | idx = idx[:-1] # remove kept element from view 322 | # load bboxes of next highest vals 323 | torch.index_select(x1, 0, idx, out=xx1) 324 | torch.index_select(y1, 0, idx, out=yy1) 325 | torch.index_select(x2, 0, idx, out=xx2) 326 | torch.index_select(y2, 0, idx, out=yy2) 327 | # store element-wise max with next highest score 328 | xx1 = torch.clamp(xx1, min=x1[i]) 329 | yy1 = torch.clamp(yy1, min=y1[i]) 330 | xx2 = torch.clamp(xx2, max=x2[i]) 331 | yy2 = torch.clamp(yy2, max=y2[i]) 332 | w.resize_as_(xx2) 333 | h.resize_as_(yy2) 334 | w = xx2 - xx1 335 | h = yy2 - yy1 336 | # check sizes of xx1 and xx2.. after each iteration 337 | w = torch.clamp(w, min=0.0) 338 | h = torch.clamp(h, min=0.0) 339 | inter = w*h 340 | # IoU = i / (area(a) + area(b) - i) 341 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 342 | union = (rem_areas - inter) + area[i] 343 | IoU = inter/union # store result in iou 344 | # keep only elements with an IoU <= overlap 345 | idx = idx[IoU.le(overlap)] 346 | return keep, count 347 | 348 | 349 | -------------------------------------------------------------------------------- /SSD/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import os 4 | import torch 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | import torch.backends.cudnn as cudnn 8 | import torchvision.transforms as transforms 9 | import torch.nn.init as init 10 | import argparse 11 | import numpy as np 12 | from torch.autograd import Variable 13 | import torch.utils.data as data 14 | from data import VOCroot, COCOroot, VOC_300, VOC_512, COCO_300, COCO_512, COCO_mobile_300, AnnotationTransform, COCODetection, VOCDetection, detection_collate, BaseTransform, preproc, preproc_mixup 15 | from layers.modules import MultiBoxLoss 16 | from layers.functions import PriorBox,Detect 17 | import time 18 | import math 19 | from val import val_net 20 | 21 | parser = argparse.ArgumentParser(description='SSD Training') 22 | parser.add_argument('-v', '--version', default='SSD', help='version.') 23 | parser.add_argument('-s', '--size', default='300', help='300 or 512 input size.') 24 | parser.add_argument('-d', '--dataset', default='VOC', help='VOC or COCO dataset') 25 | parser.add_argument('--basenet', default='vgg16_bn.pth', help='pretrained base model') 26 | parser.add_argument('--jaccard_threshold', default=0.5, type=float, help='Min Jaccard index for matching') 27 | parser.add_argument('-b', '--batch_size', default=32, type=int, help='Batch size for training') 28 | parser.add_argument('--num_workers', default=8, type=int, help='Number of workers used in dataloading') 29 | parser.add_argument('--cuda', default=True, type=bool, help='Use cuda to train model') 30 | parser.add_argument('--ngpu', default=1, type=int, help='gpus') 31 | parser.add_argument('--lr', '--learning-rate', default=4e-3, type=float, help='initial learning rate') 32 | parser.add_argument('--momentum', default=0.9, type=float, help='momentum') 33 | parser.add_argument( '--resume_net', default=None, help='resume net for retraining') 34 | parser.add_argument('--resume_epoch', default=0, type=int, help='resume iter for retraining') 35 | parser.add_argument('-max','--max_epoch', default=250, type=int, help='max epoch for retraining') 36 | parser.add_argument('--weight_decay', default=5e-4, type=float, help='Weight decay for SGD') 37 | parser.add_argument('--gamma', default=0.1, type=float, help='Gamma update for SGD') 38 | parser.add_argument('--log_iters', default=True, type=bool, help='Print the loss at each iteration') 39 | parser.add_argument('--save_folder', default='./weights/', help='Location to save checkpoint models') 40 | parser.add_argument('--save_val_folder', default='eval/', type=str, help='Dir to save results') 41 | parser.add_argument('-wu','--warm_epoch', default='5', type=int, help='warm up') 42 | parser.add_argument('-ls','--lr_schedule', default='cos', type=str, help='lr schedule: step;cos;htd') 43 | parser.add_argument('--norm', default="BN", type=str, help='L2Norm/BN/GN for normalization') 44 | parser.add_argument('-bd','--bias_decay', default=True, type=bool, help='BN/GN and bias for weight decay') 45 | parser.add_argument('--label_smooth', default=False, type=bool, 46 | help='Label Smooth for cls task, default label_pos=0.9.Please refer layers/modules/multibox_loss.py') 47 | parser.add_argument('--balance_l1', default=False, type=bool, help='Balanced for SmoothL1, refer to Libra R-CNN') 48 | parser.add_argument('--random_erasing', default=True, type=bool, help='Random Erasing for Data Augmentation') 49 | parser.add_argument('--focal_loss', default=False, type=bool, help='Focal Loss') 50 | parser.add_argument('--alpha', default=0, type=float, help='Mixup for SSD, if alpha is zero, not use Mixup') 51 | parser.add_argument('--giou', default=False, type=bool, help='GIOU for reg loss') 52 | parser.add_argument('--vgg_bn', default=True, type=bool, help='Use VGG16_BN as backbone for training') 53 | args = parser.parse_args() 54 | 55 | 56 | if not os.path.exists(args.save_folder): 57 | os.mkdir(args.save_folder) 58 | 59 | if args.dataset == 'VOC': 60 | train_sets = [('2007', 'trainval'), ('2012', 'trainval')] 61 | cfg = (VOC_300, VOC_512)[args.size == '512'] 62 | else: 63 | train_sets = [('2014', 'train'),('2014', 'valminusminival')] 64 | cfg = (COCO_300, COCO_512)[args.size == '512'] 65 | 66 | if args.version == 'SSD': 67 | from models.SSD import build_net 68 | else: 69 | print('Unkown version!') 70 | 71 | img_dim = (300,512)[args.size=='512'] 72 | rgb_means = (104, 117, 123) 73 | p = 0.6 74 | num_classes = (21, 81)[args.dataset == 'COCO'] 75 | batch_size = args.batch_size 76 | weight_decay = args.weight_decay 77 | gamma = args.gamma 78 | momentum = args.momentum 79 | 80 | net = build_net(img_dim, num_classes,args.norm,args.vgg_bn) 81 | print(net) 82 | if not args.resume_net: 83 | base_weights = torch.load(args.basenet) 84 | print('Loading base network...') 85 | if args.vgg_bn: 86 | net.base[:-5].load_state_dict(base_weights) 87 | else: 88 | net.base.load_state_dict(base_weights) 89 | 90 | def weights_init(m): 91 | for key in m.state_dict(): 92 | if key.split('.')[-1] == 'weight': 93 | if 'conv' in key: 94 | init.kaiming_normal_(m.state_dict()[key], mode='fan_out',nonlinearity='relu') 95 | if 'bn' in key: 96 | m.state_dict()[key][...] = 1 97 | if 'gn' in key: 98 | m.state_dict()[key][...] = 1 99 | 100 | elif key.split('.')[-1] == 'bias': 101 | m.state_dict()[key][...] = 0 102 | 103 | def head_weights_init(m): 104 | for key in m.state_dict(): 105 | if key.split('.')[-1] == 'weight': 106 | init.xavier_uniform_(m.state_dict()[key]) 107 | elif key.split('.')[-1] == 'bias': 108 | m.state_dict()[key][...] = 0 109 | 110 | print('Initializing weights...') 111 | # initialize newly added layers' weights with kaiming_normal method 112 | if args.vgg_bn: 113 | net.base[-5:].apply(weights_init) 114 | net.extras.apply(weights_init) 115 | 116 | else: 117 | print('Loading resume network') 118 | state_dict = torch.load(args.resume_net) 119 | # create new OrderedDict that does not contain `module.` 120 | from collections import OrderedDict 121 | 122 | # multi-GPU 123 | new_state_dict = OrderedDict() 124 | for k, v in state_dict.items(): 125 | head = k[:7] 126 | if head == 'module.': 127 | name = k[7:] # remove `module.` 128 | else: 129 | name = k 130 | new_state_dict[name] = v 131 | net.load_state_dict(new_state_dict) 132 | 133 | if args.ngpu > 1: 134 | net = torch.nn.DataParallel(net, device_ids=list(range(args.ngpu))) 135 | 136 | if args.cuda: 137 | net.cuda() 138 | cudnn.benchmark = True 139 | 140 | if not args.bias_decay: # BN/GN and bias don't use weight decay 141 | spe_params = [] 142 | conv_params = [] 143 | for k, v in net.named_parameters(): 144 | if 'bn' in k or 'bias' in k: 145 | spe_params.append(v) 146 | else: 147 | conv_params.append(v) 148 | params_group = [{'params': spe_params, 'weight_decay': 0.0}, {'params': conv_params}] 149 | optimizer = optim.SGD(params_group, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) 150 | else: 151 | optimizer = optim.SGD(net.parameters(), lr=args.lr,momentum=args.momentum, weight_decay=args.weight_decay) 152 | 153 | criterion = MultiBoxLoss(num_classes, 0.5, True, 0, True, 3, 0.5, False,label_smmooth=args.label_smooth,balance_l1=args.balance_l1, 154 | focal_loss=args.focal_loss,giou=args.giou) 155 | 156 | priorbox = PriorBox(cfg) 157 | with torch.no_grad(): 158 | priors = priorbox.forward() 159 | if args.cuda: 160 | priors = priors.cuda() 161 | 162 | def get_features_hook(self,input,output): 163 | print('~'*10) 164 | print('features:') 165 | print('input:',input[0][0,0]) 166 | print('output:',output[0,0]) 167 | 168 | def get_grads_hook(self,input_grad, output_grad): 169 | print('~'*10) 170 | print('grad:') 171 | print('grad_in:',input_grad[0][0,0]) 172 | print('grad_out',output_grad[0][0,0]) 173 | 174 | def train(): 175 | net.train() 176 | # loss counters 177 | loc_loss = 0 # epoch 178 | conf_loss = 0 179 | epoch = 0 + args.resume_epoch 180 | print('Loading Dataset...') 181 | 182 | if args.dataset == 'VOC': 183 | if args.alpha - 0.0 > 1e-5: 184 | dataset = VOCDetection(VOCroot, train_sets, preproc_mixup(img_dim, rgb_means, p), AnnotationTransform(), random_erasing=args.random_erasing, 185 | mixup_alpha=args.alpha) 186 | else: 187 | dataset = VOCDetection(VOCroot, train_sets, preproc(img_dim, rgb_means, p), AnnotationTransform(), random_erasing=args.random_erasing) 188 | elif args.dataset == 'COCO': 189 | dataset = COCODetection(COCOroot, train_sets, preproc(img_dim, rgb_means, p)) 190 | else: 191 | print('Only VOC and COCO are supported now!') 192 | return 193 | 194 | epoch_size = len(dataset) // args.batch_size 195 | max_iter = args.max_epoch * epoch_size 196 | 197 | stepvalues_VOC = (150 * epoch_size, 200 * epoch_size, 250 * epoch_size) 198 | stepvalues_COCO = (100 * epoch_size, 135 * epoch_size, 170 * epoch_size) 199 | stepvalues = (stepvalues_VOC,stepvalues_COCO)[args.dataset=='COCO'] 200 | print('Training',args.version, 'on', dataset.name) 201 | step_index = 0 202 | 203 | if args.resume_epoch > 0: 204 | start_iter = args.resume_epoch * epoch_size 205 | for sv in stepvalues: 206 | if start_iter>sv: 207 | step_index+=1 208 | continue 209 | else: 210 | break 211 | else: 212 | start_iter = 0 213 | 214 | lr = args.lr 215 | avg_loss_list = [] 216 | flag = True 217 | for iteration in range(start_iter, max_iter): 218 | if iteration % epoch_size == 0: 219 | # create batch iterator 220 | batch_iterator = iter(data.DataLoader(dataset, batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=detection_collate)) 221 | avg_loss = (loc_loss+conf_loss)/epoch_size 222 | avg_loss_list.append(avg_loss) 223 | print("avg_loss_list:") 224 | if len(avg_loss_list)<=5: 225 | print (avg_loss_list) 226 | else: 227 | print(avg_loss_list[-5:]) 228 | loc_loss = 0 229 | conf_loss = 0 230 | if (epoch<=150 and epoch%10==0) or (150< epoch< 200 and epoch%5==0) or (epoch>200): 231 | torch.save(net.state_dict(), args.save_folder+args.version+'_'+args.dataset + '_epoches_'+ repr(epoch) + '.pth') 232 | if (epoch!=args.resume_epoch): 233 | #if(epoch): 234 | ValNet = build_net(img_dim, num_classes, args.norm, args.vgg_bn) 235 | val_state_dict = torch.load(args.save_folder + args.version + '_' + args.dataset + '_epoches_' + repr(epoch) + '.pth') 236 | from collections import OrderedDict 237 | new_state_dict = OrderedDict() 238 | for k, v in val_state_dict.items(): 239 | head = k[:7] 240 | if head == 'module.': 241 | name = k[7:] 242 | else: 243 | name = k 244 | new_state_dict[name] = v 245 | ValNet.load_state_dict(new_state_dict) 246 | ValNet.eval() 247 | print('Finished loading ' + args.version + '_' + args.dataset + '_epoches_' + repr(epoch) + '.pth model!') 248 | if args.dataset == 'VOC': 249 | testset = VOCDetection(VOCroot, [('2007', 'test')], None, AnnotationTransform()) 250 | elif args.dataset == 'COCO': 251 | testset = COCODetection(COCOroot, [('2014', 'minival')], None) 252 | if args.cuda: 253 | ValNet = ValNet.cuda() 254 | cudnn.benchmark = True 255 | else: 256 | ValNet = ValNet.cpu() 257 | top_k = 200 258 | detector = Detect(num_classes, 0, cfg, GIOU=args.giou) 259 | save_val_folder = os.path.join(args.save_val_folder, args.dataset) 260 | val_transform = BaseTransform(ValNet.size, rgb_means, (2, 0, 1)) 261 | val_net(priors, save_val_folder, testset, num_classes, ValNet, detector, val_transform, top_k, 0.01, 262 | args.cuda,args.vgg_bn) 263 | epoch += 1 264 | 265 | load_t0 = time.time() 266 | if iteration in stepvalues: 267 | step_index += 1 268 | lr = adjust_learning_rate(optimizer, args.gamma, epoch, step_index, iteration, epoch_size) 269 | 270 | images, targets = next(batch_iterator) 271 | 272 | # no mixup 273 | if args.cuda: 274 | images = Variable(images.cuda()) 275 | targets = [Variable(anno.cuda()) for anno in targets] 276 | else: 277 | images = Variable(images) 278 | targets = [Variable(anno) for anno in targets] 279 | 280 | # fh = net.base[22].register_forward_hook(get_features_hook) 281 | # bh = net.base[22].register_backward_hook(get_grads_hook) 282 | out = net(images,vgg_bn=args.vgg_bn) 283 | optimizer.zero_grad() 284 | loss_l, loss_c, = criterion(out, priors, targets) 285 | loss = loss_l + loss_c 286 | loss.backward() 287 | # fh.remove() 288 | # bh.remove() 289 | 290 | optimizer.step() 291 | t1 = time.time() 292 | loc_loss += loss_l.item() 293 | conf_loss += loss_c.item() 294 | load_t1 = time.time() 295 | if iteration % 10 == 0: 296 | print('Epoch:' + repr(epoch) + ' || epochiter: ' + repr(iteration % epoch_size) + '/' + repr(epoch_size) 297 | + '|| Totel iter ' + repr(iteration) + ' || L: %.4f C: %.4f S: %.4f||' % (loss_l.item(),loss_c.item(),loss_l.item()+loss_c.item()) + 298 | 'Batch time: %.4f ||' % (load_t1 - load_t0) + 'LR: %.7f' % (lr)) 299 | 300 | torch.save(net.state_dict(), args.save_folder + 'Final_' + args.version +'_' + args.dataset+ '.pth') 301 | 302 | 303 | def adjust_learning_rate(optimizer, gamma, epoch, step_index, iteration, epoch_size,lr_schedule=args.lr_schedule): 304 | """Sets the learning rate 305 | # Adapted from PyTorch Imagenet example: 306 | # https://github.com/pytorch/examples/blob/master/imagenet/main.py 307 | """ 308 | if epoch <= args.warm_epoch: 309 | lr = 1e-6 + (args.lr - 1e-6) * iteration / (epoch_size * args.warm_epoch) 310 | else: 311 | if lr_schedule == 'step': 312 | lr = args.lr * (gamma ** (step_index)) 313 | elif lr_schedule == 'cos': 314 | lr = 1e-6 + (args.lr - 1e-6) * 0.5 * (1 + math.cos( 315 | (iteration - args.warm_epoch * epoch_size) * math.pi /((args.max_epoch - args.warm_epoch) * epoch_size))) 316 | elif lr_schedule == 'htd': 317 | l,u = -6,3 318 | lr = 1e-6 + (args.lr - 1e-6) * 0.5 * (1 - math.tanh(l + (u - l) * 319 | ((iteration - args.warm_epoch * epoch_size) /(args.max_epoch - args.warm_epoch) /epoch_size))) 320 | else: 321 | print ('Unknown the lr schedule type!') 322 | for param_group in optimizer.param_groups: 323 | param_group['lr'] = lr 324 | return lr 325 | 326 | 327 | if __name__ == '__main__': 328 | train() 329 | 330 | -------------------------------------------------------------------------------- /SSD/data/voc0712.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | 9 | import os 10 | import pickle 11 | import os.path 12 | import sys 13 | import torch 14 | import torch.utils.data as data 15 | import torchvision.transforms as transforms 16 | from PIL import Image, ImageDraw, ImageFont 17 | import cv2 18 | import numpy as np 19 | from .voc_eval import voc_eval 20 | if sys.version_info[0] == 2: 21 | import xml.etree.cElementTree as ET 22 | else: 23 | import xml.etree.ElementTree as ET 24 | 25 | VOC_CLASSES = ( '__background__', # always index 0 26 | 'aeroplane', 'bicycle', 'bird', 'boat', 27 | 'bottle', 'bus', 'car', 'cat', 'chair', 28 | 'cow', 'diningtable', 'dog', 'horse', 29 | 'motorbike', 'person', 'pottedplant', 30 | 'sheep', 'sofa', 'train', 'tvmonitor') 31 | 32 | # for making bounding boxes pretty 33 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128), 34 | (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128)) 35 | 36 | 37 | class VOCSegmentation(data.Dataset): 38 | 39 | """VOC Segmentation Dataset Object 40 | input and target are both images 41 | 42 | NOTE: need to address https://github.com/pytorch/vision/issues/9 43 | 44 | Arguments: 45 | root (string): filepath to VOCdevkit folder. 46 | image_set (string): imageset to use (eg: 'train', 'val', 'test'). 47 | transform (callable, optional): transformation to perform on the 48 | input image 49 | target_transform (callable, optional): transformation to perform on the 50 | target image 51 | dataset_name (string, optional): which dataset to load 52 | (default: 'VOC2007') 53 | """ 54 | 55 | def __init__(self, root, image_set, transform=None, target_transform=None, 56 | dataset_name='VOC2007'): 57 | self.root = root 58 | self.image_set = image_set 59 | self.transform = transform 60 | self.target_transform = target_transform 61 | 62 | self._annopath = os.path.join( 63 | self.root, dataset_name, 'SegmentationClass', '%s.png') 64 | self._imgpath = os.path.join( 65 | self.root, dataset_name, 'JPEGImages', '%s.jpg') 66 | self._imgsetpath = os.path.join( 67 | self.root, dataset_name, 'ImageSets', 'Segmentation', '%s.txt') 68 | 69 | with open(self._imgsetpath % self.image_set) as f: 70 | self.ids = f.readlines() 71 | self.ids = [x.strip('\n') for x in self.ids] 72 | 73 | def __getitem__(self, index): 74 | img_id = self.ids[index] 75 | 76 | target = Image.open(self._annopath % img_id).convert('RGB') 77 | img = Image.open(self._imgpath % img_id).convert('RGB') 78 | 79 | if self.transform is not None: 80 | img = self.transform(img) 81 | 82 | if self.target_transform is not None: 83 | target = self.target_transform(target) 84 | 85 | return img, target 86 | 87 | def __len__(self): 88 | return len(self.ids) 89 | 90 | 91 | class AnnotationTransform(object): 92 | 93 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 94 | Initilized with a dictionary lookup of classnames to indexes 95 | 96 | Arguments: 97 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 98 | (default: alphabetic indexing of VOC's 20 classes) 99 | keep_difficult (bool, optional): keep difficult instances or not 100 | (default: False) 101 | height (int): height 102 | width (int): width 103 | """ 104 | 105 | def __init__(self, class_to_ind=None, keep_difficult=True): 106 | self.class_to_ind = class_to_ind or dict( 107 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 108 | self.keep_difficult = keep_difficult 109 | 110 | def __call__(self, target): 111 | """ 112 | Arguments: 113 | target (annotation) : the target annotation to be made usable 114 | will be an ET.Element 115 | Returns: 116 | a list containing lists of bounding boxes [bbox coords, class name] 117 | """ 118 | res = np.empty((0,5)) 119 | for obj in target.iter('object'): 120 | difficult = int(obj.find('difficult').text) == 1 121 | if not self.keep_difficult and difficult: 122 | continue 123 | name = obj.find('name').text.lower().strip() 124 | bbox = obj.find('bndbox') 125 | 126 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 127 | bndbox = [] 128 | for i, pt in enumerate(pts): 129 | cur_pt = int(bbox.find(pt).text) - 1 130 | # scale height or width 131 | #cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 132 | bndbox.append(cur_pt) 133 | label_idx = self.class_to_ind[name] 134 | bndbox.append(label_idx) 135 | res = np.vstack((res,bndbox)) # [xmin, ymin, xmax, ymax, label_ind] 136 | # img_id = target.find('filename').text[:-4] 137 | 138 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 139 | 140 | 141 | class VOCDetection(data.Dataset): 142 | 143 | """VOC Detection Dataset Object 144 | 145 | input is image, target is annotation 146 | 147 | Arguments: 148 | root (string): filepath to VOCdevkit folder. 149 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 150 | transform (callable, optional): transformation to perform on the 151 | input image 152 | target_transform (callable, optional): transformation to perform on the 153 | target `annotation` 154 | (eg: take in caption string, return tensor of word indices) 155 | dataset_name (string, optional): which dataset to load 156 | (default: 'VOC2007') 157 | """ 158 | 159 | def __init__(self, root, image_sets, preproc=None, target_transform=None, dataset_name='VOC0712', means=(104, 117, 123), 160 | random_erasing=False, mixup_alpha=0.0): 161 | self.root = root 162 | self.image_set = image_sets 163 | self.preproc = preproc 164 | self.target_transform = target_transform 165 | self.name = dataset_name 166 | self._annopath = os.path.join('%s', 'Annotations', '%s.xml') 167 | self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg') 168 | self.ids = list() 169 | self.means = means 170 | for (year, name) in image_sets: 171 | self._year = year 172 | rootpath = os.path.join(self.root, 'VOC' + year) 173 | for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 174 | self.ids.append((rootpath, line.strip())) 175 | self.random_erasing = random_erasing 176 | self.mixup_alpha = mixup_alpha 177 | 178 | def __getitem__(self, index): 179 | img_id = self.ids[index] 180 | target = ET.parse(self._annopath % img_id).getroot() 181 | img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 182 | height, width, _ = img.shape 183 | 184 | if self.target_transform is not None: 185 | target = self.target_transform(target) 186 | 187 | if (self.mixup_alpha - 0.0 > 1e-5): 188 | index2 = np.random.choice(np.delete(np.arange(len(self.ids)), index),replace=False) 189 | img_id2 = self.ids[index2] 190 | target2 = ET.parse(self._annopath % img_id2).getroot() 191 | img2 = cv2.imread(self._imgpath % img_id2, cv2.IMREAD_COLOR) 192 | height2, width2, _ = img2.shape 193 | if self.target_transform is not None: 194 | target2 = self.target_transform(target2) 195 | 196 | height_mix = max(height, height2) 197 | width_mix = max(width, width2) 198 | mix_img = np.zeros((height_mix, width_mix, 3), dtype='float32') 199 | alpha = np.random.uniform(0, self.mixup_alpha) 200 | lam = np.random.beta(alpha,alpha) 201 | mix_img[:height, :width, :] = img.astype('float32') * lam 202 | mix_img[:height2, :width2, :] += img2.astype('float32') * (1.0 - lam) 203 | if (height2-height)*(width2-width) < 0: 204 | mix_img[min(height,height2): height_mix, min(width, width2): width_mix, :] = [2*j for j in self.means] 205 | mix_img = mix_img.astype('uint8') 206 | w1 = np.full((target.shape[0], 1), lam) 207 | w2 = np.full((target2.shape[0], 1), (1 - lam)) 208 | mix_target = np.hstack((np.vstack((target, target2)), np.vstack((w1, w2)))) 209 | 210 | if self.preproc is not None: 211 | img, target = self.preproc(mix_img, mix_target, self.random_erasing) 212 | 213 | else: 214 | if self.preproc is not None: 215 | img, target = self.preproc(img, target, self.random_erasing) 216 | 217 | 218 | return img, target 219 | 220 | def __len__(self): 221 | return len(self.ids) 222 | 223 | def pull_image(self, index): 224 | '''Returns the original image object at index in PIL form 225 | 226 | Note: not using self.__getitem__(), as any transformations passed in 227 | could mess up this functionality. 228 | 229 | Argument: 230 | index (int): index of img to show 231 | Return: 232 | PIL img 233 | ''' 234 | img_id = self.ids[index] 235 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 236 | 237 | def pull_anno(self, index): 238 | '''Returns the original annotation of image at index 239 | 240 | Note: not using self.__getitem__(), as any transformations passed in 241 | could mess up this functionality. 242 | 243 | Argument: 244 | index (int): index of img to get annotation of 245 | Return: 246 | list: [img_id, [(label, bbox coords),...]] 247 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 248 | ''' 249 | img_id = self.ids[index] 250 | anno = ET.parse(self._annopath % img_id).getroot() 251 | gt = self.target_transform(anno, 1, 1) 252 | return img_id[1], gt 253 | 254 | def pull_tensor(self, index): 255 | '''Returns the original image at an index in tensor form 256 | 257 | Note: not using self.__getitem__(), as any transformations passed in 258 | could mess up this functionality. 259 | 260 | Argument: 261 | index (int): index of img to show 262 | Return: 263 | tensorized version of img, squeezed 264 | ''' 265 | to_tensor = transforms.ToTensor() 266 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 267 | 268 | def evaluate_detections(self, all_boxes, output_dir=None): 269 | """ 270 | all_boxes is a list of length number-of-classes. 271 | Each list element is a list of length number-of-images. 272 | Each of those list elements is either an empty list [] 273 | or a numpy array of detection. 274 | 275 | all_boxes[class][image] = [] or np.array of shape #dets x 5 276 | """ 277 | self._write_voc_results_file(all_boxes) 278 | self._do_python_eval(output_dir) 279 | 280 | def _get_voc_results_file_template(self): 281 | filename = 'comp4_det_test' + '_{:s}.txt' 282 | filedir = os.path.join( 283 | self.root, 'results', 'VOC' + self._year, 'Main') 284 | if not os.path.exists(filedir): 285 | os.makedirs(filedir) 286 | path = os.path.join(filedir, filename) 287 | return path 288 | 289 | def _write_voc_results_file(self, all_boxes): 290 | for cls_ind, cls in enumerate(VOC_CLASSES): 291 | cls_ind = cls_ind 292 | if cls == '__background__': 293 | continue 294 | print('Writing {} VOC results file'.format(cls)) 295 | filename = self._get_voc_results_file_template().format(cls) 296 | with open(filename, 'wt') as f: 297 | for im_ind, index in enumerate(self.ids): 298 | index = index[1] 299 | dets = all_boxes[cls_ind][im_ind] 300 | if dets == []: 301 | continue 302 | for k in range(dets.shape[0]): 303 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 304 | format(index, dets[k, -1], 305 | dets[k, 0] + 1, dets[k, 1] + 1, 306 | dets[k, 2] + 1, dets[k, 3] + 1)) 307 | 308 | def _do_python_eval(self, output_dir='output'): 309 | rootpath = os.path.join(self.root, 'VOC' + self._year) 310 | name = self.image_set[0][1] 311 | annopath = os.path.join( 312 | rootpath, 313 | 'Annotations', 314 | '{:s}.xml') 315 | imagesetfile = os.path.join( 316 | rootpath, 317 | 'ImageSets', 318 | 'Main', 319 | name+'.txt') 320 | cachedir = os.path.join(self.root, 'annotations_cache') 321 | aps = [] 322 | # The PASCAL VOC metric changed in 2010 323 | use_07_metric = True if int(self._year) < 2010 else False 324 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 325 | if output_dir is not None and not os.path.isdir(output_dir): 326 | os.mkdir(output_dir) 327 | for i, cls in enumerate(VOC_CLASSES): 328 | 329 | if cls == '__background__': 330 | continue 331 | 332 | filename = self._get_voc_results_file_template().format(cls) 333 | rec, prec, ap = voc_eval( 334 | filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5, 335 | use_07_metric=use_07_metric) 336 | aps += [ap] 337 | print('AP for {} = {:.4f}'.format(cls, ap)) 338 | if output_dir is not None: 339 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 340 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 341 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 342 | print('~~~~~~~~') 343 | print('Results:') 344 | for ap in aps: 345 | print('{:.3f}'.format(ap)) 346 | print('{:.3f}'.format(np.mean(aps))) 347 | print('~~~~~~~~') 348 | print('') 349 | print('--------------------------------------------------------------') 350 | print('Results computed with the **unofficial** Python eval code.') 351 | print('Results should be very close to the official MATLAB eval code.') 352 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 353 | print('-- Thanks, The Management') 354 | print('--------------------------------------------------------------') 355 | 356 | def detection_collate(batch): 357 | """Custom collate fn for dealing with batches of images that have a different 358 | number of associated object annotations (bounding boxes). 359 | 360 | Arguments: 361 | batch: (tuple) A tuple of tensor images and lists of annotations 362 | 363 | Return: 364 | A tuple containing: 365 | 1) (tensor) batch of images stacked on their 0 dim 366 | 2) (list of tensors) annotations for a given image are stacked on 0 dim 367 | """ 368 | targets = [] 369 | imgs = [] 370 | for _, sample in enumerate(batch): 371 | for _, tup in enumerate(sample): 372 | if torch.is_tensor(tup): 373 | imgs.append(tup) 374 | elif isinstance(tup, type(np.empty(0))): 375 | annos = torch.from_numpy(tup).float() 376 | targets.append(annos) 377 | 378 | return (torch.stack(imgs, 0), targets) 379 | -------------------------------------------------------------------------------- /SSD/utils/pycocotools/coco.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | __version__ = '2.0' 3 | # Interface for accessing the Microsoft COCO dataset. 4 | 5 | # Microsoft COCO is a large image dataset designed for object detection, 6 | # segmentation, and caption generation. pycocotools is a Python API that 7 | # assists in loading, parsing and visualizing the annotations in COCO. 8 | # Please visit http://mscoco.org/ for more information on COCO, including 9 | # for the data, paper, and tutorials. The exact format of the annotations 10 | # is also described on the COCO website. For example usage of the pycocotools 11 | # please see pycocotools_demo.ipynb. In addition to this API, please download both 12 | # the COCO images and annotations in order to run the demo. 13 | 14 | # An alternative to using the API is to load the annotations directly 15 | # into Python dictionary 16 | # Using the API provides additional utility functions. Note that this API 17 | # supports both *instance* and *caption* annotations. In the case of 18 | # captions not all functions are defined (e.g. categories are undefined). 19 | 20 | # The following API functions are defined: 21 | # COCO - COCO api class that loads COCO annotation file and prepare data structures. 22 | # decodeMask - Decode binary mask M encoded via run-length encoding. 23 | # encodeMask - Encode binary mask M using run-length encoding. 24 | # getAnnIds - Get ann ids that satisfy given filter conditions. 25 | # getCatIds - Get cat ids that satisfy given filter conditions. 26 | # getImgIds - Get img ids that satisfy given filter conditions. 27 | # loadAnns - Load anns with the specified ids. 28 | # loadCats - Load cats with the specified ids. 29 | # loadImgs - Load imgs with the specified ids. 30 | # annToMask - Convert segmentation in an annotation to binary mask. 31 | # showAnns - Display the specified annotations. 32 | # loadRes - Load algorithm results and create API for accessing them. 33 | # download - Download COCO images from mscoco.org server. 34 | # Throughout the API "ann"=annotation, "cat"=category, and "img"=image. 35 | # Help on each functions can be accessed by: "help COCO>function". 36 | 37 | # See also COCO>decodeMask, 38 | # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds, 39 | # COCO>getImgIds, COCO>loadAnns, COCO>loadCats, 40 | # COCO>loadImgs, COCO>annToMask, COCO>showAnns 41 | 42 | # Microsoft COCO Toolbox. version 2.0 43 | # Data, paper, and tutorials available at: http://mscoco.org/ 44 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2014. 45 | # Licensed under the Simplified BSD License [see bsd.txt] 46 | 47 | import json 48 | import time 49 | import matplotlib.pyplot as plt 50 | from matplotlib.collections import PatchCollection 51 | from matplotlib.patches import Polygon 52 | import numpy as np 53 | import copy 54 | import itertools 55 | from . import mask as maskUtils 56 | import os 57 | from collections import defaultdict 58 | import sys 59 | PYTHON_VERSION = sys.version_info[0] 60 | if PYTHON_VERSION == 2: 61 | from urllib import urlretrieve 62 | elif PYTHON_VERSION == 3: 63 | from urllib.request import urlretrieve 64 | 65 | class COCO: 66 | def __init__(self, annotation_file=None): 67 | """ 68 | Constructor of Microsoft COCO helper class for reading and visualizing annotations. 69 | :param annotation_file (str): location of annotation file 70 | :param image_folder (str): location to the folder that hosts images. 71 | :return: 72 | """ 73 | # load dataset 74 | self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict() 75 | self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list) 76 | if not annotation_file == None: 77 | print('loading annotations into memory...') 78 | tic = time.time() 79 | dataset = json.load(open(annotation_file, 'r')) 80 | assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset)) 81 | print('Done (t={:0.2f}s)'.format(time.time()- tic)) 82 | self.dataset = dataset 83 | self.createIndex() 84 | 85 | def createIndex(self): 86 | # create index 87 | print('creating index...') 88 | anns, cats, imgs = {}, {}, {} 89 | imgToAnns,catToImgs = defaultdict(list),defaultdict(list) 90 | if 'annotations' in self.dataset: 91 | for ann in self.dataset['annotations']: 92 | imgToAnns[ann['image_id']].append(ann) 93 | anns[ann['id']] = ann 94 | 95 | if 'images' in self.dataset: 96 | for img in self.dataset['images']: 97 | imgs[img['id']] = img 98 | 99 | if 'categories' in self.dataset: 100 | for cat in self.dataset['categories']: 101 | cats[cat['id']] = cat 102 | 103 | if 'annotations' in self.dataset and 'categories' in self.dataset: 104 | for ann in self.dataset['annotations']: 105 | catToImgs[ann['category_id']].append(ann['image_id']) 106 | 107 | print('index created!') 108 | 109 | # create class members 110 | self.anns = anns 111 | self.imgToAnns = imgToAnns 112 | self.catToImgs = catToImgs 113 | self.imgs = imgs 114 | self.cats = cats 115 | 116 | def info(self): 117 | """ 118 | Print information about the annotation file. 119 | :return: 120 | """ 121 | for key, value in self.dataset['info'].items(): 122 | print('{}: {}'.format(key, value)) 123 | 124 | def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None): 125 | """ 126 | Get ann ids that satisfy given filter conditions. default skips that filter 127 | :param imgIds (int array) : get anns for given imgs 128 | catIds (int array) : get anns for given cats 129 | areaRng (float array) : get anns for given area range (e.g. [0 inf]) 130 | iscrowd (boolean) : get anns for given crowd label (False or True) 131 | :return: ids (int array) : integer array of ann ids 132 | """ 133 | imgIds = imgIds if type(imgIds) == list else [imgIds] 134 | catIds = catIds if type(catIds) == list else [catIds] 135 | 136 | if len(imgIds) == len(catIds) == len(areaRng) == 0: 137 | anns = self.dataset['annotations'] 138 | else: 139 | if not len(imgIds) == 0: 140 | lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns] 141 | anns = list(itertools.chain.from_iterable(lists)) 142 | else: 143 | anns = self.dataset['annotations'] 144 | anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] 145 | anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]] 146 | if not iscrowd == None: 147 | ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] 148 | else: 149 | ids = [ann['id'] for ann in anns] 150 | return ids 151 | 152 | def getCatIds(self, catNms=[], supNms=[], catIds=[]): 153 | """ 154 | filtering parameters. default skips that filter. 155 | :param catNms (str array) : get cats for given cat names 156 | :param supNms (str array) : get cats for given supercategory names 157 | :param catIds (int array) : get cats for given cat ids 158 | :return: ids (int array) : integer array of cat ids 159 | """ 160 | catNms = catNms if type(catNms) == list else [catNms] 161 | supNms = supNms if type(supNms) == list else [supNms] 162 | catIds = catIds if type(catIds) == list else [catIds] 163 | 164 | if len(catNms) == len(supNms) == len(catIds) == 0: 165 | cats = self.dataset['categories'] 166 | else: 167 | cats = self.dataset['categories'] 168 | cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] 169 | cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] 170 | cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] 171 | ids = [cat['id'] for cat in cats] 172 | return ids 173 | 174 | def getImgIds(self, imgIds=[], catIds=[]): 175 | ''' 176 | Get img ids that satisfy given filter conditions. 177 | :param imgIds (int array) : get imgs for given ids 178 | :param catIds (int array) : get imgs with all given cats 179 | :return: ids (int array) : integer array of img ids 180 | ''' 181 | imgIds = imgIds if type(imgIds) == list else [imgIds] 182 | catIds = catIds if type(catIds) == list else [catIds] 183 | 184 | if len(imgIds) == len(catIds) == 0: 185 | ids = self.imgs.keys() 186 | else: 187 | ids = set(imgIds) 188 | for i, catId in enumerate(catIds): 189 | if i == 0 and len(ids) == 0: 190 | ids = set(self.catToImgs[catId]) 191 | else: 192 | ids &= set(self.catToImgs[catId]) 193 | return list(ids) 194 | 195 | def loadAnns(self, ids=[]): 196 | """ 197 | Load anns with the specified ids. 198 | :param ids (int array) : integer ids specifying anns 199 | :return: anns (object array) : loaded ann objects 200 | """ 201 | if type(ids) == list: 202 | return [self.anns[id] for id in ids] 203 | elif type(ids) == int: 204 | return [self.anns[ids]] 205 | 206 | def loadCats(self, ids=[]): 207 | """ 208 | Load cats with the specified ids. 209 | :param ids (int array) : integer ids specifying cats 210 | :return: cats (object array) : loaded cat objects 211 | """ 212 | if type(ids) == list: 213 | return [self.cats[id] for id in ids] 214 | elif type(ids) == int: 215 | return [self.cats[ids]] 216 | 217 | def loadImgs(self, ids=[]): 218 | """ 219 | Load anns with the specified ids. 220 | :param ids (int array) : integer ids specifying img 221 | :return: imgs (object array) : loaded img objects 222 | """ 223 | if type(ids) == list: 224 | return [self.imgs[id] for id in ids] 225 | elif type(ids) == int: 226 | return [self.imgs[ids]] 227 | 228 | def showAnns(self, anns): 229 | """ 230 | Display the specified annotations. 231 | :param anns (array of object): annotations to display 232 | :return: None 233 | """ 234 | if len(anns) == 0: 235 | return 0 236 | if 'segmentation' in anns[0] or 'keypoints' in anns[0]: 237 | datasetType = 'instances' 238 | elif 'caption' in anns[0]: 239 | datasetType = 'captions' 240 | else: 241 | raise Exception('datasetType not supported') 242 | if datasetType == 'instances': 243 | ax = plt.gca() 244 | ax.set_autoscale_on(False) 245 | polygons = [] 246 | color = [] 247 | for ann in anns: 248 | c = (np.random.random((1, 3))*0.6+0.4).tolist()[0] 249 | if 'segmentation' in ann: 250 | if type(ann['segmentation']) == list: 251 | # polygon 252 | for seg in ann['segmentation']: 253 | poly = np.array(seg).reshape((int(len(seg)/2), 2)) 254 | polygons.append(Polygon(poly)) 255 | color.append(c) 256 | else: 257 | # mask 258 | t = self.imgs[ann['image_id']] 259 | if type(ann['segmentation']['counts']) == list: 260 | rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width']) 261 | else: 262 | rle = [ann['segmentation']] 263 | m = maskUtils.decode(rle) 264 | img = np.ones( (m.shape[0], m.shape[1], 3) ) 265 | if ann['iscrowd'] == 1: 266 | color_mask = np.array([2.0,166.0,101.0])/255 267 | if ann['iscrowd'] == 0: 268 | color_mask = np.random.random((1, 3)).tolist()[0] 269 | for i in range(3): 270 | img[:,:,i] = color_mask[i] 271 | ax.imshow(np.dstack( (img, m*0.5) )) 272 | if 'keypoints' in ann and type(ann['keypoints']) == list: 273 | # turn skeleton into zero-based index 274 | sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1 275 | kp = np.array(ann['keypoints']) 276 | x = kp[0::3] 277 | y = kp[1::3] 278 | v = kp[2::3] 279 | for sk in sks: 280 | if np.all(v[sk]>0): 281 | plt.plot(x[sk],y[sk], linewidth=3, color=c) 282 | plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2) 283 | plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2) 284 | p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4) 285 | ax.add_collection(p) 286 | p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2) 287 | ax.add_collection(p) 288 | elif datasetType == 'captions': 289 | for ann in anns: 290 | print(ann['caption']) 291 | 292 | def loadRes(self, resFile): 293 | """ 294 | Load result file and return a result api object. 295 | :param resFile (str) : file name of result file 296 | :return: res (obj) : result api object 297 | """ 298 | res = COCO() 299 | res.dataset['images'] = [img for img in self.dataset['images']] 300 | 301 | print('Loading and preparing results...') 302 | tic = time.time() 303 | if type(resFile) == str or type(resFile) == unicode: 304 | anns = json.load(open(resFile)) 305 | elif type(resFile) == np.ndarray: 306 | anns = self.loadNumpyAnnotations(resFile) 307 | else: 308 | anns = resFile 309 | assert type(anns) == list, 'results in not an array of objects' 310 | annsImgIds = [ann['image_id'] for ann in anns] 311 | assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ 312 | 'Results do not correspond to current coco set' 313 | if 'caption' in anns[0]: 314 | imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) 315 | res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] 316 | for id, ann in enumerate(anns): 317 | ann['id'] = id+1 318 | elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: 319 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 320 | for id, ann in enumerate(anns): 321 | bb = ann['bbox'] 322 | x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]] 323 | if not 'segmentation' in ann: 324 | ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] 325 | ann['area'] = bb[2]*bb[3] 326 | ann['id'] = id+1 327 | ann['iscrowd'] = 0 328 | elif 'segmentation' in anns[0]: 329 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 330 | for id, ann in enumerate(anns): 331 | # now only support compressed RLE format as segmentation results 332 | ann['area'] = maskUtils.area(ann['segmentation']) 333 | if not 'bbox' in ann: 334 | ann['bbox'] = maskUtils.toBbox(ann['segmentation']) 335 | ann['id'] = id+1 336 | ann['iscrowd'] = 0 337 | elif 'keypoints' in anns[0]: 338 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 339 | for id, ann in enumerate(anns): 340 | s = ann['keypoints'] 341 | x = s[0::3] 342 | y = s[1::3] 343 | x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y) 344 | ann['area'] = (x1-x0)*(y1-y0) 345 | ann['id'] = id + 1 346 | ann['bbox'] = [x0,y0,x1-x0,y1-y0] 347 | print('DONE (t={:0.2f}s)'.format(time.time()- tic)) 348 | 349 | res.dataset['annotations'] = anns 350 | res.createIndex() 351 | return res 352 | 353 | def download(self, tarDir = None, imgIds = [] ): 354 | ''' 355 | Download COCO images from mscoco.org server. 356 | :param tarDir (str): COCO results directory name 357 | imgIds (list): images to be downloaded 358 | :return: 359 | ''' 360 | if tarDir is None: 361 | print('Please specify target directory') 362 | return -1 363 | if len(imgIds) == 0: 364 | imgs = self.imgs.values() 365 | else: 366 | imgs = self.loadImgs(imgIds) 367 | N = len(imgs) 368 | if not os.path.exists(tarDir): 369 | os.makedirs(tarDir) 370 | for i, img in enumerate(imgs): 371 | tic = time.time() 372 | fname = os.path.join(tarDir, img['file_name']) 373 | if not os.path.exists(fname): 374 | urlretrieve(img['coco_url'], fname) 375 | print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic)) 376 | 377 | def loadNumpyAnnotations(self, data): 378 | """ 379 | Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class} 380 | :param data (numpy.ndarray) 381 | :return: annotations (python nested list) 382 | """ 383 | print('Converting ndarray to lists...') 384 | assert(type(data) == np.ndarray) 385 | print(data.shape) 386 | assert(data.shape[1] == 7) 387 | N = data.shape[0] 388 | ann = [] 389 | for i in range(N): 390 | if i % 1000000 == 0: 391 | print('{}/{}'.format(i,N)) 392 | ann += [{ 393 | 'image_id' : int(data[i, 0]), 394 | 'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ], 395 | 'score' : data[i, 5], 396 | 'category_id': int(data[i, 6]), 397 | }] 398 | return ann 399 | 400 | def annToRLE(self, ann): 401 | """ 402 | Convert annotation which can be polygons, uncompressed RLE to RLE. 403 | :return: binary mask (numpy 2D array) 404 | """ 405 | t = self.imgs[ann['image_id']] 406 | h, w = t['height'], t['width'] 407 | segm = ann['segmentation'] 408 | if type(segm) == list: 409 | # polygon -- a single object might consist of multiple parts 410 | # we merge all parts into one mask rle code 411 | rles = maskUtils.frPyObjects(segm, h, w) 412 | rle = maskUtils.merge(rles) 413 | elif type(segm['counts']) == list: 414 | # uncompressed RLE 415 | rle = maskUtils.frPyObjects(segm, h, w) 416 | else: 417 | # rle 418 | rle = ann['segmentation'] 419 | return rle 420 | 421 | def annToMask(self, ann): 422 | """ 423 | Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask. 424 | :return: binary mask (numpy 2D array) 425 | """ 426 | rle = self.annToRLE(ann) 427 | m = maskUtils.decode(rle) 428 | return m -------------------------------------------------------------------------------- /SSD/utils/pycocotools/cocoeval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | import numpy as np 4 | import datetime 5 | import time 6 | from collections import defaultdict 7 | from . import mask as maskUtils 8 | import copy 9 | 10 | class COCOeval: 11 | # Interface for evaluating detection on the Microsoft COCO dataset. 12 | # 13 | # The usage for CocoEval is as follows: 14 | # cocoGt=..., cocoDt=... # load dataset and results 15 | # E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object 16 | # E.params.recThrs = ...; # set parameters as desired 17 | # E.evaluate(); # run per image evaluation 18 | # E.accumulate(); # accumulate per image results 19 | # E.summarize(); # display summary metrics of results 20 | # For example usage see evalDemo.m and http://mscoco.org/. 21 | # 22 | # The evaluation parameters are as follows (defaults in brackets): 23 | # imgIds - [all] N img ids to use for evaluation 24 | # catIds - [all] K cat ids to use for evaluation 25 | # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation 26 | # recThrs - [0:.01:1] R=101 recall thresholds for evaluation 27 | # areaRng - [...] A=4 object area ranges for evaluation 28 | # maxDets - [1 10 100] M=3 thresholds on max detections per image 29 | # iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints' 30 | # iouType replaced the now DEPRECATED useSegm parameter. 31 | # useCats - [1] if true use category labels for evaluation 32 | # Note: if useCats=0 category labels are ignored as in proposal scoring. 33 | # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified. 34 | # 35 | # evaluate(): evaluates detections on every image and every category and 36 | # concats the results into the "evalImgs" with fields: 37 | # dtIds - [1xD] id for each of the D detections (dt) 38 | # gtIds - [1xG] id for each of the G ground truths (gt) 39 | # dtMatches - [TxD] matching gt id at each IoU or 0 40 | # gtMatches - [TxG] matching dt id at each IoU or 0 41 | # dtScores - [1xD] confidence of each dt 42 | # gtIgnore - [1xG] ignore flag for each gt 43 | # dtIgnore - [TxD] ignore flag for each dt at each IoU 44 | # 45 | # accumulate(): accumulates the per-image, per-category evaluation 46 | # results in "evalImgs" into the dictionary "eval" with fields: 47 | # params - parameters used for evaluation 48 | # date - date evaluation was performed 49 | # counts - [T,R,K,A,M] parameter dimensions (see above) 50 | # precision - [TxRxKxAxM] precision for every evaluation setting 51 | # recall - [TxKxAxM] max recall for every evaluation setting 52 | # Note: precision and recall==-1 for settings with no gt objects. 53 | # 54 | # See also coco, mask, pycocoDemo, pycocoEvalDemo 55 | # 56 | # Microsoft COCO Toolbox. version 2.0 57 | # Data, paper, and tutorials available at: http://mscoco.org/ 58 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 59 | # Licensed under the Simplified BSD License [see coco/license.txt] 60 | def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'): 61 | ''' 62 | Initialize CocoEval using coco APIs for gt and dt 63 | :param cocoGt: coco object with ground truth annotations 64 | :param cocoDt: coco object with detection results 65 | :return: None 66 | ''' 67 | if not iouType: 68 | print('iouType not specified. use default iouType segm') 69 | self.cocoGt = cocoGt # ground truth COCO API 70 | self.cocoDt = cocoDt # detections COCO API 71 | self.params = {} # evaluation parameters 72 | self.evalImgs = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements 73 | self.eval = {} # accumulated evaluation results 74 | self._gts = defaultdict(list) # gt for evaluation 75 | self._dts = defaultdict(list) # dt for evaluation 76 | self.params = Params(iouType=iouType) # parameters 77 | self._paramsEval = {} # parameters for evaluation 78 | self.stats = [] # result summarization 79 | self.ious = {} # ious between all gts and dts 80 | if not cocoGt is None: 81 | self.params.imgIds = sorted(cocoGt.getImgIds()) 82 | self.params.catIds = sorted(cocoGt.getCatIds()) 83 | 84 | 85 | def _prepare(self): 86 | ''' 87 | Prepare ._gts and ._dts for evaluation based on params 88 | :return: None 89 | ''' 90 | def _toMask(anns, coco): 91 | # modify ann['segmentation'] by reference 92 | for ann in anns: 93 | rle = coco.annToRLE(ann) 94 | ann['segmentation'] = rle 95 | p = self.params 96 | if p.useCats: 97 | gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) 98 | dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) 99 | else: 100 | gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds)) 101 | dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds)) 102 | 103 | # convert ground truth to mask if iouType == 'segm' 104 | if p.iouType == 'segm': 105 | _toMask(gts, self.cocoGt) 106 | _toMask(dts, self.cocoDt) 107 | # set ignore flag 108 | for gt in gts: 109 | gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0 110 | gt['ignore'] = 'iscrowd' in gt and gt['iscrowd'] 111 | if p.iouType == 'keypoints': 112 | gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore'] 113 | self._gts = defaultdict(list) # gt for evaluation 114 | self._dts = defaultdict(list) # dt for evaluation 115 | for gt in gts: 116 | self._gts[gt['image_id'], gt['category_id']].append(gt) 117 | for dt in dts: 118 | self._dts[dt['image_id'], dt['category_id']].append(dt) 119 | self.evalImgs = defaultdict(list) # per-image per-category evaluation results 120 | self.eval = {} # accumulated evaluation results 121 | 122 | def evaluate(self): 123 | ''' 124 | Run per image evaluation on given images and store results (a list of dict) in self.evalImgs 125 | :return: None 126 | ''' 127 | tic = time.time() 128 | print('Running per image evaluation...') 129 | p = self.params 130 | # add backward compatibility if useSegm is specified in params 131 | if not p.useSegm is None: 132 | p.iouType = 'segm' if p.useSegm == 1 else 'bbox' 133 | print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) 134 | print('Evaluate annotation type *{}*'.format(p.iouType)) 135 | p.imgIds = list(np.unique(p.imgIds)) 136 | if p.useCats: 137 | p.catIds = list(np.unique(p.catIds)) 138 | p.maxDets = sorted(p.maxDets) 139 | self.params=p 140 | 141 | self._prepare() 142 | # loop through images, area range, max detection number 143 | catIds = p.catIds if p.useCats else [-1] 144 | 145 | if p.iouType == 'segm' or p.iouType == 'bbox': 146 | computeIoU = self.computeIoU 147 | elif p.iouType == 'keypoints': 148 | computeIoU = self.computeOks 149 | self.ious = {(imgId, catId): computeIoU(imgId, catId) \ 150 | for imgId in p.imgIds 151 | for catId in catIds} 152 | 153 | evaluateImg = self.evaluateImg 154 | maxDet = p.maxDets[-1] 155 | self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet) 156 | for catId in catIds 157 | for areaRng in p.areaRng 158 | for imgId in p.imgIds 159 | ] 160 | self._paramsEval = copy.deepcopy(self.params) 161 | toc = time.time() 162 | print('DONE (t={:0.2f}s).'.format(toc-tic)) 163 | 164 | def computeIoU(self, imgId, catId): 165 | p = self.params 166 | if p.useCats: 167 | gt = self._gts[imgId,catId] 168 | dt = self._dts[imgId,catId] 169 | else: 170 | gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]] 171 | dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]] 172 | if len(gt) == 0 and len(dt) ==0: 173 | return [] 174 | inds = np.argsort([-d['score'] for d in dt], kind='mergesort') 175 | dt = [dt[i] for i in inds] 176 | if len(dt) > p.maxDets[-1]: 177 | dt=dt[0:p.maxDets[-1]] 178 | 179 | if p.iouType == 'segm': 180 | g = [g['segmentation'] for g in gt] 181 | d = [d['segmentation'] for d in dt] 182 | elif p.iouType == 'bbox': 183 | g = [g['bbox'] for g in gt] 184 | d = [d['bbox'] for d in dt] 185 | else: 186 | raise Exception('unknown iouType for iou computation') 187 | 188 | # compute iou between each dt and gt region 189 | iscrowd = [int(o['iscrowd']) for o in gt] 190 | ious = maskUtils.iou(d,g,iscrowd) 191 | return ious 192 | 193 | def computeOks(self, imgId, catId): 194 | p = self.params 195 | # dimention here should be Nxm 196 | gts = self._gts[imgId, catId] 197 | dts = self._dts[imgId, catId] 198 | inds = np.argsort([-d['score'] for d in dts], kind='mergesort') 199 | dts = [dts[i] for i in inds] 200 | if len(dts) > p.maxDets[-1]: 201 | dts = dts[0:p.maxDets[-1]] 202 | # if len(gts) == 0 and len(dts) == 0: 203 | if len(gts) == 0 or len(dts) == 0: 204 | return [] 205 | ious = np.zeros((len(dts), len(gts))) 206 | sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0 207 | vars = (sigmas * 2)**2 208 | k = len(sigmas) 209 | # compute oks between each detection and ground truth object 210 | for j, gt in enumerate(gts): 211 | # create bounds for ignore regions(double the gt bbox) 212 | g = np.array(gt['keypoints']) 213 | xg = g[0::3]; yg = g[1::3]; vg = g[2::3] 214 | k1 = np.count_nonzero(vg > 0) 215 | bb = gt['bbox'] 216 | x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2 217 | y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2 218 | for i, dt in enumerate(dts): 219 | d = np.array(dt['keypoints']) 220 | xd = d[0::3]; yd = d[1::3] 221 | if k1>0: 222 | # measure the per-keypoint distance if keypoints visible 223 | dx = xd - xg 224 | dy = yd - yg 225 | else: 226 | # measure minimum distance to keypoints in (x0,y0) & (x1,y1) 227 | z = np.zeros((k)) 228 | dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0) 229 | dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0) 230 | e = (dx**2 + dy**2) / vars / (gt['area']+np.spacing(1)) / 2 231 | if k1 > 0: 232 | e=e[vg > 0] 233 | ious[i, j] = np.sum(np.exp(-e)) / e.shape[0] 234 | return ious 235 | 236 | def evaluateImg(self, imgId, catId, aRng, maxDet): 237 | ''' 238 | perform evaluation for single category and image 239 | :return: dict (single image results) 240 | ''' 241 | p = self.params 242 | if p.useCats: 243 | gt = self._gts[imgId,catId] 244 | dt = self._dts[imgId,catId] 245 | else: 246 | gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]] 247 | dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]] 248 | if len(gt) == 0 and len(dt) ==0: 249 | return None 250 | 251 | for g in gt: 252 | if g['ignore'] or (g['area']aRng[1]): 253 | g['_ignore'] = 1 254 | else: 255 | g['_ignore'] = 0 256 | 257 | # sort dt highest score first, sort gt ignore last 258 | gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort') 259 | gt = [gt[i] for i in gtind] 260 | dtind = np.argsort([-d['score'] for d in dt], kind='mergesort') 261 | dt = [dt[i] for i in dtind[0:maxDet]] 262 | iscrowd = [int(o['iscrowd']) for o in gt] 263 | # load computed ious 264 | ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId] 265 | 266 | T = len(p.iouThrs) 267 | G = len(gt) 268 | D = len(dt) 269 | gtm = np.zeros((T,G)) 270 | dtm = np.zeros((T,D)) 271 | gtIg = np.array([g['_ignore'] for g in gt]) 272 | dtIg = np.zeros((T,D)) 273 | if not len(ious)==0: 274 | for tind, t in enumerate(p.iouThrs): 275 | for dind, d in enumerate(dt): 276 | # information about best match so far (m=-1 -> unmatched) 277 | iou = min([t,1-1e-10]) 278 | m = -1 279 | for gind, g in enumerate(gt): 280 | # if this gt already matched, and not a crowd, continue 281 | if gtm[tind,gind]>0 and not iscrowd[gind]: 282 | continue 283 | # if dt matched to reg gt, and on ignore gt, stop 284 | if m>-1 and gtIg[m]==0 and gtIg[gind]==1: 285 | break 286 | # continue to next gt unless better match made 287 | if ious[dind,gind] < iou: 288 | continue 289 | # if match successful and best so far, store appropriately 290 | iou=ious[dind,gind] 291 | m=gind 292 | # if match made store id of match for both dt and gt 293 | if m ==-1: 294 | continue 295 | dtIg[tind,dind] = gtIg[m] 296 | dtm[tind,dind] = gt[m]['id'] 297 | gtm[tind,m] = d['id'] 298 | # set unmatched detections outside of area range to ignore 299 | a = np.array([d['area']aRng[1] for d in dt]).reshape((1, len(dt))) 300 | dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0))) 301 | # store results for given image and category 302 | return { 303 | 'image_id': imgId, 304 | 'category_id': catId, 305 | 'aRng': aRng, 306 | 'maxDet': maxDet, 307 | 'dtIds': [d['id'] for d in dt], 308 | 'gtIds': [g['id'] for g in gt], 309 | 'dtMatches': dtm, 310 | 'gtMatches': gtm, 311 | 'dtScores': [d['score'] for d in dt], 312 | 'gtIgnore': gtIg, 313 | 'dtIgnore': dtIg, 314 | } 315 | 316 | def accumulate(self, p = None): 317 | ''' 318 | Accumulate per image evaluation results and store the result in self.eval 319 | :param p: input params for evaluation 320 | :return: None 321 | ''' 322 | print('Accumulating evaluation results...') 323 | tic = time.time() 324 | if not self.evalImgs: 325 | print('Please run evaluate() first') 326 | # allows input customized parameters 327 | if p is None: 328 | p = self.params 329 | p.catIds = p.catIds if p.useCats == 1 else [-1] 330 | T = len(p.iouThrs) 331 | R = len(p.recThrs) 332 | K = len(p.catIds) if p.useCats else 1 333 | A = len(p.areaRng) 334 | M = len(p.maxDets) 335 | precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories 336 | recall = -np.ones((T,K,A,M)) 337 | 338 | # create dictionary for future indexing 339 | _pe = self._paramsEval 340 | catIds = _pe.catIds if _pe.useCats else [-1] 341 | setK = set(catIds) 342 | setA = set(map(tuple, _pe.areaRng)) 343 | setM = set(_pe.maxDets) 344 | setI = set(_pe.imgIds) 345 | # get inds to evaluate 346 | k_list = [n for n, k in enumerate(p.catIds) if k in setK] 347 | m_list = [m for n, m in enumerate(p.maxDets) if m in setM] 348 | a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA] 349 | i_list = [n for n, i in enumerate(p.imgIds) if i in setI] 350 | I0 = len(_pe.imgIds) 351 | A0 = len(_pe.areaRng) 352 | # retrieve E at each category, area range, and max number of detections 353 | for k, k0 in enumerate(k_list): 354 | Nk = k0*A0*I0 355 | for a, a0 in enumerate(a_list): 356 | Na = a0*I0 357 | for m, maxDet in enumerate(m_list): 358 | E = [self.evalImgs[Nk + Na + i] for i in i_list] 359 | E = [e for e in E if not e is None] 360 | if len(E) == 0: 361 | continue 362 | dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E]) 363 | 364 | # different sorting method generates slightly different results. 365 | # mergesort is used to be consistent as Matlab implementation. 366 | inds = np.argsort(-dtScores, kind='mergesort') 367 | 368 | dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds] 369 | dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds] 370 | gtIg = np.concatenate([e['gtIgnore'] for e in E]) 371 | npig = np.count_nonzero(gtIg==0 ) 372 | if npig == 0: 373 | continue 374 | tps = np.logical_and( dtm, np.logical_not(dtIg) ) 375 | fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) ) 376 | 377 | tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) 378 | fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) 379 | for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): 380 | tp = np.array(tp) 381 | fp = np.array(fp) 382 | nd = len(tp) 383 | rc = tp / npig 384 | pr = tp / (fp+tp+np.spacing(1)) 385 | q = np.zeros((R,)) 386 | 387 | if nd: 388 | recall[t,k,a,m] = rc[-1] 389 | else: 390 | recall[t,k,a,m] = 0 391 | 392 | # numpy is slow without cython optimization for accessing elements 393 | # use python array gets significant speed improvement 394 | pr = pr.tolist(); q = q.tolist() 395 | 396 | for i in range(nd-1, 0, -1): 397 | if pr[i] > pr[i-1]: 398 | pr[i-1] = pr[i] 399 | 400 | inds = np.searchsorted(rc, p.recThrs, side='left') 401 | try: 402 | for ri, pi in enumerate(inds): 403 | q[ri] = pr[pi] 404 | except: 405 | pass 406 | precision[t,:,k,a,m] = np.array(q) 407 | self.eval = { 408 | 'params': p, 409 | 'counts': [T, R, K, A, M], 410 | 'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 411 | 'precision': precision, 412 | 'recall': recall, 413 | } 414 | toc = time.time() 415 | print('DONE (t={:0.2f}s).'.format( toc-tic)) 416 | 417 | def summarize(self): 418 | ''' 419 | Compute and display summary metrics for evaluation results. 420 | Note this functin can *only* be applied on the default parameter setting 421 | ''' 422 | def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ): 423 | p = self.params 424 | iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}' 425 | titleStr = 'Average Precision' if ap == 1 else 'Average Recall' 426 | typeStr = '(AP)' if ap==1 else '(AR)' 427 | iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \ 428 | if iouThr is None else '{:0.2f}'.format(iouThr) 429 | 430 | aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] 431 | mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] 432 | if ap == 1: 433 | # dimension of precision: [TxRxKxAxM] 434 | s = self.eval['precision'] 435 | # IoU 436 | if iouThr is not None: 437 | t = np.where(iouThr == p.iouThrs)[0] 438 | s = s[t] 439 | s = s[:,:,:,aind,mind] 440 | else: 441 | # dimension of recall: [TxKxAxM] 442 | s = self.eval['recall'] 443 | if iouThr is not None: 444 | t = np.where(iouThr == p.iouThrs)[0] 445 | s = s[t] 446 | s = s[:,:,aind,mind] 447 | if len(s[s>-1])==0: 448 | mean_s = -1 449 | else: 450 | mean_s = np.mean(s[s>-1]) 451 | print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) 452 | return mean_s 453 | def _summarizeDets(): 454 | stats = np.zeros((12,)) 455 | stats[0] = _summarize(1) 456 | stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2]) 457 | stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2]) 458 | stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2]) 459 | stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2]) 460 | stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2]) 461 | stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) 462 | stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) 463 | stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) 464 | stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2]) 465 | stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2]) 466 | stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2]) 467 | return stats 468 | def _summarizeKps(): 469 | stats = np.zeros((10,)) 470 | stats[0] = _summarize(1, maxDets=20) 471 | stats[1] = _summarize(1, maxDets=20, iouThr=.5) 472 | stats[2] = _summarize(1, maxDets=20, iouThr=.75) 473 | stats[3] = _summarize(1, maxDets=20, areaRng='medium') 474 | stats[4] = _summarize(1, maxDets=20, areaRng='large') 475 | stats[5] = _summarize(0, maxDets=20) 476 | stats[6] = _summarize(0, maxDets=20, iouThr=.5) 477 | stats[7] = _summarize(0, maxDets=20, iouThr=.75) 478 | stats[8] = _summarize(0, maxDets=20, areaRng='medium') 479 | stats[9] = _summarize(0, maxDets=20, areaRng='large') 480 | return stats 481 | if not self.eval: 482 | raise Exception('Please run accumulate() first') 483 | iouType = self.params.iouType 484 | if iouType == 'segm' or iouType == 'bbox': 485 | summarize = _summarizeDets 486 | elif iouType == 'keypoints': 487 | summarize = _summarizeKps 488 | self.stats = summarize() 489 | 490 | def __str__(self): 491 | self.summarize() 492 | 493 | class Params: 494 | ''' 495 | Params for coco evaluation api 496 | ''' 497 | def setDetParams(self): 498 | self.imgIds = [] 499 | self.catIds = [] 500 | # np.arange causes trouble. the data point on arange is slightly larger than the true value 501 | self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True) 502 | self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True) 503 | self.maxDets = [1, 10, 100] 504 | self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] 505 | self.areaRngLbl = ['all', 'small', 'medium', 'large'] 506 | self.useCats = 1 507 | 508 | def setKpParams(self): 509 | self.imgIds = [] 510 | self.catIds = [] 511 | # np.arange causes trouble. the data point on arange is slightly larger than the true value 512 | self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True) 513 | self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True) 514 | self.maxDets = [20] 515 | self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] 516 | self.areaRngLbl = ['all', 'medium', 'large'] 517 | self.useCats = 1 518 | 519 | def __init__(self, iouType='segm'): 520 | if iouType == 'segm' or iouType == 'bbox': 521 | self.setDetParams() 522 | elif iouType == 'keypoints': 523 | self.setKpParams() 524 | else: 525 | raise Exception('iouType not supported') 526 | self.iouType = iouType 527 | # useSegm is deprecated 528 | self.useSegm = None --------------------------------------------------------------------------------