├── README.md ├── data ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── config.cpython-35.pyc │ └── voc0712.cpython-35.pyc ├── config.py ├── scripts │ ├── VOC2007.sh │ └── VOC2012.sh └── voc0712.py ├── layers ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ └── box_utils.cpython-35.pyc ├── box_utils.py ├── functions │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ ├── detection.cpython-35.pyc │ │ └── prior_box.cpython-35.pyc │ ├── detection.py │ └── prior_box.py └── modules │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── l2norm.cpython-35.pyc │ └── multibox_loss.cpython-35.pyc │ ├── l2norm.py │ └── multibox_loss.py ├── object_detection.py ├── prev.png ├── ssd.py ├── ssd300_mAP_77.43_v2.pth.000 ├── ssd300_mAP_77.43_v2.pth.001 ├── ssd300_mAP_77.43_v2.pth.002 ├── ssd300_mAP_77.43_v2.pth.003 ├── ssd300_mAP_77.43_v2.pth.004 ├── virtual_platform_windows.yml └── working.png /README.md: -------------------------------------------------------------------------------- 1 | # Object-Detection SSD 2 |

3 | 4 | 5 | ## Intro 6 | ![Build Status](https://img.shields.io/badge/build-passing-yellowgreen.svg) 7 | ![Python 3.5](https://img.shields.io/badge/Python-3.5-green.svg) 8 | ![Pytorch 0.1.12](https://img.shields.io/badge/pytorch-0.1.12-yellow.svg) 9 | 10 | 11 | 12 | 13 | Detecting Multiple objects in a video using Single Shot Multibox Detector 14 | 15 | Weight Files are splitted as 16 |

ssd300_mAP_77.43_v2.pth.000
ssd300_mAP_77.43_v2.pth.001
ssd300_mAP_77.43_v2.pth.002
ssd300_mAP_77.43_v2.pth.003
ssd300_mAP_77.43_v2.pth.004

23 | 24 | Join weight files [Here](http://pinetools.com/join-files) 25 | 26 | Read more about SSD [here](https://arxiv.org/pdf/1512.02325.pdf) 27 | 28 | 29 | Click on this image to see demo from SSD: 30 | 31 | [![img](prev.png)](http://i.imgur.com/EyZZKAA.gif) 32 | 33 | ## Dependencies 34 | 35 | Check out the
virtual_platform_windows.yml file 36 | 37 | ### Getting started 38 | 39 | Create a virtual platform.
40 | ``` 41 | conda env create -f virtual_platform_windows.yml 42 | ``` 43 | 44 | ## Working 45 | [![img](working.png)] 46 | 47 | 48 | ## Testing 49 | 50 | **updateobject_detection.py** 51 |
reader = imageio.get_reader('`video`.mp4') 52 | 53 | 54 | 55 | ## Getting data set for Training a new model 56 | 57 | Training is simple but GPU `CUDA` is mandatory otherwise it will take months to train on a CPU: 58 | 59 | ```bash 60 | #Get the dataset: 61 | (http://host.robots.ox.ac.uk/pascal/VOC/index.html) 62 | ``` 63 | 64 | Created by:
Anubhav Shukla 65 | 66 | ![lic](https://img.shields.io/badge/anubhv-%C2%A92017-blue.svg) 67 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from .voc0712 import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES 2 | from .config import * 3 | import cv2 4 | import numpy as np 5 | 6 | 7 | def base_transform(image, size, mean): 8 | x = cv2.resize(image, (size, size)).astype(np.float32) 9 | # x = cv2.resize(np.array(image), (size, size)).astype(np.float32) 10 | x -= mean 11 | x = x.astype(np.float32) 12 | return x 13 | 14 | 15 | class BaseTransform: 16 | def __init__(self, size, mean): 17 | self.size = size 18 | self.mean = np.array(mean, dtype=np.float32) 19 | 20 | def __call__(self, image, boxes=None, labels=None): 21 | return base_transform(image, self.size, self.mean), boxes, labels 22 | -------------------------------------------------------------------------------- /data/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/data/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /data/__pycache__/config.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/data/__pycache__/config.cpython-35.pyc -------------------------------------------------------------------------------- /data/__pycache__/voc0712.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/data/__pycache__/voc0712.cpython-35.pyc -------------------------------------------------------------------------------- /data/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | import os.path 3 | 4 | # gets home dir cross platform 5 | home = os.path.expanduser("~") 6 | ddir = os.path.join(home,"data/VOCdevkit/") 7 | 8 | # note: if you used our download scripts, this should be right 9 | VOCroot = ddir # path to VOCdevkit root dir 10 | 11 | # default batch size 12 | BATCHES = 32 13 | # data reshuffled at every epoch 14 | SHUFFLE = True 15 | # number of subprocesses to use for data loading 16 | WORKERS = 4 17 | 18 | 19 | #SSD300 CONFIGS 20 | # newer version: use additional conv11_2 layer as last layer before multibox layers 21 | v2 = { 22 | 'feature_maps' : [38, 19, 10, 5, 3, 1], 23 | 24 | 'min_dim' : 300, 25 | 26 | 'steps' : [8, 16, 32, 64, 100, 300], 27 | 28 | 'min_sizes' : [30, 60, 111, 162, 213, 264], 29 | 30 | 'max_sizes' : [60, 111, 162, 213, 264, 315], 31 | 32 | # 'aspect_ratios' : [[2, 1/2], [2, 1/2, 3, 1/3], [2, 1/2, 3, 1/3], 33 | # [2, 1/2, 3, 1/3], [2, 1/2], [2, 1/2]], 34 | 'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 35 | 36 | 'variance' : [0.1, 0.2], 37 | 38 | 'clip' : True, 39 | 40 | 'name' : 'v2', 41 | } 42 | 43 | # use average pooling layer as last layer before multibox layers 44 | v1 = { 45 | 'feature_maps' : [38, 19, 10, 5, 3, 1], 46 | 47 | 'min_dim' : 300, 48 | 49 | 'steps' : [8, 16, 32, 64, 100, 300], 50 | 51 | 'min_sizes' : [30, 60, 114, 168, 222, 276], 52 | 53 | 'max_sizes' : [-1, 114, 168, 222, 276, 330], 54 | 55 | # 'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]], 56 | 'aspect_ratios' : [[1,1,2,1/2],[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3], 57 | [1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3]], 58 | 59 | 'variance' : [0.1, 0.2], 60 | 61 | 'clip' : True, 62 | 63 | 'name' : 'v1', 64 | } 65 | -------------------------------------------------------------------------------- /data/scripts/VOC2007.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2007 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 26 | echo "Downloading VOC2007 test data ..." 27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 28 | echo "Done downloading." 29 | 30 | # Extract data 31 | echo "Extracting trainval ..." 32 | tar -xvf VOCtrainval_06-Nov-2007.tar 33 | echo "Extracting test ..." 34 | tar -xvf VOCtest_06-Nov-2007.tar 35 | echo "removing tars ..." 36 | rm VOCtrainval_06-Nov-2007.tar 37 | rm VOCtest_06-Nov-2007.tar 38 | 39 | end=`date +%s` 40 | runtime=$((end-start)) 41 | 42 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/scripts/VOC2012.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2012 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 26 | echo "Done downloading." 27 | 28 | 29 | # Extract data 30 | echo "Extracting trainval ..." 31 | tar -xvf VOCtrainval_11-May-2012.tar 32 | echo "removing tar ..." 33 | rm VOCtrainval_11-May-2012.tar 34 | 35 | end=`date +%s` 36 | runtime=$((end-start)) 37 | 38 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/voc0712.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | 9 | import os 10 | import os.path 11 | import sys 12 | import torch 13 | import torch.utils.data as data 14 | import torchvision.transforms as transforms 15 | from PIL import Image, ImageDraw, ImageFont 16 | import cv2 17 | import numpy as np 18 | if sys.version_info[0] == 2: 19 | import xml.etree.cElementTree as ET 20 | else: 21 | import xml.etree.ElementTree as ET 22 | 23 | VOC_CLASSES = ( # always index 0 24 | 'aeroplane', 'bicycle', 'bird', 'boat', 25 | 'bottle', 'bus', 'car', 'cat', 'chair', 26 | 'cow', 'diningtable', 'dog', 'horse', 27 | 'motorbike', 'person', 'pottedplant', 28 | 'sheep', 'sofa', 'train', 'tvmonitor') 29 | 30 | # for making bounding boxes pretty 31 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128), 32 | (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128)) 33 | 34 | 35 | class AnnotationTransform(object): 36 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 37 | Initilized with a dictionary lookup of classnames to indexes 38 | 39 | Arguments: 40 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 41 | (default: alphabetic indexing of VOC's 20 classes) 42 | keep_difficult (bool, optional): keep difficult instances or not 43 | (default: False) 44 | height (int): height 45 | width (int): width 46 | """ 47 | 48 | def __init__(self, class_to_ind=None, keep_difficult=False): 49 | self.class_to_ind = class_to_ind or dict( 50 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 51 | self.keep_difficult = keep_difficult 52 | 53 | def __call__(self, target, width, height): 54 | """ 55 | Arguments: 56 | target (annotation) : the target annotation to be made usable 57 | will be an ET.Element 58 | Returns: 59 | a list containing lists of bounding boxes [bbox coords, class name] 60 | """ 61 | res = [] 62 | for obj in target.iter('object'): 63 | difficult = int(obj.find('difficult').text) == 1 64 | if not self.keep_difficult and difficult: 65 | continue 66 | name = obj.find('name').text.lower().strip() 67 | bbox = obj.find('bndbox') 68 | 69 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 70 | bndbox = [] 71 | for i, pt in enumerate(pts): 72 | cur_pt = int(bbox.find(pt).text) - 1 73 | # scale height or width 74 | cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 75 | bndbox.append(cur_pt) 76 | label_idx = self.class_to_ind[name] 77 | bndbox.append(label_idx) 78 | res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] 79 | # img_id = target.find('filename').text[:-4] 80 | 81 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 82 | 83 | 84 | class VOCDetection(data.Dataset): 85 | """VOC Detection Dataset Object 86 | 87 | input is image, target is annotation 88 | 89 | Arguments: 90 | root (string): filepath to VOCdevkit folder. 91 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 92 | transform (callable, optional): transformation to perform on the 93 | input image 94 | target_transform (callable, optional): transformation to perform on the 95 | target `annotation` 96 | (eg: take in caption string, return tensor of word indices) 97 | dataset_name (string, optional): which dataset to load 98 | (default: 'VOC2007') 99 | """ 100 | 101 | def __init__(self, root, image_sets, transform=None, target_transform=None, 102 | dataset_name='VOC0712'): 103 | self.root = root 104 | self.image_set = image_sets 105 | self.transform = transform 106 | self.target_transform = target_transform 107 | self.name = dataset_name 108 | self._annopath = os.path.join('%s', 'Annotations', '%s.xml') 109 | self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg') 110 | self.ids = list() 111 | for (year, name) in image_sets: 112 | rootpath = os.path.join(self.root, 'VOC' + year) 113 | for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 114 | self.ids.append((rootpath, line.strip())) 115 | 116 | def __getitem__(self, index): 117 | im, gt, h, w = self.pull_item(index) 118 | 119 | return im, gt 120 | 121 | def __len__(self): 122 | return len(self.ids) 123 | 124 | def pull_item(self, index): 125 | img_id = self.ids[index] 126 | 127 | target = ET.parse(self._annopath % img_id).getroot() 128 | img = cv2.imread(self._imgpath % img_id) 129 | height, width, channels = img.shape 130 | 131 | if self.target_transform is not None: 132 | target = self.target_transform(target, width, height) 133 | 134 | if self.transform is not None: 135 | target = np.array(target) 136 | img, boxes, labels = self.transform(img, target[:, :4], target[:, 4]) 137 | # to rgb 138 | img = img[:, :, (2, 1, 0)] 139 | # img = img.transpose(2, 0, 1) 140 | target = np.hstack((boxes, np.expand_dims(labels, axis=1))) 141 | return torch.from_numpy(img).permute(2, 0, 1), target, height, width 142 | # return torch.from_numpy(img), target, height, width 143 | 144 | def pull_image(self, index): 145 | '''Returns the original image object at index in PIL form 146 | 147 | Note: not using self.__getitem__(), as any transformations passed in 148 | could mess up this functionality. 149 | 150 | Argument: 151 | index (int): index of img to show 152 | Return: 153 | PIL img 154 | ''' 155 | img_id = self.ids[index] 156 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 157 | 158 | def pull_anno(self, index): 159 | '''Returns the original annotation of image at index 160 | 161 | Note: not using self.__getitem__(), as any transformations passed in 162 | could mess up this functionality. 163 | 164 | Argument: 165 | index (int): index of img to get annotation of 166 | Return: 167 | list: [img_id, [(label, bbox coords),...]] 168 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 169 | ''' 170 | img_id = self.ids[index] 171 | anno = ET.parse(self._annopath % img_id).getroot() 172 | gt = self.target_transform(anno, 1, 1) 173 | return img_id[1], gt 174 | 175 | def pull_tensor(self, index): 176 | '''Returns the original image at an index in tensor form 177 | 178 | Note: not using self.__getitem__(), as any transformations passed in 179 | could mess up this functionality. 180 | 181 | Argument: 182 | index (int): index of img to show 183 | Return: 184 | tensorized version of img, squeezed 185 | ''' 186 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 187 | 188 | 189 | def detection_collate(batch): 190 | """Custom collate fn for dealing with batches of images that have a different 191 | number of associated object annotations (bounding boxes). 192 | 193 | Arguments: 194 | batch: (tuple) A tuple of tensor images and lists of annotations 195 | 196 | Return: 197 | A tuple containing: 198 | 1) (tensor) batch of images stacked on their 0 dim 199 | 2) (list of tensors) annotations for a given image are stacked on 0 dim 200 | """ 201 | targets = [] 202 | imgs = [] 203 | for sample in batch: 204 | imgs.append(sample[0]) 205 | targets.append(torch.FloatTensor(sample[1])) 206 | return torch.stack(imgs, 0), targets 207 | -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | from .modules import * 3 | -------------------------------------------------------------------------------- /layers/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /layers/__pycache__/box_utils.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/__pycache__/box_utils.cpython-35.pyc -------------------------------------------------------------------------------- /layers/box_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def point_form(boxes): 4 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax) 5 | representation for comparison to point form ground truth data. 6 | Args: 7 | boxes: (tensor) center-size default boxes from priorbox layers. 8 | Return: 9 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 10 | """ 11 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin 12 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax 13 | 14 | 15 | def center_size(boxes): 16 | """ Convert prior_boxes to (cx, cy, w, h) 17 | representation for comparison to center-size form ground truth data. 18 | Args: 19 | boxes: (tensor) point_form boxes 20 | Return: 21 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 22 | """ 23 | return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy 24 | boxes[:, 2:] - boxes[:, :2], 1) # w, h 25 | 26 | 27 | def intersect(box_a, box_b): 28 | """ We resize both tensors to [A,B,2] without new malloc: 29 | [A,2] -> [A,1,2] -> [A,B,2] 30 | [B,2] -> [1,B,2] -> [A,B,2] 31 | Then we compute the area of intersect between box_a and box_b. 32 | Args: 33 | box_a: (tensor) bounding boxes, Shape: [A,4]. 34 | box_b: (tensor) bounding boxes, Shape: [B,4]. 35 | Return: 36 | (tensor) intersection area, Shape: [A,B]. 37 | """ 38 | A = box_a.size(0) 39 | B = box_b.size(0) 40 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), 41 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) 42 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), 43 | box_b[:, :2].unsqueeze(0).expand(A, B, 2)) 44 | inter = torch.clamp((max_xy - min_xy), min=0) 45 | return inter[:, :, 0] * inter[:, :, 1] 46 | 47 | 48 | def jaccard(box_a, box_b): 49 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 50 | is simply the intersection over union of two boxes. Here we operate on 51 | ground truth boxes and default boxes. 52 | E.g.: 53 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 54 | Args: 55 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] 56 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] 57 | Return: 58 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 59 | """ 60 | inter = intersect(box_a, box_b) 61 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 62 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] 63 | area_b = ((box_b[:, 2]-box_b[:, 0]) * 64 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] 65 | union = area_a + area_b - inter 66 | return inter / union # [A,B] 67 | 68 | 69 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx): 70 | """Match each prior box with the ground truth box of the highest jaccard 71 | overlap, encode the bounding boxes, then return the matched indices 72 | corresponding to both confidence and location preds. 73 | Args: 74 | threshold: (float) The overlap threshold used when mathing boxes. 75 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors]. 76 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 77 | variances: (tensor) Variances corresponding to each prior coord, 78 | Shape: [num_priors, 4]. 79 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 80 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 81 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 82 | idx: (int) current batch index 83 | Return: 84 | The matched indices corresponding to 1)location and 2)confidence preds. 85 | """ 86 | # jaccard index 87 | overlaps = jaccard( 88 | truths, 89 | point_form(priors) 90 | ) 91 | # (Bipartite Matching) 92 | # [1,num_objects] best prior for each ground truth 93 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 94 | # [1,num_priors] best ground truth for each prior 95 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 96 | best_truth_idx.squeeze_(0) 97 | best_truth_overlap.squeeze_(0) 98 | best_prior_idx.squeeze_(1) 99 | best_prior_overlap.squeeze_(1) 100 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior 101 | # TODO refactor: index best_prior_idx with long tensor 102 | # ensure every gt matches with its prior of max overlap 103 | for j in range(best_prior_idx.size(0)): 104 | best_truth_idx[best_prior_idx[j]] = j 105 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 106 | conf = labels[best_truth_idx] + 1 # Shape: [num_priors] 107 | conf[best_truth_overlap < threshold] = 0 # label as background 108 | loc = encode(matches, priors, variances) 109 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 110 | conf_t[idx] = conf # [num_priors] top class label for each prior 111 | 112 | 113 | def encode(matched, priors, variances): 114 | """Encode the variances from the priorbox layers into the ground truth boxes 115 | we have matched (based on jaccard overlap) with the prior boxes. 116 | Args: 117 | matched: (tensor) Coords of ground truth for each prior in point-form 118 | Shape: [num_priors, 4]. 119 | priors: (tensor) Prior boxes in center-offset form 120 | Shape: [num_priors,4]. 121 | variances: (list[float]) Variances of priorboxes 122 | Return: 123 | encoded boxes (tensor), Shape: [num_priors, 4] 124 | """ 125 | 126 | # dist b/t match center and prior's center 127 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] 128 | # encode variance 129 | g_cxcy /= (variances[0] * priors[:, 2:]) 130 | # match wh / prior wh 131 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 132 | g_wh = torch.log(g_wh) / variances[1] 133 | # return target for smooth_l1_loss 134 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 135 | 136 | 137 | # Adapted from https://github.com/Hakuyume/chainer-ssd 138 | def decode(loc, priors, variances): 139 | """Decode locations from predictions using priors to undo 140 | the encoding we did for offset regression at train time. 141 | Args: 142 | loc (tensor): location predictions for loc layers, 143 | Shape: [num_priors,4] 144 | priors (tensor): Prior boxes in center-offset form. 145 | Shape: [num_priors,4]. 146 | variances: (list[float]) Variances of priorboxes 147 | Return: 148 | decoded bounding box predictions 149 | """ 150 | 151 | boxes = torch.cat(( 152 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 153 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 154 | boxes[:, :2] -= boxes[:, 2:] / 2 155 | boxes[:, 2:] += boxes[:, :2] 156 | return boxes 157 | 158 | 159 | def log_sum_exp(x): 160 | """Utility function for computing log_sum_exp while determining 161 | This will be used to determine unaveraged confidence loss across 162 | all examples in a batch. 163 | Args: 164 | x (Variable(tensor)): conf_preds from conf layers 165 | """ 166 | x_max = x.data.max() 167 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max 168 | 169 | 170 | # Original author: Francisco Massa: 171 | # https://github.com/fmassa/object-detection.torch 172 | # Ported to PyTorch by Max deGroot (02/01/2017) 173 | def nms(boxes, scores, overlap=0.5, top_k=200): 174 | """Apply non-maximum suppression at test time to avoid detecting too many 175 | overlapping bounding boxes for a given object. 176 | Args: 177 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 178 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 179 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 180 | top_k: (int) The Maximum number of box preds to consider. 181 | Return: 182 | The indices of the kept boxes with respect to num_priors. 183 | """ 184 | 185 | keep = scores.new(scores.size(0)).zero_().long() 186 | if boxes.numel() == 0: 187 | return keep 188 | x1 = boxes[:, 0] 189 | y1 = boxes[:, 1] 190 | x2 = boxes[:, 2] 191 | y2 = boxes[:, 3] 192 | area = torch.mul(x2 - x1, y2 - y1) 193 | v, idx = scores.sort(0) # sort in ascending order 194 | # I = I[v >= 0.01] 195 | idx = idx[-top_k:] # indices of the top-k largest vals 196 | xx1 = boxes.new() 197 | yy1 = boxes.new() 198 | xx2 = boxes.new() 199 | yy2 = boxes.new() 200 | w = boxes.new() 201 | h = boxes.new() 202 | 203 | # keep = torch.Tensor() 204 | count = 0 205 | while idx.numel() > 0: 206 | i = idx[-1] # index of current largest val 207 | # keep.append(i) 208 | keep[count] = i 209 | count += 1 210 | if idx.size(0) == 1: 211 | break 212 | idx = idx[:-1] # remove kept element from view 213 | # load bboxes of next highest vals 214 | torch.index_select(x1, 0, idx, out=xx1) 215 | torch.index_select(y1, 0, idx, out=yy1) 216 | torch.index_select(x2, 0, idx, out=xx2) 217 | torch.index_select(y2, 0, idx, out=yy2) 218 | # store element-wise max with next highest score 219 | xx1 = torch.clamp(xx1, min=x1[i]) 220 | yy1 = torch.clamp(yy1, min=y1[i]) 221 | xx2 = torch.clamp(xx2, max=x2[i]) 222 | yy2 = torch.clamp(yy2, max=y2[i]) 223 | w.resize_as_(xx2) 224 | h.resize_as_(yy2) 225 | w = xx2 - xx1 226 | h = yy2 - yy1 227 | # check sizes of xx1 and xx2.. after each iteration 228 | w = torch.clamp(w, min=0.0) 229 | h = torch.clamp(h, min=0.0) 230 | inter = w*h 231 | # IoU = i / (area(a) + area(b) - i) 232 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 233 | union = (rem_areas - inter) + area[i] 234 | IoU = inter/union # store result in iou 235 | # keep only elements with an IoU <= overlap 236 | idx = idx[IoU.le(overlap)] 237 | return keep, count 238 | -------------------------------------------------------------------------------- /layers/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .detection import Detect 2 | from .prior_box import PriorBox 3 | 4 | 5 | __all__ = ['Detect', 'PriorBox'] 6 | -------------------------------------------------------------------------------- /layers/functions/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/functions/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /layers/functions/__pycache__/detection.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/functions/__pycache__/detection.cpython-35.pyc -------------------------------------------------------------------------------- /layers/functions/__pycache__/prior_box.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/functions/__pycache__/prior_box.cpython-35.pyc -------------------------------------------------------------------------------- /layers/functions/detection.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.backends.cudnn as cudnn 4 | from torch.autograd import Function 5 | from torch.autograd import Variable 6 | from ..box_utils import decode, nms 7 | from data import v2 as cfg 8 | 9 | 10 | class Detect(Function): 11 | """At test time, Detect is the final layer of SSD. Decode location preds, 12 | apply non-maximum suppression to location predictions based on conf 13 | scores and threshold to a top_k number of output predictions for both 14 | confidence score and locations. 15 | """ 16 | def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh): 17 | self.num_classes = num_classes 18 | self.background_label = bkg_label 19 | self.top_k = top_k 20 | # Parameters used in nms. 21 | self.nms_thresh = nms_thresh 22 | if nms_thresh <= 0: 23 | raise ValueError('nms_threshold must be non negative.') 24 | self.conf_thresh = conf_thresh 25 | self.variance = cfg['variance'] 26 | self.output = torch.zeros(1, self.num_classes, self.top_k, 5) 27 | 28 | def forward(self, loc_data, conf_data, prior_data): 29 | """ 30 | Args: 31 | loc_data: (tensor) Loc preds from loc layers 32 | Shape: [batch,num_priors*4] 33 | conf_data: (tensor) Shape: Conf preds from conf layers 34 | Shape: [batch*num_priors,num_classes] 35 | prior_data: (tensor) Prior boxes and variances from priorbox layers 36 | Shape: [1,num_priors,4] 37 | """ 38 | num = loc_data.size(0) # batch size 39 | num_priors = prior_data.size(0) 40 | self.output.zero_() 41 | if num == 1: 42 | # size batch x num_classes x num_priors 43 | conf_preds = conf_data.t().contiguous().unsqueeze(0) 44 | else: 45 | conf_preds = conf_data.view(num, num_priors, 46 | self.num_classes).transpose(2, 1) 47 | self.output.expand_(num, self.num_classes, self.top_k, 5) 48 | 49 | # Decode predictions into bboxes. 50 | for i in range(num): 51 | decoded_boxes = decode(loc_data[i], prior_data, self.variance) 52 | # For each class, perform nms 53 | conf_scores = conf_preds[i].clone() 54 | num_det = 0 55 | for cl in range(1, self.num_classes): 56 | c_mask = conf_scores[cl].gt(self.conf_thresh) 57 | scores = conf_scores[cl][c_mask] 58 | if scores.dim() == 0: 59 | continue 60 | l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes) 61 | boxes = decoded_boxes[l_mask].view(-1, 4) 62 | # idx of highest scoring and non-overlapping boxes per class 63 | ids, count = nms(boxes, scores, self.nms_thresh, self.top_k) 64 | self.output[i, cl, :count] = \ 65 | torch.cat((scores[ids[:count]].unsqueeze(1), 66 | boxes[ids[:count]]), 1) 67 | flt = self.output.view(-1, 5) 68 | _, idx = flt[:, 0].sort(0) 69 | _, rank = idx.sort(0) 70 | flt[(rank >= self.top_k).unsqueeze(1).expand_as(flt)].fill_(0) 71 | return self.output 72 | -------------------------------------------------------------------------------- /layers/functions/prior_box.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from math import sqrt as sqrt 3 | from itertools import product as product 4 | 5 | class PriorBox(object): 6 | """Compute priorbox coordinates in center-offset form for each source 7 | feature map. 8 | Note: 9 | This 'layer' has changed between versions of the original SSD 10 | paper, so we include both versions, but note v2 is the most tested and most 11 | recent version of the paper. 12 | 13 | """ 14 | def __init__(self, cfg): 15 | super(PriorBox, self).__init__() 16 | # self.type = cfg.name 17 | self.image_size = cfg['min_dim'] 18 | # number of priors for feature map location (either 4 or 6) 19 | self.num_priors = len(cfg['aspect_ratios']) 20 | self.variance = cfg['variance'] or [0.1] 21 | self.feature_maps = cfg['feature_maps'] 22 | self.min_sizes = cfg['min_sizes'] 23 | self.max_sizes = cfg['max_sizes'] 24 | self.steps = cfg['steps'] 25 | self.aspect_ratios = cfg['aspect_ratios'] 26 | self.clip = cfg['clip'] 27 | self.version = cfg['name'] 28 | for v in self.variance: 29 | if v <= 0: 30 | raise ValueError('Variances must be greater than 0') 31 | 32 | def forward(self): 33 | mean = [] 34 | # TODO merge these 35 | if self.version == 'v2': 36 | for k, f in enumerate(self.feature_maps): 37 | for i, j in product(range(f), repeat=2): 38 | f_k = self.image_size / self.steps[k] 39 | # unit center x,y 40 | cx = (j + 0.5) / f_k 41 | cy = (i + 0.5) / f_k 42 | 43 | # aspect_ratio: 1 44 | # rel size: min_size 45 | s_k = self.min_sizes[k]/self.image_size 46 | mean += [cx, cy, s_k, s_k] 47 | 48 | # aspect_ratio: 1 49 | # rel size: sqrt(s_k * s_(k+1)) 50 | s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size)) 51 | mean += [cx, cy, s_k_prime, s_k_prime] 52 | 53 | # rest of aspect ratios 54 | for ar in self.aspect_ratios[k]: 55 | mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)] 56 | mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)] 57 | 58 | else: 59 | # original version generation of prior (default) boxes 60 | for i, k in enumerate(self.feature_maps): 61 | step_x = step_y = self.image_size/k 62 | for h, w in product(range(k), repeat=2): 63 | c_x = ((w+0.5) * step_x) 64 | c_y = ((h+0.5) * step_y) 65 | c_w = c_h = self.min_sizes[i] / 2 66 | s_k = self.image_size # 300 67 | # aspect_ratio: 1, 68 | # size: min_size 69 | mean += [(c_x-c_w)/s_k, (c_y-c_h)/s_k, 70 | (c_x+c_w)/s_k, (c_y+c_h)/s_k] 71 | if self.max_sizes[i] > 0: 72 | # aspect_ratio: 1 73 | # size: sqrt(min_size * max_size)/2 74 | c_w = c_h = sqrt(self.min_sizes[i] * 75 | self.max_sizes[i])/2 76 | mean += [(c_x-c_w)/s_k, (c_y-c_h)/s_k, 77 | (c_x+c_w)/s_k, (c_y+c_h)/s_k] 78 | # rest of prior boxes 79 | for ar in self.aspect_ratios[i]: 80 | if not (abs(ar-1) < 1e-6): 81 | c_w = self.min_sizes[i] * sqrt(ar)/2 82 | c_h = self.min_sizes[i] / sqrt(ar)/2 83 | mean += [(c_x-c_w)/s_k, (c_y-c_h)/s_k, 84 | (c_x+c_w)/s_k, (c_y+c_h)/s_k] 85 | # back to torch land 86 | output = torch.Tensor(mean).view(-1, 4) 87 | if self.clip: 88 | output.clamp_(max=1, min=0) 89 | return output 90 | -------------------------------------------------------------------------------- /layers/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .l2norm import L2Norm 2 | from .multibox_loss import MultiBoxLoss 3 | 4 | __all__ = ['L2Norm', 'MultiBoxLoss'] 5 | -------------------------------------------------------------------------------- /layers/modules/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/modules/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /layers/modules/__pycache__/l2norm.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/modules/__pycache__/l2norm.cpython-35.pyc -------------------------------------------------------------------------------- /layers/modules/__pycache__/multibox_loss.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/modules/__pycache__/multibox_loss.cpython-35.pyc -------------------------------------------------------------------------------- /layers/modules/l2norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Function 4 | from torch.autograd import Variable 5 | import torch.nn.init as init 6 | 7 | class L2Norm(nn.Module): 8 | def __init__(self,n_channels, scale): 9 | super(L2Norm,self).__init__() 10 | self.n_channels = n_channels 11 | self.gamma = scale or None 12 | self.eps = 1e-10 13 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 14 | self.reset_parameters() 15 | 16 | def reset_parameters(self): 17 | init.constant(self.weight,self.gamma) 18 | 19 | def forward(self, x): 20 | norm = x.pow(2).sum(1).sqrt()+self.eps 21 | x/=norm.expand_as(x) 22 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 23 | return out 24 | -------------------------------------------------------------------------------- /layers/modules/multibox_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from data import v2 as cfg 6 | from ..box_utils import match, log_sum_exp 7 | 8 | class MultiBoxLoss(nn.Module): 9 | """SSD Weighted Loss Function 10 | Compute Targets: 11 | 1) Produce Confidence Target Indices by matching ground truth boxes 12 | with (default) 'priorboxes' that have jaccard index > threshold parameter 13 | (default threshold: 0.5). 14 | 2) Produce localization target by 'encoding' variance into offsets of ground 15 | truth boxes and their matched 'priorboxes'. 16 | 3) Hard negative mining to filter the excessive number of negative examples 17 | that comes with using a large number of default bounding boxes. 18 | (default negative:positive ratio 3:1) 19 | Objective Loss: 20 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 21 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 22 | weighted by α which is set to 1 by cross val. 23 | Args: 24 | c: class confidences, 25 | l: predicted boxes, 26 | g: ground truth boxes 27 | N: number of matched default boxes 28 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 29 | """ 30 | 31 | def __init__(self, num_classes, overlap_thresh, prior_for_matching, 32 | bkg_label, neg_mining, neg_pos, neg_overlap, encode_target, 33 | use_gpu=True): 34 | super(MultiBoxLoss, self).__init__() 35 | self.use_gpu = use_gpu 36 | self.num_classes = num_classes 37 | self.threshold = overlap_thresh 38 | self.background_label = bkg_label 39 | self.encode_target = encode_target 40 | self.use_prior_for_matching = prior_for_matching 41 | self.do_neg_mining = neg_mining 42 | self.negpos_ratio = neg_pos 43 | self.neg_overlap = neg_overlap 44 | self.variance = cfg['variance'] 45 | 46 | def forward(self, predictions, targets): 47 | """Multibox Loss 48 | Args: 49 | predictions (tuple): A tuple containing loc preds, conf preds, 50 | and prior boxes from SSD net. 51 | conf shape: torch.size(batch_size,num_priors,num_classes) 52 | loc shape: torch.size(batch_size,num_priors,4) 53 | priors shape: torch.size(num_priors,4) 54 | 55 | ground_truth (tensor): Ground truth boxes and labels for a batch, 56 | shape: [batch_size,num_objs,5] (last idx is the label). 57 | """ 58 | loc_data, conf_data, priors = predictions 59 | num = loc_data.size(0) 60 | priors = priors[:loc_data.size(1), :] 61 | num_priors = (priors.size(0)) 62 | num_classes = self.num_classes 63 | 64 | # match priors (default boxes) and ground truth boxes 65 | loc_t = torch.Tensor(num, num_priors, 4) 66 | conf_t = torch.LongTensor(num, num_priors) 67 | for idx in range(num): 68 | truths = targets[idx][:, :-1].data 69 | labels = targets[idx][:, -1].data 70 | defaults = priors.data 71 | match(self.threshold, truths, defaults, self.variance, labels, 72 | loc_t, conf_t, idx) 73 | if self.use_gpu: 74 | loc_t = loc_t.cuda() 75 | conf_t = conf_t.cuda() 76 | # wrap targets 77 | loc_t = Variable(loc_t, requires_grad=False) 78 | conf_t = Variable(conf_t, requires_grad=False) 79 | 80 | pos = conf_t > 0 81 | num_pos = pos.sum(keepdim=True) 82 | 83 | # Localization Loss (Smooth L1) 84 | # Shape: [batch,num_priors,4] 85 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 86 | loc_p = loc_data[pos_idx].view(-1, 4) 87 | loc_t = loc_t[pos_idx].view(-1, 4) 88 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 89 | 90 | # Compute max conf across batch for hard negative mining 91 | batch_conf = conf_data.view(-1, self.num_classes) 92 | 93 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) 94 | 95 | # Hard Negative Mining 96 | loss_c[pos] = 0 # filter out pos boxes for now 97 | loss_c = loss_c.view(num, -1) 98 | _, loss_idx = loss_c.sort(1, descending=True) 99 | _, idx_rank = loss_idx.sort(1) 100 | num_pos = pos.long().sum(1, keepdim=True) 101 | num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) 102 | neg = idx_rank < num_neg.expand_as(idx_rank) 103 | 104 | # Confidence Loss Including Positive and Negative Examples 105 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 106 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 107 | conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes) 108 | targets_weighted = conf_t[(pos+neg).gt(0)] 109 | loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False) 110 | 111 | # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 112 | 113 | N = num_pos.data.sum() 114 | loss_l /= N 115 | loss_c /= N 116 | return loss_l, loss_c 117 | -------------------------------------------------------------------------------- /object_detection.py: -------------------------------------------------------------------------------- 1 | # Object Detection 2 | 3 | # Importing the libraries 4 | import torch 5 | from torch.autograd import Variable 6 | import cv2 7 | from data import BaseTransform, VOC_CLASSES as labelmap 8 | from ssd import build_ssd 9 | import imageio 10 | 11 | # Defining a function that will do the detections 12 | def detect(frame, net, transform): 13 | height, width = frame.shape[:2] 14 | frame_t = transform(frame)[0] 15 | x = torch.from_numpy(frame_t).permute(2, 0, 1) 16 | x = Variable(x.unsqueeze(0)) 17 | y = net(x) 18 | detections = y.data 19 | scale = torch.Tensor([width, height, width, height]) 20 | # detections = [batch, number of classes, number of occurence, (score, x0, Y0, x1, y1)] 21 | for i in range(detections.size(1)): 22 | j = 0 23 | while detections[0, i, j, 0] >= 0.6: 24 | pt = (detections[0, i, j, 1:] * scale).numpy() 25 | cv2.rectangle(frame, (int(pt[0]), int(pt[1])), (int(pt[2]), int(pt[3])), (255, 0, 0), 2) 26 | cv2.putText(frame, labelmap[i - 1], (int(pt[0]), int(pt[1])), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2, cv2.LINE_AA) 27 | j += 1 28 | return frame 29 | 30 | # Creating the SSD neural network 31 | net = build_ssd('test') 32 | net.load_state_dict(torch.load('ssd300_mAP_77.43_v2.pth', map_location = lambda storage, loc: storage)) 33 | 34 | # Creating the transformation 35 | transform = BaseTransform(net.size, (104/256.0, 117/256.0, 123/256.0)) 36 | 37 | # Doing some Object Detection on a video 38 | reader = imageio.get_reader('dog.mp4') 39 | fps = reader.get_meta_data()['fps'] 40 | writer = imageio.get_writer('output.mp4', fps = fps) 41 | for i, frame in enumerate(reader): 42 | frame = detect(frame, net.eval(), transform) 43 | writer.append_data(frame) 44 | print(i) 45 | writer.close() 46 | -------------------------------------------------------------------------------- /prev.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/prev.png -------------------------------------------------------------------------------- /ssd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from layers import * 6 | from data import v2 7 | import os 8 | 9 | 10 | class SSD(nn.Module): 11 | """Single Shot Multibox Architecture 12 | The network is composed of a base VGG network followed by the 13 | added multibox conv layers. Each multibox layer branches into 14 | 1) conv2d for class conf scores 15 | 2) conv2d for localization predictions 16 | 3) associated priorbox layer to produce default bounding 17 | boxes specific to the layer's feature map size. 18 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 19 | 20 | Args: 21 | phase: (string) Can be "test" or "train" 22 | base: VGG16 layers for input, size of either 300 or 500 23 | extras: extra layers that feed to multibox loc and conf layers 24 | head: "multibox head" consists of loc and conf conv layers 25 | """ 26 | 27 | def __init__(self, phase, base, extras, head, num_classes): 28 | super(SSD, self).__init__() 29 | self.phase = phase 30 | self.num_classes = num_classes 31 | # TODO: implement __call__ in PriorBox 32 | self.priorbox = PriorBox(v2) 33 | self.priors = Variable(self.priorbox.forward(), volatile=True) 34 | self.size = 300 35 | 36 | # SSD network 37 | self.vgg = nn.ModuleList(base) 38 | # Layer learns to scale the l2 normalized features from conv4_3 39 | self.L2Norm = L2Norm(512, 20) 40 | self.extras = nn.ModuleList(extras) 41 | 42 | self.loc = nn.ModuleList(head[0]) 43 | self.conf = nn.ModuleList(head[1]) 44 | 45 | if phase == 'test': 46 | self.softmax = nn.Softmax() 47 | self.detect = Detect(num_classes, 0, 200, 0.01, 0.45) 48 | 49 | def forward(self, x): 50 | """Applies network layers and ops on input image(s) x. 51 | 52 | Args: 53 | x: input image or batch of images. Shape: [batch,3*batch,300,300]. 54 | 55 | Return: 56 | Depending on phase: 57 | test: 58 | Variable(tensor) of output class label predictions, 59 | confidence score, and corresponding location predictions for 60 | each object detected. Shape: [batch,topk,7] 61 | 62 | train: 63 | list of concat outputs from: 64 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 65 | 2: localization layers, Shape: [batch,num_priors*4] 66 | 3: priorbox layers, Shape: [2,num_priors*4] 67 | """ 68 | sources = list() 69 | loc = list() 70 | conf = list() 71 | 72 | # apply vgg up to conv4_3 relu 73 | for k in range(23): 74 | x = self.vgg[k](x) 75 | 76 | s = self.L2Norm(x) 77 | sources.append(s) 78 | 79 | # apply vgg up to fc7 80 | for k in range(23, len(self.vgg)): 81 | x = self.vgg[k](x) 82 | sources.append(x) 83 | 84 | # apply extra layers and cache source layer outputs 85 | for k, v in enumerate(self.extras): 86 | x = F.relu(v(x), inplace=True) 87 | if k % 2 == 1: 88 | sources.append(x) 89 | 90 | # apply multibox head to source layers 91 | for (x, l, c) in zip(sources, self.loc, self.conf): 92 | loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 93 | conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 94 | 95 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 96 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 97 | if self.phase == "test": 98 | output = self.detect( 99 | loc.view(loc.size(0), -1, 4), # loc preds 100 | self.softmax(conf.view(-1, self.num_classes)), # conf preds 101 | self.priors.type(type(x.data)) # default boxes 102 | ) 103 | else: 104 | output = ( 105 | loc.view(loc.size(0), -1, 4), 106 | conf.view(conf.size(0), -1, self.num_classes), 107 | self.priors 108 | ) 109 | return output 110 | 111 | def load_weights(self, base_file): 112 | other, ext = os.path.splitext(base_file) 113 | if ext == '.pkl' or '.pth': 114 | print('Loading weights into state dict...') 115 | self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage)) 116 | print('Finished!') 117 | else: 118 | print('Sorry only .pth and .pkl files supported.') 119 | 120 | 121 | # This function is derived from torchvision VGG make_layers() 122 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 123 | def vgg(cfg, i, batch_norm=False): 124 | layers = [] 125 | in_channels = i 126 | for v in cfg: 127 | if v == 'M': 128 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 129 | elif v == 'C': 130 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 131 | else: 132 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 133 | if batch_norm: 134 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 135 | else: 136 | layers += [conv2d, nn.ReLU(inplace=True)] 137 | in_channels = v 138 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 139 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 140 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 141 | layers += [pool5, conv6, 142 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 143 | return layers 144 | 145 | 146 | def add_extras(cfg, i, batch_norm=False): 147 | # Extra layers added to VGG for feature scaling 148 | layers = [] 149 | in_channels = i 150 | flag = False 151 | for k, v in enumerate(cfg): 152 | if in_channels != 'S': 153 | if v == 'S': 154 | layers += [nn.Conv2d(in_channels, cfg[k + 1], 155 | kernel_size=(1, 3)[flag], stride=2, padding=1)] 156 | else: 157 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 158 | flag = not flag 159 | in_channels = v 160 | return layers 161 | 162 | 163 | def multibox(vgg, extra_layers, cfg, num_classes): 164 | loc_layers = [] 165 | conf_layers = [] 166 | vgg_source = [24, -2] 167 | for k, v in enumerate(vgg_source): 168 | loc_layers += [nn.Conv2d(vgg[v].out_channels, 169 | cfg[k] * 4, kernel_size=3, padding=1)] 170 | conf_layers += [nn.Conv2d(vgg[v].out_channels, 171 | cfg[k] * num_classes, kernel_size=3, padding=1)] 172 | for k, v in enumerate(extra_layers[1::2], 2): 173 | loc_layers += [nn.Conv2d(v.out_channels, cfg[k] 174 | * 4, kernel_size=3, padding=1)] 175 | conf_layers += [nn.Conv2d(v.out_channels, cfg[k] 176 | * num_classes, kernel_size=3, padding=1)] 177 | return vgg, extra_layers, (loc_layers, conf_layers) 178 | 179 | 180 | base = { 181 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 182 | 512, 512, 512], 183 | '512': [], 184 | } 185 | extras = { 186 | '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 187 | '512': [], 188 | } 189 | mbox = { 190 | '300': [4, 6, 6, 6, 4, 4], # number of boxes per feature map location 191 | '512': [], 192 | } 193 | 194 | 195 | def build_ssd(phase, size=300, num_classes=21): 196 | if phase != "test" and phase != "train": 197 | print("Error: Phase not recognized") 198 | return 199 | if size != 300: 200 | print("Error: Sorry only SSD300 is supported currently!") 201 | return 202 | 203 | return SSD(phase, *multibox(vgg(base[str(size)], 3), 204 | add_extras(extras[str(size)], 1024), 205 | mbox[str(size)], num_classes), num_classes) 206 | -------------------------------------------------------------------------------- /ssd300_mAP_77.43_v2.pth.000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/ssd300_mAP_77.43_v2.pth.000 -------------------------------------------------------------------------------- /ssd300_mAP_77.43_v2.pth.001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/ssd300_mAP_77.43_v2.pth.001 -------------------------------------------------------------------------------- /ssd300_mAP_77.43_v2.pth.002: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/ssd300_mAP_77.43_v2.pth.002 -------------------------------------------------------------------------------- /ssd300_mAP_77.43_v2.pth.003: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/ssd300_mAP_77.43_v2.pth.003 -------------------------------------------------------------------------------- /ssd300_mAP_77.43_v2.pth.004: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/ssd300_mAP_77.43_v2.pth.004 -------------------------------------------------------------------------------- /virtual_platform_windows.yml: -------------------------------------------------------------------------------- 1 | name: virtual_platform 2 | channels: 3 | - menpo 4 | - conda-forge 5 | - peterjc123 6 | - defaults 7 | dependencies: 8 | - ffmpeg=3.2.4=1 9 | - freetype=2.7=vc14_1 10 | - imageio=2.2.0=py35_0 11 | - libtiff=4.0.6=vc14_7 12 | - olefile=0.44=py35_0 13 | - pillow=4.2.1=py35_0 14 | - vc=14=0 15 | - alabaster=0.7.10=py35_0 16 | - astroid=1.5.3=py35_0 17 | - babel=2.5.0=py35_0 18 | - bleach=1.5.0=py35_0 19 | - certifi=2016.2.28=py35_0 20 | - cffi=1.10.0=py35_0 21 | - chardet=3.0.4=py35_0 22 | - colorama=0.3.9=py35_0 23 | - decorator=4.1.2=py35_0 24 | - docutils=0.14=py35_0 25 | - entrypoints=0.2.3=py35_0 26 | - html5lib=0.9999999=py35_0 27 | - icu=57.1=vc14_0 28 | - imagesize=0.7.1=py35_0 29 | - ipykernel=4.6.1=py35_0 30 | - ipython=6.1.0=py35_0 31 | - ipython_genutils=0.2.0=py35_0 32 | - isort=4.2.15=py35_0 33 | - jedi=0.10.2=py35_2 34 | - jinja2=2.9.6=py35_0 35 | - jpeg=9b=vc14_0 36 | - jsonschema=2.6.0=py35_0 37 | - jupyter_client=5.1.0=py35_0 38 | - jupyter_core=4.3.0=py35_0 39 | - lazy-object-proxy=1.3.1=py35_0 40 | - libpng=1.6.30=vc14_1 41 | - markupsafe=1.0=py35_0 42 | - mistune=0.7.4=py35_0 43 | - mkl=2017.0.3=0 44 | - nbconvert=5.2.1=py35_0 45 | - nbformat=4.4.0=py35_0 46 | - numpy=1.13.1=py35_0 47 | - numpydoc=0.7.0=py35_0 48 | - openssl=1.0.2l=vc14_0 49 | - pandocfilters=1.4.2=py35_0 50 | - path.py=10.3.1=py35_0 51 | - pickleshare=0.7.4=py35_0 52 | - pip=9.0.1=py35_1 53 | - prompt_toolkit=1.0.15=py35_0 54 | - psutil=5.2.2=py35_0 55 | - pycodestyle=2.3.1=py35_0 56 | - pycparser=2.18=py35_0 57 | - pyflakes=1.6.0=py35_0 58 | - pygments=2.2.0=py35_0 59 | - pylint=1.7.2=py35_0 60 | - pyqt=5.6.0=py35_2 61 | - python=3.5.4=0 62 | - python-dateutil=2.6.1=py35_0 63 | - pytz=2017.2=py35_0 64 | - pyzmq=16.0.2=py35_0 65 | - qt=5.6.2=vc14_6 66 | - qtawesome=0.4.4=py35_0 67 | - qtconsole=4.3.1=py35_0 68 | - qtpy=1.3.1=py35_0 69 | - requests=2.14.2=py35_0 70 | - rope=0.9.4=py35_1 71 | - setuptools=36.4.0=py35_1 72 | - simplegeneric=0.8.1=py35_1 73 | - singledispatch=3.4.0.3=py35_0 74 | - sip=4.18=py35_0 75 | - six=1.10.0=py35_1 76 | - snowballstemmer=1.2.1=py35_0 77 | - sphinx=1.6.3=py35_0 78 | - sphinxcontrib=1.0=py35_0 79 | - sphinxcontrib-websupport=1.0.1=py35_0 80 | - spyder=3.2.3=py35_0 81 | - testpath=0.3.1=py35_0 82 | - tornado=4.5.2=py35_0 83 | - traitlets=4.3.2=py35_0 84 | - vs2015_runtime=14.0.25420=0 85 | - wcwidth=0.1.7=py35_0 86 | - wheel=0.29.0=py35_0 87 | - win_unicode_console=0.5=py35_0 88 | - wincertstore=0.2=py35_0 89 | - wrapt=1.10.11=py35_0 90 | - zlib=1.2.11=vc14_0 91 | - opencv3=3.1.0=py35_0 92 | - pytorch=0.1.12=py35_0.1.12cu80 93 | - pip: 94 | - ipython-genutils==0.2.0 95 | - jupyter-client==5.1.0 96 | - jupyter-core==4.3.0 97 | - prompt-toolkit==1.0.15 98 | - pyyaml==3.12 99 | - rope-py3k==0.9.4.post1 100 | - torch==0.1.12 101 | - torchvision==0.1.9 102 | - win-unicode-console==0.5 103 | -------------------------------------------------------------------------------- /working.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/working.png --------------------------------------------------------------------------------