├── README.md
├── data
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ ├── config.cpython-35.pyc
│ └── voc0712.cpython-35.pyc
├── config.py
├── scripts
│ ├── VOC2007.sh
│ └── VOC2012.sh
└── voc0712.py
├── layers
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ └── box_utils.cpython-35.pyc
├── box_utils.py
├── functions
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-35.pyc
│ │ ├── detection.cpython-35.pyc
│ │ └── prior_box.cpython-35.pyc
│ ├── detection.py
│ └── prior_box.py
└── modules
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-35.pyc
│ ├── l2norm.cpython-35.pyc
│ └── multibox_loss.cpython-35.pyc
│ ├── l2norm.py
│ └── multibox_loss.py
├── object_detection.py
├── prev.png
├── ssd.py
├── ssd300_mAP_77.43_v2.pth.000
├── ssd300_mAP_77.43_v2.pth.001
├── ssd300_mAP_77.43_v2.pth.002
├── ssd300_mAP_77.43_v2.pth.003
├── ssd300_mAP_77.43_v2.pth.004
├── virtual_platform_windows.yml
└── working.png
/README.md:
--------------------------------------------------------------------------------
1 | # Object-Detection SSD
2 |
3 |
4 |
5 | ## Intro
6 | 
7 | 
8 | 
9 |
10 |
11 |
12 |
13 | Detecting Multiple objects in a video using Single Shot Multibox Detector
14 |
15 | Weight Files are splitted as
16 |
17 | - ssd300_mAP_77.43_v2.pth.000
18 | - ssd300_mAP_77.43_v2.pth.001
19 | - ssd300_mAP_77.43_v2.pth.002
20 | - ssd300_mAP_77.43_v2.pth.003
21 | - ssd300_mAP_77.43_v2.pth.004
22 |
23 |
24 | Join weight files [Here](http://pinetools.com/join-files)
25 |
26 | Read more about SSD [here](https://arxiv.org/pdf/1512.02325.pdf)
27 |
28 |
29 | Click on this image to see demo from SSD:
30 |
31 | [](http://i.imgur.com/EyZZKAA.gif)
32 |
33 | ## Dependencies
34 |
35 | Check out the virtual_platform_windows.yml file
36 |
37 | ### Getting started
38 |
39 | Create a virtual platform.
40 | ```
41 | conda env create -f virtual_platform_windows.yml
42 | ```
43 |
44 | ## Working
45 | []
46 |
47 |
48 | ## Testing
49 |
50 | **updateobject_detection.py**
51 | reader = imageio.get_reader('`video`.mp4')
52 |
53 |
54 |
55 | ## Getting data set for Training a new model
56 |
57 | Training is simple but GPU `CUDA` is mandatory otherwise it will take months to train on a CPU:
58 |
59 | ```bash
60 | #Get the dataset:
61 | (http://host.robots.ox.ac.uk/pascal/VOC/index.html)
62 | ```
63 |
64 | Created by:
Anubhav Shukla
65 |
66 | 
67 |
--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .voc0712 import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES
2 | from .config import *
3 | import cv2
4 | import numpy as np
5 |
6 |
7 | def base_transform(image, size, mean):
8 | x = cv2.resize(image, (size, size)).astype(np.float32)
9 | # x = cv2.resize(np.array(image), (size, size)).astype(np.float32)
10 | x -= mean
11 | x = x.astype(np.float32)
12 | return x
13 |
14 |
15 | class BaseTransform:
16 | def __init__(self, size, mean):
17 | self.size = size
18 | self.mean = np.array(mean, dtype=np.float32)
19 |
20 | def __call__(self, image, boxes=None, labels=None):
21 | return base_transform(image, self.size, self.mean), boxes, labels
22 |
--------------------------------------------------------------------------------
/data/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/data/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/data/__pycache__/config.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/data/__pycache__/config.cpython-35.pyc
--------------------------------------------------------------------------------
/data/__pycache__/voc0712.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/data/__pycache__/voc0712.cpython-35.pyc
--------------------------------------------------------------------------------
/data/config.py:
--------------------------------------------------------------------------------
1 | # config.py
2 | import os.path
3 |
4 | # gets home dir cross platform
5 | home = os.path.expanduser("~")
6 | ddir = os.path.join(home,"data/VOCdevkit/")
7 |
8 | # note: if you used our download scripts, this should be right
9 | VOCroot = ddir # path to VOCdevkit root dir
10 |
11 | # default batch size
12 | BATCHES = 32
13 | # data reshuffled at every epoch
14 | SHUFFLE = True
15 | # number of subprocesses to use for data loading
16 | WORKERS = 4
17 |
18 |
19 | #SSD300 CONFIGS
20 | # newer version: use additional conv11_2 layer as last layer before multibox layers
21 | v2 = {
22 | 'feature_maps' : [38, 19, 10, 5, 3, 1],
23 |
24 | 'min_dim' : 300,
25 |
26 | 'steps' : [8, 16, 32, 64, 100, 300],
27 |
28 | 'min_sizes' : [30, 60, 111, 162, 213, 264],
29 |
30 | 'max_sizes' : [60, 111, 162, 213, 264, 315],
31 |
32 | # 'aspect_ratios' : [[2, 1/2], [2, 1/2, 3, 1/3], [2, 1/2, 3, 1/3],
33 | # [2, 1/2, 3, 1/3], [2, 1/2], [2, 1/2]],
34 | 'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
35 |
36 | 'variance' : [0.1, 0.2],
37 |
38 | 'clip' : True,
39 |
40 | 'name' : 'v2',
41 | }
42 |
43 | # use average pooling layer as last layer before multibox layers
44 | v1 = {
45 | 'feature_maps' : [38, 19, 10, 5, 3, 1],
46 |
47 | 'min_dim' : 300,
48 |
49 | 'steps' : [8, 16, 32, 64, 100, 300],
50 |
51 | 'min_sizes' : [30, 60, 114, 168, 222, 276],
52 |
53 | 'max_sizes' : [-1, 114, 168, 222, 276, 330],
54 |
55 | # 'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]],
56 | 'aspect_ratios' : [[1,1,2,1/2],[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3],
57 | [1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3]],
58 |
59 | 'variance' : [0.1, 0.2],
60 |
61 | 'clip' : True,
62 |
63 | 'name' : 'v1',
64 | }
65 |
--------------------------------------------------------------------------------
/data/scripts/VOC2007.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Ellis Brown
3 |
4 | start=`date +%s`
5 |
6 | # handle optional download dir
7 | if [ -z "$1" ]
8 | then
9 | # navigate to ~/data
10 | echo "navigating to ~/data/ ..."
11 | mkdir -p ~/data
12 | cd ~/data/
13 | else
14 | # check if is valid directory
15 | if [ ! -d $1 ]; then
16 | echo $1 "is not a valid directory"
17 | exit 0
18 | fi
19 | echo "navigating to" $1 "..."
20 | cd $1
21 | fi
22 |
23 | echo "Downloading VOC2007 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
26 | echo "Downloading VOC2007 test data ..."
27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
28 | echo "Done downloading."
29 |
30 | # Extract data
31 | echo "Extracting trainval ..."
32 | tar -xvf VOCtrainval_06-Nov-2007.tar
33 | echo "Extracting test ..."
34 | tar -xvf VOCtest_06-Nov-2007.tar
35 | echo "removing tars ..."
36 | rm VOCtrainval_06-Nov-2007.tar
37 | rm VOCtest_06-Nov-2007.tar
38 |
39 | end=`date +%s`
40 | runtime=$((end-start))
41 |
42 | echo "Completed in" $runtime "seconds"
--------------------------------------------------------------------------------
/data/scripts/VOC2012.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Ellis Brown
3 |
4 | start=`date +%s`
5 |
6 | # handle optional download dir
7 | if [ -z "$1" ]
8 | then
9 | # navigate to ~/data
10 | echo "navigating to ~/data/ ..."
11 | mkdir -p ~/data
12 | cd ~/data/
13 | else
14 | # check if is valid directory
15 | if [ ! -d $1 ]; then
16 | echo $1 "is not a valid directory"
17 | exit 0
18 | fi
19 | echo "navigating to" $1 "..."
20 | cd $1
21 | fi
22 |
23 | echo "Downloading VOC2012 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
26 | echo "Done downloading."
27 |
28 |
29 | # Extract data
30 | echo "Extracting trainval ..."
31 | tar -xvf VOCtrainval_11-May-2012.tar
32 | echo "removing tar ..."
33 | rm VOCtrainval_11-May-2012.tar
34 |
35 | end=`date +%s`
36 | runtime=$((end-start))
37 |
38 | echo "Completed in" $runtime "seconds"
--------------------------------------------------------------------------------
/data/voc0712.py:
--------------------------------------------------------------------------------
1 | """VOC Dataset Classes
2 |
3 | Original author: Francisco Massa
4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
5 |
6 | Updated by: Ellis Brown, Max deGroot
7 | """
8 |
9 | import os
10 | import os.path
11 | import sys
12 | import torch
13 | import torch.utils.data as data
14 | import torchvision.transforms as transforms
15 | from PIL import Image, ImageDraw, ImageFont
16 | import cv2
17 | import numpy as np
18 | if sys.version_info[0] == 2:
19 | import xml.etree.cElementTree as ET
20 | else:
21 | import xml.etree.ElementTree as ET
22 |
23 | VOC_CLASSES = ( # always index 0
24 | 'aeroplane', 'bicycle', 'bird', 'boat',
25 | 'bottle', 'bus', 'car', 'cat', 'chair',
26 | 'cow', 'diningtable', 'dog', 'horse',
27 | 'motorbike', 'person', 'pottedplant',
28 | 'sheep', 'sofa', 'train', 'tvmonitor')
29 |
30 | # for making bounding boxes pretty
31 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128),
32 | (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128))
33 |
34 |
35 | class AnnotationTransform(object):
36 | """Transforms a VOC annotation into a Tensor of bbox coords and label index
37 | Initilized with a dictionary lookup of classnames to indexes
38 |
39 | Arguments:
40 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
41 | (default: alphabetic indexing of VOC's 20 classes)
42 | keep_difficult (bool, optional): keep difficult instances or not
43 | (default: False)
44 | height (int): height
45 | width (int): width
46 | """
47 |
48 | def __init__(self, class_to_ind=None, keep_difficult=False):
49 | self.class_to_ind = class_to_ind or dict(
50 | zip(VOC_CLASSES, range(len(VOC_CLASSES))))
51 | self.keep_difficult = keep_difficult
52 |
53 | def __call__(self, target, width, height):
54 | """
55 | Arguments:
56 | target (annotation) : the target annotation to be made usable
57 | will be an ET.Element
58 | Returns:
59 | a list containing lists of bounding boxes [bbox coords, class name]
60 | """
61 | res = []
62 | for obj in target.iter('object'):
63 | difficult = int(obj.find('difficult').text) == 1
64 | if not self.keep_difficult and difficult:
65 | continue
66 | name = obj.find('name').text.lower().strip()
67 | bbox = obj.find('bndbox')
68 |
69 | pts = ['xmin', 'ymin', 'xmax', 'ymax']
70 | bndbox = []
71 | for i, pt in enumerate(pts):
72 | cur_pt = int(bbox.find(pt).text) - 1
73 | # scale height or width
74 | cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
75 | bndbox.append(cur_pt)
76 | label_idx = self.class_to_ind[name]
77 | bndbox.append(label_idx)
78 | res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind]
79 | # img_id = target.find('filename').text[:-4]
80 |
81 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ]
82 |
83 |
84 | class VOCDetection(data.Dataset):
85 | """VOC Detection Dataset Object
86 |
87 | input is image, target is annotation
88 |
89 | Arguments:
90 | root (string): filepath to VOCdevkit folder.
91 | image_set (string): imageset to use (eg. 'train', 'val', 'test')
92 | transform (callable, optional): transformation to perform on the
93 | input image
94 | target_transform (callable, optional): transformation to perform on the
95 | target `annotation`
96 | (eg: take in caption string, return tensor of word indices)
97 | dataset_name (string, optional): which dataset to load
98 | (default: 'VOC2007')
99 | """
100 |
101 | def __init__(self, root, image_sets, transform=None, target_transform=None,
102 | dataset_name='VOC0712'):
103 | self.root = root
104 | self.image_set = image_sets
105 | self.transform = transform
106 | self.target_transform = target_transform
107 | self.name = dataset_name
108 | self._annopath = os.path.join('%s', 'Annotations', '%s.xml')
109 | self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg')
110 | self.ids = list()
111 | for (year, name) in image_sets:
112 | rootpath = os.path.join(self.root, 'VOC' + year)
113 | for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
114 | self.ids.append((rootpath, line.strip()))
115 |
116 | def __getitem__(self, index):
117 | im, gt, h, w = self.pull_item(index)
118 |
119 | return im, gt
120 |
121 | def __len__(self):
122 | return len(self.ids)
123 |
124 | def pull_item(self, index):
125 | img_id = self.ids[index]
126 |
127 | target = ET.parse(self._annopath % img_id).getroot()
128 | img = cv2.imread(self._imgpath % img_id)
129 | height, width, channels = img.shape
130 |
131 | if self.target_transform is not None:
132 | target = self.target_transform(target, width, height)
133 |
134 | if self.transform is not None:
135 | target = np.array(target)
136 | img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])
137 | # to rgb
138 | img = img[:, :, (2, 1, 0)]
139 | # img = img.transpose(2, 0, 1)
140 | target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
141 | return torch.from_numpy(img).permute(2, 0, 1), target, height, width
142 | # return torch.from_numpy(img), target, height, width
143 |
144 | def pull_image(self, index):
145 | '''Returns the original image object at index in PIL form
146 |
147 | Note: not using self.__getitem__(), as any transformations passed in
148 | could mess up this functionality.
149 |
150 | Argument:
151 | index (int): index of img to show
152 | Return:
153 | PIL img
154 | '''
155 | img_id = self.ids[index]
156 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
157 |
158 | def pull_anno(self, index):
159 | '''Returns the original annotation of image at index
160 |
161 | Note: not using self.__getitem__(), as any transformations passed in
162 | could mess up this functionality.
163 |
164 | Argument:
165 | index (int): index of img to get annotation of
166 | Return:
167 | list: [img_id, [(label, bbox coords),...]]
168 | eg: ('001718', [('dog', (96, 13, 438, 332))])
169 | '''
170 | img_id = self.ids[index]
171 | anno = ET.parse(self._annopath % img_id).getroot()
172 | gt = self.target_transform(anno, 1, 1)
173 | return img_id[1], gt
174 |
175 | def pull_tensor(self, index):
176 | '''Returns the original image at an index in tensor form
177 |
178 | Note: not using self.__getitem__(), as any transformations passed in
179 | could mess up this functionality.
180 |
181 | Argument:
182 | index (int): index of img to show
183 | Return:
184 | tensorized version of img, squeezed
185 | '''
186 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0)
187 |
188 |
189 | def detection_collate(batch):
190 | """Custom collate fn for dealing with batches of images that have a different
191 | number of associated object annotations (bounding boxes).
192 |
193 | Arguments:
194 | batch: (tuple) A tuple of tensor images and lists of annotations
195 |
196 | Return:
197 | A tuple containing:
198 | 1) (tensor) batch of images stacked on their 0 dim
199 | 2) (list of tensors) annotations for a given image are stacked on 0 dim
200 | """
201 | targets = []
202 | imgs = []
203 | for sample in batch:
204 | imgs.append(sample[0])
205 | targets.append(torch.FloatTensor(sample[1]))
206 | return torch.stack(imgs, 0), targets
207 |
--------------------------------------------------------------------------------
/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import *
2 | from .modules import *
3 |
--------------------------------------------------------------------------------
/layers/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/layers/__pycache__/box_utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/__pycache__/box_utils.cpython-35.pyc
--------------------------------------------------------------------------------
/layers/box_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | def point_form(boxes):
4 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
5 | representation for comparison to point form ground truth data.
6 | Args:
7 | boxes: (tensor) center-size default boxes from priorbox layers.
8 | Return:
9 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
10 | """
11 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin
12 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax
13 |
14 |
15 | def center_size(boxes):
16 | """ Convert prior_boxes to (cx, cy, w, h)
17 | representation for comparison to center-size form ground truth data.
18 | Args:
19 | boxes: (tensor) point_form boxes
20 | Return:
21 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
22 | """
23 | return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy
24 | boxes[:, 2:] - boxes[:, :2], 1) # w, h
25 |
26 |
27 | def intersect(box_a, box_b):
28 | """ We resize both tensors to [A,B,2] without new malloc:
29 | [A,2] -> [A,1,2] -> [A,B,2]
30 | [B,2] -> [1,B,2] -> [A,B,2]
31 | Then we compute the area of intersect between box_a and box_b.
32 | Args:
33 | box_a: (tensor) bounding boxes, Shape: [A,4].
34 | box_b: (tensor) bounding boxes, Shape: [B,4].
35 | Return:
36 | (tensor) intersection area, Shape: [A,B].
37 | """
38 | A = box_a.size(0)
39 | B = box_b.size(0)
40 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
41 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
42 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
43 | box_b[:, :2].unsqueeze(0).expand(A, B, 2))
44 | inter = torch.clamp((max_xy - min_xy), min=0)
45 | return inter[:, :, 0] * inter[:, :, 1]
46 |
47 |
48 | def jaccard(box_a, box_b):
49 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap
50 | is simply the intersection over union of two boxes. Here we operate on
51 | ground truth boxes and default boxes.
52 | E.g.:
53 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
54 | Args:
55 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
56 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
57 | Return:
58 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
59 | """
60 | inter = intersect(box_a, box_b)
61 | area_a = ((box_a[:, 2]-box_a[:, 0]) *
62 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
63 | area_b = ((box_b[:, 2]-box_b[:, 0]) *
64 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
65 | union = area_a + area_b - inter
66 | return inter / union # [A,B]
67 |
68 |
69 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
70 | """Match each prior box with the ground truth box of the highest jaccard
71 | overlap, encode the bounding boxes, then return the matched indices
72 | corresponding to both confidence and location preds.
73 | Args:
74 | threshold: (float) The overlap threshold used when mathing boxes.
75 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
76 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
77 | variances: (tensor) Variances corresponding to each prior coord,
78 | Shape: [num_priors, 4].
79 | labels: (tensor) All the class labels for the image, Shape: [num_obj].
80 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
81 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
82 | idx: (int) current batch index
83 | Return:
84 | The matched indices corresponding to 1)location and 2)confidence preds.
85 | """
86 | # jaccard index
87 | overlaps = jaccard(
88 | truths,
89 | point_form(priors)
90 | )
91 | # (Bipartite Matching)
92 | # [1,num_objects] best prior for each ground truth
93 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
94 | # [1,num_priors] best ground truth for each prior
95 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
96 | best_truth_idx.squeeze_(0)
97 | best_truth_overlap.squeeze_(0)
98 | best_prior_idx.squeeze_(1)
99 | best_prior_overlap.squeeze_(1)
100 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior
101 | # TODO refactor: index best_prior_idx with long tensor
102 | # ensure every gt matches with its prior of max overlap
103 | for j in range(best_prior_idx.size(0)):
104 | best_truth_idx[best_prior_idx[j]] = j
105 | matches = truths[best_truth_idx] # Shape: [num_priors,4]
106 | conf = labels[best_truth_idx] + 1 # Shape: [num_priors]
107 | conf[best_truth_overlap < threshold] = 0 # label as background
108 | loc = encode(matches, priors, variances)
109 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn
110 | conf_t[idx] = conf # [num_priors] top class label for each prior
111 |
112 |
113 | def encode(matched, priors, variances):
114 | """Encode the variances from the priorbox layers into the ground truth boxes
115 | we have matched (based on jaccard overlap) with the prior boxes.
116 | Args:
117 | matched: (tensor) Coords of ground truth for each prior in point-form
118 | Shape: [num_priors, 4].
119 | priors: (tensor) Prior boxes in center-offset form
120 | Shape: [num_priors,4].
121 | variances: (list[float]) Variances of priorboxes
122 | Return:
123 | encoded boxes (tensor), Shape: [num_priors, 4]
124 | """
125 |
126 | # dist b/t match center and prior's center
127 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
128 | # encode variance
129 | g_cxcy /= (variances[0] * priors[:, 2:])
130 | # match wh / prior wh
131 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
132 | g_wh = torch.log(g_wh) / variances[1]
133 | # return target for smooth_l1_loss
134 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4]
135 |
136 |
137 | # Adapted from https://github.com/Hakuyume/chainer-ssd
138 | def decode(loc, priors, variances):
139 | """Decode locations from predictions using priors to undo
140 | the encoding we did for offset regression at train time.
141 | Args:
142 | loc (tensor): location predictions for loc layers,
143 | Shape: [num_priors,4]
144 | priors (tensor): Prior boxes in center-offset form.
145 | Shape: [num_priors,4].
146 | variances: (list[float]) Variances of priorboxes
147 | Return:
148 | decoded bounding box predictions
149 | """
150 |
151 | boxes = torch.cat((
152 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
153 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
154 | boxes[:, :2] -= boxes[:, 2:] / 2
155 | boxes[:, 2:] += boxes[:, :2]
156 | return boxes
157 |
158 |
159 | def log_sum_exp(x):
160 | """Utility function for computing log_sum_exp while determining
161 | This will be used to determine unaveraged confidence loss across
162 | all examples in a batch.
163 | Args:
164 | x (Variable(tensor)): conf_preds from conf layers
165 | """
166 | x_max = x.data.max()
167 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
168 |
169 |
170 | # Original author: Francisco Massa:
171 | # https://github.com/fmassa/object-detection.torch
172 | # Ported to PyTorch by Max deGroot (02/01/2017)
173 | def nms(boxes, scores, overlap=0.5, top_k=200):
174 | """Apply non-maximum suppression at test time to avoid detecting too many
175 | overlapping bounding boxes for a given object.
176 | Args:
177 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
178 | scores: (tensor) The class predscores for the img, Shape:[num_priors].
179 | overlap: (float) The overlap thresh for suppressing unnecessary boxes.
180 | top_k: (int) The Maximum number of box preds to consider.
181 | Return:
182 | The indices of the kept boxes with respect to num_priors.
183 | """
184 |
185 | keep = scores.new(scores.size(0)).zero_().long()
186 | if boxes.numel() == 0:
187 | return keep
188 | x1 = boxes[:, 0]
189 | y1 = boxes[:, 1]
190 | x2 = boxes[:, 2]
191 | y2 = boxes[:, 3]
192 | area = torch.mul(x2 - x1, y2 - y1)
193 | v, idx = scores.sort(0) # sort in ascending order
194 | # I = I[v >= 0.01]
195 | idx = idx[-top_k:] # indices of the top-k largest vals
196 | xx1 = boxes.new()
197 | yy1 = boxes.new()
198 | xx2 = boxes.new()
199 | yy2 = boxes.new()
200 | w = boxes.new()
201 | h = boxes.new()
202 |
203 | # keep = torch.Tensor()
204 | count = 0
205 | while idx.numel() > 0:
206 | i = idx[-1] # index of current largest val
207 | # keep.append(i)
208 | keep[count] = i
209 | count += 1
210 | if idx.size(0) == 1:
211 | break
212 | idx = idx[:-1] # remove kept element from view
213 | # load bboxes of next highest vals
214 | torch.index_select(x1, 0, idx, out=xx1)
215 | torch.index_select(y1, 0, idx, out=yy1)
216 | torch.index_select(x2, 0, idx, out=xx2)
217 | torch.index_select(y2, 0, idx, out=yy2)
218 | # store element-wise max with next highest score
219 | xx1 = torch.clamp(xx1, min=x1[i])
220 | yy1 = torch.clamp(yy1, min=y1[i])
221 | xx2 = torch.clamp(xx2, max=x2[i])
222 | yy2 = torch.clamp(yy2, max=y2[i])
223 | w.resize_as_(xx2)
224 | h.resize_as_(yy2)
225 | w = xx2 - xx1
226 | h = yy2 - yy1
227 | # check sizes of xx1 and xx2.. after each iteration
228 | w = torch.clamp(w, min=0.0)
229 | h = torch.clamp(h, min=0.0)
230 | inter = w*h
231 | # IoU = i / (area(a) + area(b) - i)
232 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas)
233 | union = (rem_areas - inter) + area[i]
234 | IoU = inter/union # store result in iou
235 | # keep only elements with an IoU <= overlap
236 | idx = idx[IoU.le(overlap)]
237 | return keep, count
238 |
--------------------------------------------------------------------------------
/layers/functions/__init__.py:
--------------------------------------------------------------------------------
1 | from .detection import Detect
2 | from .prior_box import PriorBox
3 |
4 |
5 | __all__ = ['Detect', 'PriorBox']
6 |
--------------------------------------------------------------------------------
/layers/functions/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/functions/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/layers/functions/__pycache__/detection.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/functions/__pycache__/detection.cpython-35.pyc
--------------------------------------------------------------------------------
/layers/functions/__pycache__/prior_box.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/functions/__pycache__/prior_box.cpython-35.pyc
--------------------------------------------------------------------------------
/layers/functions/detection.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.backends.cudnn as cudnn
4 | from torch.autograd import Function
5 | from torch.autograd import Variable
6 | from ..box_utils import decode, nms
7 | from data import v2 as cfg
8 |
9 |
10 | class Detect(Function):
11 | """At test time, Detect is the final layer of SSD. Decode location preds,
12 | apply non-maximum suppression to location predictions based on conf
13 | scores and threshold to a top_k number of output predictions for both
14 | confidence score and locations.
15 | """
16 | def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh):
17 | self.num_classes = num_classes
18 | self.background_label = bkg_label
19 | self.top_k = top_k
20 | # Parameters used in nms.
21 | self.nms_thresh = nms_thresh
22 | if nms_thresh <= 0:
23 | raise ValueError('nms_threshold must be non negative.')
24 | self.conf_thresh = conf_thresh
25 | self.variance = cfg['variance']
26 | self.output = torch.zeros(1, self.num_classes, self.top_k, 5)
27 |
28 | def forward(self, loc_data, conf_data, prior_data):
29 | """
30 | Args:
31 | loc_data: (tensor) Loc preds from loc layers
32 | Shape: [batch,num_priors*4]
33 | conf_data: (tensor) Shape: Conf preds from conf layers
34 | Shape: [batch*num_priors,num_classes]
35 | prior_data: (tensor) Prior boxes and variances from priorbox layers
36 | Shape: [1,num_priors,4]
37 | """
38 | num = loc_data.size(0) # batch size
39 | num_priors = prior_data.size(0)
40 | self.output.zero_()
41 | if num == 1:
42 | # size batch x num_classes x num_priors
43 | conf_preds = conf_data.t().contiguous().unsqueeze(0)
44 | else:
45 | conf_preds = conf_data.view(num, num_priors,
46 | self.num_classes).transpose(2, 1)
47 | self.output.expand_(num, self.num_classes, self.top_k, 5)
48 |
49 | # Decode predictions into bboxes.
50 | for i in range(num):
51 | decoded_boxes = decode(loc_data[i], prior_data, self.variance)
52 | # For each class, perform nms
53 | conf_scores = conf_preds[i].clone()
54 | num_det = 0
55 | for cl in range(1, self.num_classes):
56 | c_mask = conf_scores[cl].gt(self.conf_thresh)
57 | scores = conf_scores[cl][c_mask]
58 | if scores.dim() == 0:
59 | continue
60 | l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes)
61 | boxes = decoded_boxes[l_mask].view(-1, 4)
62 | # idx of highest scoring and non-overlapping boxes per class
63 | ids, count = nms(boxes, scores, self.nms_thresh, self.top_k)
64 | self.output[i, cl, :count] = \
65 | torch.cat((scores[ids[:count]].unsqueeze(1),
66 | boxes[ids[:count]]), 1)
67 | flt = self.output.view(-1, 5)
68 | _, idx = flt[:, 0].sort(0)
69 | _, rank = idx.sort(0)
70 | flt[(rank >= self.top_k).unsqueeze(1).expand_as(flt)].fill_(0)
71 | return self.output
72 |
--------------------------------------------------------------------------------
/layers/functions/prior_box.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from math import sqrt as sqrt
3 | from itertools import product as product
4 |
5 | class PriorBox(object):
6 | """Compute priorbox coordinates in center-offset form for each source
7 | feature map.
8 | Note:
9 | This 'layer' has changed between versions of the original SSD
10 | paper, so we include both versions, but note v2 is the most tested and most
11 | recent version of the paper.
12 |
13 | """
14 | def __init__(self, cfg):
15 | super(PriorBox, self).__init__()
16 | # self.type = cfg.name
17 | self.image_size = cfg['min_dim']
18 | # number of priors for feature map location (either 4 or 6)
19 | self.num_priors = len(cfg['aspect_ratios'])
20 | self.variance = cfg['variance'] or [0.1]
21 | self.feature_maps = cfg['feature_maps']
22 | self.min_sizes = cfg['min_sizes']
23 | self.max_sizes = cfg['max_sizes']
24 | self.steps = cfg['steps']
25 | self.aspect_ratios = cfg['aspect_ratios']
26 | self.clip = cfg['clip']
27 | self.version = cfg['name']
28 | for v in self.variance:
29 | if v <= 0:
30 | raise ValueError('Variances must be greater than 0')
31 |
32 | def forward(self):
33 | mean = []
34 | # TODO merge these
35 | if self.version == 'v2':
36 | for k, f in enumerate(self.feature_maps):
37 | for i, j in product(range(f), repeat=2):
38 | f_k = self.image_size / self.steps[k]
39 | # unit center x,y
40 | cx = (j + 0.5) / f_k
41 | cy = (i + 0.5) / f_k
42 |
43 | # aspect_ratio: 1
44 | # rel size: min_size
45 | s_k = self.min_sizes[k]/self.image_size
46 | mean += [cx, cy, s_k, s_k]
47 |
48 | # aspect_ratio: 1
49 | # rel size: sqrt(s_k * s_(k+1))
50 | s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size))
51 | mean += [cx, cy, s_k_prime, s_k_prime]
52 |
53 | # rest of aspect ratios
54 | for ar in self.aspect_ratios[k]:
55 | mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)]
56 | mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)]
57 |
58 | else:
59 | # original version generation of prior (default) boxes
60 | for i, k in enumerate(self.feature_maps):
61 | step_x = step_y = self.image_size/k
62 | for h, w in product(range(k), repeat=2):
63 | c_x = ((w+0.5) * step_x)
64 | c_y = ((h+0.5) * step_y)
65 | c_w = c_h = self.min_sizes[i] / 2
66 | s_k = self.image_size # 300
67 | # aspect_ratio: 1,
68 | # size: min_size
69 | mean += [(c_x-c_w)/s_k, (c_y-c_h)/s_k,
70 | (c_x+c_w)/s_k, (c_y+c_h)/s_k]
71 | if self.max_sizes[i] > 0:
72 | # aspect_ratio: 1
73 | # size: sqrt(min_size * max_size)/2
74 | c_w = c_h = sqrt(self.min_sizes[i] *
75 | self.max_sizes[i])/2
76 | mean += [(c_x-c_w)/s_k, (c_y-c_h)/s_k,
77 | (c_x+c_w)/s_k, (c_y+c_h)/s_k]
78 | # rest of prior boxes
79 | for ar in self.aspect_ratios[i]:
80 | if not (abs(ar-1) < 1e-6):
81 | c_w = self.min_sizes[i] * sqrt(ar)/2
82 | c_h = self.min_sizes[i] / sqrt(ar)/2
83 | mean += [(c_x-c_w)/s_k, (c_y-c_h)/s_k,
84 | (c_x+c_w)/s_k, (c_y+c_h)/s_k]
85 | # back to torch land
86 | output = torch.Tensor(mean).view(-1, 4)
87 | if self.clip:
88 | output.clamp_(max=1, min=0)
89 | return output
90 |
--------------------------------------------------------------------------------
/layers/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .l2norm import L2Norm
2 | from .multibox_loss import MultiBoxLoss
3 |
4 | __all__ = ['L2Norm', 'MultiBoxLoss']
5 |
--------------------------------------------------------------------------------
/layers/modules/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/modules/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/layers/modules/__pycache__/l2norm.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/modules/__pycache__/l2norm.cpython-35.pyc
--------------------------------------------------------------------------------
/layers/modules/__pycache__/multibox_loss.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/layers/modules/__pycache__/multibox_loss.cpython-35.pyc
--------------------------------------------------------------------------------
/layers/modules/l2norm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.autograd import Function
4 | from torch.autograd import Variable
5 | import torch.nn.init as init
6 |
7 | class L2Norm(nn.Module):
8 | def __init__(self,n_channels, scale):
9 | super(L2Norm,self).__init__()
10 | self.n_channels = n_channels
11 | self.gamma = scale or None
12 | self.eps = 1e-10
13 | self.weight = nn.Parameter(torch.Tensor(self.n_channels))
14 | self.reset_parameters()
15 |
16 | def reset_parameters(self):
17 | init.constant(self.weight,self.gamma)
18 |
19 | def forward(self, x):
20 | norm = x.pow(2).sum(1).sqrt()+self.eps
21 | x/=norm.expand_as(x)
22 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
23 | return out
24 |
--------------------------------------------------------------------------------
/layers/modules/multibox_loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | from data import v2 as cfg
6 | from ..box_utils import match, log_sum_exp
7 |
8 | class MultiBoxLoss(nn.Module):
9 | """SSD Weighted Loss Function
10 | Compute Targets:
11 | 1) Produce Confidence Target Indices by matching ground truth boxes
12 | with (default) 'priorboxes' that have jaccard index > threshold parameter
13 | (default threshold: 0.5).
14 | 2) Produce localization target by 'encoding' variance into offsets of ground
15 | truth boxes and their matched 'priorboxes'.
16 | 3) Hard negative mining to filter the excessive number of negative examples
17 | that comes with using a large number of default bounding boxes.
18 | (default negative:positive ratio 3:1)
19 | Objective Loss:
20 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
21 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
22 | weighted by α which is set to 1 by cross val.
23 | Args:
24 | c: class confidences,
25 | l: predicted boxes,
26 | g: ground truth boxes
27 | N: number of matched default boxes
28 | See: https://arxiv.org/pdf/1512.02325.pdf for more details.
29 | """
30 |
31 | def __init__(self, num_classes, overlap_thresh, prior_for_matching,
32 | bkg_label, neg_mining, neg_pos, neg_overlap, encode_target,
33 | use_gpu=True):
34 | super(MultiBoxLoss, self).__init__()
35 | self.use_gpu = use_gpu
36 | self.num_classes = num_classes
37 | self.threshold = overlap_thresh
38 | self.background_label = bkg_label
39 | self.encode_target = encode_target
40 | self.use_prior_for_matching = prior_for_matching
41 | self.do_neg_mining = neg_mining
42 | self.negpos_ratio = neg_pos
43 | self.neg_overlap = neg_overlap
44 | self.variance = cfg['variance']
45 |
46 | def forward(self, predictions, targets):
47 | """Multibox Loss
48 | Args:
49 | predictions (tuple): A tuple containing loc preds, conf preds,
50 | and prior boxes from SSD net.
51 | conf shape: torch.size(batch_size,num_priors,num_classes)
52 | loc shape: torch.size(batch_size,num_priors,4)
53 | priors shape: torch.size(num_priors,4)
54 |
55 | ground_truth (tensor): Ground truth boxes and labels for a batch,
56 | shape: [batch_size,num_objs,5] (last idx is the label).
57 | """
58 | loc_data, conf_data, priors = predictions
59 | num = loc_data.size(0)
60 | priors = priors[:loc_data.size(1), :]
61 | num_priors = (priors.size(0))
62 | num_classes = self.num_classes
63 |
64 | # match priors (default boxes) and ground truth boxes
65 | loc_t = torch.Tensor(num, num_priors, 4)
66 | conf_t = torch.LongTensor(num, num_priors)
67 | for idx in range(num):
68 | truths = targets[idx][:, :-1].data
69 | labels = targets[idx][:, -1].data
70 | defaults = priors.data
71 | match(self.threshold, truths, defaults, self.variance, labels,
72 | loc_t, conf_t, idx)
73 | if self.use_gpu:
74 | loc_t = loc_t.cuda()
75 | conf_t = conf_t.cuda()
76 | # wrap targets
77 | loc_t = Variable(loc_t, requires_grad=False)
78 | conf_t = Variable(conf_t, requires_grad=False)
79 |
80 | pos = conf_t > 0
81 | num_pos = pos.sum(keepdim=True)
82 |
83 | # Localization Loss (Smooth L1)
84 | # Shape: [batch,num_priors,4]
85 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
86 | loc_p = loc_data[pos_idx].view(-1, 4)
87 | loc_t = loc_t[pos_idx].view(-1, 4)
88 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
89 |
90 | # Compute max conf across batch for hard negative mining
91 | batch_conf = conf_data.view(-1, self.num_classes)
92 |
93 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))
94 |
95 | # Hard Negative Mining
96 | loss_c[pos] = 0 # filter out pos boxes for now
97 | loss_c = loss_c.view(num, -1)
98 | _, loss_idx = loss_c.sort(1, descending=True)
99 | _, idx_rank = loss_idx.sort(1)
100 | num_pos = pos.long().sum(1, keepdim=True)
101 | num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
102 | neg = idx_rank < num_neg.expand_as(idx_rank)
103 |
104 | # Confidence Loss Including Positive and Negative Examples
105 | pos_idx = pos.unsqueeze(2).expand_as(conf_data)
106 | neg_idx = neg.unsqueeze(2).expand_as(conf_data)
107 | conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
108 | targets_weighted = conf_t[(pos+neg).gt(0)]
109 | loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)
110 |
111 | # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
112 |
113 | N = num_pos.data.sum()
114 | loss_l /= N
115 | loss_c /= N
116 | return loss_l, loss_c
117 |
--------------------------------------------------------------------------------
/object_detection.py:
--------------------------------------------------------------------------------
1 | # Object Detection
2 |
3 | # Importing the libraries
4 | import torch
5 | from torch.autograd import Variable
6 | import cv2
7 | from data import BaseTransform, VOC_CLASSES as labelmap
8 | from ssd import build_ssd
9 | import imageio
10 |
11 | # Defining a function that will do the detections
12 | def detect(frame, net, transform):
13 | height, width = frame.shape[:2]
14 | frame_t = transform(frame)[0]
15 | x = torch.from_numpy(frame_t).permute(2, 0, 1)
16 | x = Variable(x.unsqueeze(0))
17 | y = net(x)
18 | detections = y.data
19 | scale = torch.Tensor([width, height, width, height])
20 | # detections = [batch, number of classes, number of occurence, (score, x0, Y0, x1, y1)]
21 | for i in range(detections.size(1)):
22 | j = 0
23 | while detections[0, i, j, 0] >= 0.6:
24 | pt = (detections[0, i, j, 1:] * scale).numpy()
25 | cv2.rectangle(frame, (int(pt[0]), int(pt[1])), (int(pt[2]), int(pt[3])), (255, 0, 0), 2)
26 | cv2.putText(frame, labelmap[i - 1], (int(pt[0]), int(pt[1])), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2, cv2.LINE_AA)
27 | j += 1
28 | return frame
29 |
30 | # Creating the SSD neural network
31 | net = build_ssd('test')
32 | net.load_state_dict(torch.load('ssd300_mAP_77.43_v2.pth', map_location = lambda storage, loc: storage))
33 |
34 | # Creating the transformation
35 | transform = BaseTransform(net.size, (104/256.0, 117/256.0, 123/256.0))
36 |
37 | # Doing some Object Detection on a video
38 | reader = imageio.get_reader('dog.mp4')
39 | fps = reader.get_meta_data()['fps']
40 | writer = imageio.get_writer('output.mp4', fps = fps)
41 | for i, frame in enumerate(reader):
42 | frame = detect(frame, net.eval(), transform)
43 | writer.append_data(frame)
44 | print(i)
45 | writer.close()
46 |
--------------------------------------------------------------------------------
/prev.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/prev.png
--------------------------------------------------------------------------------
/ssd.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | from layers import *
6 | from data import v2
7 | import os
8 |
9 |
10 | class SSD(nn.Module):
11 | """Single Shot Multibox Architecture
12 | The network is composed of a base VGG network followed by the
13 | added multibox conv layers. Each multibox layer branches into
14 | 1) conv2d for class conf scores
15 | 2) conv2d for localization predictions
16 | 3) associated priorbox layer to produce default bounding
17 | boxes specific to the layer's feature map size.
18 | See: https://arxiv.org/pdf/1512.02325.pdf for more details.
19 |
20 | Args:
21 | phase: (string) Can be "test" or "train"
22 | base: VGG16 layers for input, size of either 300 or 500
23 | extras: extra layers that feed to multibox loc and conf layers
24 | head: "multibox head" consists of loc and conf conv layers
25 | """
26 |
27 | def __init__(self, phase, base, extras, head, num_classes):
28 | super(SSD, self).__init__()
29 | self.phase = phase
30 | self.num_classes = num_classes
31 | # TODO: implement __call__ in PriorBox
32 | self.priorbox = PriorBox(v2)
33 | self.priors = Variable(self.priorbox.forward(), volatile=True)
34 | self.size = 300
35 |
36 | # SSD network
37 | self.vgg = nn.ModuleList(base)
38 | # Layer learns to scale the l2 normalized features from conv4_3
39 | self.L2Norm = L2Norm(512, 20)
40 | self.extras = nn.ModuleList(extras)
41 |
42 | self.loc = nn.ModuleList(head[0])
43 | self.conf = nn.ModuleList(head[1])
44 |
45 | if phase == 'test':
46 | self.softmax = nn.Softmax()
47 | self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
48 |
49 | def forward(self, x):
50 | """Applies network layers and ops on input image(s) x.
51 |
52 | Args:
53 | x: input image or batch of images. Shape: [batch,3*batch,300,300].
54 |
55 | Return:
56 | Depending on phase:
57 | test:
58 | Variable(tensor) of output class label predictions,
59 | confidence score, and corresponding location predictions for
60 | each object detected. Shape: [batch,topk,7]
61 |
62 | train:
63 | list of concat outputs from:
64 | 1: confidence layers, Shape: [batch*num_priors,num_classes]
65 | 2: localization layers, Shape: [batch,num_priors*4]
66 | 3: priorbox layers, Shape: [2,num_priors*4]
67 | """
68 | sources = list()
69 | loc = list()
70 | conf = list()
71 |
72 | # apply vgg up to conv4_3 relu
73 | for k in range(23):
74 | x = self.vgg[k](x)
75 |
76 | s = self.L2Norm(x)
77 | sources.append(s)
78 |
79 | # apply vgg up to fc7
80 | for k in range(23, len(self.vgg)):
81 | x = self.vgg[k](x)
82 | sources.append(x)
83 |
84 | # apply extra layers and cache source layer outputs
85 | for k, v in enumerate(self.extras):
86 | x = F.relu(v(x), inplace=True)
87 | if k % 2 == 1:
88 | sources.append(x)
89 |
90 | # apply multibox head to source layers
91 | for (x, l, c) in zip(sources, self.loc, self.conf):
92 | loc.append(l(x).permute(0, 2, 3, 1).contiguous())
93 | conf.append(c(x).permute(0, 2, 3, 1).contiguous())
94 |
95 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
96 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
97 | if self.phase == "test":
98 | output = self.detect(
99 | loc.view(loc.size(0), -1, 4), # loc preds
100 | self.softmax(conf.view(-1, self.num_classes)), # conf preds
101 | self.priors.type(type(x.data)) # default boxes
102 | )
103 | else:
104 | output = (
105 | loc.view(loc.size(0), -1, 4),
106 | conf.view(conf.size(0), -1, self.num_classes),
107 | self.priors
108 | )
109 | return output
110 |
111 | def load_weights(self, base_file):
112 | other, ext = os.path.splitext(base_file)
113 | if ext == '.pkl' or '.pth':
114 | print('Loading weights into state dict...')
115 | self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage))
116 | print('Finished!')
117 | else:
118 | print('Sorry only .pth and .pkl files supported.')
119 |
120 |
121 | # This function is derived from torchvision VGG make_layers()
122 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
123 | def vgg(cfg, i, batch_norm=False):
124 | layers = []
125 | in_channels = i
126 | for v in cfg:
127 | if v == 'M':
128 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
129 | elif v == 'C':
130 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
131 | else:
132 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
133 | if batch_norm:
134 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
135 | else:
136 | layers += [conv2d, nn.ReLU(inplace=True)]
137 | in_channels = v
138 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
139 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
140 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
141 | layers += [pool5, conv6,
142 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
143 | return layers
144 |
145 |
146 | def add_extras(cfg, i, batch_norm=False):
147 | # Extra layers added to VGG for feature scaling
148 | layers = []
149 | in_channels = i
150 | flag = False
151 | for k, v in enumerate(cfg):
152 | if in_channels != 'S':
153 | if v == 'S':
154 | layers += [nn.Conv2d(in_channels, cfg[k + 1],
155 | kernel_size=(1, 3)[flag], stride=2, padding=1)]
156 | else:
157 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
158 | flag = not flag
159 | in_channels = v
160 | return layers
161 |
162 |
163 | def multibox(vgg, extra_layers, cfg, num_classes):
164 | loc_layers = []
165 | conf_layers = []
166 | vgg_source = [24, -2]
167 | for k, v in enumerate(vgg_source):
168 | loc_layers += [nn.Conv2d(vgg[v].out_channels,
169 | cfg[k] * 4, kernel_size=3, padding=1)]
170 | conf_layers += [nn.Conv2d(vgg[v].out_channels,
171 | cfg[k] * num_classes, kernel_size=3, padding=1)]
172 | for k, v in enumerate(extra_layers[1::2], 2):
173 | loc_layers += [nn.Conv2d(v.out_channels, cfg[k]
174 | * 4, kernel_size=3, padding=1)]
175 | conf_layers += [nn.Conv2d(v.out_channels, cfg[k]
176 | * num_classes, kernel_size=3, padding=1)]
177 | return vgg, extra_layers, (loc_layers, conf_layers)
178 |
179 |
180 | base = {
181 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
182 | 512, 512, 512],
183 | '512': [],
184 | }
185 | extras = {
186 | '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
187 | '512': [],
188 | }
189 | mbox = {
190 | '300': [4, 6, 6, 6, 4, 4], # number of boxes per feature map location
191 | '512': [],
192 | }
193 |
194 |
195 | def build_ssd(phase, size=300, num_classes=21):
196 | if phase != "test" and phase != "train":
197 | print("Error: Phase not recognized")
198 | return
199 | if size != 300:
200 | print("Error: Sorry only SSD300 is supported currently!")
201 | return
202 |
203 | return SSD(phase, *multibox(vgg(base[str(size)], 3),
204 | add_extras(extras[str(size)], 1024),
205 | mbox[str(size)], num_classes), num_classes)
206 |
--------------------------------------------------------------------------------
/ssd300_mAP_77.43_v2.pth.000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/ssd300_mAP_77.43_v2.pth.000
--------------------------------------------------------------------------------
/ssd300_mAP_77.43_v2.pth.001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/ssd300_mAP_77.43_v2.pth.001
--------------------------------------------------------------------------------
/ssd300_mAP_77.43_v2.pth.002:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/ssd300_mAP_77.43_v2.pth.002
--------------------------------------------------------------------------------
/ssd300_mAP_77.43_v2.pth.003:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/ssd300_mAP_77.43_v2.pth.003
--------------------------------------------------------------------------------
/ssd300_mAP_77.43_v2.pth.004:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/ssd300_mAP_77.43_v2.pth.004
--------------------------------------------------------------------------------
/virtual_platform_windows.yml:
--------------------------------------------------------------------------------
1 | name: virtual_platform
2 | channels:
3 | - menpo
4 | - conda-forge
5 | - peterjc123
6 | - defaults
7 | dependencies:
8 | - ffmpeg=3.2.4=1
9 | - freetype=2.7=vc14_1
10 | - imageio=2.2.0=py35_0
11 | - libtiff=4.0.6=vc14_7
12 | - olefile=0.44=py35_0
13 | - pillow=4.2.1=py35_0
14 | - vc=14=0
15 | - alabaster=0.7.10=py35_0
16 | - astroid=1.5.3=py35_0
17 | - babel=2.5.0=py35_0
18 | - bleach=1.5.0=py35_0
19 | - certifi=2016.2.28=py35_0
20 | - cffi=1.10.0=py35_0
21 | - chardet=3.0.4=py35_0
22 | - colorama=0.3.9=py35_0
23 | - decorator=4.1.2=py35_0
24 | - docutils=0.14=py35_0
25 | - entrypoints=0.2.3=py35_0
26 | - html5lib=0.9999999=py35_0
27 | - icu=57.1=vc14_0
28 | - imagesize=0.7.1=py35_0
29 | - ipykernel=4.6.1=py35_0
30 | - ipython=6.1.0=py35_0
31 | - ipython_genutils=0.2.0=py35_0
32 | - isort=4.2.15=py35_0
33 | - jedi=0.10.2=py35_2
34 | - jinja2=2.9.6=py35_0
35 | - jpeg=9b=vc14_0
36 | - jsonschema=2.6.0=py35_0
37 | - jupyter_client=5.1.0=py35_0
38 | - jupyter_core=4.3.0=py35_0
39 | - lazy-object-proxy=1.3.1=py35_0
40 | - libpng=1.6.30=vc14_1
41 | - markupsafe=1.0=py35_0
42 | - mistune=0.7.4=py35_0
43 | - mkl=2017.0.3=0
44 | - nbconvert=5.2.1=py35_0
45 | - nbformat=4.4.0=py35_0
46 | - numpy=1.13.1=py35_0
47 | - numpydoc=0.7.0=py35_0
48 | - openssl=1.0.2l=vc14_0
49 | - pandocfilters=1.4.2=py35_0
50 | - path.py=10.3.1=py35_0
51 | - pickleshare=0.7.4=py35_0
52 | - pip=9.0.1=py35_1
53 | - prompt_toolkit=1.0.15=py35_0
54 | - psutil=5.2.2=py35_0
55 | - pycodestyle=2.3.1=py35_0
56 | - pycparser=2.18=py35_0
57 | - pyflakes=1.6.0=py35_0
58 | - pygments=2.2.0=py35_0
59 | - pylint=1.7.2=py35_0
60 | - pyqt=5.6.0=py35_2
61 | - python=3.5.4=0
62 | - python-dateutil=2.6.1=py35_0
63 | - pytz=2017.2=py35_0
64 | - pyzmq=16.0.2=py35_0
65 | - qt=5.6.2=vc14_6
66 | - qtawesome=0.4.4=py35_0
67 | - qtconsole=4.3.1=py35_0
68 | - qtpy=1.3.1=py35_0
69 | - requests=2.14.2=py35_0
70 | - rope=0.9.4=py35_1
71 | - setuptools=36.4.0=py35_1
72 | - simplegeneric=0.8.1=py35_1
73 | - singledispatch=3.4.0.3=py35_0
74 | - sip=4.18=py35_0
75 | - six=1.10.0=py35_1
76 | - snowballstemmer=1.2.1=py35_0
77 | - sphinx=1.6.3=py35_0
78 | - sphinxcontrib=1.0=py35_0
79 | - sphinxcontrib-websupport=1.0.1=py35_0
80 | - spyder=3.2.3=py35_0
81 | - testpath=0.3.1=py35_0
82 | - tornado=4.5.2=py35_0
83 | - traitlets=4.3.2=py35_0
84 | - vs2015_runtime=14.0.25420=0
85 | - wcwidth=0.1.7=py35_0
86 | - wheel=0.29.0=py35_0
87 | - win_unicode_console=0.5=py35_0
88 | - wincertstore=0.2=py35_0
89 | - wrapt=1.10.11=py35_0
90 | - zlib=1.2.11=vc14_0
91 | - opencv3=3.1.0=py35_0
92 | - pytorch=0.1.12=py35_0.1.12cu80
93 | - pip:
94 | - ipython-genutils==0.2.0
95 | - jupyter-client==5.1.0
96 | - jupyter-core==4.3.0
97 | - prompt-toolkit==1.0.15
98 | - pyyaml==3.12
99 | - rope-py3k==0.9.4.post1
100 | - torch==0.1.12
101 | - torchvision==0.1.9
102 | - win-unicode-console==0.5
103 |
--------------------------------------------------------------------------------
/working.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anushuk/Object-Detection-SSD/3cc4dcad5ec6fba640cdd96882780a687b475742/working.png
--------------------------------------------------------------------------------