├── .gitattributes
├── .gitignore
├── README.md
├── YOLOv4
├── app.py
├── camera.py
├── cfg
│ ├── tiny-yolo-voc.cfg
│ ├── yolo-voc.cfg
│ ├── yolo.cfg
│ ├── yolov3-spp.cfg
│ ├── yolov3-tiny.cfg
│ ├── yolov3.cfg
│ └── yolov4.cfg
├── darknet.py
├── data
│ ├── coco.names
│ └── voc.names
├── object_detection.py
├── pallete
├── requirements.txt
├── templates
│ ├── 12.jpg
│ ├── base.html
│ └── index.html
└── tool
│ ├── config.py
│ ├── region_loss.py
│ ├── torch_utils.py
│ ├── utils.py
│ └── yolo_layer.py
├── app.py
├── bbox.py
├── camera.py
├── cfg
├── tiny-yolo-voc.cfg
├── yolo-voc.cfg
├── yolo.cfg
├── yolov3-spp.cfg
├── yolov3-tiny.cfg
└── yolov3.cfg
├── darknet.py
├── data
├── coco.names
└── voc.names
├── object_detection.py
├── pallete
├── preprocess.py
├── requirements.txt
├── templates
├── 12.jpg
├── base.html
└── index.html
├── util.py
└── utils
├── app_utils.py
└── objDet_utils.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.weights filter=lfs diff=lfs merge=lfs -text
2 | *.mp4 filter=lfs diff=lfs merge=lfs -text
3 | *.avi filter=lfs diff=lfs merge=lfs -text
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Object Detection and Distance Measurement
2 |
3 | [](https://pjreddie.com/darknet/yolo/?style=centerme)
4 |
5 |
6 | ## Introduction
7 | This repo contains object_detection.py, which can perform the following task -
8 | - Object detection from a live video frame, in any video file, or in an image
9 | - Counting the number of objects in a frame
10 | - Measuring the distance of an object using depth information
11 | - Inference on Multiple Camera feed at a time
12 |
13 | For object detection, YOLO-V3 has been used, which can detect 80 different objects. Some of those are-
14 | - person
15 | - car
16 | - bus
17 | - stop sign
18 | - bench
19 | - dog
20 | - bear
21 | - backpack, and so on.
22 |
23 | ### User Instruction
24 |
25 | ## Update
26 |
27 | **There is a new update with [yolov4 new release](https://github.com/Tianxiaomo/pytorch-YOLOv4). All you have to do a simple step which is after downloading the project, run the following command and follow the rest of the process as it is.**
28 |
29 | ```
30 | cd YOLOv4
31 | ```
32 |
33 | You can also use Yolact++ as an object detector using [this] repo (https://github.com/paul-pias/Social-Distance-Monitoring).
34 |
35 |
36 | To execute object_dection.py, you require Python version > 3.5 (depending on whether you are using GPU or not) and have to install the following libraries.
37 |
38 | ### Installation
39 | ``` python
40 | $ pip install -r requirements.txt
41 | or
42 | $ pip install opencv-python
43 | $ pip install numpy
44 | $ pip install pandas
45 | $ pip install matplotlib
46 | $ pip install Pillow
47 | $ pip install imutils
48 | ```
49 |
6 |
7 |
8 |
Camera - 01
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | {% endblock %}
--------------------------------------------------------------------------------
/YOLOv4/tool/config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from tool.torch_utils import convert2cpu
3 |
4 |
5 | def parse_cfg(cfgfile):
6 | blocks = []
7 | fp = open(cfgfile, 'r')
8 | block = None
9 | line = fp.readline()
10 | while line != '':
11 | line = line.rstrip()
12 | if line == '' or line[0] == '#':
13 | line = fp.readline()
14 | continue
15 | elif line[0] == '[':
16 | if block:
17 | blocks.append(block)
18 | block = dict()
19 | block['type'] = line.lstrip('[').rstrip(']')
20 | # set default value
21 | if block['type'] == 'convolutional':
22 | block['batch_normalize'] = 0
23 | else:
24 | key, value = line.split('=')
25 | key = key.strip()
26 | if key == 'type':
27 | key = '_type'
28 | value = value.strip()
29 | block[key] = value
30 | line = fp.readline()
31 |
32 | if block:
33 | blocks.append(block)
34 | fp.close()
35 | return blocks
36 |
37 |
38 | def print_cfg(blocks):
39 | print('layer filters size input output');
40 | prev_width = 416
41 | prev_height = 416
42 | prev_filters = 3
43 | out_filters = []
44 | out_widths = []
45 | out_heights = []
46 | ind = -2
47 | for block in blocks:
48 | ind = ind + 1
49 | if block['type'] == 'net':
50 | prev_width = int(block['width'])
51 | prev_height = int(block['height'])
52 | continue
53 | elif block['type'] == 'convolutional':
54 | filters = int(block['filters'])
55 | kernel_size = int(block['size'])
56 | stride = int(block['stride'])
57 | is_pad = int(block['pad'])
58 | pad = (kernel_size - 1) // 2 if is_pad else 0
59 | width = (prev_width + 2 * pad - kernel_size) // stride + 1
60 | height = (prev_height + 2 * pad - kernel_size) // stride + 1
61 | print('%5d %-6s %4d %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % (
62 | ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width,
63 | height, filters))
64 | prev_width = width
65 | prev_height = height
66 | prev_filters = filters
67 | out_widths.append(prev_width)
68 | out_heights.append(prev_height)
69 | out_filters.append(prev_filters)
70 | elif block['type'] == 'maxpool':
71 | pool_size = int(block['size'])
72 | stride = int(block['stride'])
73 | width = prev_width // stride
74 | height = prev_height // stride
75 | print('%5d %-6s %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % (
76 | ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height,
77 | filters))
78 | prev_width = width
79 | prev_height = height
80 | prev_filters = filters
81 | out_widths.append(prev_width)
82 | out_heights.append(prev_height)
83 | out_filters.append(prev_filters)
84 | elif block['type'] == 'avgpool':
85 | width = 1
86 | height = 1
87 | print('%5d %-6s %3d x %3d x%4d -> %3d' % (
88 | ind, 'avg', prev_width, prev_height, prev_filters, prev_filters))
89 | prev_width = width
90 | prev_height = height
91 | prev_filters = filters
92 | out_widths.append(prev_width)
93 | out_heights.append(prev_height)
94 | out_filters.append(prev_filters)
95 | elif block['type'] == 'softmax':
96 | print('%5d %-6s -> %3d' % (ind, 'softmax', prev_filters))
97 | out_widths.append(prev_width)
98 | out_heights.append(prev_height)
99 | out_filters.append(prev_filters)
100 | elif block['type'] == 'cost':
101 | print('%5d %-6s -> %3d' % (ind, 'cost', prev_filters))
102 | out_widths.append(prev_width)
103 | out_heights.append(prev_height)
104 | out_filters.append(prev_filters)
105 | elif block['type'] == 'reorg':
106 | stride = int(block['stride'])
107 | filters = stride * stride * prev_filters
108 | width = prev_width // stride
109 | height = prev_height // stride
110 | print('%5d %-6s / %d %3d x %3d x%4d -> %3d x %3d x%4d' % (
111 | ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters))
112 | prev_width = width
113 | prev_height = height
114 | prev_filters = filters
115 | out_widths.append(prev_width)
116 | out_heights.append(prev_height)
117 | out_filters.append(prev_filters)
118 | elif block['type'] == 'upsample':
119 | stride = int(block['stride'])
120 | filters = prev_filters
121 | width = prev_width * stride
122 | height = prev_height * stride
123 | print('%5d %-6s * %d %3d x %3d x%4d -> %3d x %3d x%4d' % (
124 | ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters))
125 | prev_width = width
126 | prev_height = height
127 | prev_filters = filters
128 | out_widths.append(prev_width)
129 | out_heights.append(prev_height)
130 | out_filters.append(prev_filters)
131 | elif block['type'] == 'route':
132 | layers = block['layers'].split(',')
133 | layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
134 | if len(layers) == 1:
135 | print('%5d %-6s %d' % (ind, 'route', layers[0]))
136 | prev_width = out_widths[layers[0]]
137 | prev_height = out_heights[layers[0]]
138 | prev_filters = out_filters[layers[0]]
139 | elif len(layers) == 2:
140 | print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1]))
141 | prev_width = out_widths[layers[0]]
142 | prev_height = out_heights[layers[0]]
143 | assert (prev_width == out_widths[layers[1]])
144 | assert (prev_height == out_heights[layers[1]])
145 | prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
146 | elif len(layers) == 4:
147 | print('%5d %-6s %d %d %d %d' % (ind, 'route', layers[0], layers[1], layers[2], layers[3]))
148 | prev_width = out_widths[layers[0]]
149 | prev_height = out_heights[layers[0]]
150 | assert (prev_width == out_widths[layers[1]] == out_widths[layers[2]] == out_widths[layers[3]])
151 | assert (prev_height == out_heights[layers[1]] == out_heights[layers[2]] == out_heights[layers[3]])
152 | prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + out_filters[
153 | layers[3]]
154 | else:
155 | print("route error !!! {} {} {}".format(sys._getframe().f_code.co_filename,
156 | sys._getframe().f_code.co_name, sys._getframe().f_lineno))
157 |
158 | out_widths.append(prev_width)
159 | out_heights.append(prev_height)
160 | out_filters.append(prev_filters)
161 | elif block['type'] in ['region', 'yolo']:
162 | print('%5d %-6s' % (ind, 'detection'))
163 | out_widths.append(prev_width)
164 | out_heights.append(prev_height)
165 | out_filters.append(prev_filters)
166 | elif block['type'] == 'shortcut':
167 | from_id = int(block['from'])
168 | from_id = from_id if from_id > 0 else from_id + ind
169 | print('%5d %-6s %d' % (ind, 'shortcut', from_id))
170 | prev_width = out_widths[from_id]
171 | prev_height = out_heights[from_id]
172 | prev_filters = out_filters[from_id]
173 | out_widths.append(prev_width)
174 | out_heights.append(prev_height)
175 | out_filters.append(prev_filters)
176 | elif block['type'] == 'connected':
177 | filters = int(block['output'])
178 | print('%5d %-6s %d -> %3d' % (ind, 'connected', prev_filters, filters))
179 | prev_filters = filters
180 | out_widths.append(1)
181 | out_heights.append(1)
182 | out_filters.append(prev_filters)
183 | else:
184 | print('unknown type %s' % (block['type']))
185 |
186 |
187 | def load_conv(buf, start, conv_model):
188 | num_w = conv_model.weight.numel()
189 | num_b = conv_model.bias.numel()
190 | conv_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
191 | start = start + num_b
192 | conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
193 | start = start + num_w
194 | return start
195 |
196 |
197 | def save_conv(fp, conv_model):
198 | if conv_model.bias.is_cuda:
199 | convert2cpu(conv_model.bias.data).numpy().tofile(fp)
200 | convert2cpu(conv_model.weight.data).numpy().tofile(fp)
201 | else:
202 | conv_model.bias.data.numpy().tofile(fp)
203 | conv_model.weight.data.numpy().tofile(fp)
204 |
205 |
206 | def load_conv_bn(buf, start, conv_model, bn_model):
207 | num_w = conv_model.weight.numel()
208 | num_b = bn_model.bias.numel()
209 | bn_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
210 | start = start + num_b
211 | bn_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_b]));
212 | start = start + num_b
213 | bn_model.running_mean.copy_(torch.from_numpy(buf[start:start + num_b]));
214 | start = start + num_b
215 | bn_model.running_var.copy_(torch.from_numpy(buf[start:start + num_b]));
216 | start = start + num_b
217 | conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
218 | start = start + num_w
219 | return start
220 |
221 |
222 | def save_conv_bn(fp, conv_model, bn_model):
223 | if bn_model.bias.is_cuda:
224 | convert2cpu(bn_model.bias.data).numpy().tofile(fp)
225 | convert2cpu(bn_model.weight.data).numpy().tofile(fp)
226 | convert2cpu(bn_model.running_mean).numpy().tofile(fp)
227 | convert2cpu(bn_model.running_var).numpy().tofile(fp)
228 | convert2cpu(conv_model.weight.data).numpy().tofile(fp)
229 | else:
230 | bn_model.bias.data.numpy().tofile(fp)
231 | bn_model.weight.data.numpy().tofile(fp)
232 | bn_model.running_mean.numpy().tofile(fp)
233 | bn_model.running_var.numpy().tofile(fp)
234 | conv_model.weight.data.numpy().tofile(fp)
235 |
236 |
237 | def load_fc(buf, start, fc_model):
238 | num_w = fc_model.weight.numel()
239 | num_b = fc_model.bias.numel()
240 | fc_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
241 | start = start + num_b
242 | fc_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]));
243 | start = start + num_w
244 | return start
245 |
246 |
247 | def save_fc(fp, fc_model):
248 | fc_model.bias.data.numpy().tofile(fp)
249 | fc_model.weight.data.numpy().tofile(fp)
250 |
251 |
252 | if __name__ == '__main__':
253 | import sys
254 |
255 | blocks = parse_cfg('cfg/yolo.cfg')
256 | if len(sys.argv) == 2:
257 | blocks = parse_cfg(sys.argv[1])
258 | print_cfg(blocks)
--------------------------------------------------------------------------------
/YOLOv4/tool/region_loss.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch.nn.functional as F
3 | from tool.torch_utils import *
4 |
5 |
6 | def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale,
7 | sil_thresh, seen):
8 | nB = target.size(0)
9 | nA = num_anchors
10 | nC = num_classes
11 | anchor_step = len(anchors) / num_anchors
12 | conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale
13 | coord_mask = torch.zeros(nB, nA, nH, nW)
14 | cls_mask = torch.zeros(nB, nA, nH, nW)
15 | tx = torch.zeros(nB, nA, nH, nW)
16 | ty = torch.zeros(nB, nA, nH, nW)
17 | tw = torch.zeros(nB, nA, nH, nW)
18 | th = torch.zeros(nB, nA, nH, nW)
19 | tconf = torch.zeros(nB, nA, nH, nW)
20 | tcls = torch.zeros(nB, nA, nH, nW)
21 |
22 | nAnchors = nA * nH * nW
23 | nPixels = nH * nW
24 | for b in range(nB):
25 | cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t()
26 | cur_ious = torch.zeros(nAnchors)
27 | for t in range(50):
28 | if target[b][t * 5 + 1] == 0:
29 | break
30 | gx = target[b][t * 5 + 1] * nW
31 | gy = target[b][t * 5 + 2] * nH
32 | gw = target[b][t * 5 + 3] * nW
33 | gh = target[b][t * 5 + 4] * nH
34 | cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t()
35 | cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False))
36 | conf_mask[b][cur_ious > sil_thresh] = 0
37 | if seen < 12800:
38 | if anchor_step == 4:
39 | tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1,
40 | 1).repeat(
41 | nB, 1, nH, nW)
42 | ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view(
43 | 1, nA, 1, 1).repeat(nB, 1, nH, nW)
44 | else:
45 | tx.fill_(0.5)
46 | ty.fill_(0.5)
47 | tw.zero_()
48 | th.zero_()
49 | coord_mask.fill_(1)
50 |
51 | nGT = 0
52 | nCorrect = 0
53 | for b in range(nB):
54 | for t in range(50):
55 | if target[b][t * 5 + 1] == 0:
56 | break
57 | nGT = nGT + 1
58 | best_iou = 0.0
59 | best_n = -1
60 | min_dist = 10000
61 | gx = target[b][t * 5 + 1] * nW
62 | gy = target[b][t * 5 + 2] * nH
63 | gi = int(gx)
64 | gj = int(gy)
65 | gw = target[b][t * 5 + 3] * nW
66 | gh = target[b][t * 5 + 4] * nH
67 | gt_box = [0, 0, gw, gh]
68 | for n in range(nA):
69 | aw = anchors[anchor_step * n]
70 | ah = anchors[anchor_step * n + 1]
71 | anchor_box = [0, 0, aw, ah]
72 | iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)
73 | if anchor_step == 4:
74 | ax = anchors[anchor_step * n + 2]
75 | ay = anchors[anchor_step * n + 3]
76 | dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2)
77 | if iou > best_iou:
78 | best_iou = iou
79 | best_n = n
80 | elif anchor_step == 4 and iou == best_iou and dist < min_dist:
81 | best_iou = iou
82 | best_n = n
83 | min_dist = dist
84 |
85 | gt_box = [gx, gy, gw, gh]
86 | pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi]
87 |
88 | coord_mask[b][best_n][gj][gi] = 1
89 | cls_mask[b][best_n][gj][gi] = 1
90 | conf_mask[b][best_n][gj][gi] = object_scale
91 | tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi
92 | ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj
93 | tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n])
94 | th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1])
95 | iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) # best_iou
96 | tconf[b][best_n][gj][gi] = iou
97 | tcls[b][best_n][gj][gi] = target[b][t * 5]
98 | if iou > 0.5:
99 | nCorrect = nCorrect + 1
100 |
101 | return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls
102 |
103 |
104 | class RegionLoss(nn.Module):
105 | def __init__(self, num_classes=0, anchors=[], num_anchors=1):
106 | super(RegionLoss, self).__init__()
107 | self.num_classes = num_classes
108 | self.anchors = anchors
109 | self.num_anchors = num_anchors
110 | self.anchor_step = len(anchors) / num_anchors
111 | self.coord_scale = 1
112 | self.noobject_scale = 1
113 | self.object_scale = 5
114 | self.class_scale = 1
115 | self.thresh = 0.6
116 | self.seen = 0
117 |
118 | def forward(self, output, target):
119 | # output : BxAs*(4+1+num_classes)*H*W
120 | t0 = time.time()
121 | nB = output.data.size(0)
122 | nA = self.num_anchors
123 | nC = self.num_classes
124 | nH = output.data.size(2)
125 | nW = output.data.size(3)
126 |
127 | output = output.view(nB, nA, (5 + nC), nH, nW)
128 | x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW))
129 | y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW))
130 | w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW)
131 | h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW)
132 | conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW))
133 | cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda()))
134 | cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC)
135 | t1 = time.time()
136 |
137 | pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW)
138 | grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
139 | grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
140 | anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda()
141 | anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda()
142 | anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
143 | anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
144 | pred_boxes[0] = x.data + grid_x
145 | pred_boxes[1] = y.data + grid_y
146 | pred_boxes[2] = torch.exp(w.data) * anchor_w
147 | pred_boxes[3] = torch.exp(h.data) * anchor_h
148 | pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4))
149 | t2 = time.time()
150 |
151 | nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,
152 | target.data,
153 | self.anchors, nA,
154 | nC, \
155 | nH, nW,
156 | self.noobject_scale,
157 | self.object_scale,
158 | self.thresh,
159 | self.seen)
160 | cls_mask = (cls_mask == 1)
161 | nProposals = int((conf > 0.25).sum().data[0])
162 |
163 | tx = Variable(tx.cuda())
164 | ty = Variable(ty.cuda())
165 | tw = Variable(tw.cuda())
166 | th = Variable(th.cuda())
167 | tconf = Variable(tconf.cuda())
168 | tcls = Variable(tcls.view(-1)[cls_mask].long().cuda())
169 |
170 | coord_mask = Variable(coord_mask.cuda())
171 | conf_mask = Variable(conf_mask.cuda().sqrt())
172 | cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda())
173 | cls = cls[cls_mask].view(-1, nC)
174 |
175 | t3 = time.time()
176 |
177 | loss_x = self.coord_scale * nn.MSELoss(size_average=False)(x * coord_mask, tx * coord_mask) / 2.0
178 | loss_y = self.coord_scale * nn.MSELoss(size_average=False)(y * coord_mask, ty * coord_mask) / 2.0
179 | loss_w = self.coord_scale * nn.MSELoss(size_average=False)(w * coord_mask, tw * coord_mask) / 2.0
180 | loss_h = self.coord_scale * nn.MSELoss(size_average=False)(h * coord_mask, th * coord_mask) / 2.0
181 | loss_conf = nn.MSELoss(size_average=False)(conf * conf_mask, tconf * conf_mask) / 2.0
182 | loss_cls = self.class_scale * nn.CrossEntropyLoss(size_average=False)(cls, tcls)
183 | loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
184 | t4 = time.time()
185 | if False:
186 | print('-----------------------------------')
187 | print(' activation : %f' % (t1 - t0))
188 | print(' create pred_boxes : %f' % (t2 - t1))
189 | print(' build targets : %f' % (t3 - t2))
190 | print(' create loss : %f' % (t4 - t3))
191 | print(' total : %f' % (t4 - t0))
192 | print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % (
193 | self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0],
194 | loss_conf.data[0], loss_cls.data[0], loss.data[0]))
195 | return loss
--------------------------------------------------------------------------------
/YOLOv4/tool/torch_utils.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import time
4 | import math
5 | import torch
6 | import numpy as np
7 | from torch.autograd import Variable
8 |
9 | import itertools
10 | import struct # get_image_size
11 | import imghdr # get_image_size
12 |
13 | from tool import utils
14 |
15 |
16 | def bbox_ious(boxes1, boxes2, x1y1x2y2=True):
17 | if x1y1x2y2:
18 | mx = torch.min(boxes1[0], boxes2[0])
19 | Mx = torch.max(boxes1[2], boxes2[2])
20 | my = torch.min(boxes1[1], boxes2[1])
21 | My = torch.max(boxes1[3], boxes2[3])
22 | w1 = boxes1[2] - boxes1[0]
23 | h1 = boxes1[3] - boxes1[1]
24 | w2 = boxes2[2] - boxes2[0]
25 | h2 = boxes2[3] - boxes2[1]
26 | else:
27 | mx = torch.min(boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0)
28 | Mx = torch.max(boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0)
29 | my = torch.min(boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0)
30 | My = torch.max(boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0)
31 | w1 = boxes1[2]
32 | h1 = boxes1[3]
33 | w2 = boxes2[2]
34 | h2 = boxes2[3]
35 | uw = Mx - mx
36 | uh = My - my
37 | cw = w1 + w2 - uw
38 | ch = h1 + h2 - uh
39 | mask = ((cw <= 0) + (ch <= 0) > 0)
40 | area1 = w1 * h1
41 | area2 = w2 * h2
42 | carea = cw * ch
43 | carea[mask] = 0
44 | uarea = area1 + area2 - carea
45 | return carea / uarea
46 |
47 |
48 | def get_region_boxes(boxes_and_confs):
49 |
50 | # print('Getting boxes from boxes and confs ...')
51 |
52 | boxes_list = []
53 | confs_list = []
54 |
55 | for item in boxes_and_confs:
56 | boxes_list.append(item[0])
57 | confs_list.append(item[1])
58 |
59 | # boxes: [batch, num1 + num2 + num3, 4]
60 | # confs: [batch, num1 + num2 + num3, num_classes]
61 | boxes = torch.cat(boxes_list, dim=1)
62 | confs = torch.cat(confs_list, dim=1)
63 |
64 | output = torch.cat((boxes, confs), dim=2)
65 |
66 | return output
67 |
68 |
69 | def convert2cpu(gpu_matrix):
70 | return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix)
71 |
72 |
73 | def convert2cpu_long(gpu_matrix):
74 | return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix)
75 |
76 |
77 |
78 | def do_detect(model, img, conf_thresh, nms_thresh, use_cuda=1):
79 | model.eval()
80 | t0 = time.time()
81 |
82 | if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image
83 | img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
84 | elif type(img) == np.ndarray and len(img.shape) == 4:
85 | img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0)
86 | else:
87 | print("unknow image type")
88 | exit(-1)
89 |
90 | if use_cuda:
91 | img = img.cuda()
92 | img = torch.autograd.Variable(img)
93 |
94 | t1 = time.time()
95 |
96 | output = model(img)
97 |
98 | t2 = time.time()
99 |
100 | print('-----------------------------------')
101 | print(' Preprocess : %f' % (t1 - t0))
102 | print(' Model Inference : %f' % (t2 - t1))
103 | print('-----------------------------------')
104 |
105 | return utils.post_processing(img, conf_thresh, nms_thresh, output)
--------------------------------------------------------------------------------
/YOLOv4/tool/utils.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import time
4 | import math
5 | import numpy as np
6 |
7 | import itertools
8 | import struct # get_image_size
9 | import imghdr # get_image_size
10 |
11 | import win32com.client as wincl #### Python's Text-to-speech (tts) engine for windows, multiprocessing
12 | speak = wincl.Dispatch("SAPI.SpVoice") #### This initiates the tts engine
13 |
14 |
15 | def sigmoid(x):
16 | return 1.0 / (np.exp(-x) + 1.)
17 |
18 |
19 | def softmax(x):
20 | x = np.exp(x - np.expand_dims(np.max(x, axis=1), axis=1))
21 | x = x / np.expand_dims(x.sum(axis=1), axis=1)
22 | return x
23 |
24 |
25 | def bbox_iou(box1, box2, x1y1x2y2=True):
26 |
27 | # print('iou box1:', box1)
28 | # print('iou box2:', box2)
29 |
30 | if x1y1x2y2:
31 | mx = min(box1[0], box2[0])
32 | Mx = max(box1[2], box2[2])
33 | my = min(box1[1], box2[1])
34 | My = max(box1[3], box2[3])
35 | w1 = box1[2] - box1[0]
36 | h1 = box1[3] - box1[1]
37 | w2 = box2[2] - box2[0]
38 | h2 = box2[3] - box2[1]
39 | else:
40 | w1 = box1[2]
41 | h1 = box1[3]
42 | w2 = box2[2]
43 | h2 = box2[3]
44 |
45 | mx = min(box1[0], box2[0])
46 | Mx = max(box1[0] + w1, box2[0] + w2)
47 | my = min(box1[1], box2[1])
48 | My = max(box1[1] + h1, box2[1] + h2)
49 | uw = Mx - mx
50 | uh = My - my
51 | cw = w1 + w2 - uw
52 | ch = h1 + h2 - uh
53 | carea = 0
54 | if cw <= 0 or ch <= 0:
55 | return 0.0
56 |
57 | area1 = w1 * h1
58 | area2 = w2 * h2
59 | carea = cw * ch
60 | uarea = area1 + area2 - carea
61 | return carea / uarea
62 |
63 |
64 | def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
65 | # print(boxes.shape)
66 | x1 = boxes[:, 0]
67 | y1 = boxes[:, 1]
68 | x2 = boxes[:, 0] + boxes[:, 2]
69 | y2 = boxes[:, 1] + boxes[:, 3]
70 |
71 | areas = (x2 - x1) * (y2 - y1)
72 | order = confs.argsort()[::-1]
73 |
74 | keep = []
75 | while order.size > 0:
76 | idx_self = order[0]
77 | idx_other = order[1:]
78 |
79 | keep.append(idx_self)
80 |
81 | xx1 = np.maximum(x1[idx_self], x1[idx_other])
82 | yy1 = np.maximum(y1[idx_self], y1[idx_other])
83 | xx2 = np.minimum(x2[idx_self], x2[idx_other])
84 | yy2 = np.minimum(y2[idx_self], y2[idx_other])
85 |
86 | w = np.maximum(0.0, xx2 - xx1)
87 | h = np.maximum(0.0, yy2 - yy1)
88 | inter = w * h
89 |
90 | if min_mode:
91 | over = inter / np.minimum(areas[order[0]], areas[order[1:]])
92 | else:
93 | over = inter / (areas[order[0]] + areas[order[1:]] - inter)
94 |
95 | inds = np.where(over <= nms_thresh)[0]
96 | order = order[inds + 1]
97 |
98 | return np.array(keep)
99 |
100 |
101 |
102 | def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None, colors = None):
103 | import cv2
104 | img = np.copy(img)
105 | # colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32)
106 | colors = np.array(colors)
107 |
108 | def get_color(c, x, max_val):
109 | ratio = float(x) / max_val * 5
110 | i = int(math.floor(ratio))
111 | j = int(math.ceil(ratio))
112 | ratio = ratio - i
113 | r = (1 - ratio) * colors[i][c] + ratio * colors[j][c]
114 | return int(r * 255)
115 |
116 | width = img.shape[1]
117 | height = img.shape[0]
118 | # print("weight{} , height {}".format(width, height))
119 | for i in range(len(boxes)):
120 | box = boxes[i]
121 | x1 = int((box[0] - box[2] / 2.0) * width)
122 | y1 = int((box[1] - box[3] / 2.0) * height)
123 | x2 = int((box[0] + box[2] / 2.0) * width)
124 | y2 = int((box[1] + box[3] / 2.0) * height)
125 | x,y,w,h = x1,y1,x2,y2
126 |
127 | font_face = cv2.FONT_HERSHEY_DUPLEX
128 | font_scale = 1.2
129 | font_thickness = 1
130 |
131 | text_pt = (box[0], box[1] - 3)
132 | text_color = [255, 255, 255]
133 | if color:
134 | rgb = color
135 | else:
136 | rgb = (255, 0, 0)
137 | if len(box) >= 7 and class_names:
138 | cls_conf = box[5]
139 | cls_id = box[6]
140 | print('%s: %f' % (class_names[cls_id], cls_conf))
141 |
142 | distance = (2 * 3.14 * 180) / (w+ h * 360) * 1000 + 3 ### Distance measuring in Inch
143 | feedback = ("{}".format(class_names[cls_id])+ " " +"is"+" at {} ".format(round(distance))+"Inches")
144 | # speak.Speak(feedback)
145 | print(feedback)
146 | text_str = '%s' % (class_names[cls_id])
147 | text_w, text_h = cv2.getTextSize(text_str, font_face, font_scale, font_thickness)[0]
148 | classes = len(class_names)
149 | offset = cls_id * 123457 % classes
150 | red = get_color(2, offset, classes)
151 | green = get_color(1, offset, classes)
152 | blue = get_color(0, offset, classes)
153 | if color is None:
154 | rgb = (red, green, blue)
155 | cv2.putText(img, str("{:.2f} Inches".format(distance)), (text_w+x,y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, rgb, font_thickness, cv2.LINE_AA)
156 | img = cv2.putText(img, class_names[cls_id], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb, 1)
157 |
158 |
159 | img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, 1)
160 |
161 |
162 | return img
163 |
164 |
165 | def read_truths(lab_path):
166 | if not os.path.exists(lab_path):
167 | return np.array([])
168 | if os.path.getsize(lab_path):
169 | truths = np.loadtxt(lab_path)
170 | truths = truths.reshape(truths.size / 5, 5) # to avoid single truth problem
171 | return truths
172 | else:
173 | return np.array([])
174 |
175 |
176 |
177 |
178 |
179 | def post_processing(img, conf_thresh, nms_thresh, output):
180 |
181 | # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401]
182 | # num_anchors = 9
183 | # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
184 | # strides = [8, 16, 32]
185 | # anchor_step = len(anchors) // num_anchors
186 |
187 | t1 = time.time()
188 |
189 | if type(output).__name__ != 'ndarray':
190 | output = output.cpu().detach().numpy()
191 |
192 | # [batch, num, 4]
193 | box_array = output[:, :, :4]
194 |
195 | # [batch, num, num_classes]
196 | confs = output[:, :, 4:]
197 |
198 | # [batch, num, num_classes] --> [batch, num]
199 | max_conf = np.max(confs, axis=2)
200 | max_id = np.argmax(confs, axis=2)
201 |
202 | t2 = time.time()
203 |
204 | bboxes_batch = []
205 | for i in range(box_array.shape[0]):
206 |
207 | argwhere = max_conf[i] > conf_thresh
208 | l_box_array = box_array[i, argwhere, :]
209 | l_max_conf = max_conf[i, argwhere]
210 | l_max_id = max_id[i, argwhere]
211 |
212 | keep = nms_cpu(l_box_array, l_max_conf, nms_thresh)
213 |
214 | bboxes = []
215 | if (keep.size > 0):
216 | l_box_array = l_box_array[keep, :]
217 | l_max_conf = l_max_conf[keep]
218 | l_max_id = l_max_id[keep]
219 |
220 | for j in range(l_box_array.shape[0]):
221 | bboxes.append([l_box_array[j, 0], l_box_array[j, 1], l_box_array[j, 2], l_box_array[j, 3], l_max_conf[j], l_max_conf[j], l_max_id[j]])
222 |
223 | bboxes_batch.append(bboxes)
224 |
225 | t3 = time.time()
226 |
227 | # print('-----------------------------------')
228 | # print(' max and argmax : %f' % (t2 - t1))
229 | # print(' nms : %f' % (t3 - t2))
230 | # print('Post processing total : %f' % (t3 - t1))
231 | # print('-----------------------------------')
232 |
233 | return bboxes_batch
--------------------------------------------------------------------------------
/YOLOv4/tool/yolo_layer.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch.nn.functional as F
3 | from tool.torch_utils import *
4 |
5 |
6 | def yolo_forward_alternative(output, conf_thresh, num_classes, anchors, num_anchors, only_objectness=1,
7 | validation=False):
8 | # Output would be invalid if it does not satisfy this assert
9 | # assert (output.size(1) == (5 + num_classes) * num_anchors)
10 |
11 | # print(output.size())
12 |
13 | # Slice the second dimension (channel) of output into:
14 | # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
15 | # And then into
16 | # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
17 | batch = output.size(0)
18 | H = output.size(2)
19 | W = output.size(3)
20 |
21 | device = None
22 | cuda_check = output.is_cuda
23 | if cuda_check:
24 | device = output.get_device()
25 |
26 |
27 | # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
28 | grid_x = np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0).reshape(1, 1, H * W).repeat(batch, 0).repeat(num_anchors, 1)
29 | grid_y = np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1).reshape(1, 1, H * W).repeat(batch, 0).repeat(num_anchors, 1)
30 | # Shape: [batch, num_anchors, H * W]
31 | grid_x_tensor = torch.tensor(grid_x, device=device, dtype=torch.float32)
32 | grid_y_tensor = torch.tensor(grid_y, device=device, dtype=torch.float32)
33 |
34 | anchor_array = np.array(anchors).reshape(1, num_anchors, 2)
35 | anchor_array = anchor_array.repeat(batch, 0)
36 | anchor_array = np.expand_dims(anchor_array, axis=3).repeat(H * W, 3)
37 | # Shape: [batch, num_anchors, 2, H * W]
38 | anchor_tensor = torch.tensor(anchor_array, device=device, dtype=torch.float32)
39 |
40 | # normalize coordinates to [0, 1]
41 | normal_array = np.array([1.0 / W, 1.0 / H, 1.0 / W, 1.0 / H], dtype=np.float32).reshape(1, 1, 4)
42 | normal_array = normal_array.repeat(batch, 0)
43 | normal_array = normal_array.repeat(num_anchors * H * W, 1)
44 | # Shape: [batch, num_anchors * H * W, 4]
45 | normal_tensor = torch.tensor(normal_array, device=device, dtype=torch.float32)
46 |
47 | bxy_list = []
48 | bwh_list = []
49 | det_confs_list = []
50 | cls_confs_list = []
51 |
52 | for i in range(num_anchors):
53 | begin = i * (5 + num_classes)
54 | end = (i + 1) * (5 + num_classes)
55 |
56 | bxy_list.append(output[:, begin : begin + 2])
57 | bwh_list.append(output[:, begin + 2 : begin + 4])
58 | det_confs_list.append(output[:, begin + 4 : begin + 5])
59 | cls_confs_list.append(output[:, begin + 5 : end])
60 |
61 | # Shape: [batch, num_anchors * 2, H, W]
62 | bxy = torch.cat(bxy_list, dim=1)
63 | # Shape: [batch, num_anchors * 2, H, W]
64 | bwh = torch.cat(bwh_list, dim=1)
65 |
66 | # Shape: [batch, num_anchors, H, W]
67 | det_confs = torch.cat(det_confs_list, dim=1)
68 | # Shape: [batch, num_anchors * H * W]
69 | det_confs = det_confs.view(batch, num_anchors * H * W)
70 |
71 | # Shape: [batch, num_anchors * num_classes, H, W]
72 | cls_confs = torch.cat(cls_confs_list, dim=1)
73 | # Shape: [batch, num_anchors, num_classes, H * W]
74 | cls_confs = cls_confs.view(batch, num_anchors, num_classes, H * W)
75 | # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes]
76 | cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, num_classes)
77 |
78 | # Apply sigmoid(), exp() and softmax() to slices
79 | #
80 | bxy = torch.sigmoid(bxy)
81 | bwh = torch.exp(bwh)
82 | det_confs = torch.sigmoid(det_confs)
83 | cls_confs = torch.nn.Softmax(dim=2)(cls_confs)
84 |
85 | # Shape: [batch, num_anchors, 2, H * W]
86 | bxy = bxy.view(batch, num_anchors, 2, H * W)
87 | # Shape: [batch, num_anchors, 2, H * W]
88 | bwh = bwh.view(batch, num_anchors, 2, H * W)
89 |
90 | # Apply C-x, C-y, P-w, P-h
91 | bxy[:, :, 0] += grid_x_tensor
92 | bxy[:, :, 1] += grid_y_tensor
93 |
94 | print(anchor_tensor.size())
95 | bwh *= anchor_tensor
96 |
97 | # Shape: [batch, num_anchors, 4, H * W] --> [batch, num_anchors * H * W, 4]
98 | boxes = torch.cat((bxy, bwh), dim=2).permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, 4)
99 |
100 | print(normal_tensor.size())
101 | boxes *= normal_tensor
102 |
103 | det_confs = det_confs.view(batch, num_anchors * H * W, 1)
104 | confs = cls_confs * det_confs
105 |
106 | # boxes: [batch, num_anchors * H * W, 4]
107 | # confs: [batch, num_anchors * H * W, num_classes]
108 |
109 | return boxes, confs
110 |
111 |
112 |
113 | def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
114 | validation=False):
115 | # Output would be invalid if it does not satisfy this assert
116 | # assert (output.size(1) == (5 + num_classes) * num_anchors)
117 |
118 | # print(output.size())
119 |
120 | # Slice the second dimension (channel) of output into:
121 | # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
122 | # And then into
123 | # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
124 | batch = output.size(0)
125 | H = output.size(2)
126 | W = output.size(3)
127 |
128 | bxy_list = []
129 | bwh_list = []
130 | det_confs_list = []
131 | cls_confs_list = []
132 |
133 | for i in range(num_anchors):
134 | begin = i * (5 + num_classes)
135 | end = (i + 1) * (5 + num_classes)
136 |
137 | bxy_list.append(output[:, begin : begin + 2])
138 | bwh_list.append(output[:, begin + 2 : begin + 4])
139 | det_confs_list.append(output[:, begin + 4 : begin + 5])
140 | cls_confs_list.append(output[:, begin + 5 : end])
141 |
142 | # Shape: [batch, num_anchors * 2, H, W]
143 | bxy = torch.cat(bxy_list, dim=1)
144 | # Shape: [batch, num_anchors * 2, H, W]
145 | bwh = torch.cat(bwh_list, dim=1)
146 |
147 | # Shape: [batch, num_anchors, H, W]
148 | det_confs = torch.cat(det_confs_list, dim=1)
149 | # Shape: [batch, num_anchors * H * W]
150 | det_confs = det_confs.view(batch, num_anchors * H * W)
151 |
152 | # Shape: [batch, num_anchors * num_classes, H, W]
153 | cls_confs = torch.cat(cls_confs_list, dim=1)
154 | # Shape: [batch, num_anchors, num_classes, H * W]
155 | cls_confs = cls_confs.view(batch, num_anchors, num_classes, H * W)
156 | # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes]
157 | cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, num_classes)
158 |
159 | # Apply sigmoid(), exp() and softmax() to slices
160 | #
161 | bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
162 | bwh = torch.exp(bwh)
163 | det_confs = torch.sigmoid(det_confs)
164 | cls_confs = torch.nn.Softmax(dim=2)(cls_confs)
165 |
166 | # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
167 | grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0), axis=0), axis=0)
168 | grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1), axis=0), axis=0)
169 | # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1)
170 | # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W)
171 |
172 | anchor_w = []
173 | anchor_h = []
174 | for i in range(num_anchors):
175 | anchor_w.append(anchors[i * 2])
176 | anchor_h.append(anchors[i * 2 + 1])
177 |
178 | device = None
179 | cuda_check = output.is_cuda
180 | if cuda_check:
181 | device = output.get_device()
182 |
183 | bx_list = []
184 | by_list = []
185 | bw_list = []
186 | bh_list = []
187 |
188 | # Apply C-x, C-y, P-w, P-h
189 | for i in range(num_anchors):
190 | ii = i * 2
191 | # Shape: [batch, 1, H, W]
192 | bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32)
193 | # Shape: [batch, 1, H, W]
194 | by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32)
195 | # Shape: [batch, 1, H, W]
196 | bw = bwh[:, ii : ii + 1] * anchor_w[i]
197 | # Shape: [batch, 1, H, W]
198 | bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]
199 |
200 | bx_list.append(bx)
201 | by_list.append(by)
202 | bw_list.append(bw)
203 | bh_list.append(bh)
204 |
205 |
206 | ########################################
207 | # Figure out bboxes from slices #
208 | ########################################
209 |
210 | # Shape: [batch, num_anchors, H, W]
211 | bx = torch.cat(bx_list, dim=1)
212 | # Shape: [batch, num_anchors, H, W]
213 | by = torch.cat(by_list, dim=1)
214 | # Shape: [batch, num_anchors, H, W]
215 | bw = torch.cat(bw_list, dim=1)
216 | # Shape: [batch, num_anchors, H, W]
217 | bh = torch.cat(bh_list, dim=1)
218 |
219 | # Shape: [batch, 2 * num_anchors, H, W]
220 | bx_bw = torch.cat((bx, bw), dim=1)
221 | # Shape: [batch, 2 * num_anchors, H, W]
222 | by_bh = torch.cat((by, bh), dim=1)
223 |
224 | # normalize coordinates to [0, 1]
225 | bx_bw /= W
226 | by_bh /= H
227 |
228 | # Shape: [batch, num_anchors * H * W, 1]
229 | bx = bx_bw[:, :num_anchors].view(batch, num_anchors * H * W, 1)
230 | by = by_bh[:, :num_anchors].view(batch, num_anchors * H * W, 1)
231 | bw = bx_bw[:, num_anchors:].view(batch, num_anchors * H * W, 1)
232 | bh = by_bh[:, num_anchors:].view(batch, num_anchors * H * W, 1)
233 |
234 | # Shape: [batch, num_anchors * h * w, 4]
235 | boxes = torch.cat((bx, by, bw, bh), dim=2).view(batch, num_anchors * H * W, 4)
236 |
237 | # boxes: [batch, num_anchors * H * W, num_classes, 4]
238 | # cls_confs: [batch, num_anchors * H * W, num_classes]
239 | # det_confs: [batch, num_anchors * H * W]
240 |
241 | det_confs = det_confs.view(batch, num_anchors * H * W, 1)
242 | confs = cls_confs * det_confs
243 |
244 | # boxes: [batch, num_anchors * H * W, 4]
245 | # confs: [batch, num_anchors * H * W, num_classes]
246 |
247 | return boxes, confs
248 |
249 |
250 | class YoloLayer(nn.Module):
251 | ''' Yolo layer
252 | model_out: while inference,is post-processing inside or outside the model
253 | true:outside
254 | '''
255 | def __init__(self, anchor_mask=[], num_classes=0, anchors=[], num_anchors=1, stride=32, model_out=False):
256 | super(YoloLayer, self).__init__()
257 | self.anchor_mask = anchor_mask
258 | self.num_classes = num_classes
259 | self.anchors = anchors
260 | self.num_anchors = num_anchors
261 | self.anchor_step = len(anchors) // num_anchors
262 | self.coord_scale = 1
263 | self.noobject_scale = 1
264 | self.object_scale = 5
265 | self.class_scale = 1
266 | self.thresh = 0.6
267 | self.stride = stride
268 | self.seen = 0
269 | self.scale_x_y = 1
270 |
271 | self.model_out = model_out
272 |
273 | def forward(self, output, target=None):
274 | if self.training:
275 | return output
276 | masked_anchors = []
277 | for m in self.anchor_mask:
278 | masked_anchors += self.anchors[m * self.anchor_step:(m + 1) * self.anchor_step]
279 | masked_anchors = [anchor / self.stride for anchor in masked_anchors]
280 |
281 | return yolo_forward(output, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask),scale_x_y=self.scale_x_y)
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | # Flask utils
2 | from flask import Flask, redirect, url_for, request, render_template, Response
3 | from werkzeug.utils import secure_filename
4 | from gevent.pywsgi import WSGIServer
5 | from camera import ObjectDetection
6 |
7 | app = Flask(__name__)
8 | @app.route("/")
9 | def main():
10 | return render_template("index.html")
11 |
12 | def gen(camera):
13 | while True:
14 | frame = camera.main()
15 | if frame != "":
16 | yield (b'--frame\r\n'
17 | b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n\r\n')
18 |
19 | @app.route('/video_feed')
20 | def video_feed():
21 | id = 0
22 | return Response(gen(ObjectDetection(id)), mimetype='multipart/x-mixed-replace; boundary=frame')
23 |
24 |
25 | if __name__ == '__main__':
26 | # Serve the app with gevent
27 | app.run(host='127.0.0.1', threaded=True, debug = True)
28 |
--------------------------------------------------------------------------------
/bbox.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 |
3 | import torch
4 | import random
5 |
6 | import numpy as np
7 | import cv2
8 |
9 | def confidence_filter(result, confidence):
10 | conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2)
11 | result = result*conf_mask
12 |
13 | return result
14 |
15 | def confidence_filter_cls(result, confidence):
16 | max_scores = torch.max(result[:,:,5:25], 2)[0]
17 | res = torch.cat((result, max_scores),2)
18 | print(res.shape)
19 |
20 |
21 | cond_1 = (res[:,:,4] > confidence).float()
22 | cond_2 = (res[:,:,25] > 0.995).float()
23 |
24 | conf = cond_1 + cond_2
25 | conf = torch.clamp(conf, 0.0, 1.0)
26 | conf = conf.unsqueeze(2)
27 | result = result*conf
28 | return result
29 |
30 |
31 |
32 | def get_abs_coord(box):
33 | box[2], box[3] = abs(box[2]), abs(box[3])
34 | x1 = (box[0] - box[2]/2) - 1
35 | y1 = (box[1] - box[3]/2) - 1
36 | x2 = (box[0] + box[2]/2) - 1
37 | y2 = (box[1] + box[3]/2) - 1
38 | return x1, y1, x2, y2
39 |
40 |
41 |
42 | def sanity_fix(box):
43 | if (box[0] > box[2]):
44 | box[0], box[2] = box[2], box[0]
45 |
46 | if (box[1] > box[3]):
47 | box[1], box[3] = box[3], box[1]
48 |
49 | return box
50 |
51 | def bbox_iou(box1, box2):
52 | """
53 | Returns the IoU of two bounding boxes
54 |
55 |
56 | """
57 | #Get the coordinates of bounding boxes
58 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
59 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3]
60 |
61 | #get the corrdinates of the intersection rectangle
62 | inter_rect_x1 = torch.max(b1_x1, b2_x1)
63 | inter_rect_y1 = torch.max(b1_y1, b2_y1)
64 | inter_rect_x2 = torch.min(b1_x2, b2_x2)
65 | inter_rect_y2 = torch.min(b1_y2, b2_y2)
66 |
67 | #Intersection area
68 | if torch.cuda.is_available():
69 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda())
70 | else:
71 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape))
72 |
73 | #Union Area
74 | b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
75 | b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
76 |
77 | iou = inter_area / (b1_area + b2_area - inter_area)
78 |
79 | return iou
80 |
81 |
82 | def pred_corner_coord(prediction):
83 | #Get indices of non-zero confidence bboxes
84 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous()
85 |
86 | box = prediction[ind_nz[0], ind_nz[1]]
87 |
88 |
89 | box_a = box.new(box.shape)
90 | box_a[:,0] = (box[:,0] - box[:,2]/2)
91 | box_a[:,1] = (box[:,1] - box[:,3]/2)
92 | box_a[:,2] = (box[:,0] + box[:,2]/2)
93 | box_a[:,3] = (box[:,1] + box[:,3]/2)
94 | box[:,:4] = box_a[:,:4]
95 |
96 | prediction[ind_nz[0], ind_nz[1]] = box
97 |
98 | return prediction
99 |
100 |
101 |
102 |
103 | def write(x, batches, results, colors, classes):
104 | c1 = tuple(x[1:3].int())
105 | c2 = tuple(x[3:5].int())
106 | img = results[int(x[0])]
107 | cls = int(x[-1])
108 | label = "{0}".format(classes[cls])
109 | color = random.choice(colors)
110 | cv2.rectangle(img, c1, c2,color, 1)
111 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
112 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
113 | cv2.rectangle(img, c1, c2,color, -1)
114 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);
115 | return img
116 |
--------------------------------------------------------------------------------
/camera.py:
--------------------------------------------------------------------------------
1 | import torch,cv2,random,os,time
2 | import torch.nn as nn
3 | from torch.autograd import Variable
4 | import numpy as np
5 | import pickle as pkl
6 | import argparse
7 | import threading, queue
8 | from torch.multiprocessing import Pool, Process, set_start_method
9 | from util import write_results, load_classes
10 | from preprocess import letterbox_image
11 | from darknet import Darknet
12 | from imutils.video import WebcamVideoStream,FPS
13 | # from camera import write
14 | import win32com.client as wincl #### Python's Text-to-speech (tts) engine for windows, multiprocessing
15 | speak = wincl.Dispatch("SAPI.SpVoice") #### This initiates the tts engine
16 |
17 | torch.multiprocessing.set_start_method('spawn', force=True)
18 |
19 | ## Setting up torch for gpu utilization
20 | if torch.cuda.is_available():
21 | torch.backends.cudnn.enabled = True
22 | torch.backends.cudnn.benchmark = True
23 | torch.backends.cudnn.deterministic = True
24 | torch.set_default_tensor_type('torch.cuda.FloatTensor')
25 |
26 | def prep_image(img, inp_dim):
27 | """
28 | Prepare image for inputting to the neural network.
29 | Returns a Variable
30 | """
31 | orig_im = img
32 | dim = orig_im.shape[1], orig_im.shape[0]
33 | img = (letterbox_image(orig_im, (inp_dim, inp_dim)))
34 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
35 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
36 | return img_, orig_im, dim
37 |
38 | labels = {}
39 | b_boxes = {}
40 | def write(bboxes, img, classes, colors):
41 | """
42 | Draws the bounding box in every frame over the objects that the model detects
43 | """
44 | class_idx = bboxes
45 | bboxes = bboxes[1:5]
46 | bboxes = bboxes.cpu().data.numpy()
47 | bboxes = bboxes.astype(int)
48 | b_boxes.update({"bbox":bboxes.tolist()})
49 | # bboxes = bboxes + [150,100,200,200] # personal choice you can modify this to get distance as accurate as possible
50 | bboxes = torch.from_numpy(bboxes)
51 | cls = int(class_idx[-1])
52 | label = "{0}".format(classes[cls])
53 | labels.update({"Current Object":label})
54 | color = random.choice(colors)
55 |
56 | ## Put text configuration on frame
57 | text_str = '%s' % (label)
58 | font_face = cv2.FONT_HERSHEY_DUPLEX
59 | font_scale = 0.6
60 | font_thickness = 1
61 | text_w, text_h = cv2.getTextSize(text_str, font_face, font_scale, font_thickness)[0]
62 | text_pt = (bboxes[0], bboxes[1] - 3)
63 | text_color = [255, 255, 255]
64 |
65 |
66 | ## Distance Meaasurement for each bounding box
67 | x, y, w, h = bboxes[0], bboxes[1], bboxes[2], bboxes[3]
68 | ## item() is used to retrieve the value from the tensor
69 | distance = (2 * 3.14 * 180) / (w.item()+ h.item() * 360) * 1000 + 3 ### Distance measuring in Inch
70 | feedback = ("{}".format(labels["Current Object"])+ " " +"is"+" at {} ".format(round(distance))+"Inches")
71 | # # speak.Speak(feedback) # If you are running this on linux based OS kindly use espeak. Using this speaking library in winodws will add unnecessary latency
72 | print(feedback)
73 |
74 | cv2.putText(img, str("{:.2f} Inches".format(distance)), (text_w+x,y), cv2.FONT_HERSHEY_DUPLEX, font_scale, (0,255,0), font_thickness, cv2.LINE_AA)
75 | cv2.rectangle(img, (bboxes[0],bboxes[1]),(bboxes[2] + text_w -30,bboxes[3]), color, 2)
76 | cv2.putText(img, text_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA)
77 |
78 | return img
79 |
80 | class ObjectDetection:
81 | def __init__(self, id):
82 | # self.cap = cv2.VideoCapture(id)
83 | self.cap = WebcamVideoStream(src = id).start()
84 | self.cfgfile = "cfg/yolov3.cfg"
85 | # self.cfgfile = 'cfg/yolov3-tiny.cfg'
86 | self.weightsfile = "yolov3.weights"
87 | # self.weightsfile = 'yolov3-tiny.weights'
88 | self.confidence = float(0.6)
89 | self.nms_thesh = float(0.8)
90 | self.num_classes = 80
91 | self.classes = load_classes('data/coco.names')
92 | self.colors = pkl.load(open("pallete", "rb"))
93 | self.model = Darknet(self.cfgfile)
94 | self.CUDA = torch.cuda.is_available()
95 | self.model.load_weights(self.weightsfile)
96 | self.model.net_info["height"] = 160
97 | self.inp_dim = int(self.model.net_info["height"])
98 | self.width = 1280 #640#1280
99 | self.height = 720 #360#720
100 | print("Loading network.....")
101 | if self.CUDA:
102 | self.model.cuda()
103 | print("Network successfully loaded")
104 | assert self.inp_dim % 32 == 0
105 | assert self.inp_dim > 32
106 | self.model.eval()
107 |
108 | def main(self):
109 | q = queue.Queue()
110 | while True:
111 | def frame_render(queue_from_cam):
112 | frame = self.cap.read() # If you capture stream using opencv (cv2.VideoCapture()) the use the following line
113 | # ret, frame = self.cap.read()
114 | frame = cv2.resize(frame,(self.width, self.height))
115 | queue_from_cam.put(frame)
116 | cam = threading.Thread(target=frame_render, args=(q,))
117 | cam.start()
118 | cam.join()
119 | frame = q.get()
120 | q.task_done()
121 | fps = FPS().start()
122 | try:
123 | img, orig_im, dim = prep_image(frame, self.inp_dim)
124 | im_dim = torch.FloatTensor(dim).repeat(1,2)
125 | if self.CUDA: #### If you have a gpu properly installed then it will run on the gpu
126 | im_dim = im_dim.cuda()
127 | img = img.cuda()
128 | # with torch.no_grad(): #### Set the model in the evaluation mode
129 | output = self.model(Variable(img), self.CUDA)
130 | output = write_results(output, self.confidence, self.num_classes, nms = True, nms_conf = self.nms_thesh) #### Localize the objects in a frame
131 | output = output.type(torch.half)
132 |
133 | if list(output.size()) == [1,86]:
134 | print(output.size())
135 | pass
136 | else:
137 | output[:,1:5] = torch.clamp(output[:,1:5], 0.0, float(self.inp_dim))/self.inp_dim
138 |
139 | # im_dim = im_dim.repeat(output.size(0), 1)
140 | output[:,[1,3]] *= frame.shape[1]
141 | output[:,[2,4]] *= frame.shape[0]
142 | list(map(lambda boxes: write(boxes, frame, self.classes, self.colors),output))
143 |
144 | except:
145 | pass
146 |
147 | fps.update()
148 | fps.stop()
149 | ret, jpeg = cv2.imencode('.jpg', frame)
150 | print("[INFO] elasped time: {:.2f}".format(fps.elapsed()))
151 | print("[INFO] approx. FPS: {:.1f}".format(fps.fps()))
152 |
153 | return jpeg.tostring()
--------------------------------------------------------------------------------
/cfg/tiny-yolo-voc.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | batch=64
3 | subdivisions=8
4 | width=416
5 | height=416
6 | channels=3
7 | momentum=0.9
8 | decay=0.0005
9 | angle=0
10 | saturation = 1.5
11 | exposure = 1.5
12 | hue=.1
13 |
14 | learning_rate=0.001
15 | max_batches = 40200
16 | policy=steps
17 | steps=-1,100,20000,30000
18 | scales=.1,10,.1,.1
19 |
20 | [convolutional]
21 | batch_normalize=1
22 | filters=16
23 | size=3
24 | stride=1
25 | pad=1
26 | activation=leaky
27 |
28 | [maxpool]
29 | size=2
30 | stride=2
31 |
32 | [convolutional]
33 | batch_normalize=1
34 | filters=32
35 | size=3
36 | stride=1
37 | pad=1
38 | activation=leaky
39 |
40 | [maxpool]
41 | size=2
42 | stride=2
43 |
44 | [convolutional]
45 | batch_normalize=1
46 | filters=64
47 | size=3
48 | stride=1
49 | pad=1
50 | activation=leaky
51 |
52 | [maxpool]
53 | size=2
54 | stride=2
55 |
56 | [convolutional]
57 | batch_normalize=1
58 | filters=128
59 | size=3
60 | stride=1
61 | pad=1
62 | activation=leaky
63 |
64 | [maxpool]
65 | size=2
66 | stride=2
67 |
68 | [convolutional]
69 | batch_normalize=1
70 | filters=256
71 | size=3
72 | stride=1
73 | pad=1
74 | activation=leaky
75 |
76 | [maxpool]
77 | size=2
78 | stride=2
79 |
80 | [convolutional]
81 | batch_normalize=1
82 | filters=512
83 | size=3
84 | stride=1
85 | pad=1
86 | activation=leaky
87 |
88 | [maxpool]
89 | size=2
90 | stride=1
91 |
92 | [convolutional]
93 | batch_normalize=1
94 | filters=1024
95 | size=3
96 | stride=1
97 | pad=1
98 | activation=leaky
99 |
100 | ###########
101 |
102 | [convolutional]
103 | batch_normalize=1
104 | size=3
105 | stride=1
106 | pad=1
107 | filters=1024
108 | activation=leaky
109 |
110 | [convolutional]
111 | size=1
112 | stride=1
113 | pad=1
114 | filters=125
115 | activation=linear
116 |
117 | [region]
118 | anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52
119 | bias_match=1
120 | classes=20
121 | coords=4
122 | num=5
123 | softmax=1
124 | jitter=.2
125 | rescore=1
126 |
127 | object_scale=5
128 | noobject_scale=1
129 | class_scale=1
130 | coord_scale=1
131 |
132 | absolute=1
133 | thresh = .6
134 | random=1
135 |
--------------------------------------------------------------------------------
/cfg/yolo-voc.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=64
4 | subdivisions=8
5 | # Training
6 | # batch=64
7 | # subdivisions=8
8 | height=416
9 | width=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 80200
21 | policy=steps
22 | steps=-1,500,40000,60000
23 | scales=0.1,10,.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | [maxpool]
34 | size=2
35 | stride=2
36 |
37 | [convolutional]
38 | batch_normalize=1
39 | filters=64
40 | size=3
41 | stride=1
42 | pad=1
43 | activation=leaky
44 |
45 | [maxpool]
46 | size=2
47 | stride=2
48 |
49 | [convolutional]
50 | batch_normalize=1
51 | filters=128
52 | size=3
53 | stride=1
54 | pad=1
55 | activation=leaky
56 |
57 | [convolutional]
58 | batch_normalize=1
59 | filters=64
60 | size=1
61 | stride=1
62 | pad=1
63 | activation=leaky
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=1
70 | pad=1
71 | activation=leaky
72 |
73 | [maxpool]
74 | size=2
75 | stride=2
76 |
77 | [convolutional]
78 | batch_normalize=1
79 | filters=256
80 | size=3
81 | stride=1
82 | pad=1
83 | activation=leaky
84 |
85 | [convolutional]
86 | batch_normalize=1
87 | filters=128
88 | size=1
89 | stride=1
90 | pad=1
91 | activation=leaky
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=256
96 | size=3
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [maxpool]
102 | size=2
103 | stride=2
104 |
105 | [convolutional]
106 | batch_normalize=1
107 | filters=512
108 | size=3
109 | stride=1
110 | pad=1
111 | activation=leaky
112 |
113 | [convolutional]
114 | batch_normalize=1
115 | filters=256
116 | size=1
117 | stride=1
118 | pad=1
119 | activation=leaky
120 |
121 | [convolutional]
122 | batch_normalize=1
123 | filters=512
124 | size=3
125 | stride=1
126 | pad=1
127 | activation=leaky
128 |
129 | [convolutional]
130 | batch_normalize=1
131 | filters=256
132 | size=1
133 | stride=1
134 | pad=1
135 | activation=leaky
136 |
137 | [convolutional]
138 | batch_normalize=1
139 | filters=512
140 | size=3
141 | stride=1
142 | pad=1
143 | activation=leaky
144 |
145 | [maxpool]
146 | size=2
147 | stride=2
148 |
149 | [convolutional]
150 | batch_normalize=1
151 | filters=1024
152 | size=3
153 | stride=1
154 | pad=1
155 | activation=leaky
156 |
157 | [convolutional]
158 | batch_normalize=1
159 | filters=512
160 | size=1
161 | stride=1
162 | pad=1
163 | activation=leaky
164 |
165 | [convolutional]
166 | batch_normalize=1
167 | filters=1024
168 | size=3
169 | stride=1
170 | pad=1
171 | activation=leaky
172 |
173 | [convolutional]
174 | batch_normalize=1
175 | filters=512
176 | size=1
177 | stride=1
178 | pad=1
179 | activation=leaky
180 |
181 | [convolutional]
182 | batch_normalize=1
183 | filters=1024
184 | size=3
185 | stride=1
186 | pad=1
187 | activation=leaky
188 |
189 |
190 | #######
191 |
192 | [convolutional]
193 | batch_normalize=1
194 | size=3
195 | stride=1
196 | pad=1
197 | filters=1024
198 | activation=leaky
199 |
200 | [convolutional]
201 | batch_normalize=1
202 | size=3
203 | stride=1
204 | pad=1
205 | filters=1024
206 | activation=leaky
207 |
208 | [route]
209 | layers=-9
210 |
211 | [convolutional]
212 | batch_normalize=1
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=64
217 | activation=leaky
218 |
219 | [reorg]
220 | stride=2
221 |
222 | [route]
223 | layers=-1,-4
224 |
225 | [convolutional]
226 | batch_normalize=1
227 | size=3
228 | stride=1
229 | pad=1
230 | filters=1024
231 | activation=leaky
232 |
233 | [convolutional]
234 | size=1
235 | stride=1
236 | pad=1
237 | filters=125
238 | activation=linear
239 |
240 |
241 | [region]
242 | anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071
243 | bias_match=1
244 | classes=20
245 | coords=4
246 | num=5
247 | softmax=1
248 | jitter=.3
249 | rescore=1
250 |
251 | object_scale=5
252 | noobject_scale=1
253 | class_scale=1
254 | coord_scale=1
255 |
256 | absolute=1
257 | thresh = .6
258 | random=1
259 |
--------------------------------------------------------------------------------
/cfg/yolo.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=8
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | [maxpool]
34 | size=2
35 | stride=2
36 |
37 | [convolutional]
38 | batch_normalize=1
39 | filters=64
40 | size=3
41 | stride=1
42 | pad=1
43 | activation=leaky
44 |
45 | [maxpool]
46 | size=2
47 | stride=2
48 |
49 | [convolutional]
50 | batch_normalize=1
51 | filters=128
52 | size=3
53 | stride=1
54 | pad=1
55 | activation=leaky
56 |
57 | [convolutional]
58 | batch_normalize=1
59 | filters=64
60 | size=1
61 | stride=1
62 | pad=1
63 | activation=leaky
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=1
70 | pad=1
71 | activation=leaky
72 |
73 | [maxpool]
74 | size=2
75 | stride=2
76 |
77 | [convolutional]
78 | batch_normalize=1
79 | filters=256
80 | size=3
81 | stride=1
82 | pad=1
83 | activation=leaky
84 |
85 | [convolutional]
86 | batch_normalize=1
87 | filters=128
88 | size=1
89 | stride=1
90 | pad=1
91 | activation=leaky
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=256
96 | size=3
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [maxpool]
102 | size=2
103 | stride=2
104 |
105 | [convolutional]
106 | batch_normalize=1
107 | filters=512
108 | size=3
109 | stride=1
110 | pad=1
111 | activation=leaky
112 |
113 | [convolutional]
114 | batch_normalize=1
115 | filters=256
116 | size=1
117 | stride=1
118 | pad=1
119 | activation=leaky
120 |
121 | [convolutional]
122 | batch_normalize=1
123 | filters=512
124 | size=3
125 | stride=1
126 | pad=1
127 | activation=leaky
128 |
129 | [convolutional]
130 | batch_normalize=1
131 | filters=256
132 | size=1
133 | stride=1
134 | pad=1
135 | activation=leaky
136 |
137 | [convolutional]
138 | batch_normalize=1
139 | filters=512
140 | size=3
141 | stride=1
142 | pad=1
143 | activation=leaky
144 |
145 | [maxpool]
146 | size=2
147 | stride=2
148 |
149 | [convolutional]
150 | batch_normalize=1
151 | filters=1024
152 | size=3
153 | stride=1
154 | pad=1
155 | activation=leaky
156 |
157 | [convolutional]
158 | batch_normalize=1
159 | filters=512
160 | size=1
161 | stride=1
162 | pad=1
163 | activation=leaky
164 |
165 | [convolutional]
166 | batch_normalize=1
167 | filters=1024
168 | size=3
169 | stride=1
170 | pad=1
171 | activation=leaky
172 |
173 | [convolutional]
174 | batch_normalize=1
175 | filters=512
176 | size=1
177 | stride=1
178 | pad=1
179 | activation=leaky
180 |
181 | [convolutional]
182 | batch_normalize=1
183 | filters=1024
184 | size=3
185 | stride=1
186 | pad=1
187 | activation=leaky
188 |
189 |
190 | #######
191 |
192 | [convolutional]
193 | batch_normalize=1
194 | size=3
195 | stride=1
196 | pad=1
197 | filters=1024
198 | activation=leaky
199 |
200 | [convolutional]
201 | batch_normalize=1
202 | size=3
203 | stride=1
204 | pad=1
205 | filters=1024
206 | activation=leaky
207 |
208 | [route]
209 | layers=-9
210 |
211 | [convolutional]
212 | batch_normalize=1
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=64
217 | activation=leaky
218 |
219 | [reorg]
220 | stride=2
221 |
222 | [route]
223 | layers=-1,-4
224 |
225 | [convolutional]
226 | batch_normalize=1
227 | size=3
228 | stride=1
229 | pad=1
230 | filters=1024
231 | activation=leaky
232 |
233 | [convolutional]
234 | size=1
235 | stride=1
236 | pad=1
237 | filters=425
238 | activation=linear
239 |
240 |
241 | [region]
242 | anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
243 | bias_match=1
244 | classes=80
245 | coords=4
246 | num=5
247 | softmax=1
248 | jitter=.3
249 | rescore=1
250 |
251 | object_scale=5
252 | noobject_scale=1
253 | class_scale=1
254 | coord_scale=1
255 |
256 | absolute=1
257 | thresh = .6
258 | random=1
259 |
--------------------------------------------------------------------------------
/cfg/yolov3-spp.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 |
580 | [route]
581 | layers=-2
582 |
583 | [maxpool]
584 | stride=1
585 | size=9
586 |
587 | [route]
588 | layers=-4
589 |
590 | [maxpool]
591 | stride=1
592 | size=13
593 |
594 | [route]
595 | layers=-1,-3,-5,-6
596 |
597 | ### End SPP ###
598 |
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 |
607 |
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 |
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 |
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 |
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=255
637 | activation=linear
638 |
639 |
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
643 | classes=80
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 |
650 |
651 | [route]
652 | layers = -4
653 |
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 |
662 | [upsample]
663 | stride=2
664 |
665 | [route]
666 | layers = -1, 61
667 |
668 |
669 |
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 |
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 |
686 | [convolutional]
687 | batch_normalize=1
688 | filters=256
689 | size=1
690 | stride=1
691 | pad=1
692 | activation=leaky
693 |
694 | [convolutional]
695 | batch_normalize=1
696 | size=3
697 | stride=1
698 | pad=1
699 | filters=512
700 | activation=leaky
701 |
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=leaky
709 |
710 | [convolutional]
711 | batch_normalize=1
712 | size=3
713 | stride=1
714 | pad=1
715 | filters=512
716 | activation=leaky
717 |
718 | [convolutional]
719 | size=1
720 | stride=1
721 | pad=1
722 | filters=255
723 | activation=linear
724 |
725 |
726 | [yolo]
727 | mask = 3,4,5
728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
729 | classes=80
730 | num=9
731 | jitter=.3
732 | ignore_thresh = .7
733 | truth_thresh = 1
734 | random=1
735 |
736 |
737 |
738 | [route]
739 | layers = -4
740 |
741 | [convolutional]
742 | batch_normalize=1
743 | filters=128
744 | size=1
745 | stride=1
746 | pad=1
747 | activation=leaky
748 |
749 | [upsample]
750 | stride=2
751 |
752 | [route]
753 | layers = -1, 36
754 |
755 |
756 |
757 | [convolutional]
758 | batch_normalize=1
759 | filters=128
760 | size=1
761 | stride=1
762 | pad=1
763 | activation=leaky
764 |
765 | [convolutional]
766 | batch_normalize=1
767 | size=3
768 | stride=1
769 | pad=1
770 | filters=256
771 | activation=leaky
772 |
773 | [convolutional]
774 | batch_normalize=1
775 | filters=128
776 | size=1
777 | stride=1
778 | pad=1
779 | activation=leaky
780 |
781 | [convolutional]
782 | batch_normalize=1
783 | size=3
784 | stride=1
785 | pad=1
786 | filters=256
787 | activation=leaky
788 |
789 | [convolutional]
790 | batch_normalize=1
791 | filters=128
792 | size=1
793 | stride=1
794 | pad=1
795 | activation=leaky
796 |
797 | [convolutional]
798 | batch_normalize=1
799 | size=3
800 | stride=1
801 | pad=1
802 | filters=256
803 | activation=leaky
804 |
805 | [convolutional]
806 | size=1
807 | stride=1
808 | pad=1
809 | filters=255
810 | activation=linear
811 |
812 |
813 | [yolo]
814 | mask = 0,1,2
815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
816 | classes=80
817 | num=9
818 | jitter=.3
819 | ignore_thresh = .7
820 | truth_thresh = 1
821 | random=1
822 |
823 |
--------------------------------------------------------------------------------
/cfg/yolov3-tiny.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=2
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=16
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | [maxpool]
34 | size=2
35 | stride=2
36 |
37 | [convolutional]
38 | batch_normalize=1
39 | filters=32
40 | size=3
41 | stride=1
42 | pad=1
43 | activation=leaky
44 |
45 | [maxpool]
46 | size=2
47 | stride=2
48 |
49 | [convolutional]
50 | batch_normalize=1
51 | filters=64
52 | size=3
53 | stride=1
54 | pad=1
55 | activation=leaky
56 |
57 | [maxpool]
58 | size=2
59 | stride=2
60 |
61 | [convolutional]
62 | batch_normalize=1
63 | filters=128
64 | size=3
65 | stride=1
66 | pad=1
67 | activation=leaky
68 |
69 | [maxpool]
70 | size=2
71 | stride=2
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=256
76 | size=3
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [maxpool]
82 | size=2
83 | stride=2
84 |
85 | [convolutional]
86 | batch_normalize=1
87 | filters=512
88 | size=3
89 | stride=1
90 | pad=1
91 | activation=leaky
92 |
93 | [maxpool]
94 | size=2
95 | stride=1
96 |
97 | [convolutional]
98 | batch_normalize=1
99 | filters=1024
100 | size=3
101 | stride=1
102 | pad=1
103 | activation=leaky
104 |
105 | ###########
106 |
107 | [convolutional]
108 | batch_normalize=1
109 | filters=256
110 | size=1
111 | stride=1
112 | pad=1
113 | activation=leaky
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=512
118 | size=3
119 | stride=1
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | size=1
125 | stride=1
126 | pad=1
127 | filters=255
128 | activation=linear
129 |
130 |
131 |
132 | [yolo]
133 | mask = 3,4,5
134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
135 | classes=80
136 | num=6
137 | jitter=.3
138 | ignore_thresh = .7
139 | truth_thresh = 1
140 | random=1
141 |
142 | [route]
143 | layers = -4
144 |
145 | [convolutional]
146 | batch_normalize=1
147 | filters=128
148 | size=1
149 | stride=1
150 | pad=1
151 | activation=leaky
152 |
153 | [upsample]
154 | stride=2
155 |
156 | [route]
157 | layers = -1, 8
158 |
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 |
167 | [convolutional]
168 | size=1
169 | stride=1
170 | pad=1
171 | filters=255
172 | activation=linear
173 |
174 | [yolo]
175 | mask = 0,1,2
176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
177 | classes=80
178 | num=6
179 | jitter=.3
180 | ignore_thresh = .7
181 | truth_thresh = 1
182 | random=1
183 |
--------------------------------------------------------------------------------
/cfg/yolov3.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=16
8 | width= 320
9 | height = 320
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 |
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 |
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 |
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 |
606 |
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .5
614 | truth_thresh = 1
615 | random=1
616 |
617 |
618 | [route]
619 | layers = -4
620 |
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 |
629 | [upsample]
630 | stride=2
631 |
632 | [route]
633 | layers = -1, 61
634 |
635 |
636 |
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 |
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 |
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 |
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 |
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 |
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 |
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 |
692 |
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .5
700 | truth_thresh = 1
701 | random=1
702 |
703 |
704 |
705 | [route]
706 | layers = -4
707 |
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 |
716 | [upsample]
717 | stride=2
718 |
719 | [route]
720 | layers = -1, 36
721 |
722 |
723 |
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 |
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 |
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 |
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 |
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 |
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 |
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 |
779 |
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .5
787 | truth_thresh = 1
788 | random=1
789 |
790 |
--------------------------------------------------------------------------------
/data/coco.names:
--------------------------------------------------------------------------------
1 | person
2 | bicycle
3 | car
4 | motorbike
5 | aeroplane
6 | bus
7 | train
8 | truck
9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 |
--------------------------------------------------------------------------------
/data/voc.names:
--------------------------------------------------------------------------------
1 | aeroplane
2 | bicycle
3 | bird
4 | boat
5 | bottle
6 | bus
7 | car
8 | cat
9 | chair
10 | cow
11 | diningtable
12 | dog
13 | horse
14 | motorbike
15 | person
16 | pottedplant
17 | sheep
18 | sofa
19 | train
20 | tvmonitor
21 |
--------------------------------------------------------------------------------
/object_detection.py:
--------------------------------------------------------------------------------
1 | import torch,cv2,random,os,time
2 | import torch.nn as nn
3 | from torch.autograd import Variable
4 | import numpy as np
5 | import pickle as pkl
6 | import argparse
7 | import threading, queue
8 | from torch.multiprocessing import Pool, Process, set_start_method
9 | from util import write_results, load_classes
10 | from preprocess import letterbox_image
11 | from darknet import Darknet
12 | from imutils.video import WebcamVideoStream,FPS
13 | # from camera import write
14 | import win32com.client as wincl #### Python's Text-to-speech (tts) engine for windows, multiprocessing
15 | speak = wincl.Dispatch("SAPI.SpVoice") #### This initiates the tts engine
16 |
17 | torch.multiprocessing.set_start_method('spawn', force=True)
18 |
19 | ## Setting up torch for gpu utilization
20 | if torch.cuda.is_available():
21 | torch.backends.cudnn.enabled = True
22 | torch.backends.cudnn.benchmark = True
23 | torch.backends.cudnn.deterministic = True
24 | torch.set_default_tensor_type('torch.cuda.FloatTensor')
25 |
26 | def prep_image(img, inp_dim):
27 | """
28 | Prepare image for inputting to the neural network.
29 | Returns a Variable
30 | """
31 | orig_im = img
32 | dim = orig_im.shape[1], orig_im.shape[0]
33 | img = (letterbox_image(orig_im, (inp_dim, inp_dim)))
34 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
35 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
36 | return img_, orig_im, dim
37 |
38 | labels = {}
39 | b_boxes = {}
40 | def write(bboxes, img, classes, colors):
41 | """
42 | Draws the bounding box in every frame over the objects that the model detects
43 | """
44 | class_idx = bboxes
45 | bboxes = bboxes[1:5]
46 | bboxes = bboxes.cpu().data.numpy()
47 | bboxes = bboxes.astype(int)
48 | b_boxes.update({"bbox":bboxes.tolist()})
49 | # bboxes = bboxes + [150,100,200,200] # personal choice you can modify this to get distance as accurate as possible
50 | bboxes = torch.from_numpy(bboxes)
51 | cls = int(class_idx[-1])
52 | label = "{0}".format(classes[cls])
53 | labels.update({"Current Object":label})
54 | color = random.choice(colors)
55 |
56 | ## Put text configuration on frame
57 | text_str = '%s' % (label)
58 | font_face = cv2.FONT_HERSHEY_DUPLEX
59 | font_scale = 0.6
60 | font_thickness = 1
61 | text_w, text_h = cv2.getTextSize(text_str, font_face, font_scale, font_thickness)[0]
62 | text_pt = (bboxes[0], bboxes[1] - 3)
63 | text_color = [255, 255, 255]
64 |
65 |
66 | ## Distance Meaasurement for each bounding box
67 | x, y, w, h = bboxes[0], bboxes[1], bboxes[2], bboxes[3]
68 | ## item() is used to retrieve the value from the tensor
69 | distance = (2 * 3.14 * 180) / (w.item()+ h.item() * 360) * 1000 + 3 ### Distance measuring in Inch
70 | feedback = ("{}".format(labels["Current Object"])+ " " +"is"+" at {} ".format(round(distance))+"Inches")
71 | # # speak.Speak(feedback) # If you are running this on linux based OS kindly use espeak. Using this speaking library in winodws will add unnecessary latency
72 | print(feedback)
73 |
74 | cv2.putText(img, str("{:.2f} Inches".format(distance)), (text_w+x,y), cv2.FONT_HERSHEY_DUPLEX, font_scale, (0,255,0), font_thickness, cv2.LINE_AA)
75 | cv2.rectangle(img, (bboxes[0],bboxes[1]),(bboxes[2] + text_w -30,bboxes[3]), color, 2)
76 | cv2.putText(img, text_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA)
77 |
78 | return img
79 |
80 | class ObjectDetection:
81 | def __init__(self, id):
82 | # self.cap = cv2.VideoCapture(id)
83 | self.cap = WebcamVideoStream(src = id).start()
84 | self.cfgfile = "cfg/yolov3.cfg"
85 | # self.cfgfile = 'cfg/yolov3-tiny.cfg'
86 | self.weightsfile = "yolov3.weights"
87 | # self.weightsfile = 'yolov3-tiny.weights'
88 | self.confidence = float(0.6)
89 | self.nms_thesh = float(0.8)
90 | self.num_classes = 80
91 | self.classes = load_classes('data/coco.names')
92 | self.colors = pkl.load(open("pallete", "rb"))
93 | self.model = Darknet(self.cfgfile)
94 | self.CUDA = torch.cuda.is_available()
95 | self.model.load_weights(self.weightsfile)
96 | self.model.net_info["height"] = 160
97 | self.inp_dim = int(self.model.net_info["height"])
98 | self.width = 1280 #640#1280
99 | self.height = 720 #360#720
100 | print("Loading network.....")
101 | if self.CUDA:
102 | self.model.cuda()
103 | print("Network successfully loaded")
104 | assert self.inp_dim % 32 == 0
105 | assert self.inp_dim > 32
106 | self.model.eval()
107 |
108 | def main(self):
109 | q = queue.Queue()
110 | while True:
111 | def frame_render(queue_from_cam):
112 | frame = self.cap.read() # If you capture stream using opencv (cv2.VideoCapture()) the use the following line
113 | # ret, frame = self.cap.read()
114 | frame = cv2.resize(frame,(self.width, self.height))
115 | queue_from_cam.put(frame)
116 | cam = threading.Thread(target=frame_render, args=(q,))
117 | cam.start()
118 | cam.join()
119 | frame = q.get()
120 | q.task_done()
121 | fps = FPS().start()
122 | try:
123 | img, orig_im, dim = prep_image(frame, self.inp_dim)
124 | im_dim = torch.FloatTensor(dim).repeat(1,2)
125 | if self.CUDA: #### If you have a gpu properly installed then it will run on the gpu
126 | im_dim = im_dim.cuda()
127 | img = img.cuda()
128 | # with torch.no_grad(): #### Set the model in the evaluation mode
129 | output = self.model(Variable(img), self.CUDA)
130 | output = write_results(output, self.confidence, self.num_classes, nms = True, nms_conf = self.nms_thesh) #### Localize the objects in a frame
131 | output = output.type(torch.half)
132 |
133 | if list(output.size()) == [1,86]:
134 | print(output.size())
135 | pass
136 | else:
137 | output[:,1:5] = torch.clamp(output[:,1:5], 0.0, float(self.inp_dim))/self.inp_dim
138 |
139 | # im_dim = im_dim.repeat(output.size(0), 1)
140 | output[:,[1,3]] *= frame.shape[1]
141 | output[:,[2,4]] *= frame.shape[0]
142 | list(map(lambda boxes: write(boxes, frame, self.classes, self.colors),output))
143 |
144 | except:
145 | pass
146 |
147 | fps.update()
148 | fps.stop()
149 | print("[INFO] elasped time: {:.2f}".format(fps.elapsed()))
150 | print("[INFO] approx. FPS: {:.1f}".format(fps.fps()))
151 | cv2.imshow("Object Detection Window", frame)
152 |
153 | if cv2.waitKey(1) & 0xFF == ord('q'):
154 | break
155 | continue
156 | torch.cuda.empty_cache()
157 |
158 |
159 | if __name__ == "__main__":
160 | id = 0
161 | ObjectDetection(id).main()
162 |
--------------------------------------------------------------------------------
/pallete:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paul-pias/Object-Detection-and-Distance-Measurement/d03baa0d99626190c87fccdd75fbc67ce8d176f8/pallete
--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from torch.autograd import Variable
7 | import numpy as np
8 | import cv2
9 | import matplotlib.pyplot as plt
10 | from util import count_parameters as count
11 | from util import convert2cpu as cpu
12 | from PIL import Image, ImageDraw
13 |
14 |
15 | def letterbox_image(img, inp_dim):
16 | '''resize image with unchanged aspect ratio using padding'''
17 | img_w, img_h = img.shape[1], img.shape[0]
18 | w, h = inp_dim
19 | new_w = int(img_w * min(w/img_w, h/img_h))
20 | new_h = int(img_h * min(w/img_w, h/img_h))
21 | resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC)
22 |
23 | canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
24 |
25 | canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w, :] = resized_image
26 |
27 | return canvas
28 |
29 |
30 |
31 | def prep_image(img, inp_dim):
32 | """
33 | Prepare image for inputting to the neural network.
34 |
35 | Returns a Variable
36 | """
37 |
38 | orig_im = cv2.imread(img)
39 | dim = orig_im.shape[1], orig_im.shape[0]
40 | img = (letterbox_image(orig_im, (inp_dim, inp_dim)))
41 | img_ = img[:,:,::-1].transpose((2,0,1)).copy()
42 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
43 | return img_, orig_im, dim
44 |
45 | def prep_image_pil(img, network_dim):
46 | orig_im = Image.open(img)
47 | img = orig_im.convert('RGB')
48 | dim = img.size
49 | img = img.resize(network_dim)
50 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes()))
51 | img = img.view(*network_dim, 3).transpose(0,1).transpose(0,2).contiguous()
52 | img = img.view(1, 3,*network_dim)
53 | img = img.float().div(255.0)
54 | return (img, orig_im, dim)
55 |
56 | def inp_to_image(inp):
57 | inp = inp.cpu().squeeze()
58 | inp = inp*255
59 | try:
60 | inp = inp.data.numpy()
61 | except RuntimeError:
62 | inp = inp.numpy()
63 | inp = inp.transpose(1,2,0)
64 |
65 | inp = inp[:,:,::-1]
66 | return inp
67 |
68 |
69 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python==4.1.0.25
2 | numpy==1.17.0
3 | pandas==0.25.1
4 | torch_nightly==1.2.0.dev20190807+cpu
5 | matplotlib==3.1.1
6 | Pillow>=7.1.0
7 | torch==1.2.0
8 | imutils
9 |
--------------------------------------------------------------------------------
/templates/12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paul-pias/Object-Detection-and-Distance-Measurement/d03baa0d99626190c87fccdd75fbc67ce8d176f8/templates/12.jpg
--------------------------------------------------------------------------------
/templates/base.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
31 |
32 |
33 |
34 |
35 | {% block content %}
36 |
37 | {% endblock %}
38 |
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %} {% block content %}
2 |
3 |
4 |
5 |
6 |
7 |
8 |
Camera - 01
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | {% endblock %}
--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
1 |
2 | from __future__ import division
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 | import numpy as np
9 | import cv2
10 | import matplotlib.pyplot as plt
11 | from bbox import bbox_iou
12 |
13 | def count_parameters(model):
14 | return sum(p.numel() for p in model.parameters())
15 |
16 | def count_learnable_parameters(model):
17 | return sum(p.numel() for p in model.parameters() if p.requires_grad)
18 |
19 | def convert2cpu(matrix):
20 | if matrix.is_cuda:
21 | return torch.FloatTensor(matrix.size()).copy_(matrix)
22 | else:
23 | return matrix
24 |
25 | def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True):
26 | batch_size = prediction.size(0)
27 | stride = inp_dim // prediction.size(2)
28 | grid_size = inp_dim // stride
29 | bbox_attrs = 5 + num_classes
30 | num_anchors = len(anchors)
31 |
32 | anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
33 |
34 |
35 |
36 | prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
37 | prediction = prediction.transpose(1,2).contiguous()
38 | prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
39 |
40 |
41 | #Sigmoid the centre_X, centre_Y. and object confidencce
42 | prediction[:,:,0] = torch.sigmoid(prediction[:,:,0])
43 | prediction[:,:,1] = torch.sigmoid(prediction[:,:,1])
44 | prediction[:,:,4] = torch.sigmoid(prediction[:,:,4])
45 |
46 |
47 |
48 | #Add the center offsets
49 | grid_len = np.arange(grid_size)
50 | a,b = np.meshgrid(grid_len, grid_len)
51 |
52 | x_offset = torch.FloatTensor(a).view(-1,1)
53 | y_offset = torch.FloatTensor(b).view(-1,1)
54 |
55 | if CUDA:
56 | x_offset = x_offset.cuda()
57 | y_offset = y_offset.cuda()
58 |
59 | x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0)
60 |
61 | prediction[:,:,:2] += x_y_offset
62 |
63 | #log space transform height and the width
64 | anchors = torch.FloatTensor(anchors)
65 |
66 | if CUDA:
67 | anchors = anchors.cuda()
68 |
69 | anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
70 | prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors
71 |
72 | #Softmax the class scores
73 | prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes]))
74 |
75 | prediction[:,:,:4] *= stride
76 |
77 |
78 | return prediction
79 |
80 | def load_classes(namesfile):
81 | fp = open(namesfile, "r")
82 | names = fp.read().split("\n")[:-1]
83 | return names
84 |
85 | def get_im_dim(im):
86 | im = cv2.imread(im)
87 | w,h = im.shape[1], im.shape[0]
88 | return w,h
89 |
90 | def unique(tensor):
91 | tensor_np = tensor.cpu().numpy()
92 | unique_np = np.unique(tensor_np)
93 | unique_tensor = torch.from_numpy(unique_np)
94 |
95 | tensor_res = tensor.new(unique_tensor.shape)
96 | tensor_res.copy_(unique_tensor)
97 | return tensor_res
98 |
99 | def write_results(prediction, confidence, num_classes, nms = True, nms_conf = 0.4):
100 | conf_mask = (prediction[:,:,4] > confidence).float().unsqueeze(2)
101 | prediction = prediction*conf_mask
102 |
103 |
104 | try:
105 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous()
106 | except:
107 | return 0
108 |
109 |
110 | box_a = prediction.new(prediction.shape)
111 | box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
112 | box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
113 | box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2)
114 | box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
115 | prediction[:,:,:4] = box_a[:,:,:4]
116 |
117 |
118 |
119 | batch_size = prediction.size(0)
120 |
121 | output = prediction.new(1, prediction.size(2) + 1)
122 | write = False
123 |
124 |
125 | for ind in range(batch_size):
126 | #select the image from the batch
127 | image_pred = prediction[ind]
128 |
129 |
130 |
131 | #Get the class having maximum score, and the index of that class
132 | #Get rid of num_classes softmax scores
133 | #Add the class index and the class score of class having maximum score
134 | max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1)
135 | max_conf = max_conf.float().unsqueeze(1)
136 | max_conf_score = max_conf_score.float().unsqueeze(1)
137 | seq = (image_pred[:,:5], max_conf, max_conf_score)
138 | image_pred = torch.cat(seq, 1)
139 |
140 |
141 |
142 | #Get rid of the zero entries
143 | non_zero_ind = (torch.nonzero(image_pred[:,4]))
144 |
145 |
146 | image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7)
147 |
148 | #Get the various classes detected in the image
149 | try:
150 | img_classes = unique(image_pred_[:,-1])
151 | except:
152 | continue
153 | #WE will do NMS classwise
154 | for cls in img_classes:
155 | #get the detections with one particular class
156 | cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1)
157 | class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze()
158 |
159 |
160 | image_pred_class = image_pred_[class_mask_ind].view(-1,7)
161 |
162 |
163 |
164 | #sort the detections such that the entry with the maximum objectness
165 | #confidence is at the top
166 | conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1]
167 | image_pred_class = image_pred_class[conf_sort_index]
168 | idx = image_pred_class.size(0)
169 |
170 | #if nms has to be done
171 | if nms:
172 | #For each detection
173 | for i in range(idx):
174 | #Get the IOUs of all boxes that come after the one we are looking at
175 | #in the loop
176 | try:
177 | ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
178 | except ValueError:
179 | break
180 |
181 | except IndexError:
182 | break
183 |
184 | #Zero out all the detections that have IoU > treshhold
185 | iou_mask = (ious < nms_conf).float().unsqueeze(1)
186 | image_pred_class[i+1:] *= iou_mask
187 |
188 | #Remove the non-zero entries
189 | non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze()
190 | image_pred_class = image_pred_class[non_zero_ind].view(-1,7)
191 |
192 |
193 |
194 | #Concatenate the batch_id of the image to the detection
195 | #this helps us identify which image does the detection correspond to
196 | #We use a linear straucture to hold ALL the detections from the batch
197 | #the batch_dim is flattened
198 | #batch is identified by extra batch column
199 |
200 |
201 | batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind)
202 | seq = batch_ind, image_pred_class
203 | if not write:
204 | output = torch.cat(seq,1)
205 | write = True
206 | else:
207 | out = torch.cat(seq,1)
208 | output = torch.cat((output,out))
209 |
210 | return output
211 |
212 | #!/usr/bin/env python3
213 | # -*- coding: utf-8 -*-
214 | """
215 | Created on Sat Mar 24 00:12:16 2018
216 |
217 | @author: ayooshmac
218 | """
219 |
220 | def predict_transform_half(prediction, inp_dim, anchors, num_classes, CUDA = True):
221 | batch_size = prediction.size(0)
222 | stride = inp_dim // prediction.size(2)
223 |
224 | bbox_attrs = 5 + num_classes
225 | num_anchors = len(anchors)
226 | grid_size = inp_dim // stride
227 |
228 |
229 | prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
230 | prediction = prediction.transpose(1,2).contiguous()
231 | prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
232 |
233 |
234 | #Sigmoid the centre_X, centre_Y. and object confidencce
235 | prediction[:,:,0] = torch.sigmoid(prediction[:,:,0])
236 | prediction[:,:,1] = torch.sigmoid(prediction[:,:,1])
237 | prediction[:,:,4] = torch.sigmoid(prediction[:,:,4])
238 |
239 |
240 | #Add the center offsets
241 | grid_len = np.arange(grid_size)
242 | a,b = np.meshgrid(grid_len, grid_len)
243 |
244 | x_offset = torch.FloatTensor(a).view(-1,1)
245 | y_offset = torch.FloatTensor(b).view(-1,1)
246 |
247 | if CUDA:
248 | x_offset = x_offset.cuda().half()
249 | y_offset = y_offset.cuda().half()
250 |
251 | x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0)
252 |
253 | prediction[:,:,:2] += x_y_offset
254 |
255 | #log space transform height and the width
256 | anchors = torch.HalfTensor(anchors)
257 |
258 | if CUDA:
259 | anchors = anchors.cuda()
260 |
261 | anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
262 | prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors
263 |
264 | #Softmax the class scores
265 | prediction[:,:,5: 5 + num_classes] = nn.Softmax(-1)(Variable(prediction[:,:, 5 : 5 + num_classes])).data
266 |
267 | prediction[:,:,:4] *= stride
268 |
269 |
270 | return prediction
271 |
272 |
273 | def write_results_half(prediction, confidence, num_classes, nms = True, nms_conf = 0.4):
274 | conf_mask = (prediction[:,:,4] > confidence).half().unsqueeze(2)
275 | prediction = prediction*conf_mask
276 |
277 | try:
278 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous()
279 | except:
280 | return 0
281 |
282 |
283 |
284 | box_a = prediction.new(prediction.shape)
285 | box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
286 | box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
287 | box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2)
288 | box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
289 | prediction[:,:,:4] = box_a[:,:,:4]
290 |
291 |
292 |
293 | batch_size = prediction.size(0)
294 |
295 | output = prediction.new(1, prediction.size(2) + 1)
296 | write = False
297 |
298 | for ind in range(batch_size):
299 | #select the image from the batch
300 | image_pred = prediction[ind]
301 |
302 |
303 | #Get the class having maximum score, and the index of that class
304 | #Get rid of num_classes softmax scores
305 | #Add the class index and the class score of class having maximum score
306 | max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1)
307 | max_conf = max_conf.half().unsqueeze(1)
308 | max_conf_score = max_conf_score.half().unsqueeze(1)
309 | seq = (image_pred[:,:5], max_conf, max_conf_score)
310 | image_pred = torch.cat(seq, 1)
311 |
312 |
313 | #Get rid of the zero entries
314 | non_zero_ind = (torch.nonzero(image_pred[:,4]))
315 | try:
316 | image_pred_ = image_pred[non_zero_ind.squeeze(),:]
317 | except:
318 | continue
319 |
320 | #Get the various classes detected in the image
321 | img_classes = unique(image_pred_[:,-1].long()).half()
322 |
323 |
324 |
325 |
326 | #WE will do NMS classwise
327 | for cls in img_classes:
328 | #get the detections with one particular class
329 | cls_mask = image_pred_*(image_pred_[:,-1] == cls).half().unsqueeze(1)
330 | class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze()
331 |
332 |
333 | image_pred_class = image_pred_[class_mask_ind]
334 |
335 |
336 | #sort the detections such that the entry with the maximum objectness
337 | #confidence is at the top
338 | conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1]
339 | image_pred_class = image_pred_class[conf_sort_index]
340 | idx = image_pred_class.size(0)
341 |
342 | #if nms has to be done
343 | if nms:
344 | #For each detection
345 | for i in range(idx):
346 | #Get the IOUs of all boxes that come after the one we are looking at
347 | #in the loop
348 | try:
349 | ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
350 | except ValueError:
351 | break
352 |
353 | except IndexError:
354 | break
355 |
356 | #Zero out all the detections that have IoU > treshhold
357 | iou_mask = (ious < nms_conf).half().unsqueeze(1)
358 | image_pred_class[i+1:] *= iou_mask
359 |
360 | #Remove the non-zero entries
361 | non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze()
362 | image_pred_class = image_pred_class[non_zero_ind]
363 |
364 |
365 |
366 | #Concatenate the batch_id of the image to the detection
367 | #this helps us identify which image does the detection correspond to
368 | #We use a linear straucture to hold ALL the detections from the batch
369 | #the batch_dim is flattened
370 | #batch is identified by extra batch column
371 | batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind)
372 | seq = batch_ind, image_pred_class
373 |
374 | if not write:
375 | output = torch.cat(seq,1)
376 | write = True
377 | else:
378 | out = torch.cat(seq,1)
379 | output = torch.cat((output,out))
380 |
381 | return output
382 |
--------------------------------------------------------------------------------
/utils/app_utils.py:
--------------------------------------------------------------------------------
1 | # import the necessary packages
2 | from threading import Thread
3 | import datetime
4 | import cv2
5 |
6 | class FPS:
7 | def __init__(self):
8 | # store the start time, end time, and total number of frames
9 | # that were examined between the start and end intervals
10 | self._start = None
11 | self._end = None
12 | self._numFrames = 0
13 |
14 | def start(self):
15 | # start the timer
16 | self._start = datetime.datetime.now()
17 | return self
18 |
19 | def stop(self):
20 | # stop the timer
21 | self._end = datetime.datetime.now()
22 |
23 | def update(self):
24 | # increment the total number of frames examined during the
25 | # start and end intervals
26 | self._numFrames += 1
27 |
28 | def elapsed(self):
29 | # return the total number of seconds between the start and
30 | # end interval
31 | return (self._end - self._start).total_seconds()
32 |
33 | def fps(self):
34 | # compute the (approximate) frames per second
35 | return self._numFrames / self.elapsed()
36 |
37 |
38 | class WebcamVideoStream:
39 | def __init__(self, src=0):
40 | # initialize the video camera stream and read the first frame
41 | # from the stream
42 | self.stream = cv2.VideoCapture(src)
43 | (self.grabbed, self.frame) = self.stream.read()
44 |
45 | # initialize the variable used to indicate if the thread should
46 | # be stopped
47 | self.stopped = False
48 |
49 | def start(self):
50 | # start the thread to read frames from the video stream
51 | Thread(target=self.update, args=()).start()
52 | return self
53 |
54 | def update(self):
55 | # keep looping infinitely until the thread is stopped
56 | while True:
57 | # if the thread indicator variable is set, stop the thread
58 | if self.stopped:
59 | return
60 |
61 | # otherwise, read the next frame from the stream
62 | (self.grabbed, self.frame) = self.stream.read()
63 |
64 | def read(self):
65 | # return the frame most recently read
66 | return self.grabbed, self.frame
67 |
68 | def stop(self):
69 | # indicate that the thread should be stopped
70 | self.stopped = True
71 |
72 | def getWidth(self):
73 | # Get the width of the frames
74 | return int(self.stream.get(cv2.CAP_PROP_FRAME_WIDTH))
75 |
76 | def getHeight(self):
77 | # Get the height of the frames
78 | return int(self.stream.get(cv2.CAP_PROP_FRAME_HEIGHT))
79 |
80 | def getFPS(self):
81 | # Get the frame rate of the frames
82 | return int(self.stream.get(cv2.CAP_PROP_FPS))
83 |
84 | def isOpen(self):
85 | # Get the frame rate of the frames
86 | return self.stream.isOpened()
87 |
88 | def setFramePosition(self, framePos):
89 | self.stream.set(cv2.CAP_PROP_POS_FRAMES, framePos)
90 |
91 | def getFramePosition(self):
92 | return int(self.stream.get(cv2.CAP_PROP_POS_FRAMES))
93 |
94 | def getFrameCount(self):
95 | return int(self.stream.get(cv2.CAP_PROP_FRAME_COUNT))
96 |
--------------------------------------------------------------------------------
/utils/objDet_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from utils.app_utils import *
3 | import numpy as np
4 | import tensorflow as tf
5 | import sys
6 |
7 | sys.path.append("C:\\Users\\okaya\\Documents\\tensorflow\\models\\research\\")
8 | sys.path.append("C:\\Users\\okaya\\Documents\\tensorflow\\models\\research\\object_detection\\utils")
9 |
10 | from object_detection.utils import label_map_util
11 | from object_detection.utils import visualization_utils as vis_util
12 |
13 |
14 | # Path to frozen detection graph. This is the actual model that is used for the object detection.
15 | PATH_TO_CKPT = 'model/frozen_inference_graph.pb'
16 |
17 | # List of the strings that is used to add correct label for each box.
18 | PATH_TO_LABELS = 'model/mscoco_label_map.pbtxt'
19 |
20 | NUM_CLASSES = 90
21 |
22 | # Loading label map
23 | label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
24 | categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES,
25 | use_display_name=True)
26 | category_index = label_map_util.create_category_index(categories)
27 |
28 | def detect_objects(image_np, sess, detection_graph):
29 | # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
30 | image_np_expanded = np.expand_dims(image_np, axis=0)
31 | image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
32 |
33 | # Each box represents a part of the image where a particular object was detected.
34 | boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
35 |
36 | # Each score represent how level of confidence for each of the objects.
37 | # Score is shown on the result image, together with the class label.
38 | scores = detection_graph.get_tensor_by_name('detection_scores:0')
39 | classes = detection_graph.get_tensor_by_name('detection_classes:0')
40 | num_detections = detection_graph.get_tensor_by_name('num_detections:0')
41 |
42 | # Actual detection.
43 | (boxes, scores, classes, num_detections) = sess.run(
44 | [boxes, scores, classes, num_detections],
45 | feed_dict={image_tensor: image_np_expanded})
46 |
47 | # Visualization of the results of a detection.
48 | vis_util.visualize_boxes_and_labels_on_image_array(
49 | image_np,
50 | np.squeeze(boxes),
51 | np.squeeze(classes).astype(np.int32),
52 | np.squeeze(scores),
53 | category_index,
54 | use_normalized_coordinates=True,
55 | line_thickness=4)
56 |
57 | return image_np
58 |
59 |
60 |
61 | def worker(input_q, output_q):
62 | # Load a (frozen) Tensorflow model into memory.
63 | detection_graph = tf.Graph()
64 | with detection_graph.as_default():
65 | od_graph_def = tf.GraphDef()
66 | with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
67 | serialized_graph = fid.read()
68 | od_graph_def.ParseFromString(serialized_graph)
69 | tf.import_graph_def(od_graph_def, name='')
70 | sess = tf.Session(graph=detection_graph)
71 |
72 | fps = FPS().start()
73 | while True:
74 | fps.update()
75 | frame = input_q.get()
76 |
77 | # Check frame object is a 2-D array (video) or 1-D (webcam)
78 | if len(frame) == 2:
79 | frame_rgb = cv2.cvtColor(frame[1], cv2.COLOR_BGR2RGB)
80 | output_q.put((frame[0], detect_objects(frame_rgb, sess, detection_graph)))
81 | else:
82 | frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
83 | output_q.put(detect_objects(frame_rgb, sess, detection_graph))
84 | fps.stop()
85 | sess.close()
86 |
--------------------------------------------------------------------------------