├── DNModel.py
├── LICENSE
├── README.md
├── cfg
    ├── tiny-yolo-voc.cfg
    ├── yolo-voc.cfg
    ├── yolo.cfg
    └── yolov3.cfg
├── data
    ├── coco.names
    └── voc.names
├── detect.py
├── detect_video.py
├── images
    ├── 1480611559-palm-beach-home-living-room.jpg
    ├── 62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg
    ├── dog.jpg
    ├── eagle.jpg
    ├── giraffe.jpg
    ├── herd_of_horses.jpg
    ├── img1.jpg
    ├── img2.jpg
    ├── img3.jpg
    ├── img4.jpg
    ├── messi.jpg
    └── person.jpg
├── img_process.py
├── pallete
├── result
    ├── det_1480611559-palm-beach-home-living-room.jpg
    ├── det_62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg
    ├── det_dog.jpg
    ├── det_eagle.jpg
    ├── det_giraffe.jpg
    ├── det_herd_of_horses.jpg
    ├── det_img1.jpg
    ├── det_img2.jpg
    ├── det_img3.jpg
    ├── det_img4.jpg
    ├── det_messi.jpg
    └── det_person.jpg
└── util.py


/DNModel.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import torch 
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F 
  6 | import numpy as np
  7 | import cv2 
  8 | from util import *
  9 | 
 10 | 
 11 | 
 12 | class dummyLayer(nn.Module):
 13 |     def __init__(self):
 14 |         super(dummyLayer, self).__init__()
 15 |         
 16 | 
 17 | class detector(nn.Module):
 18 |     def __init__(self, anchors):
 19 |         super(detector, self).__init__()
 20 |         self.anchors = anchors
 21 | 
 22 | 
 23 | def construct_cfg(configFile):
 24 |     '''
 25 |     Build the network blocks using the configuration file.
 26 |     Pre-process it to form easy to manupulate using pytorch.
 27 |     ''' 
 28 |     
 29 |     # Read and pre-process the configuration file
 30 |     
 31 |     config = open(configFile,'r')
 32 |     file = config.read().split('\n')
 33 |     
 34 |     file = [line for line in file if len(line) > 0 and line[0]!= '#']
 35 |     file = [line.lstrip().rstrip() for line in file]
 36 |     
 37 |     
 38 |     #Separate network blocks in a list 
 39 |     
 40 |     networkBlocks = [] 
 41 |     networkBlock = {}
 42 | 
 43 |     for x in file:
 44 |         if x[0] == '[':
 45 |             if len(networkBlock) != 0:
 46 |                 networkBlocks.append(networkBlock)
 47 |                 networkBlock = {}
 48 |             networkBlock["type"] = x[1:-1].rstrip()
 49 |         else:
 50 |             entity , value = x.split('=')
 51 |             networkBlock[entity.rstrip()] = value.lstrip()
 52 |     networkBlocks.append(networkBlock)
 53 |     
 54 |     return networkBlocks
 55 |     
 56 | 
 57 | def buildNetwork(networkBlocks):
 58 |     DNInfo = networkBlocks[0]
 59 |     modules = nn.ModuleList([])
 60 |     channels = 3
 61 |     filterTracker = [] 
 62 | 
 63 |     for i,x in enumerate(networkBlocks[1:]):
 64 |         seqModule  = nn.Sequential()
 65 |         if (x["type"] == "convolutional"):
 66 | 
 67 |             filters= int(x["filters"])
 68 |             pad = int(x["pad"])
 69 |             kernelSize = int(x["size"])
 70 |             stride = int(x["stride"])
 71 | 
 72 |             if pad:
 73 |                 padding = (kernelSize - 1) // 2
 74 |             else:
 75 |                 padding = 0
 76 |             
 77 |             activation = x["activation"]
 78 |             try:
 79 |                 bn = int(x["batch_normalize"])
 80 |                 bias = False
 81 |             except:
 82 |                 bn = 0
 83 |                 bias = True
 84 | 
 85 |             conv = nn.Conv2d(channels, filters, kernelSize, stride, padding, bias = bias)
 86 |             seqModule.add_module("conv_{0}".format(i), conv)
 87 | 
 88 |             if bn:
 89 |                 bn = nn.BatchNorm2d(filters)
 90 |                 seqModule.add_module("batch_norm_{0}".format(i), bn)
 91 | 
 92 |             if activation == "leaky":
 93 |                 activn = nn.LeakyReLU(0.1, inplace = True)
 94 |                 seqModule.add_module("leaky_{0}".format(i), activn)
 95 | 
 96 | 
 97 |         elif (x["type"] == "upsample"):
 98 |             upsample = nn.Upsample(scale_factor = 2, mode = "bilinear")
 99 |             seqModule.add_module("upsample_{}".format(i), upsample)
100 |         
101 |         elif (x["type"] == "route"):
102 |             x['layers'] = x["layers"].split(',')
103 |             start = int(x['layers'][0])
104 |             try:
105 |                 end = int(x['layers'][1])
106 |             except:
107 |                 end =0
108 |             
109 |             if start > 0:
110 |                 start = start - i 
111 |             if end > 0:
112 |                 end = end - i
113 |             
114 |             route = dummyLayer()
115 |             seqModule.add_module("route_{0}".format(i),route)
116 |             if end < 0:
117 |                 filters = filterTracker[i+start] + filterTracker[i+end]
118 |             else:
119 |                 filters = filterTracker[i+start]
120 |         elif (x["type"] == "shortcut"):
121 |             shortcut = dummyLayer()
122 |             seqModule.add_module("shortcut_{0}".format(i),shortcut)
123 |         elif (x["type"] == "yolo"):
124 |             anchors = x["anchors"].split(',')
125 |             anchors = [int(a) for a in anchors]
126 |             masks = x["mask"].split(',')
127 |             masks = [int(a) for a in masks]
128 |             anchors = [(anchors[j],anchors[j+1]) for j in range(0,len(anchors),2)]
129 |             anchors = [anchors[j] for j in masks]
130 |             detectorLayer = detector(anchors)
131 | 
132 |             seqModule.add_module("Detection_{0}".format(i),detectorLayer)
133 |         
134 |         modules.append(seqModule)
135 |         channels = filters
136 |         filterTracker.append(filters)
137 |     return (DNInfo, modules)
138 | 
139 | 
140 | 
141 | class net(nn.Module):
142 |     def __init__(self, cfgfile):
143 |         super(net, self).__init__()
144 |         self.netBlocks = construct_cfg(cfgfile)
145 |         self.DNInfo, self.moduleList = buildNetwork(self.netBlocks)
146 |         self.header = torch.IntTensor([0,0,0,0])
147 |         self.seen = 0
148 |                 
149 |     def forward(self, x, CUDA):
150 |         detections = []
151 |         modules = self.netBlocks[1:]
152 |         layerOutputs = {}  
153 |         
154 |         
155 |         written_output = 0
156 |         #Iterate throught each module 
157 |         for i in range(len(modules)):        
158 |             
159 |             module_type = (modules[i]["type"])
160 |             #Upsampling is basically a form of convolution
161 |             if module_type == "convolutional" or module_type == "upsample" :
162 |                 
163 |                 x = self.moduleList[i](x)
164 |                 layerOutputs[i] = x
165 | 
166 |             #Add outouts from previous layers to this layer
167 |             elif module_type == "route":
168 |                 layers = modules[i]["layers"]
169 |                 layers = [int(a) for a in layers]
170 |                 
171 |                 #If layer nummber is mentioned instead of its position relative to the the current layer
172 |                 if (layers[0]) > 0:
173 |                     layers[0] = layers[0] - i
174 | 
175 |                 if len(layers) == 1:
176 |                     x = layerOutputs[i + (layers[0])]
177 | 
178 |                 else:
179 |                     #If layer nummber is mentioned instead of its position relative to the the current layer
180 |                     if (layers[1]) > 0:
181 |                         layers[1] = layers[1] - i
182 |                         
183 |                     map1 = layerOutputs[i + layers[0]]
184 |                     map2 = layerOutputs[i + layers[1]]
185 |                     
186 |                     
187 |                     x = torch.cat((map1, map2), 1)
188 |                 layerOutputs[i] = x
189 |             
190 |             #ShortCut is essentially residue from resnets
191 |             elif  module_type == "shortcut":
192 |                 from_ = int(modules[i]["from"])
193 |                 x = layerOutputs[i-1] + layerOutputs[i+from_]
194 |                 layerOutputs[i] = x
195 |                 
196 |             
197 |             
198 |             elif module_type == 'yolo':        
199 |                 
200 |                 anchors = self.moduleList[i][0].anchors
201 |                 #Get the input dimensions
202 |                 inp_dim = int (self.DNInfo["height"])
203 |                 
204 |                 #Get the number of classes
205 |                 num_classes = int (modules[i]["classes"])
206 |                 
207 |                 #Output the result
208 |                 x = x.data
209 |                 print("Size before transform => " ,x.size())
210 |                 
211 |                 #Convert the output to 2D (batch x grids x bounding box attributes)
212 |                 x = transformOutput(x, inp_dim, anchors, num_classes, CUDA)
213 |                 print("Size after transform => " ,x.size())
214 | 
215 |                 
216 |                 #If no detections were made
217 |                 if type(x) == int:
218 |                     continue
219 | 
220 |                 
221 |                 if not written_output:
222 |                     detections = x
223 |                     written_output = 1
224 |                 
225 |                 else:
226 |                     detections = torch.cat((detections, x), 1)
227 |                 
228 |                 layerOutputs[i] = layerOutputs[i-1]
229 |                 
230 |         
231 |         try:
232 |             return detections
233 |         except:
234 |             return 0
235 | 
236 |             
237 |     def load_weights(self, weightfile):
238 |         
239 |         fp = open(weightfile, "rb")
240 | 
241 |         #The first 4 values are header information 
242 |         # 1. Major version number
243 |         # 2. Minor Version Number
244 |         # 3. Subversion number 
245 |         # 4. IMages seen 
246 |         header = np.fromfile(fp, dtype = np.int32, count = 5)
247 |         self.header = torch.from_numpy(header)
248 |         self.seen = self.header[3]
249 |         
250 | 
251 |         weights = np.fromfile(fp, dtype = np.float32)
252 |         
253 |         tracker = 0
254 |         for i in range(len(self.moduleList)):
255 |             module_type = self.netBlocks[i + 1]["type"]
256 |             
257 |             if module_type == "convolutional":
258 |                 model = self.moduleList[i]
259 |                 try:
260 |                     batch_normalize = int(self.netBlocks[i+1]["batch_normalize"])
261 |                 except:
262 |                     batch_normalize = 0
263 |                 
264 |                 convPart = model[0]
265 |                 
266 |                 if (batch_normalize):
267 |                     #Weights file Configuration=> bn bais->bn weights-> running mean-> running var
268 |                     #The weights are arranged in the above mentioned order
269 |                     bnPart = model[1]
270 |                     
271 |                     biasCount = bnPart.bias.numel()
272 |                     
273 |                     bnBias = torch.from_numpy(weights[tracker:tracker + biasCount])
274 |                     tracker += biasCount
275 |                     
276 |                     bnPart_weights = torch.from_numpy(weights[tracker: tracker + biasCount])
277 |                     tracker  += biasCount
278 |                     
279 |                     bnPart_running_mean = torch.from_numpy(weights[tracker: tracker + biasCount])
280 |                     tracker  += biasCount
281 |                     
282 |                     bnPart_running_var = torch.from_numpy(weights[tracker: tracker + biasCount])
283 |                     tracker  += biasCount
284 |                     
285 |                     bnBias = bnBias.view_as(bnPart.bias.data)
286 |                     bnPart_weights = bnPart_weights.view_as(bnPart.weight.data)
287 |                     bnPart_running_mean = bnPart_running_mean.view_as(bnPart.running_mean)
288 |                     bnPart_running_var = bnPart_running_var.view_as(bnPart.running_var)
289 | 
290 |                     bnPart.bias.data.copy_(bnBias)
291 |                     bnPart.weight.data.copy_(bnPart_weights)
292 |                     bnPart.running_mean.copy_(bnPart_running_mean)
293 |                     bnPart.running_var.copy_(bnPart_running_var)
294 |                 
295 |                 else:
296 |                     biasCount = convPart.bias.numel()
297 |                 
298 |                     convBias = torch.from_numpy(weights[tracker: tracker + biasCount])
299 |                     tracker = tracker + biasCount
300 |                     
301 |                     convBias = convBias.view_as(convPart.bias.data)
302 |                     
303 |                     convPart.bias.data.copy_(convBias)
304 |                     
305 |                     
306 |                 weightfile = convPart.weight.numel()
307 |                 
308 |                 convWeight = torch.from_numpy(weights[tracker:tracker+weightfile])
309 |                 tracker = tracker + weightfile
310 | 
311 |                 convWeight = convWeight.view_as(convPart.weight.data)
312 |                 convPart.weight.data.copy_(convWeight)
313 | '''
314 | #Test CFG:
315 | construct = construct_cfg('cfg/yolov3.cfg')
316 | print(construct,"/n constructed from cfg file")
317 | '''
318 | 
319 | #TestMOdel:
320 | 
321 | num_classes = 80
322 | classes = load_classes('data/coco.names') 
323 | 
324 | model = net('cfg/yolov3.cfg')
325 | model.load_weights("yolov3.weights")
326 | print("Network loaded")
327 | 
328 | test_data = torch.randn(1,3,256,256,dtype = torch.float)
329 | test_output = model(test_data,False)
330 | 
331 | print(test_output.size())
332 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Ayush Chaurasia
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This is complementary code for the video series that explains the implementation of yolo-v3 from scratch, [available here](https://www.youtube.com/playlist?list=PLbMqOoYQ3MxxArhAqvki_WoWBTCc8fDHG).
 2 | .
 3 | [My channel](http://www.youtube.com/channel/UCgpckFNtZEOSjPFpQf-Kn8w) for cutting edge deep learning projects.
 4 | 
 5 | # Yolo-V3
 6 | 
 7 | yolov3.cfg (236 MB COCO Yolo v3) - requires 4 GB GPU-RAM: https://pjreddie.com/media/files/yolov3.weights
 8 | 
 9 | # Test:
10 | Run the following command with optional commandline arguments to perform detections on images in 'images' folder. By default the 'result' folder will store the output.
11 | ```
12 | python detect.py 
13 | ```
14 | Run the following command with optional commandline arguments to perform detections on videos
15 | ```
16 | python detect_video.py
17 | ```
18 | 
19 | # Understand and implement the network from scratch (Video)
20 | [![](http://img.youtube.com/vi/chVamXQp9so&list=PLbMqOoYQ3MxxArhAqvki_WoWBTCc8fDHG/0.jpg)](http://www.youtube.com/watch?v=chVamXQp9so&list=PLbMqOoYQ3MxxArhAqvki_WoWBTCc8fDHG)
21 | <a href="http://www.youtube.com/watch?v=chVamXQp9so&list=PLbMqOoYQ3MxxArhAqvki_WoWBTCc8fDHG
22 | " target="_blank"><img src="http://i3.ytimg.com/vi/chVamXQp9so/hqdefault.jpg" 
23 | alt="" width="400" height="300" border="10" /></a>
24 | 
25 | # Some Outputs:
26 | ![](https://github.com/AyushExel/Yolo-V3/blob/master/result/det_messi.jpg)
27 | ![](https://github.com/AyushExel/Yolo-V3/blob/master/result/det_62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg)
28 | ![](https://github.com/AyushExel/Yolo-V3/blob/master/result/det_person.jpg)
29 | 
30 | 


--------------------------------------------------------------------------------
/cfg/tiny-yolo-voc.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | batch=64
  3 | subdivisions=8
  4 | width=416
  5 | height=416
  6 | channels=3
  7 | momentum=0.9
  8 | decay=0.0005
  9 | angle=0
 10 | saturation = 1.5
 11 | exposure = 1.5
 12 | hue=.1
 13 | 
 14 | learning_rate=0.001
 15 | max_batches = 40200
 16 | policy=steps
 17 | steps=-1,100,20000,30000
 18 | scales=.1,10,.1,.1
 19 | 
 20 | [convolutional]
 21 | batch_normalize=1
 22 | filters=16
 23 | size=3
 24 | stride=1
 25 | pad=1
 26 | activation=leaky
 27 | 
 28 | [maxpool]
 29 | size=2
 30 | stride=2
 31 | 
 32 | [convolutional]
 33 | batch_normalize=1
 34 | filters=32
 35 | size=3
 36 | stride=1
 37 | pad=1
 38 | activation=leaky
 39 | 
 40 | [maxpool]
 41 | size=2
 42 | stride=2
 43 | 
 44 | [convolutional]
 45 | batch_normalize=1
 46 | filters=64
 47 | size=3
 48 | stride=1
 49 | pad=1
 50 | activation=leaky
 51 | 
 52 | [maxpool]
 53 | size=2
 54 | stride=2
 55 | 
 56 | [convolutional]
 57 | batch_normalize=1
 58 | filters=128
 59 | size=3
 60 | stride=1
 61 | pad=1
 62 | activation=leaky
 63 | 
 64 | [maxpool]
 65 | size=2
 66 | stride=2
 67 | 
 68 | [convolutional]
 69 | batch_normalize=1
 70 | filters=256
 71 | size=3
 72 | stride=1
 73 | pad=1
 74 | activation=leaky
 75 | 
 76 | [maxpool]
 77 | size=2
 78 | stride=2
 79 | 
 80 | [convolutional]
 81 | batch_normalize=1
 82 | filters=512
 83 | size=3
 84 | stride=1
 85 | pad=1
 86 | activation=leaky
 87 | 
 88 | [maxpool]
 89 | size=2
 90 | stride=1
 91 | 
 92 | [convolutional]
 93 | batch_normalize=1
 94 | filters=1024
 95 | size=3
 96 | stride=1
 97 | pad=1
 98 | activation=leaky
 99 | 
100 | ###########
101 | 
102 | [convolutional]
103 | batch_normalize=1
104 | size=3
105 | stride=1
106 | pad=1
107 | filters=1024
108 | activation=leaky
109 | 
110 | [convolutional]
111 | size=1
112 | stride=1
113 | pad=1
114 | filters=125
115 | activation=linear
116 | 
117 | [region]
118 | anchors = 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52
119 | bias_match=1
120 | classes=20
121 | coords=4
122 | num=5
123 | softmax=1
124 | jitter=.2
125 | rescore=1
126 | 
127 | object_scale=5
128 | noobject_scale=1
129 | class_scale=1
130 | coord_scale=1
131 | 
132 | absolute=1
133 | thresh = .6
134 | random=1
135 | 


--------------------------------------------------------------------------------
/cfg/yolo-voc.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=64
  4 | subdivisions=8
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=8
  8 | height=416
  9 | width=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 80200
 21 | policy=steps
 22 | steps=-1,500,40000,60000
 23 | scales=0.1,10,.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | [maxpool]
 34 | size=2
 35 | stride=2
 36 | 
 37 | [convolutional]
 38 | batch_normalize=1
 39 | filters=64
 40 | size=3
 41 | stride=1
 42 | pad=1
 43 | activation=leaky
 44 | 
 45 | [maxpool]
 46 | size=2
 47 | stride=2
 48 | 
 49 | [convolutional]
 50 | batch_normalize=1
 51 | filters=128
 52 | size=3
 53 | stride=1
 54 | pad=1
 55 | activation=leaky
 56 | 
 57 | [convolutional]
 58 | batch_normalize=1
 59 | filters=64
 60 | size=1
 61 | stride=1
 62 | pad=1
 63 | activation=leaky
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=1
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [maxpool]
 74 | size=2
 75 | stride=2
 76 | 
 77 | [convolutional]
 78 | batch_normalize=1
 79 | filters=256
 80 | size=3
 81 | stride=1
 82 | pad=1
 83 | activation=leaky
 84 | 
 85 | [convolutional]
 86 | batch_normalize=1
 87 | filters=128
 88 | size=1
 89 | stride=1
 90 | pad=1
 91 | activation=leaky
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=256
 96 | size=3
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [maxpool]
102 | size=2
103 | stride=2
104 | 
105 | [convolutional]
106 | batch_normalize=1
107 | filters=512
108 | size=3
109 | stride=1
110 | pad=1
111 | activation=leaky
112 | 
113 | [convolutional]
114 | batch_normalize=1
115 | filters=256
116 | size=1
117 | stride=1
118 | pad=1
119 | activation=leaky
120 | 
121 | [convolutional]
122 | batch_normalize=1
123 | filters=512
124 | size=3
125 | stride=1
126 | pad=1
127 | activation=leaky
128 | 
129 | [convolutional]
130 | batch_normalize=1
131 | filters=256
132 | size=1
133 | stride=1
134 | pad=1
135 | activation=leaky
136 | 
137 | [convolutional]
138 | batch_normalize=1
139 | filters=512
140 | size=3
141 | stride=1
142 | pad=1
143 | activation=leaky
144 | 
145 | [maxpool]
146 | size=2
147 | stride=2
148 | 
149 | [convolutional]
150 | batch_normalize=1
151 | filters=1024
152 | size=3
153 | stride=1
154 | pad=1
155 | activation=leaky
156 | 
157 | [convolutional]
158 | batch_normalize=1
159 | filters=512
160 | size=1
161 | stride=1
162 | pad=1
163 | activation=leaky
164 | 
165 | [convolutional]
166 | batch_normalize=1
167 | filters=1024
168 | size=3
169 | stride=1
170 | pad=1
171 | activation=leaky
172 | 
173 | [convolutional]
174 | batch_normalize=1
175 | filters=512
176 | size=1
177 | stride=1
178 | pad=1
179 | activation=leaky
180 | 
181 | [convolutional]
182 | batch_normalize=1
183 | filters=1024
184 | size=3
185 | stride=1
186 | pad=1
187 | activation=leaky
188 | 
189 | 
190 | #######
191 | 
192 | [convolutional]
193 | batch_normalize=1
194 | size=3
195 | stride=1
196 | pad=1
197 | filters=1024
198 | activation=leaky
199 | 
200 | [convolutional]
201 | batch_normalize=1
202 | size=3
203 | stride=1
204 | pad=1
205 | filters=1024
206 | activation=leaky
207 | 
208 | [route]
209 | layers=-9
210 | 
211 | [convolutional]
212 | batch_normalize=1
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=64
217 | activation=leaky
218 | 
219 | [reorg]
220 | stride=2
221 | 
222 | [route]
223 | layers=-1,-4
224 | 
225 | [convolutional]
226 | batch_normalize=1
227 | size=3
228 | stride=1
229 | pad=1
230 | filters=1024
231 | activation=leaky
232 | 
233 | [convolutional]
234 | size=1
235 | stride=1
236 | pad=1
237 | filters=125
238 | activation=linear
239 | 
240 | 
241 | [region]
242 | anchors =  1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071
243 | bias_match=1
244 | classes=20
245 | coords=4
246 | num=5
247 | softmax=1
248 | jitter=.3
249 | rescore=1
250 | 
251 | object_scale=5
252 | noobject_scale=1
253 | class_scale=1
254 | coord_scale=1
255 | 
256 | absolute=1
257 | thresh = .6
258 | random=1
259 | 


--------------------------------------------------------------------------------
/cfg/yolo.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=8
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | [maxpool]
 34 | size=2
 35 | stride=2
 36 | 
 37 | [convolutional]
 38 | batch_normalize=1
 39 | filters=64
 40 | size=3
 41 | stride=1
 42 | pad=1
 43 | activation=leaky
 44 | 
 45 | [maxpool]
 46 | size=2
 47 | stride=2
 48 | 
 49 | [convolutional]
 50 | batch_normalize=1
 51 | filters=128
 52 | size=3
 53 | stride=1
 54 | pad=1
 55 | activation=leaky
 56 | 
 57 | [convolutional]
 58 | batch_normalize=1
 59 | filters=64
 60 | size=1
 61 | stride=1
 62 | pad=1
 63 | activation=leaky
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=1
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [maxpool]
 74 | size=2
 75 | stride=2
 76 | 
 77 | [convolutional]
 78 | batch_normalize=1
 79 | filters=256
 80 | size=3
 81 | stride=1
 82 | pad=1
 83 | activation=leaky
 84 | 
 85 | [convolutional]
 86 | batch_normalize=1
 87 | filters=128
 88 | size=1
 89 | stride=1
 90 | pad=1
 91 | activation=leaky
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=256
 96 | size=3
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [maxpool]
102 | size=2
103 | stride=2
104 | 
105 | [convolutional]
106 | batch_normalize=1
107 | filters=512
108 | size=3
109 | stride=1
110 | pad=1
111 | activation=leaky
112 | 
113 | [convolutional]
114 | batch_normalize=1
115 | filters=256
116 | size=1
117 | stride=1
118 | pad=1
119 | activation=leaky
120 | 
121 | [convolutional]
122 | batch_normalize=1
123 | filters=512
124 | size=3
125 | stride=1
126 | pad=1
127 | activation=leaky
128 | 
129 | [convolutional]
130 | batch_normalize=1
131 | filters=256
132 | size=1
133 | stride=1
134 | pad=1
135 | activation=leaky
136 | 
137 | [convolutional]
138 | batch_normalize=1
139 | filters=512
140 | size=3
141 | stride=1
142 | pad=1
143 | activation=leaky
144 | 
145 | [maxpool]
146 | size=2
147 | stride=2
148 | 
149 | [convolutional]
150 | batch_normalize=1
151 | filters=1024
152 | size=3
153 | stride=1
154 | pad=1
155 | activation=leaky
156 | 
157 | [convolutional]
158 | batch_normalize=1
159 | filters=512
160 | size=1
161 | stride=1
162 | pad=1
163 | activation=leaky
164 | 
165 | [convolutional]
166 | batch_normalize=1
167 | filters=1024
168 | size=3
169 | stride=1
170 | pad=1
171 | activation=leaky
172 | 
173 | [convolutional]
174 | batch_normalize=1
175 | filters=512
176 | size=1
177 | stride=1
178 | pad=1
179 | activation=leaky
180 | 
181 | [convolutional]
182 | batch_normalize=1
183 | filters=1024
184 | size=3
185 | stride=1
186 | pad=1
187 | activation=leaky
188 | 
189 | 
190 | #######
191 | 
192 | [convolutional]
193 | batch_normalize=1
194 | size=3
195 | stride=1
196 | pad=1
197 | filters=1024
198 | activation=leaky
199 | 
200 | [convolutional]
201 | batch_normalize=1
202 | size=3
203 | stride=1
204 | pad=1
205 | filters=1024
206 | activation=leaky
207 | 
208 | [route]
209 | layers=-9
210 | 
211 | [convolutional]
212 | batch_normalize=1
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=64
217 | activation=leaky
218 | 
219 | [reorg]
220 | stride=2
221 | 
222 | [route]
223 | layers=-1,-4
224 | 
225 | [convolutional]
226 | batch_normalize=1
227 | size=3
228 | stride=1
229 | pad=1
230 | filters=1024
231 | activation=leaky
232 | 
233 | [convolutional]
234 | size=1
235 | stride=1
236 | pad=1
237 | filters=425
238 | activation=linear
239 | 
240 | 
241 | [region]
242 | anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
243 | bias_match=1
244 | classes=80
245 | coords=4
246 | num=5
247 | softmax=1
248 | jitter=.3
249 | rescore=1
250 | 
251 | object_scale=5
252 | noobject_scale=1
253 | class_scale=1
254 | coord_scale=1
255 | 
256 | absolute=1
257 | thresh = .6
258 | random=1
259 | 


--------------------------------------------------------------------------------
/cfg/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=16
  8 | width= 320
  9 | height = 320
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | #route
114 | 
115 | # Downsample
116 | 
117 | [convolutional]
118 | batch_normalize=1
119 | filters=256
120 | size=3
121 | stride=2
122 | pad=1
123 | activation=leaky
124 | 
125 | [convolutional]
126 | batch_normalize=1
127 | filters=128
128 | size=1
129 | stride=1
130 | pad=1
131 | activation=leaky
132 | 
133 | [convolutional]
134 | batch_normalize=1
135 | filters=256
136 | size=3
137 | stride=1
138 | pad=1
139 | activation=leaky
140 | 
141 | [shortcut]
142 | from=-3
143 | activation=linear
144 | 
145 | [convolutional]
146 | batch_normalize=1
147 | filters=128
148 | size=1
149 | stride=1
150 | pad=1
151 | activation=leaky
152 | 
153 | [convolutional]
154 | batch_normalize=1
155 | filters=256
156 | size=3
157 | stride=1
158 | pad=1
159 | activation=leaky
160 | 
161 | [shortcut]
162 | from=-3
163 | activation=linear
164 | 
165 | [convolutional]
166 | batch_normalize=1
167 | filters=128
168 | size=1
169 | stride=1
170 | pad=1
171 | activation=leaky
172 | 
173 | [convolutional]
174 | batch_normalize=1
175 | filters=256
176 | size=3
177 | stride=1
178 | pad=1
179 | activation=leaky
180 | 
181 | [shortcut]
182 | from=-3
183 | activation=linear
184 | 
185 | [convolutional]
186 | batch_normalize=1
187 | filters=128
188 | size=1
189 | stride=1
190 | pad=1
191 | activation=leaky
192 | 
193 | [convolutional]
194 | batch_normalize=1
195 | filters=256
196 | size=3
197 | stride=1
198 | pad=1
199 | activation=leaky
200 | 
201 | [shortcut]
202 | from=-3
203 | activation=linear
204 | 
205 | 
206 | [convolutional]
207 | batch_normalize=1
208 | filters=128
209 | size=1
210 | stride=1
211 | pad=1
212 | activation=leaky
213 | 
214 | [convolutional]
215 | batch_normalize=1
216 | filters=256
217 | size=3
218 | stride=1
219 | pad=1
220 | activation=leaky
221 | 
222 | [shortcut]
223 | from=-3
224 | activation=linear
225 | 
226 | [convolutional]
227 | batch_normalize=1
228 | filters=128
229 | size=1
230 | stride=1
231 | pad=1
232 | activation=leaky
233 | 
234 | [convolutional]
235 | batch_normalize=1
236 | filters=256
237 | size=3
238 | stride=1
239 | pad=1
240 | activation=leaky
241 | 
242 | [shortcut]
243 | from=-3
244 | activation=linear
245 | 
246 | [convolutional]
247 | batch_normalize=1
248 | filters=128
249 | size=1
250 | stride=1
251 | pad=1
252 | activation=leaky
253 | 
254 | [convolutional]
255 | batch_normalize=1
256 | filters=256
257 | size=3
258 | stride=1
259 | pad=1
260 | activation=leaky
261 | 
262 | [shortcut]
263 | from=-3
264 | activation=linear
265 | 
266 | [convolutional]
267 | batch_normalize=1
268 | filters=128
269 | size=1
270 | stride=1
271 | pad=1
272 | activation=leaky
273 | 
274 | [convolutional]
275 | batch_normalize=1
276 | filters=256
277 | size=3
278 | stride=1
279 | pad=1
280 | activation=leaky
281 | 
282 | [shortcut]
283 | from=-3
284 | activation=linear
285 | 
286 | # Downsample
287 | 
288 | [convolutional]
289 | batch_normalize=1
290 | filters=512
291 | size=3
292 | stride=2
293 | pad=1
294 | activation=leaky
295 | 
296 | [convolutional]
297 | batch_normalize=1
298 | filters=256
299 | size=1
300 | stride=1
301 | pad=1
302 | activation=leaky
303 | 
304 | [convolutional]
305 | batch_normalize=1
306 | filters=512
307 | size=3
308 | stride=1
309 | pad=1
310 | activation=leaky
311 | 
312 | [shortcut]
313 | from=-3
314 | activation=linear
315 | 
316 | 
317 | [convolutional]
318 | batch_normalize=1
319 | filters=256
320 | size=1
321 | stride=1
322 | pad=1
323 | activation=leaky
324 | 
325 | [convolutional]
326 | batch_normalize=1
327 | filters=512
328 | size=3
329 | stride=1
330 | pad=1
331 | activation=leaky
332 | 
333 | [shortcut]
334 | from=-3
335 | activation=linear
336 | 
337 | 
338 | [convolutional]
339 | batch_normalize=1
340 | filters=256
341 | size=1
342 | stride=1
343 | pad=1
344 | activation=leaky
345 | 
346 | [convolutional]
347 | batch_normalize=1
348 | filters=512
349 | size=3
350 | stride=1
351 | pad=1
352 | activation=leaky
353 | 
354 | [shortcut]
355 | from=-3
356 | activation=linear
357 | 
358 | 
359 | [convolutional]
360 | batch_normalize=1
361 | filters=256
362 | size=1
363 | stride=1
364 | pad=1
365 | activation=leaky
366 | 
367 | [convolutional]
368 | batch_normalize=1
369 | filters=512
370 | size=3
371 | stride=1
372 | pad=1
373 | activation=leaky
374 | 
375 | [shortcut]
376 | from=-3
377 | activation=linear
378 | 
379 | [convolutional]
380 | batch_normalize=1
381 | filters=256
382 | size=1
383 | stride=1
384 | pad=1
385 | activation=leaky
386 | 
387 | [convolutional]
388 | batch_normalize=1
389 | filters=512
390 | size=3
391 | stride=1
392 | pad=1
393 | activation=leaky
394 | 
395 | [shortcut]
396 | from=-3
397 | activation=linear
398 | 
399 | 
400 | [convolutional]
401 | batch_normalize=1
402 | filters=256
403 | size=1
404 | stride=1
405 | pad=1
406 | activation=leaky
407 | 
408 | [convolutional]
409 | batch_normalize=1
410 | filters=512
411 | size=3
412 | stride=1
413 | pad=1
414 | activation=leaky
415 | 
416 | [shortcut]
417 | from=-3
418 | activation=linear
419 | 
420 | 
421 | [convolutional]
422 | batch_normalize=1
423 | filters=256
424 | size=1
425 | stride=1
426 | pad=1
427 | activation=leaky
428 | 
429 | [convolutional]
430 | batch_normalize=1
431 | filters=512
432 | size=3
433 | stride=1
434 | pad=1
435 | activation=leaky
436 | 
437 | [shortcut]
438 | from=-3
439 | activation=linear
440 | 
441 | [convolutional]
442 | batch_normalize=1
443 | filters=256
444 | size=1
445 | stride=1
446 | pad=1
447 | activation=leaky
448 | 
449 | [convolutional]
450 | batch_normalize=1
451 | filters=512
452 | size=3
453 | stride=1
454 | pad=1
455 | activation=leaky
456 | 
457 | [shortcut]
458 | from=-3
459 | activation=linear
460 | 
461 | # Downsample
462 | 
463 | [convolutional]
464 | batch_normalize=1
465 | filters=1024
466 | size=3
467 | stride=2
468 | pad=1
469 | activation=leaky
470 | 
471 | [convolutional]
472 | batch_normalize=1
473 | filters=512
474 | size=1
475 | stride=1
476 | pad=1
477 | activation=leaky
478 | 
479 | [convolutional]
480 | batch_normalize=1
481 | filters=1024
482 | size=3
483 | stride=1
484 | pad=1
485 | activation=leaky
486 | 
487 | [shortcut]
488 | from=-3
489 | activation=linear
490 | 
491 | [convolutional]
492 | batch_normalize=1
493 | filters=512
494 | size=1
495 | stride=1
496 | pad=1
497 | activation=leaky
498 | 
499 | [convolutional]
500 | batch_normalize=1
501 | filters=1024
502 | size=3
503 | stride=1
504 | pad=1
505 | activation=leaky
506 | 
507 | [shortcut]
508 | from=-3
509 | activation=linear
510 | 
511 | [convolutional]
512 | batch_normalize=1
513 | filters=512
514 | size=1
515 | stride=1
516 | pad=1
517 | activation=leaky
518 | 
519 | [convolutional]
520 | batch_normalize=1
521 | filters=1024
522 | size=3
523 | stride=1
524 | pad=1
525 | activation=leaky
526 | 
527 | [shortcut]
528 | from=-3
529 | activation=linear
530 | 
531 | [convolutional]
532 | batch_normalize=1
533 | filters=512
534 | size=1
535 | stride=1
536 | pad=1
537 | activation=leaky
538 | 
539 | [convolutional]
540 | batch_normalize=1
541 | filters=1024
542 | size=3
543 | stride=1
544 | pad=1
545 | activation=leaky
546 | 
547 | [shortcut]
548 | from=-3
549 | activation=linear
550 | 
551 | ######################
552 | 
553 | [convolutional]
554 | batch_normalize=1
555 | filters=512
556 | size=1
557 | stride=1
558 | pad=1
559 | activation=leaky
560 | 
561 | [convolutional]
562 | batch_normalize=1
563 | size=3
564 | stride=1
565 | pad=1
566 | filters=1024
567 | activation=leaky
568 | 
569 | [convolutional]
570 | batch_normalize=1
571 | filters=512
572 | size=1
573 | stride=1
574 | pad=1
575 | activation=leaky
576 | 
577 | [convolutional]
578 | batch_normalize=1
579 | size=3
580 | stride=1
581 | pad=1
582 | filters=1024
583 | activation=leaky
584 | 
585 | [convolutional]
586 | batch_normalize=1
587 | filters=512
588 | size=1
589 | stride=1
590 | pad=1
591 | activation=leaky
592 | 
593 | [convolutional]
594 | batch_normalize=1
595 | size=3
596 | stride=1
597 | pad=1
598 | filters=1024
599 | activation=leaky
600 | 
601 | [convolutional]
602 | size=1
603 | stride=1
604 | pad=1
605 | filters=255
606 | activation=linear
607 | 
608 | 
609 | [yolo]
610 | mask = 6,7,8
611 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
612 | classes=80
613 | num=9
614 | jitter=.3
615 | ignore_thresh = .5
616 | truth_thresh = 1
617 | random=1
618 | 
619 | 
620 | [route]
621 | layers = -4
622 | 
623 | [convolutional]
624 | batch_normalize=1
625 | filters=256
626 | size=1
627 | stride=1
628 | pad=1
629 | activation=leaky
630 | 
631 | [upsample]
632 | stride=2
633 | 
634 | [route]
635 | layers = -1, 61
636 | 
637 | 
638 | 
639 | [convolutional]
640 | batch_normalize=1
641 | filters=256
642 | size=1
643 | stride=1
644 | pad=1
645 | activation=leaky
646 | 
647 | [convolutional]
648 | batch_normalize=1
649 | size=3
650 | stride=1
651 | pad=1
652 | filters=512
653 | activation=leaky
654 | 
655 | [convolutional]
656 | batch_normalize=1
657 | filters=256
658 | size=1
659 | stride=1
660 | pad=1
661 | activation=leaky
662 | 
663 | [convolutional]
664 | batch_normalize=1
665 | size=3
666 | stride=1
667 | pad=1
668 | filters=512
669 | activation=leaky
670 | 
671 | [convolutional]
672 | batch_normalize=1
673 | filters=256
674 | size=1
675 | stride=1
676 | pad=1
677 | activation=leaky
678 | 
679 | [convolutional]
680 | batch_normalize=1
681 | size=3
682 | stride=1
683 | pad=1
684 | filters=512
685 | activation=leaky
686 | 
687 | [convolutional]
688 | size=1
689 | stride=1
690 | pad=1
691 | filters=255
692 | activation=linear
693 | 
694 | 
695 | [yolo]
696 | mask = 3,4,5
697 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
698 | classes=80
699 | num=9
700 | jitter=.3
701 | ignore_thresh = .5
702 | truth_thresh = 1
703 | random=1
704 | 
705 | 
706 | 
707 | [route]
708 | layers = -4
709 | 
710 | [convolutional]
711 | batch_normalize=1
712 | filters=128
713 | size=1
714 | stride=1
715 | pad=1
716 | activation=leaky
717 | 
718 | [upsample]
719 | stride=2
720 | 
721 | [route]
722 | layers = -1, 36
723 | 
724 | #yolo
725 | 
726 | [convolutional]
727 | batch_normalize=1
728 | filters=128
729 | size=1
730 | stride=1
731 | pad=1
732 | activation=leaky
733 | 
734 | [convolutional]
735 | batch_normalize=1
736 | size=3
737 | stride=1
738 | pad=1
739 | filters=256
740 | activation=leaky
741 | 
742 | [convolutional]
743 | batch_normalize=1
744 | filters=128
745 | size=1
746 | stride=1
747 | pad=1
748 | activation=leaky
749 | 
750 | [convolutional]
751 | batch_normalize=1
752 | size=3
753 | stride=1
754 | pad=1
755 | filters=256
756 | activation=leaky
757 | 
758 | [convolutional]
759 | batch_normalize=1
760 | filters=128
761 | size=1
762 | stride=1
763 | pad=1
764 | activation=leaky
765 | 
766 | [convolutional]
767 | batch_normalize=1
768 | size=3
769 | stride=1
770 | pad=1
771 | filters=256
772 | activation=leaky
773 | 
774 | [convolutional]
775 | size=1
776 | stride=1
777 | pad=1
778 | filters=255
779 | activation=linear
780 | 
781 | 
782 | [yolo]
783 | mask = 0,1,2
784 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
785 | classes=80
786 | num=9
787 | jitter=.3
788 | ignore_thresh = .5
789 | truth_thresh = 1
790 | random=1
791 | 
792 | 


--------------------------------------------------------------------------------
/data/coco.names:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/data/voc.names:
--------------------------------------------------------------------------------
 1 | aeroplane
 2 | bicycle
 3 | bird
 4 | boat
 5 | bottle
 6 | bus
 7 | car
 8 | cat
 9 | chair
10 | cow
11 | diningtable
12 | dog
13 | horse
14 | motorbike
15 | person
16 | pottedplant
17 | sheep
18 | sofa
19 | train
20 | tvmonitor
21 | 


--------------------------------------------------------------------------------
/detect.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import time
  3 | import torch 
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | import numpy as np
  7 | import cv2 
  8 | from util import *
  9 | import argparse
 10 | import os 
 11 | import os.path as osp
 12 | from DNModel import net
 13 | from img_process import preprocess_img, inp_to_image
 14 | import pandas as pd
 15 | import random 
 16 | import pickle as pkl
 17 | 
 18 |         
 19 | 
 20 | 
 21 | def arg_parse():
 22 | 
 23 |     parser = argparse.ArgumentParser(description='YOLOv3 ')
 24 |    
 25 |     parser.add_argument("--images", dest = 'images', help = 
 26 |                         "Image / Directory containing input images",
 27 |                         default = "images", type = str)
 28 |     parser.add_argument("--result", dest = 'result', help = 
 29 |                         " Directory to store results ",
 30 |                         default = "result", type = str)
 31 |     parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4)
 32 | 
 33 |     parser.add_argument("--bs", dest = "bs", help = "Batch size", default = 1)
 34 |     parser.add_argument("--confidence", dest = "confidence", help = "Detection Confidence ", default = 0.5)
 35 |     parser.add_argument("--cfg", dest = 'configfile', help = 
 36 |                         "Config file",
 37 |                         default = "cfg/yolov3.cfg", type = str)
 38 |     parser.add_argument("--weights", dest = 'weightsfile', help = 
 39 |                         "weightsfile",
 40 |                         default = "yolov3.weights", type = str)
 41 |     parser.add_argument("--reso", dest = 'resolution', help = 
 42 |                         "Input resolution of the network",
 43 |                         default = "256", type = str)
 44 |     parser.add_argument("--scales", dest = "scales", help = "Scales to use for detection",
 45 |                         default = "1,2,3", type = str)
 46 |     
 47 |     return parser.parse_args()
 48 | 
 49 | if __name__ ==  '__main__':
 50 |     args = arg_parse()
 51 |     
 52 |     scales = args.scales
 53 |     
 54 |  
 55 |     images = args.images
 56 |     batch_size = int(args.bs)
 57 |     confidence = float(args.confidence)
 58 |     nms_thesh = float(args.nms_thresh)
 59 |     start = 0
 60 | 
 61 |     CUDA = torch.cuda.is_available()
 62 | 
 63 |     num_classes = 80
 64 |     classes = load_classes('data/coco.names') 
 65 | 
 66 |     model = net(args.configfile)
 67 |     model.load_weights(args.weightsfile)
 68 |     print("Network loaded")
 69 |     
 70 |     model.DNInfo["height"] = args.resolution
 71 |     in_dim = int(model.DNInfo["height"])
 72 | 
 73 | 
 74 |     if CUDA:
 75 |         model.cuda()
 76 |     model.eval()
 77 |     
 78 |     read_dir = time.time()
 79 |     try:
 80 |         imlist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images) if os.path.splitext(img)[1] == '.png' or os.path.splitext(img)[1] =='.jpeg' or os.path.splitext(img)[1] =='.jpg']
 81 |     except NotADirectoryError:
 82 |         imlist = []
 83 |         imlist.append(osp.join(osp.realpath('.'), images))
 84 |     except FileNotFoundError:
 85 |         print ("No with the name {}".format(images))
 86 |         exit()
 87 |         
 88 |     if not os.path.exists(args.result):
 89 |         os.makedirs(args.result)
 90 |             
 91 |     batches = list(map(preprocess_img, imlist, [in_dim for x in range(len(imlist))]))
 92 |     im_batches = [x[0] for x in batches]
 93 |     orig_ims = [x[1] for x in batches]
 94 |     im_dim_list = [x[2] for x in batches]
 95 |     #Explain
 96 |     im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2)
 97 |     
 98 |     
 99 |     
100 |     if CUDA:
101 |         im_dim_list = im_dim_list.cuda()
102 |     
103 |     leftover = 0
104 |     
105 |     if (len(im_dim_list) % batch_size):
106 |         leftover = 1
107 |         
108 | 
109 |     i = 0
110 |     
111 | 
112 |     write = False
113 |     
114 |     
115 |     objs = {}
116 |     
117 |     
118 |     
119 |     for batch in im_batches:
120 |         if CUDA:
121 |             batch = batch.cuda()
122 |         #print('batch size => ', batch.size())
123 |         with torch.no_grad():
124 |             prediction = model(batch, CUDA)
125 |         
126 | 
127 |         
128 |         prediction = write_results(prediction, confidence, num_classes, nms = True, nms_conf = nms_thesh)
129 |         
130 |         
131 |         if type(prediction) == int:
132 |             i += 1
133 |             continue
134 | 
135 |             
136 |         #Add the current batch number
137 |         prediction[:,0] += i*batch_size
138 |         
139 |     
140 |             
141 |           
142 |         if not write:
143 |             output = prediction
144 |             write = 1
145 |         else:
146 |             output = torch.cat((output,prediction))
147 |             
148 |         
149 |         
150 | 
151 |         for im_num, image in enumerate(imlist[i*batch_size: min((i +  1)*batch_size, len(imlist))]):
152 |             im_id = i*batch_size + im_num
153 |             objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id]
154 |             print("{0:20s} {1:s}".format("Objects Detected:", " ".join(objs)))
155 |             print("----------------------------------------------------------")
156 |         i += 1
157 | 
158 |         
159 |         if CUDA:
160 |             torch.cuda.synchronize()
161 |     
162 |     try:
163 |         output
164 |     except NameError:
165 |         print("No detections were made")
166 |         exit()
167 |         
168 |     im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long())
169 |     
170 |     scaling_factor = torch.min(in_dim/im_dim_list,1)[0].view(-1,1)
171 |     
172 |     
173 |     output[:,[1,3]] -= (in_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2
174 |     output[:,[2,4]] -= (in_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2
175 |     
176 |     
177 |     
178 |     output[:,1:5] /= scaling_factor
179 |     
180 |     for i in range(output.shape[0]):
181 |         output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0])
182 |         output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1])
183 |         
184 |     colors = pkl.load(open("pallete", "rb"))
185 | 
186 |     def write(x, batches, results):
187 |         c1 = tuple(x[1:3].int())
188 |         c2 = tuple(x[3:5].int())
189 |         img = results[int(x[0])]
190 |         cls = int(x[-1])
191 |         label = "{0}".format(classes[cls])
192 |         color = random.choice(colors)
193 |         cv2.rectangle(img, c1, c2,color, 1)
194 |         t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
195 |         c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
196 |         cv2.rectangle(img, c1, c2,color, -1)
197 |         cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1)
198 |         return img
199 |     
200 |             
201 |     list(map(lambda x: write(x, im_batches, orig_ims), output))
202 |       
203 |     det_names = pd.Series(imlist).apply(lambda x: "{}/det_{}".format(args.result,x.split("\\")[-1]))
204 |     
205 |     list(map(cv2.imwrite, det_names, orig_ims))
206 | 
207 |     torch.cuda.empty_cache()
208 |     
209 |     
210 |         
211 |         
212 |     
213 |     
214 | 


--------------------------------------------------------------------------------
/detect_video.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import time
  3 | import torch 
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | import numpy as np
  7 | import cv2 
  8 | from util import *
  9 | from DNModel import net as Darknet
 10 | from img_process import inp_to_image, custom_resize
 11 | import pandas as pd
 12 | import random 
 13 | import pickle as pkl
 14 | import argparse
 15 | 
 16 | 
 17 | 
 18 | def prepare_input(img, inp_dim):
 19 |     """
 20 |     Prepare image for inputting to the neural network. 
 21 |     Perform tranpose and return Tensor
 22 |     """
 23 | 
 24 |     orig_im = img
 25 |     dim = orig_im.shape[1], orig_im.shape[0]
 26 |     img = (custom_resize(orig_im, (inp_dim, inp_dim)))
 27 |     img_ = img[:,:,::-1].transpose((2,0,1)).copy()
 28 |     img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
 29 |     return img_, orig_im, dim
 30 | 
 31 | def write(x, img):
 32 |     c1 = tuple(x[1:3].int())
 33 |     c2 = tuple(x[3:5].int())
 34 |     cls = int(x[-1])
 35 |     label = "{0}".format(classes[cls])
 36 |     color = random.choice(colors)
 37 |     cv2.rectangle(img, c1, c2,color, 1)
 38 |     t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
 39 |     c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
 40 |     cv2.rectangle(img, c1, c2,color, -1)
 41 |     cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);
 42 |     return img
 43 | 
 44 | def arg_parse():
 45 |     """
 46 |     Parse arguements to the detect module
 47 |     
 48 |     """
 49 |     
 50 |     
 51 |     parser = argparse.ArgumentParser(description='YOLO v3 Video Detection Module')
 52 |    
 53 |     parser.add_argument("--video", dest = 'video', help = 
 54 |                         "Video to run detection upon",
 55 |                         default = "video.avi", type = str)
 56 |     parser.add_argument("--dataset", dest = "dataset", help = "Dataset on which the network has been trained", default = "pascal")
 57 |     parser.add_argument("--confidence", dest = "confidence", help = "Object Confidence to filter predictions", default = 0.5)
 58 |     parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4)
 59 |     parser.add_argument("--cfg", dest = 'cfgfile', help = 
 60 |                         "Config file",
 61 |                         default = "cfg/yolov3.cfg", type = str)
 62 |     parser.add_argument("--weights", dest = 'weightsfile', help = 
 63 |                         "weightsfile",
 64 |                         default = "yolov3.weights", type = str)
 65 |     parser.add_argument("--reso", dest = 'reso', help = 
 66 |                         "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed",
 67 |                         default = "128", type = str)
 68 |     return parser.parse_args()
 69 | 
 70 | 
 71 | if __name__ == '__main__':
 72 |     args = arg_parse()
 73 |     confidence = float(args.confidence)
 74 |     nms_thesh = float(args.nms_thresh)
 75 |     start = 0
 76 | 
 77 |     CUDA = torch.cuda.is_available()
 78 | 
 79 |     num_classes = 80
 80 |     
 81 |     bbox_attrs = 5 + num_classes
 82 |     
 83 |     print("Loading network")
 84 |     model = Darknet(args.cfgfile)
 85 |     model.load_weights(args.weightsfile)
 86 |     print("Network loaded")
 87 |     classes = load_classes('data/coco.names')
 88 |     colors = pkl.load(open("pallete", "rb"))
 89 |     model.DNInfo["height"] = args.reso
 90 |     inp_dim = int(model.DNInfo["height"])
 91 | 
 92 | 
 93 |     if CUDA:
 94 |         model.cuda()
 95 |         
 96 |     model.eval()
 97 |     
 98 |     videofile = args.video
 99 |     
100 |     cap = cv2.VideoCapture(videofile)
101 |     
102 |     assert cap.isOpened(), 'Cannot capture source'
103 |     
104 |     while cap.isOpened():
105 |         
106 |         ret, frame = cap.read()
107 |         if ret:
108 |             
109 | 
110 |             img, orig_im, dim = prepare_input(frame, inp_dim)
111 |             
112 |             im_dim = torch.FloatTensor(dim).repeat(1,2)                        
113 |             
114 |             
115 |             if CUDA:
116 |                 im_dim = im_dim.cuda()
117 |                 img = img.cuda()
118 |             
119 |             with torch.no_grad():   
120 |                 output = model(Variable(img), CUDA)
121 |             output = write_results(output, confidence, num_classes, nms = True, nms_conf = nms_thesh)
122 | 
123 |             if type(output) == int:
124 |                 cv2.imshow("frame", orig_im)
125 |                 key = cv2.waitKey(1)
126 |                 if key & 0xFF == ord('x'):
127 |                     break
128 |                 continue
129 |         
130 |             
131 |             im_dim = im_dim.repeat(output.size(0), 1)
132 |             scaling_factor = torch.min(inp_dim/im_dim,1)[0].view(-1,1)
133 |             
134 |             output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim[:,0].view(-1,1))/2
135 |             output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim[:,1].view(-1,1))/2
136 |             
137 |             output[:,1:5] /= scaling_factor
138 |     
139 |             for i in range(output.shape[0]):
140 |                 output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim[i,0])
141 |                 output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim[i,1])
142 |             
143 | 
144 |             
145 |             list(map(lambda x: write(x, orig_im), output))
146 |             
147 |             
148 |             cv2.imshow("frame", orig_im)
149 |             key = cv2.waitKey(1)
150 |             if key & 0xFF == ord('x'):
151 |                 break
152 |         else:
153 |             break
154 |     
155 | 
156 |     
157 |     
158 | 
159 | 


--------------------------------------------------------------------------------
/images/1480611559-palm-beach-home-living-room.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/1480611559-palm-beach-home-living-room.jpg


--------------------------------------------------------------------------------
/images/62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg


--------------------------------------------------------------------------------
/images/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/dog.jpg


--------------------------------------------------------------------------------
/images/eagle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/eagle.jpg


--------------------------------------------------------------------------------
/images/giraffe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/giraffe.jpg


--------------------------------------------------------------------------------
/images/herd_of_horses.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/herd_of_horses.jpg


--------------------------------------------------------------------------------
/images/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/img1.jpg


--------------------------------------------------------------------------------
/images/img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/img2.jpg


--------------------------------------------------------------------------------
/images/img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/img3.jpg


--------------------------------------------------------------------------------
/images/img4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/img4.jpg


--------------------------------------------------------------------------------
/images/messi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/messi.jpg


--------------------------------------------------------------------------------
/images/person.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/person.jpg


--------------------------------------------------------------------------------
/img_process.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import torch 
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F 
 6 | from torch.autograd import Variable
 7 | import numpy as np
 8 | import cv2 
 9 | import matplotlib.pyplot as plt
10 | from util import convert2cpu as cpu
11 | from PIL import Image, ImageDraw
12 | 
13 | 
14 | def custom_resize(img, inp_dim):
15 |     '''resize without changing aspect ratio'''
16 |     img_w, img_h = img.shape[1], img.shape[0]
17 |     w, h = inp_dim
18 |     new_w = int(img_w * min(w/img_w, h/img_h))
19 |     new_h = int(img_h * min(w/img_w, h/img_h))
20 |     resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC)
21 |     
22 |     canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
23 | 
24 |     canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w,  :] = resized_image
25 |     
26 |     return canvas
27 | 
28 | 
29 |         
30 | def preprocess_img(img, inp_dim):
31 |     """
32 |     Preprocess the image for the neural network. 
33 |     
34 |     Returns a tensor 
35 |     """
36 | 
37 |     orig_im = cv2.imread(img)
38 |     dim = orig_im.shape[1], orig_im.shape[0]
39 |     img = (custom_resize(orig_im, (inp_dim, inp_dim)))
40 |     img_ = img[:,:,::-1].transpose((2,0,1)).copy()
41 |     img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
42 |     return img_, orig_im, dim
43 | 
44 | 
45 | def inp_to_image(inp):
46 |     inp = inp.cpu().squeeze()
47 |     inp = inp*255
48 |     try:
49 |         inp = inp.data.numpy()
50 |     except RuntimeError:
51 |         inp = inp.numpy()
52 |     inp = inp.transpose(1,2,0)
53 | 
54 |     inp = inp[:,:,::-1]
55 |     return inp
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/pallete:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/pallete


--------------------------------------------------------------------------------
/result/det_1480611559-palm-beach-home-living-room.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_1480611559-palm-beach-home-living-room.jpg


--------------------------------------------------------------------------------
/result/det_62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg


--------------------------------------------------------------------------------
/result/det_dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_dog.jpg


--------------------------------------------------------------------------------
/result/det_eagle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_eagle.jpg


--------------------------------------------------------------------------------
/result/det_giraffe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_giraffe.jpg


--------------------------------------------------------------------------------
/result/det_herd_of_horses.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_herd_of_horses.jpg


--------------------------------------------------------------------------------
/result/det_img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_img1.jpg


--------------------------------------------------------------------------------
/result/det_img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_img2.jpg


--------------------------------------------------------------------------------
/result/det_img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_img3.jpg


--------------------------------------------------------------------------------
/result/det_img4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_img4.jpg


--------------------------------------------------------------------------------
/result/det_messi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_messi.jpg


--------------------------------------------------------------------------------
/result/det_person.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_person.jpg


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from __future__ import division
  3 | 
  4 | import torch 
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F 
  7 | from torch.autograd import Variable
  8 | import numpy as np
  9 | import cv2 
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | def count_parameters(model):
 13 |     return sum(p.numel() for p in model.parameters())
 14 | 
 15 | def count_learnable_parameters(model):
 16 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
 17 | 
 18 | def convert2cpu(matrix):
 19 |     if matrix.is_cuda:
 20 |         return torch.FloatTensor(matrix.size()).copy_(matrix)
 21 |     else:
 22 |         return matrix
 23 | 
 24 | def bbox_iou(box1, box2):
 25 |     """
 26 |     Returns the IoU of two bounding boxes 
 27 |     
 28 |     
 29 |     """
 30 |     #Get the coordinates of bounding boxes
 31 |     b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
 32 |     b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3]
 33 |     
 34 |     #get the corrdinates of the intersection rectangle
 35 |     inter_rect_x1 =  torch.max(b1_x1, b2_x1)
 36 |     inter_rect_y1 =  torch.max(b1_y1, b2_y1)
 37 |     inter_rect_x2 =  torch.min(b1_x2, b2_x2)
 38 |     inter_rect_y2 =  torch.min(b1_y2, b2_y2)
 39 |     
 40 |     #Intersection area
 41 |     if torch.cuda.is_available():
 42 |             inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda())
 43 |     else:
 44 |             inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape))
 45 |     
 46 |     #Union Area
 47 |     b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
 48 |     b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
 49 |     
 50 |     iou = inter_area / (b1_area + b2_area - inter_area)
 51 |     
 52 |     return iou
 53 | 
 54 | 
 55 | def transformOutput(prediction, inp_dim, anchors, num_classes, CUDA = True):
 56 |     batch_size = prediction.size(0)
 57 |     stride =  inp_dim // prediction.size(2)
 58 |     grid_size = inp_dim // stride
 59 |     bbox_attrs = 5 + num_classes
 60 |     num_anchors = len(anchors)
 61 |     
 62 |     anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
 63 | 
 64 | 
 65 | 
 66 |     prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
 67 |     prediction = prediction.transpose(1,2).contiguous()
 68 |     prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
 69 | 
 70 | 
 71 |     #Sigmoid the  centre_X, centre_Y. and object confidencce
 72 |     prediction[:,:,0] = torch.sigmoid(prediction[:,:,0])
 73 |     prediction[:,:,1] = torch.sigmoid(prediction[:,:,1])
 74 |     prediction[:,:,4] = torch.sigmoid(prediction[:,:,4])
 75 |     
 76 | 
 77 |     
 78 |     #Add the center offsets
 79 |     grid_len = np.arange(grid_size)
 80 |     a,b = np.meshgrid(grid_len, grid_len)
 81 |     
 82 |     x_offset = torch.FloatTensor(a).view(-1,1)
 83 |     y_offset = torch.FloatTensor(b).view(-1,1)
 84 |     
 85 |     if CUDA:
 86 |         x_offset = x_offset.cuda()
 87 |         y_offset = y_offset.cuda()
 88 |     
 89 |     x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0)
 90 |     
 91 |     prediction[:,:,:2] += x_y_offset
 92 |       
 93 |     anchors = torch.FloatTensor(anchors)
 94 |     
 95 |     if CUDA:
 96 |         anchors = anchors.cuda()
 97 |         
 98 |     #Transform the anchors using log opearation given in research paper
 99 |     anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
100 |     prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors
101 | 
102 |     #Softmax the class scores
103 |     prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes]))
104 | 
105 |     prediction[:,:,:4] *= stride
106 |    
107 |     
108 |     return prediction
109 | 
110 | def load_classes(namesfile):
111 |     fp = open(namesfile, "r")
112 |     names = fp.read().split("\n")[:-1]
113 |     return names
114 | 
115 | def get_im_dim(im):
116 |     im = cv2.imread(im)
117 |     w,h = im.shape[1], im.shape[0]
118 |     return w,h
119 | 
120 | def unique(tensor):
121 |     tensor_np = tensor.cpu().numpy()
122 |     unique_np = np.unique(tensor_np)
123 |     unique_tensor = torch.from_numpy(unique_np)
124 |     
125 |     tensor_res = tensor.new(unique_tensor.shape)
126 |     tensor_res.copy_(unique_tensor)
127 |     return tensor_res
128 | 
129 | def write_results(prediction, confidence, num_classes, nms = True, nms_conf = 0.4):
130 |     conf_mask = (prediction[:,:,4] > confidence).float().unsqueeze(2)
131 |     prediction = prediction*conf_mask
132 |     
133 | 
134 |     try:
135 |         ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous()
136 |     except:
137 |         return 0
138 |     
139 |     
140 |     box_a = prediction.new(prediction.shape)
141 |     box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
142 |     box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
143 |     box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 
144 |     box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
145 |     prediction[:,:,:4] = box_a[:,:,:4]
146 |     
147 | 
148 |     
149 |     batch_size = prediction.size(0)
150 |     
151 |     output = prediction.new(1, prediction.size(2) + 1)
152 |     write = False
153 | 
154 | 
155 |     for ind in range(batch_size):
156 |         #select the image from the batch
157 |         image_pred = prediction[ind]
158 |         
159 | 
160 |         
161 |         #Get the class having maximum score, and the index of that class
162 |         #Get rid of num_classes softmax scores 
163 |         #Add the class index and the class score of class having maximum score
164 |         max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1)
165 |         max_conf = max_conf.float().unsqueeze(1)
166 |         max_conf_score = max_conf_score.float().unsqueeze(1)
167 |         seq = (image_pred[:,:5], max_conf, max_conf_score)
168 |         image_pred = torch.cat(seq, 1)
169 |         
170 | 
171 |         
172 |         #Get rid of the zero entries
173 |         non_zero_ind =  (torch.nonzero(image_pred[:,4]))
174 | 
175 |         
176 |         image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7)
177 |         
178 |         #Get the various classes detected in the image
179 |         try:
180 |             img_classes = unique(image_pred_[:,-1])
181 |         except:
182 |              continue
183 |         #WE will do NMS classwise
184 |         for cls in img_classes:
185 |             #get the detections with one particular class
186 |             cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1)
187 |             class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze()
188 |             
189 | 
190 |             image_pred_class = image_pred_[class_mask_ind].view(-1,7)
191 | 
192 | 		
193 |         
194 |              #sort the detections such that the entry with the maximum objectness
195 |              #confidence is at the top
196 |             conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1]
197 |             image_pred_class = image_pred_class[conf_sort_index]
198 |             idx = image_pred_class.size(0)
199 |             
200 |             #if nms has to be done
201 |             if nms:
202 |                 #For each detection
203 |                 for i in range(idx):
204 |                     #Get the IOUs of all boxes that come after the one we are looking at 
205 |                     #in the loop
206 |                     try:
207 |                         ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
208 |                     except ValueError:
209 |                         break
210 |         
211 |                     except IndexError:
212 |                         break
213 |                     
214 |                     #Zero out all the detections that have IoU > treshhold
215 |                     iou_mask = (ious < nms_conf).float().unsqueeze(1)
216 |                     image_pred_class[i+1:] *= iou_mask       
217 |                     
218 |                     #Remove the non-zero entries
219 |                     non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze()
220 |                     image_pred_class = image_pred_class[non_zero_ind].view(-1,7)
221 |                     
222 |                     
223 | 
224 |             #Concatenate the batch_id of the image to the detection
225 |             #this helps us identify which image does the detection correspond to 
226 |             #We use a linear straucture to hold ALL the detections from the batch
227 |             #the batch_dim is flattened
228 |             #batch is identified by extra batch column
229 |             
230 |             
231 |             batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind)
232 |             seq = batch_ind, image_pred_class
233 |             if not write:
234 |                 output = torch.cat(seq,1)
235 |                 write = True
236 |             else:
237 |                 out = torch.cat(seq,1)
238 |                 output = torch.cat((output,out))
239 | 
240 |     
241 |     return output
242 | 
243 | 
244 | 
245 | 


--------------------------------------------------------------------------------