├── DNModel.py ├── LICENSE ├── README.md ├── cfg ├── tiny-yolo-voc.cfg ├── yolo-voc.cfg ├── yolo.cfg └── yolov3.cfg ├── data ├── coco.names └── voc.names ├── detect.py ├── detect_video.py ├── images ├── 1480611559-palm-beach-home-living-room.jpg ├── 62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg ├── dog.jpg ├── eagle.jpg ├── giraffe.jpg ├── herd_of_horses.jpg ├── img1.jpg ├── img2.jpg ├── img3.jpg ├── img4.jpg ├── messi.jpg └── person.jpg ├── img_process.py ├── pallete ├── result ├── det_1480611559-palm-beach-home-living-room.jpg ├── det_62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg ├── det_dog.jpg ├── det_eagle.jpg ├── det_giraffe.jpg ├── det_herd_of_horses.jpg ├── det_img1.jpg ├── det_img2.jpg ├── det_img3.jpg ├── det_img4.jpg ├── det_messi.jpg └── det_person.jpg └── util.py /DNModel.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import numpy as np 7 | import cv2 8 | from util import * 9 | 10 | 11 | 12 | class dummyLayer(nn.Module): 13 | def __init__(self): 14 | super(dummyLayer, self).__init__() 15 | 16 | 17 | class detector(nn.Module): 18 | def __init__(self, anchors): 19 | super(detector, self).__init__() 20 | self.anchors = anchors 21 | 22 | 23 | def construct_cfg(configFile): 24 | ''' 25 | Build the network blocks using the configuration file. 26 | Pre-process it to form easy to manupulate using pytorch. 27 | ''' 28 | 29 | # Read and pre-process the configuration file 30 | 31 | config = open(configFile,'r') 32 | file = config.read().split('\n') 33 | 34 | file = [line for line in file if len(line) > 0 and line[0]!= '#'] 35 | file = [line.lstrip().rstrip() for line in file] 36 | 37 | 38 | #Separate network blocks in a list 39 | 40 | networkBlocks = [] 41 | networkBlock = {} 42 | 43 | for x in file: 44 | if x[0] == '[': 45 | if len(networkBlock) != 0: 46 | networkBlocks.append(networkBlock) 47 | networkBlock = {} 48 | networkBlock["type"] = x[1:-1].rstrip() 49 | else: 50 | entity , value = x.split('=') 51 | networkBlock[entity.rstrip()] = value.lstrip() 52 | networkBlocks.append(networkBlock) 53 | 54 | return networkBlocks 55 | 56 | 57 | def buildNetwork(networkBlocks): 58 | DNInfo = networkBlocks[0] 59 | modules = nn.ModuleList([]) 60 | channels = 3 61 | filterTracker = [] 62 | 63 | for i,x in enumerate(networkBlocks[1:]): 64 | seqModule = nn.Sequential() 65 | if (x["type"] == "convolutional"): 66 | 67 | filters= int(x["filters"]) 68 | pad = int(x["pad"]) 69 | kernelSize = int(x["size"]) 70 | stride = int(x["stride"]) 71 | 72 | if pad: 73 | padding = (kernelSize - 1) // 2 74 | else: 75 | padding = 0 76 | 77 | activation = x["activation"] 78 | try: 79 | bn = int(x["batch_normalize"]) 80 | bias = False 81 | except: 82 | bn = 0 83 | bias = True 84 | 85 | conv = nn.Conv2d(channels, filters, kernelSize, stride, padding, bias = bias) 86 | seqModule.add_module("conv_{0}".format(i), conv) 87 | 88 | if bn: 89 | bn = nn.BatchNorm2d(filters) 90 | seqModule.add_module("batch_norm_{0}".format(i), bn) 91 | 92 | if activation == "leaky": 93 | activn = nn.LeakyReLU(0.1, inplace = True) 94 | seqModule.add_module("leaky_{0}".format(i), activn) 95 | 96 | 97 | elif (x["type"] == "upsample"): 98 | upsample = nn.Upsample(scale_factor = 2, mode = "bilinear") 99 | seqModule.add_module("upsample_{}".format(i), upsample) 100 | 101 | elif (x["type"] == "route"): 102 | x['layers'] = x["layers"].split(',') 103 | start = int(x['layers'][0]) 104 | try: 105 | end = int(x['layers'][1]) 106 | except: 107 | end =0 108 | 109 | if start > 0: 110 | start = start - i 111 | if end > 0: 112 | end = end - i 113 | 114 | route = dummyLayer() 115 | seqModule.add_module("route_{0}".format(i),route) 116 | if end < 0: 117 | filters = filterTracker[i+start] + filterTracker[i+end] 118 | else: 119 | filters = filterTracker[i+start] 120 | elif (x["type"] == "shortcut"): 121 | shortcut = dummyLayer() 122 | seqModule.add_module("shortcut_{0}".format(i),shortcut) 123 | elif (x["type"] == "yolo"): 124 | anchors = x["anchors"].split(',') 125 | anchors = [int(a) for a in anchors] 126 | masks = x["mask"].split(',') 127 | masks = [int(a) for a in masks] 128 | anchors = [(anchors[j],anchors[j+1]) for j in range(0,len(anchors),2)] 129 | anchors = [anchors[j] for j in masks] 130 | detectorLayer = detector(anchors) 131 | 132 | seqModule.add_module("Detection_{0}".format(i),detectorLayer) 133 | 134 | modules.append(seqModule) 135 | channels = filters 136 | filterTracker.append(filters) 137 | return (DNInfo, modules) 138 | 139 | 140 | 141 | class net(nn.Module): 142 | def __init__(self, cfgfile): 143 | super(net, self).__init__() 144 | self.netBlocks = construct_cfg(cfgfile) 145 | self.DNInfo, self.moduleList = buildNetwork(self.netBlocks) 146 | self.header = torch.IntTensor([0,0,0,0]) 147 | self.seen = 0 148 | 149 | def forward(self, x, CUDA): 150 | detections = [] 151 | modules = self.netBlocks[1:] 152 | layerOutputs = {} 153 | 154 | 155 | written_output = 0 156 | #Iterate throught each module 157 | for i in range(len(modules)): 158 | 159 | module_type = (modules[i]["type"]) 160 | #Upsampling is basically a form of convolution 161 | if module_type == "convolutional" or module_type == "upsample" : 162 | 163 | x = self.moduleList[i](x) 164 | layerOutputs[i] = x 165 | 166 | #Add outouts from previous layers to this layer 167 | elif module_type == "route": 168 | layers = modules[i]["layers"] 169 | layers = [int(a) for a in layers] 170 | 171 | #If layer nummber is mentioned instead of its position relative to the the current layer 172 | if (layers[0]) > 0: 173 | layers[0] = layers[0] - i 174 | 175 | if len(layers) == 1: 176 | x = layerOutputs[i + (layers[0])] 177 | 178 | else: 179 | #If layer nummber is mentioned instead of its position relative to the the current layer 180 | if (layers[1]) > 0: 181 | layers[1] = layers[1] - i 182 | 183 | map1 = layerOutputs[i + layers[0]] 184 | map2 = layerOutputs[i + layers[1]] 185 | 186 | 187 | x = torch.cat((map1, map2), 1) 188 | layerOutputs[i] = x 189 | 190 | #ShortCut is essentially residue from resnets 191 | elif module_type == "shortcut": 192 | from_ = int(modules[i]["from"]) 193 | x = layerOutputs[i-1] + layerOutputs[i+from_] 194 | layerOutputs[i] = x 195 | 196 | 197 | 198 | elif module_type == 'yolo': 199 | 200 | anchors = self.moduleList[i][0].anchors 201 | #Get the input dimensions 202 | inp_dim = int (self.DNInfo["height"]) 203 | 204 | #Get the number of classes 205 | num_classes = int (modules[i]["classes"]) 206 | 207 | #Output the result 208 | x = x.data 209 | print("Size before transform => " ,x.size()) 210 | 211 | #Convert the output to 2D (batch x grids x bounding box attributes) 212 | x = transformOutput(x, inp_dim, anchors, num_classes, CUDA) 213 | print("Size after transform => " ,x.size()) 214 | 215 | 216 | #If no detections were made 217 | if type(x) == int: 218 | continue 219 | 220 | 221 | if not written_output: 222 | detections = x 223 | written_output = 1 224 | 225 | else: 226 | detections = torch.cat((detections, x), 1) 227 | 228 | layerOutputs[i] = layerOutputs[i-1] 229 | 230 | 231 | try: 232 | return detections 233 | except: 234 | return 0 235 | 236 | 237 | def load_weights(self, weightfile): 238 | 239 | fp = open(weightfile, "rb") 240 | 241 | #The first 4 values are header information 242 | # 1. Major version number 243 | # 2. Minor Version Number 244 | # 3. Subversion number 245 | # 4. IMages seen 246 | header = np.fromfile(fp, dtype = np.int32, count = 5) 247 | self.header = torch.from_numpy(header) 248 | self.seen = self.header[3] 249 | 250 | 251 | weights = np.fromfile(fp, dtype = np.float32) 252 | 253 | tracker = 0 254 | for i in range(len(self.moduleList)): 255 | module_type = self.netBlocks[i + 1]["type"] 256 | 257 | if module_type == "convolutional": 258 | model = self.moduleList[i] 259 | try: 260 | batch_normalize = int(self.netBlocks[i+1]["batch_normalize"]) 261 | except: 262 | batch_normalize = 0 263 | 264 | convPart = model[0] 265 | 266 | if (batch_normalize): 267 | #Weights file Configuration=> bn bais->bn weights-> running mean-> running var 268 | #The weights are arranged in the above mentioned order 269 | bnPart = model[1] 270 | 271 | biasCount = bnPart.bias.numel() 272 | 273 | bnBias = torch.from_numpy(weights[tracker:tracker + biasCount]) 274 | tracker += biasCount 275 | 276 | bnPart_weights = torch.from_numpy(weights[tracker: tracker + biasCount]) 277 | tracker += biasCount 278 | 279 | bnPart_running_mean = torch.from_numpy(weights[tracker: tracker + biasCount]) 280 | tracker += biasCount 281 | 282 | bnPart_running_var = torch.from_numpy(weights[tracker: tracker + biasCount]) 283 | tracker += biasCount 284 | 285 | bnBias = bnBias.view_as(bnPart.bias.data) 286 | bnPart_weights = bnPart_weights.view_as(bnPart.weight.data) 287 | bnPart_running_mean = bnPart_running_mean.view_as(bnPart.running_mean) 288 | bnPart_running_var = bnPart_running_var.view_as(bnPart.running_var) 289 | 290 | bnPart.bias.data.copy_(bnBias) 291 | bnPart.weight.data.copy_(bnPart_weights) 292 | bnPart.running_mean.copy_(bnPart_running_mean) 293 | bnPart.running_var.copy_(bnPart_running_var) 294 | 295 | else: 296 | biasCount = convPart.bias.numel() 297 | 298 | convBias = torch.from_numpy(weights[tracker: tracker + biasCount]) 299 | tracker = tracker + biasCount 300 | 301 | convBias = convBias.view_as(convPart.bias.data) 302 | 303 | convPart.bias.data.copy_(convBias) 304 | 305 | 306 | weightfile = convPart.weight.numel() 307 | 308 | convWeight = torch.from_numpy(weights[tracker:tracker+weightfile]) 309 | tracker = tracker + weightfile 310 | 311 | convWeight = convWeight.view_as(convPart.weight.data) 312 | convPart.weight.data.copy_(convWeight) 313 | ''' 314 | #Test CFG: 315 | construct = construct_cfg('cfg/yolov3.cfg') 316 | print(construct,"/n constructed from cfg file") 317 | ''' 318 | 319 | #TestMOdel: 320 | 321 | num_classes = 80 322 | classes = load_classes('data/coco.names') 323 | 324 | model = net('cfg/yolov3.cfg') 325 | model.load_weights("yolov3.weights") 326 | print("Network loaded") 327 | 328 | test_data = torch.randn(1,3,256,256,dtype = torch.float) 329 | test_output = model(test_data,False) 330 | 331 | print(test_output.size()) 332 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Ayush Chaurasia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is complementary code for the video series that explains the implementation of yolo-v3 from scratch, [available here](https://www.youtube.com/playlist?list=PLbMqOoYQ3MxxArhAqvki_WoWBTCc8fDHG). 2 | . 3 | [My channel](http://www.youtube.com/channel/UCgpckFNtZEOSjPFpQf-Kn8w) for cutting edge deep learning projects. 4 | 5 | # Yolo-V3 6 | 7 | yolov3.cfg (236 MB COCO Yolo v3) - requires 4 GB GPU-RAM: https://pjreddie.com/media/files/yolov3.weights 8 | 9 | # Test: 10 | Run the following command with optional commandline arguments to perform detections on images in 'images' folder. By default the 'result' folder will store the output. 11 | ``` 12 | python detect.py 13 | ``` 14 | Run the following command with optional commandline arguments to perform detections on videos 15 | ``` 16 | python detect_video.py 17 | ``` 18 | 19 | # Understand and implement the network from scratch (Video) 20 | [![](http://img.youtube.com/vi/chVamXQp9so&list=PLbMqOoYQ3MxxArhAqvki_WoWBTCc8fDHG/0.jpg)](http://www.youtube.com/watch?v=chVamXQp9so&list=PLbMqOoYQ3MxxArhAqvki_WoWBTCc8fDHG) 21 | 24 | 25 | # Some Outputs: 26 | ![](https://github.com/AyushExel/Yolo-V3/blob/master/result/det_messi.jpg) 27 | ![](https://github.com/AyushExel/Yolo-V3/blob/master/result/det_62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg) 28 | ![](https://github.com/AyushExel/Yolo-V3/blob/master/result/det_person.jpg) 29 | 30 | -------------------------------------------------------------------------------- /cfg/tiny-yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | batch=64 3 | subdivisions=8 4 | width=416 5 | height=416 6 | channels=3 7 | momentum=0.9 8 | decay=0.0005 9 | angle=0 10 | saturation = 1.5 11 | exposure = 1.5 12 | hue=.1 13 | 14 | learning_rate=0.001 15 | max_batches = 40200 16 | policy=steps 17 | steps=-1,100,20000,30000 18 | scales=.1,10,.1,.1 19 | 20 | [convolutional] 21 | batch_normalize=1 22 | filters=16 23 | size=3 24 | stride=1 25 | pad=1 26 | activation=leaky 27 | 28 | [maxpool] 29 | size=2 30 | stride=2 31 | 32 | [convolutional] 33 | batch_normalize=1 34 | filters=32 35 | size=3 36 | stride=1 37 | pad=1 38 | activation=leaky 39 | 40 | [maxpool] 41 | size=2 42 | stride=2 43 | 44 | [convolutional] 45 | batch_normalize=1 46 | filters=64 47 | size=3 48 | stride=1 49 | pad=1 50 | activation=leaky 51 | 52 | [maxpool] 53 | size=2 54 | stride=2 55 | 56 | [convolutional] 57 | batch_normalize=1 58 | filters=128 59 | size=3 60 | stride=1 61 | pad=1 62 | activation=leaky 63 | 64 | [maxpool] 65 | size=2 66 | stride=2 67 | 68 | [convolutional] 69 | batch_normalize=1 70 | filters=256 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | [maxpool] 77 | size=2 78 | stride=2 79 | 80 | [convolutional] 81 | batch_normalize=1 82 | filters=512 83 | size=3 84 | stride=1 85 | pad=1 86 | activation=leaky 87 | 88 | [maxpool] 89 | size=2 90 | stride=1 91 | 92 | [convolutional] 93 | batch_normalize=1 94 | filters=1024 95 | size=3 96 | stride=1 97 | pad=1 98 | activation=leaky 99 | 100 | ########### 101 | 102 | [convolutional] 103 | batch_normalize=1 104 | size=3 105 | stride=1 106 | pad=1 107 | filters=1024 108 | activation=leaky 109 | 110 | [convolutional] 111 | size=1 112 | stride=1 113 | pad=1 114 | filters=125 115 | activation=linear 116 | 117 | [region] 118 | anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 119 | bias_match=1 120 | classes=20 121 | coords=4 122 | num=5 123 | softmax=1 124 | jitter=.2 125 | rescore=1 126 | 127 | object_scale=5 128 | noobject_scale=1 129 | class_scale=1 130 | coord_scale=1 131 | 132 | absolute=1 133 | thresh = .6 134 | random=1 135 | -------------------------------------------------------------------------------- /cfg/yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=64 4 | subdivisions=8 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | height=416 9 | width=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 80200 21 | policy=steps 22 | steps=-1,500,40000,60000 23 | scales=0.1,10,.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=125 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 243 | bias_match=1 244 | classes=20 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /cfg/yolo.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=425 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 243 | bias_match=1 244 | classes=80 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /cfg/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width= 320 9 | height = 320 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | #route 114 | 115 | # Downsample 116 | 117 | [convolutional] 118 | batch_normalize=1 119 | filters=256 120 | size=3 121 | stride=2 122 | pad=1 123 | activation=leaky 124 | 125 | [convolutional] 126 | batch_normalize=1 127 | filters=128 128 | size=1 129 | stride=1 130 | pad=1 131 | activation=leaky 132 | 133 | [convolutional] 134 | batch_normalize=1 135 | filters=256 136 | size=3 137 | stride=1 138 | pad=1 139 | activation=leaky 140 | 141 | [shortcut] 142 | from=-3 143 | activation=linear 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [convolutional] 154 | batch_normalize=1 155 | filters=256 156 | size=3 157 | stride=1 158 | pad=1 159 | activation=leaky 160 | 161 | [shortcut] 162 | from=-3 163 | activation=linear 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=128 168 | size=1 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=256 176 | size=3 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [shortcut] 182 | from=-3 183 | activation=linear 184 | 185 | [convolutional] 186 | batch_normalize=1 187 | filters=128 188 | size=1 189 | stride=1 190 | pad=1 191 | activation=leaky 192 | 193 | [convolutional] 194 | batch_normalize=1 195 | filters=256 196 | size=3 197 | stride=1 198 | pad=1 199 | activation=leaky 200 | 201 | [shortcut] 202 | from=-3 203 | activation=linear 204 | 205 | 206 | [convolutional] 207 | batch_normalize=1 208 | filters=128 209 | size=1 210 | stride=1 211 | pad=1 212 | activation=leaky 213 | 214 | [convolutional] 215 | batch_normalize=1 216 | filters=256 217 | size=3 218 | stride=1 219 | pad=1 220 | activation=leaky 221 | 222 | [shortcut] 223 | from=-3 224 | activation=linear 225 | 226 | [convolutional] 227 | batch_normalize=1 228 | filters=128 229 | size=1 230 | stride=1 231 | pad=1 232 | activation=leaky 233 | 234 | [convolutional] 235 | batch_normalize=1 236 | filters=256 237 | size=3 238 | stride=1 239 | pad=1 240 | activation=leaky 241 | 242 | [shortcut] 243 | from=-3 244 | activation=linear 245 | 246 | [convolutional] 247 | batch_normalize=1 248 | filters=128 249 | size=1 250 | stride=1 251 | pad=1 252 | activation=leaky 253 | 254 | [convolutional] 255 | batch_normalize=1 256 | filters=256 257 | size=3 258 | stride=1 259 | pad=1 260 | activation=leaky 261 | 262 | [shortcut] 263 | from=-3 264 | activation=linear 265 | 266 | [convolutional] 267 | batch_normalize=1 268 | filters=128 269 | size=1 270 | stride=1 271 | pad=1 272 | activation=leaky 273 | 274 | [convolutional] 275 | batch_normalize=1 276 | filters=256 277 | size=3 278 | stride=1 279 | pad=1 280 | activation=leaky 281 | 282 | [shortcut] 283 | from=-3 284 | activation=linear 285 | 286 | # Downsample 287 | 288 | [convolutional] 289 | batch_normalize=1 290 | filters=512 291 | size=3 292 | stride=2 293 | pad=1 294 | activation=leaky 295 | 296 | [convolutional] 297 | batch_normalize=1 298 | filters=256 299 | size=1 300 | stride=1 301 | pad=1 302 | activation=leaky 303 | 304 | [convolutional] 305 | batch_normalize=1 306 | filters=512 307 | size=3 308 | stride=1 309 | pad=1 310 | activation=leaky 311 | 312 | [shortcut] 313 | from=-3 314 | activation=linear 315 | 316 | 317 | [convolutional] 318 | batch_normalize=1 319 | filters=256 320 | size=1 321 | stride=1 322 | pad=1 323 | activation=leaky 324 | 325 | [convolutional] 326 | batch_normalize=1 327 | filters=512 328 | size=3 329 | stride=1 330 | pad=1 331 | activation=leaky 332 | 333 | [shortcut] 334 | from=-3 335 | activation=linear 336 | 337 | 338 | [convolutional] 339 | batch_normalize=1 340 | filters=256 341 | size=1 342 | stride=1 343 | pad=1 344 | activation=leaky 345 | 346 | [convolutional] 347 | batch_normalize=1 348 | filters=512 349 | size=3 350 | stride=1 351 | pad=1 352 | activation=leaky 353 | 354 | [shortcut] 355 | from=-3 356 | activation=linear 357 | 358 | 359 | [convolutional] 360 | batch_normalize=1 361 | filters=256 362 | size=1 363 | stride=1 364 | pad=1 365 | activation=leaky 366 | 367 | [convolutional] 368 | batch_normalize=1 369 | filters=512 370 | size=3 371 | stride=1 372 | pad=1 373 | activation=leaky 374 | 375 | [shortcut] 376 | from=-3 377 | activation=linear 378 | 379 | [convolutional] 380 | batch_normalize=1 381 | filters=256 382 | size=1 383 | stride=1 384 | pad=1 385 | activation=leaky 386 | 387 | [convolutional] 388 | batch_normalize=1 389 | filters=512 390 | size=3 391 | stride=1 392 | pad=1 393 | activation=leaky 394 | 395 | [shortcut] 396 | from=-3 397 | activation=linear 398 | 399 | 400 | [convolutional] 401 | batch_normalize=1 402 | filters=256 403 | size=1 404 | stride=1 405 | pad=1 406 | activation=leaky 407 | 408 | [convolutional] 409 | batch_normalize=1 410 | filters=512 411 | size=3 412 | stride=1 413 | pad=1 414 | activation=leaky 415 | 416 | [shortcut] 417 | from=-3 418 | activation=linear 419 | 420 | 421 | [convolutional] 422 | batch_normalize=1 423 | filters=256 424 | size=1 425 | stride=1 426 | pad=1 427 | activation=leaky 428 | 429 | [convolutional] 430 | batch_normalize=1 431 | filters=512 432 | size=3 433 | stride=1 434 | pad=1 435 | activation=leaky 436 | 437 | [shortcut] 438 | from=-3 439 | activation=linear 440 | 441 | [convolutional] 442 | batch_normalize=1 443 | filters=256 444 | size=1 445 | stride=1 446 | pad=1 447 | activation=leaky 448 | 449 | [convolutional] 450 | batch_normalize=1 451 | filters=512 452 | size=3 453 | stride=1 454 | pad=1 455 | activation=leaky 456 | 457 | [shortcut] 458 | from=-3 459 | activation=linear 460 | 461 | # Downsample 462 | 463 | [convolutional] 464 | batch_normalize=1 465 | filters=1024 466 | size=3 467 | stride=2 468 | pad=1 469 | activation=leaky 470 | 471 | [convolutional] 472 | batch_normalize=1 473 | filters=512 474 | size=1 475 | stride=1 476 | pad=1 477 | activation=leaky 478 | 479 | [convolutional] 480 | batch_normalize=1 481 | filters=1024 482 | size=3 483 | stride=1 484 | pad=1 485 | activation=leaky 486 | 487 | [shortcut] 488 | from=-3 489 | activation=linear 490 | 491 | [convolutional] 492 | batch_normalize=1 493 | filters=512 494 | size=1 495 | stride=1 496 | pad=1 497 | activation=leaky 498 | 499 | [convolutional] 500 | batch_normalize=1 501 | filters=1024 502 | size=3 503 | stride=1 504 | pad=1 505 | activation=leaky 506 | 507 | [shortcut] 508 | from=-3 509 | activation=linear 510 | 511 | [convolutional] 512 | batch_normalize=1 513 | filters=512 514 | size=1 515 | stride=1 516 | pad=1 517 | activation=leaky 518 | 519 | [convolutional] 520 | batch_normalize=1 521 | filters=1024 522 | size=3 523 | stride=1 524 | pad=1 525 | activation=leaky 526 | 527 | [shortcut] 528 | from=-3 529 | activation=linear 530 | 531 | [convolutional] 532 | batch_normalize=1 533 | filters=512 534 | size=1 535 | stride=1 536 | pad=1 537 | activation=leaky 538 | 539 | [convolutional] 540 | batch_normalize=1 541 | filters=1024 542 | size=3 543 | stride=1 544 | pad=1 545 | activation=leaky 546 | 547 | [shortcut] 548 | from=-3 549 | activation=linear 550 | 551 | ###################### 552 | 553 | [convolutional] 554 | batch_normalize=1 555 | filters=512 556 | size=1 557 | stride=1 558 | pad=1 559 | activation=leaky 560 | 561 | [convolutional] 562 | batch_normalize=1 563 | size=3 564 | stride=1 565 | pad=1 566 | filters=1024 567 | activation=leaky 568 | 569 | [convolutional] 570 | batch_normalize=1 571 | filters=512 572 | size=1 573 | stride=1 574 | pad=1 575 | activation=leaky 576 | 577 | [convolutional] 578 | batch_normalize=1 579 | size=3 580 | stride=1 581 | pad=1 582 | filters=1024 583 | activation=leaky 584 | 585 | [convolutional] 586 | batch_normalize=1 587 | filters=512 588 | size=1 589 | stride=1 590 | pad=1 591 | activation=leaky 592 | 593 | [convolutional] 594 | batch_normalize=1 595 | size=3 596 | stride=1 597 | pad=1 598 | filters=1024 599 | activation=leaky 600 | 601 | [convolutional] 602 | size=1 603 | stride=1 604 | pad=1 605 | filters=255 606 | activation=linear 607 | 608 | 609 | [yolo] 610 | mask = 6,7,8 611 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 612 | classes=80 613 | num=9 614 | jitter=.3 615 | ignore_thresh = .5 616 | truth_thresh = 1 617 | random=1 618 | 619 | 620 | [route] 621 | layers = -4 622 | 623 | [convolutional] 624 | batch_normalize=1 625 | filters=256 626 | size=1 627 | stride=1 628 | pad=1 629 | activation=leaky 630 | 631 | [upsample] 632 | stride=2 633 | 634 | [route] 635 | layers = -1, 61 636 | 637 | 638 | 639 | [convolutional] 640 | batch_normalize=1 641 | filters=256 642 | size=1 643 | stride=1 644 | pad=1 645 | activation=leaky 646 | 647 | [convolutional] 648 | batch_normalize=1 649 | size=3 650 | stride=1 651 | pad=1 652 | filters=512 653 | activation=leaky 654 | 655 | [convolutional] 656 | batch_normalize=1 657 | filters=256 658 | size=1 659 | stride=1 660 | pad=1 661 | activation=leaky 662 | 663 | [convolutional] 664 | batch_normalize=1 665 | size=3 666 | stride=1 667 | pad=1 668 | filters=512 669 | activation=leaky 670 | 671 | [convolutional] 672 | batch_normalize=1 673 | filters=256 674 | size=1 675 | stride=1 676 | pad=1 677 | activation=leaky 678 | 679 | [convolutional] 680 | batch_normalize=1 681 | size=3 682 | stride=1 683 | pad=1 684 | filters=512 685 | activation=leaky 686 | 687 | [convolutional] 688 | size=1 689 | stride=1 690 | pad=1 691 | filters=255 692 | activation=linear 693 | 694 | 695 | [yolo] 696 | mask = 3,4,5 697 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 698 | classes=80 699 | num=9 700 | jitter=.3 701 | ignore_thresh = .5 702 | truth_thresh = 1 703 | random=1 704 | 705 | 706 | 707 | [route] 708 | layers = -4 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | filters=128 713 | size=1 714 | stride=1 715 | pad=1 716 | activation=leaky 717 | 718 | [upsample] 719 | stride=2 720 | 721 | [route] 722 | layers = -1, 36 723 | 724 | #yolo 725 | 726 | [convolutional] 727 | batch_normalize=1 728 | filters=128 729 | size=1 730 | stride=1 731 | pad=1 732 | activation=leaky 733 | 734 | [convolutional] 735 | batch_normalize=1 736 | size=3 737 | stride=1 738 | pad=1 739 | filters=256 740 | activation=leaky 741 | 742 | [convolutional] 743 | batch_normalize=1 744 | filters=128 745 | size=1 746 | stride=1 747 | pad=1 748 | activation=leaky 749 | 750 | [convolutional] 751 | batch_normalize=1 752 | size=3 753 | stride=1 754 | pad=1 755 | filters=256 756 | activation=leaky 757 | 758 | [convolutional] 759 | batch_normalize=1 760 | filters=128 761 | size=1 762 | stride=1 763 | pad=1 764 | activation=leaky 765 | 766 | [convolutional] 767 | batch_normalize=1 768 | size=3 769 | stride=1 770 | pad=1 771 | filters=256 772 | activation=leaky 773 | 774 | [convolutional] 775 | size=1 776 | stride=1 777 | pad=1 778 | filters=255 779 | activation=linear 780 | 781 | 782 | [yolo] 783 | mask = 0,1,2 784 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 785 | classes=80 786 | num=9 787 | jitter=.3 788 | ignore_thresh = .5 789 | truth_thresh = 1 790 | random=1 791 | 792 | -------------------------------------------------------------------------------- /data/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /data/voc.names: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | boat 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | diningtable 12 | dog 13 | horse 14 | motorbike 15 | person 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor 21 | -------------------------------------------------------------------------------- /detect.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import time 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import numpy as np 7 | import cv2 8 | from util import * 9 | import argparse 10 | import os 11 | import os.path as osp 12 | from DNModel import net 13 | from img_process import preprocess_img, inp_to_image 14 | import pandas as pd 15 | import random 16 | import pickle as pkl 17 | 18 | 19 | 20 | 21 | def arg_parse(): 22 | 23 | parser = argparse.ArgumentParser(description='YOLOv3 ') 24 | 25 | parser.add_argument("--images", dest = 'images', help = 26 | "Image / Directory containing input images", 27 | default = "images", type = str) 28 | parser.add_argument("--result", dest = 'result', help = 29 | " Directory to store results ", 30 | default = "result", type = str) 31 | parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4) 32 | 33 | parser.add_argument("--bs", dest = "bs", help = "Batch size", default = 1) 34 | parser.add_argument("--confidence", dest = "confidence", help = "Detection Confidence ", default = 0.5) 35 | parser.add_argument("--cfg", dest = 'configfile', help = 36 | "Config file", 37 | default = "cfg/yolov3.cfg", type = str) 38 | parser.add_argument("--weights", dest = 'weightsfile', help = 39 | "weightsfile", 40 | default = "yolov3.weights", type = str) 41 | parser.add_argument("--reso", dest = 'resolution', help = 42 | "Input resolution of the network", 43 | default = "256", type = str) 44 | parser.add_argument("--scales", dest = "scales", help = "Scales to use for detection", 45 | default = "1,2,3", type = str) 46 | 47 | return parser.parse_args() 48 | 49 | if __name__ == '__main__': 50 | args = arg_parse() 51 | 52 | scales = args.scales 53 | 54 | 55 | images = args.images 56 | batch_size = int(args.bs) 57 | confidence = float(args.confidence) 58 | nms_thesh = float(args.nms_thresh) 59 | start = 0 60 | 61 | CUDA = torch.cuda.is_available() 62 | 63 | num_classes = 80 64 | classes = load_classes('data/coco.names') 65 | 66 | model = net(args.configfile) 67 | model.load_weights(args.weightsfile) 68 | print("Network loaded") 69 | 70 | model.DNInfo["height"] = args.resolution 71 | in_dim = int(model.DNInfo["height"]) 72 | 73 | 74 | if CUDA: 75 | model.cuda() 76 | model.eval() 77 | 78 | read_dir = time.time() 79 | try: 80 | imlist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images) if os.path.splitext(img)[1] == '.png' or os.path.splitext(img)[1] =='.jpeg' or os.path.splitext(img)[1] =='.jpg'] 81 | except NotADirectoryError: 82 | imlist = [] 83 | imlist.append(osp.join(osp.realpath('.'), images)) 84 | except FileNotFoundError: 85 | print ("No with the name {}".format(images)) 86 | exit() 87 | 88 | if not os.path.exists(args.result): 89 | os.makedirs(args.result) 90 | 91 | batches = list(map(preprocess_img, imlist, [in_dim for x in range(len(imlist))])) 92 | im_batches = [x[0] for x in batches] 93 | orig_ims = [x[1] for x in batches] 94 | im_dim_list = [x[2] for x in batches] 95 | #Explain 96 | im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2) 97 | 98 | 99 | 100 | if CUDA: 101 | im_dim_list = im_dim_list.cuda() 102 | 103 | leftover = 0 104 | 105 | if (len(im_dim_list) % batch_size): 106 | leftover = 1 107 | 108 | 109 | i = 0 110 | 111 | 112 | write = False 113 | 114 | 115 | objs = {} 116 | 117 | 118 | 119 | for batch in im_batches: 120 | if CUDA: 121 | batch = batch.cuda() 122 | #print('batch size => ', batch.size()) 123 | with torch.no_grad(): 124 | prediction = model(batch, CUDA) 125 | 126 | 127 | 128 | prediction = write_results(prediction, confidence, num_classes, nms = True, nms_conf = nms_thesh) 129 | 130 | 131 | if type(prediction) == int: 132 | i += 1 133 | continue 134 | 135 | 136 | #Add the current batch number 137 | prediction[:,0] += i*batch_size 138 | 139 | 140 | 141 | 142 | if not write: 143 | output = prediction 144 | write = 1 145 | else: 146 | output = torch.cat((output,prediction)) 147 | 148 | 149 | 150 | 151 | for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): 152 | im_id = i*batch_size + im_num 153 | objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id] 154 | print("{0:20s} {1:s}".format("Objects Detected:", " ".join(objs))) 155 | print("----------------------------------------------------------") 156 | i += 1 157 | 158 | 159 | if CUDA: 160 | torch.cuda.synchronize() 161 | 162 | try: 163 | output 164 | except NameError: 165 | print("No detections were made") 166 | exit() 167 | 168 | im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long()) 169 | 170 | scaling_factor = torch.min(in_dim/im_dim_list,1)[0].view(-1,1) 171 | 172 | 173 | output[:,[1,3]] -= (in_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2 174 | output[:,[2,4]] -= (in_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2 175 | 176 | 177 | 178 | output[:,1:5] /= scaling_factor 179 | 180 | for i in range(output.shape[0]): 181 | output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0]) 182 | output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1]) 183 | 184 | colors = pkl.load(open("pallete", "rb")) 185 | 186 | def write(x, batches, results): 187 | c1 = tuple(x[1:3].int()) 188 | c2 = tuple(x[3:5].int()) 189 | img = results[int(x[0])] 190 | cls = int(x[-1]) 191 | label = "{0}".format(classes[cls]) 192 | color = random.choice(colors) 193 | cv2.rectangle(img, c1, c2,color, 1) 194 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 195 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 196 | cv2.rectangle(img, c1, c2,color, -1) 197 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1) 198 | return img 199 | 200 | 201 | list(map(lambda x: write(x, im_batches, orig_ims), output)) 202 | 203 | det_names = pd.Series(imlist).apply(lambda x: "{}/det_{}".format(args.result,x.split("\\")[-1])) 204 | 205 | list(map(cv2.imwrite, det_names, orig_ims)) 206 | 207 | torch.cuda.empty_cache() 208 | 209 | 210 | 211 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /detect_video.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import time 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import numpy as np 7 | import cv2 8 | from util import * 9 | from DNModel import net as Darknet 10 | from img_process import inp_to_image, custom_resize 11 | import pandas as pd 12 | import random 13 | import pickle as pkl 14 | import argparse 15 | 16 | 17 | 18 | def prepare_input(img, inp_dim): 19 | """ 20 | Prepare image for inputting to the neural network. 21 | Perform tranpose and return Tensor 22 | """ 23 | 24 | orig_im = img 25 | dim = orig_im.shape[1], orig_im.shape[0] 26 | img = (custom_resize(orig_im, (inp_dim, inp_dim))) 27 | img_ = img[:,:,::-1].transpose((2,0,1)).copy() 28 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 29 | return img_, orig_im, dim 30 | 31 | def write(x, img): 32 | c1 = tuple(x[1:3].int()) 33 | c2 = tuple(x[3:5].int()) 34 | cls = int(x[-1]) 35 | label = "{0}".format(classes[cls]) 36 | color = random.choice(colors) 37 | cv2.rectangle(img, c1, c2,color, 1) 38 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 39 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 40 | cv2.rectangle(img, c1, c2,color, -1) 41 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); 42 | return img 43 | 44 | def arg_parse(): 45 | """ 46 | Parse arguements to the detect module 47 | 48 | """ 49 | 50 | 51 | parser = argparse.ArgumentParser(description='YOLO v3 Video Detection Module') 52 | 53 | parser.add_argument("--video", dest = 'video', help = 54 | "Video to run detection upon", 55 | default = "video.avi", type = str) 56 | parser.add_argument("--dataset", dest = "dataset", help = "Dataset on which the network has been trained", default = "pascal") 57 | parser.add_argument("--confidence", dest = "confidence", help = "Object Confidence to filter predictions", default = 0.5) 58 | parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4) 59 | parser.add_argument("--cfg", dest = 'cfgfile', help = 60 | "Config file", 61 | default = "cfg/yolov3.cfg", type = str) 62 | parser.add_argument("--weights", dest = 'weightsfile', help = 63 | "weightsfile", 64 | default = "yolov3.weights", type = str) 65 | parser.add_argument("--reso", dest = 'reso', help = 66 | "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed", 67 | default = "128", type = str) 68 | return parser.parse_args() 69 | 70 | 71 | if __name__ == '__main__': 72 | args = arg_parse() 73 | confidence = float(args.confidence) 74 | nms_thesh = float(args.nms_thresh) 75 | start = 0 76 | 77 | CUDA = torch.cuda.is_available() 78 | 79 | num_classes = 80 80 | 81 | bbox_attrs = 5 + num_classes 82 | 83 | print("Loading network") 84 | model = Darknet(args.cfgfile) 85 | model.load_weights(args.weightsfile) 86 | print("Network loaded") 87 | classes = load_classes('data/coco.names') 88 | colors = pkl.load(open("pallete", "rb")) 89 | model.DNInfo["height"] = args.reso 90 | inp_dim = int(model.DNInfo["height"]) 91 | 92 | 93 | if CUDA: 94 | model.cuda() 95 | 96 | model.eval() 97 | 98 | videofile = args.video 99 | 100 | cap = cv2.VideoCapture(videofile) 101 | 102 | assert cap.isOpened(), 'Cannot capture source' 103 | 104 | while cap.isOpened(): 105 | 106 | ret, frame = cap.read() 107 | if ret: 108 | 109 | 110 | img, orig_im, dim = prepare_input(frame, inp_dim) 111 | 112 | im_dim = torch.FloatTensor(dim).repeat(1,2) 113 | 114 | 115 | if CUDA: 116 | im_dim = im_dim.cuda() 117 | img = img.cuda() 118 | 119 | with torch.no_grad(): 120 | output = model(Variable(img), CUDA) 121 | output = write_results(output, confidence, num_classes, nms = True, nms_conf = nms_thesh) 122 | 123 | if type(output) == int: 124 | cv2.imshow("frame", orig_im) 125 | key = cv2.waitKey(1) 126 | if key & 0xFF == ord('x'): 127 | break 128 | continue 129 | 130 | 131 | im_dim = im_dim.repeat(output.size(0), 1) 132 | scaling_factor = torch.min(inp_dim/im_dim,1)[0].view(-1,1) 133 | 134 | output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim[:,0].view(-1,1))/2 135 | output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim[:,1].view(-1,1))/2 136 | 137 | output[:,1:5] /= scaling_factor 138 | 139 | for i in range(output.shape[0]): 140 | output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim[i,0]) 141 | output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim[i,1]) 142 | 143 | 144 | 145 | list(map(lambda x: write(x, orig_im), output)) 146 | 147 | 148 | cv2.imshow("frame", orig_im) 149 | key = cv2.waitKey(1) 150 | if key & 0xFF == ord('x'): 151 | break 152 | else: 153 | break 154 | 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /images/1480611559-palm-beach-home-living-room.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/1480611559-palm-beach-home-living-room.jpg -------------------------------------------------------------------------------- /images/62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg -------------------------------------------------------------------------------- /images/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/dog.jpg -------------------------------------------------------------------------------- /images/eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/eagle.jpg -------------------------------------------------------------------------------- /images/giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/giraffe.jpg -------------------------------------------------------------------------------- /images/herd_of_horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/herd_of_horses.jpg -------------------------------------------------------------------------------- /images/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/img1.jpg -------------------------------------------------------------------------------- /images/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/img2.jpg -------------------------------------------------------------------------------- /images/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/img3.jpg -------------------------------------------------------------------------------- /images/img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/img4.jpg -------------------------------------------------------------------------------- /images/messi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/messi.jpg -------------------------------------------------------------------------------- /images/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/images/person.jpg -------------------------------------------------------------------------------- /img_process.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | import numpy as np 8 | import cv2 9 | import matplotlib.pyplot as plt 10 | from util import convert2cpu as cpu 11 | from PIL import Image, ImageDraw 12 | 13 | 14 | def custom_resize(img, inp_dim): 15 | '''resize without changing aspect ratio''' 16 | img_w, img_h = img.shape[1], img.shape[0] 17 | w, h = inp_dim 18 | new_w = int(img_w * min(w/img_w, h/img_h)) 19 | new_h = int(img_h * min(w/img_w, h/img_h)) 20 | resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC) 21 | 22 | canvas = np.full((inp_dim[1], inp_dim[0], 3), 128) 23 | 24 | canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w, :] = resized_image 25 | 26 | return canvas 27 | 28 | 29 | 30 | def preprocess_img(img, inp_dim): 31 | """ 32 | Preprocess the image for the neural network. 33 | 34 | Returns a tensor 35 | """ 36 | 37 | orig_im = cv2.imread(img) 38 | dim = orig_im.shape[1], orig_im.shape[0] 39 | img = (custom_resize(orig_im, (inp_dim, inp_dim))) 40 | img_ = img[:,:,::-1].transpose((2,0,1)).copy() 41 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 42 | return img_, orig_im, dim 43 | 44 | 45 | def inp_to_image(inp): 46 | inp = inp.cpu().squeeze() 47 | inp = inp*255 48 | try: 49 | inp = inp.data.numpy() 50 | except RuntimeError: 51 | inp = inp.numpy() 52 | inp = inp.transpose(1,2,0) 53 | 54 | inp = inp[:,:,::-1] 55 | return inp 56 | 57 | 58 | -------------------------------------------------------------------------------- /pallete: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/pallete -------------------------------------------------------------------------------- /result/det_1480611559-palm-beach-home-living-room.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_1480611559-palm-beach-home-living-room.jpg -------------------------------------------------------------------------------- /result/det_62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_62bddd2a-89ab-11e7-8a03-f21d91374892-780x429.jpg -------------------------------------------------------------------------------- /result/det_dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_dog.jpg -------------------------------------------------------------------------------- /result/det_eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_eagle.jpg -------------------------------------------------------------------------------- /result/det_giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_giraffe.jpg -------------------------------------------------------------------------------- /result/det_herd_of_horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_herd_of_horses.jpg -------------------------------------------------------------------------------- /result/det_img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_img1.jpg -------------------------------------------------------------------------------- /result/det_img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_img2.jpg -------------------------------------------------------------------------------- /result/det_img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_img3.jpg -------------------------------------------------------------------------------- /result/det_img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_img4.jpg -------------------------------------------------------------------------------- /result/det_messi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_messi.jpg -------------------------------------------------------------------------------- /result/det_person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AyushExel/Detectx-Yolo-V3/bb3dcd75741131f22fc17337e5e8fe9fe9a3bd6d/result/det_person.jpg -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import division 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import numpy as np 9 | import cv2 10 | import matplotlib.pyplot as plt 11 | 12 | def count_parameters(model): 13 | return sum(p.numel() for p in model.parameters()) 14 | 15 | def count_learnable_parameters(model): 16 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 17 | 18 | def convert2cpu(matrix): 19 | if matrix.is_cuda: 20 | return torch.FloatTensor(matrix.size()).copy_(matrix) 21 | else: 22 | return matrix 23 | 24 | def bbox_iou(box1, box2): 25 | """ 26 | Returns the IoU of two bounding boxes 27 | 28 | 29 | """ 30 | #Get the coordinates of bounding boxes 31 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3] 32 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3] 33 | 34 | #get the corrdinates of the intersection rectangle 35 | inter_rect_x1 = torch.max(b1_x1, b2_x1) 36 | inter_rect_y1 = torch.max(b1_y1, b2_y1) 37 | inter_rect_x2 = torch.min(b1_x2, b2_x2) 38 | inter_rect_y2 = torch.min(b1_y2, b2_y2) 39 | 40 | #Intersection area 41 | if torch.cuda.is_available(): 42 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda()) 43 | else: 44 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape)) 45 | 46 | #Union Area 47 | b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1) 48 | b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1) 49 | 50 | iou = inter_area / (b1_area + b2_area - inter_area) 51 | 52 | return iou 53 | 54 | 55 | def transformOutput(prediction, inp_dim, anchors, num_classes, CUDA = True): 56 | batch_size = prediction.size(0) 57 | stride = inp_dim // prediction.size(2) 58 | grid_size = inp_dim // stride 59 | bbox_attrs = 5 + num_classes 60 | num_anchors = len(anchors) 61 | 62 | anchors = [(a[0]/stride, a[1]/stride) for a in anchors] 63 | 64 | 65 | 66 | prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) 67 | prediction = prediction.transpose(1,2).contiguous() 68 | prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) 69 | 70 | 71 | #Sigmoid the centre_X, centre_Y. and object confidencce 72 | prediction[:,:,0] = torch.sigmoid(prediction[:,:,0]) 73 | prediction[:,:,1] = torch.sigmoid(prediction[:,:,1]) 74 | prediction[:,:,4] = torch.sigmoid(prediction[:,:,4]) 75 | 76 | 77 | 78 | #Add the center offsets 79 | grid_len = np.arange(grid_size) 80 | a,b = np.meshgrid(grid_len, grid_len) 81 | 82 | x_offset = torch.FloatTensor(a).view(-1,1) 83 | y_offset = torch.FloatTensor(b).view(-1,1) 84 | 85 | if CUDA: 86 | x_offset = x_offset.cuda() 87 | y_offset = y_offset.cuda() 88 | 89 | x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0) 90 | 91 | prediction[:,:,:2] += x_y_offset 92 | 93 | anchors = torch.FloatTensor(anchors) 94 | 95 | if CUDA: 96 | anchors = anchors.cuda() 97 | 98 | #Transform the anchors using log opearation given in research paper 99 | anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0) 100 | prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors 101 | 102 | #Softmax the class scores 103 | prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes])) 104 | 105 | prediction[:,:,:4] *= stride 106 | 107 | 108 | return prediction 109 | 110 | def load_classes(namesfile): 111 | fp = open(namesfile, "r") 112 | names = fp.read().split("\n")[:-1] 113 | return names 114 | 115 | def get_im_dim(im): 116 | im = cv2.imread(im) 117 | w,h = im.shape[1], im.shape[0] 118 | return w,h 119 | 120 | def unique(tensor): 121 | tensor_np = tensor.cpu().numpy() 122 | unique_np = np.unique(tensor_np) 123 | unique_tensor = torch.from_numpy(unique_np) 124 | 125 | tensor_res = tensor.new(unique_tensor.shape) 126 | tensor_res.copy_(unique_tensor) 127 | return tensor_res 128 | 129 | def write_results(prediction, confidence, num_classes, nms = True, nms_conf = 0.4): 130 | conf_mask = (prediction[:,:,4] > confidence).float().unsqueeze(2) 131 | prediction = prediction*conf_mask 132 | 133 | 134 | try: 135 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 136 | except: 137 | return 0 138 | 139 | 140 | box_a = prediction.new(prediction.shape) 141 | box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2) 142 | box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2) 143 | box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 144 | box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2) 145 | prediction[:,:,:4] = box_a[:,:,:4] 146 | 147 | 148 | 149 | batch_size = prediction.size(0) 150 | 151 | output = prediction.new(1, prediction.size(2) + 1) 152 | write = False 153 | 154 | 155 | for ind in range(batch_size): 156 | #select the image from the batch 157 | image_pred = prediction[ind] 158 | 159 | 160 | 161 | #Get the class having maximum score, and the index of that class 162 | #Get rid of num_classes softmax scores 163 | #Add the class index and the class score of class having maximum score 164 | max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1) 165 | max_conf = max_conf.float().unsqueeze(1) 166 | max_conf_score = max_conf_score.float().unsqueeze(1) 167 | seq = (image_pred[:,:5], max_conf, max_conf_score) 168 | image_pred = torch.cat(seq, 1) 169 | 170 | 171 | 172 | #Get rid of the zero entries 173 | non_zero_ind = (torch.nonzero(image_pred[:,4])) 174 | 175 | 176 | image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7) 177 | 178 | #Get the various classes detected in the image 179 | try: 180 | img_classes = unique(image_pred_[:,-1]) 181 | except: 182 | continue 183 | #WE will do NMS classwise 184 | for cls in img_classes: 185 | #get the detections with one particular class 186 | cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1) 187 | class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze() 188 | 189 | 190 | image_pred_class = image_pred_[class_mask_ind].view(-1,7) 191 | 192 | 193 | 194 | #sort the detections such that the entry with the maximum objectness 195 | #confidence is at the top 196 | conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1] 197 | image_pred_class = image_pred_class[conf_sort_index] 198 | idx = image_pred_class.size(0) 199 | 200 | #if nms has to be done 201 | if nms: 202 | #For each detection 203 | for i in range(idx): 204 | #Get the IOUs of all boxes that come after the one we are looking at 205 | #in the loop 206 | try: 207 | ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:]) 208 | except ValueError: 209 | break 210 | 211 | except IndexError: 212 | break 213 | 214 | #Zero out all the detections that have IoU > treshhold 215 | iou_mask = (ious < nms_conf).float().unsqueeze(1) 216 | image_pred_class[i+1:] *= iou_mask 217 | 218 | #Remove the non-zero entries 219 | non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze() 220 | image_pred_class = image_pred_class[non_zero_ind].view(-1,7) 221 | 222 | 223 | 224 | #Concatenate the batch_id of the image to the detection 225 | #this helps us identify which image does the detection correspond to 226 | #We use a linear straucture to hold ALL the detections from the batch 227 | #the batch_dim is flattened 228 | #batch is identified by extra batch column 229 | 230 | 231 | batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) 232 | seq = batch_ind, image_pred_class 233 | if not write: 234 | output = torch.cat(seq,1) 235 | write = True 236 | else: 237 | out = torch.cat(seq,1) 238 | output = torch.cat((output,out)) 239 | 240 | 241 | return output 242 | 243 | 244 | 245 | --------------------------------------------------------------------------------