├── .gitignore ├── LICENSE ├── README.md ├── yolo.py ├── yolo_utils.py └── yolov3-coco ├── coco-labels ├── get_model.sh └── yolov3.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | *.weights 2 | *.swp 3 | images/ 4 | __pycache__/ 5 | *.avi 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Arunava 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YOLOv3-Object-Detection-with-OpenCV 2 | 3 | This project implements an image and video object detection classifier using pretrained yolov3 models. 4 | The yolov3 models are taken from the official yolov3 paper which was released in 2018. The yolov3 implementation is from [darknet](https://github.com/pjreddie/darknet). Also, this project implements an option to perform classification real-time using the webcam. 5 | 6 | ## How to use? 7 | 8 | 1) Clone the repository 9 | 10 | ``` 11 | git clone https://github.com/iArunava/YOLOv3-Object-Detection-with-OpenCV.git 12 | ``` 13 | 14 | 2) Move to the directory 15 | ``` 16 | cd YOLOv3-Object-Detection-with-OpenCV 17 | ``` 18 | 19 | 3) To infer on an image that is stored on your local machine 20 | ``` 21 | python3 yolo.py --image-path='/path/to/image/' 22 | ``` 23 | 4) To infer on a video that is stored on your local machine 24 | ``` 25 | python3 yolo.py --video-path='/path/to/video/' 26 | ``` 27 | 5) To infer real-time on webcam 28 | ``` 29 | python3 yolo.py 30 | ``` 31 | 32 | Note: This works considering you have the `weights` and `config` files at the yolov3-coco directory. 33 |
34 | If the files are located somewhere else then mention the path while calling the `yolov3.py`. For more details 35 | ``` 36 | yolo.py --help 37 | ``` 38 | 39 | ## Inference on images 40 | 41 | 42 | ![yolo_img_infer_1](https://user-images.githubusercontent.com/26242097/48849319-15d21180-edcc-11e8-892f-7d894be8d1a6.png) 43 | ![yolo_img_infer_2](https://user-images.githubusercontent.com/26242097/48850723-41a2c680-edcf-11e8-8940-aec302cd8aa3.png) 44 | ![yolo_infer_3](https://user-images.githubusercontent.com/26242097/48850729-449db700-edcf-11e8-853d-9f3eca6233c9.png) 45 | ![yolo_img_infer_4](https://user-images.githubusercontent.com/26242097/48850735-47001100-edcf-11e8-80d6-b474e54fd7af.png) 46 | 47 | ## Inference on Video 48 | 49 | [![yolov3-video](https://user-images.githubusercontent.com/26242097/48851021-0785f480-edd0-11e8-8ce4-cdfb78e8a849.png)](https://www.youtube.com/watch?v=AzmCYs5fAn0) 50 | Click on the image to Play the video on YouTube 51 | 52 | ## Inference in Real-time 53 | 54 | [![yolov3-video](https://user-images.githubusercontent.com/26242097/48862668-0ca56c80-eded-11e8-9482-31d795105983.png)](https://youtu.be/QaxEtpRwmtI) 55 | Click on the image to Play the video on YouTube 56 | 57 | ## References 58 | 59 | 1) [PyImageSearch YOLOv3 Object Detection with OpenCV Blog](https://www.pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv/) 60 | 61 | ## License 62 | 63 | The code in this project is distributed under the MIT License. 64 | -------------------------------------------------------------------------------- /yolo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import cv2 as cv 4 | import subprocess 5 | import time 6 | import os 7 | from yolo_utils import infer_image, show_image 8 | 9 | FLAGS = [] 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser() 13 | 14 | parser.add_argument('-m', '--model-path', 15 | type=str, 16 | default='./yolov3-coco/', 17 | help='The directory where the model weights and \ 18 | configuration files are.') 19 | 20 | parser.add_argument('-w', '--weights', 21 | type=str, 22 | default='./yolov3-coco/yolov3.weights', 23 | help='Path to the file which contains the weights \ 24 | for YOLOv3.') 25 | 26 | parser.add_argument('-cfg', '--config', 27 | type=str, 28 | default='./yolov3-coco/yolov3.cfg', 29 | help='Path to the configuration file for the YOLOv3 model.') 30 | 31 | parser.add_argument('-i', '--image-path', 32 | type=str, 33 | help='The path to the image file') 34 | 35 | parser.add_argument('-v', '--video-path', 36 | type=str, 37 | help='The path to the video file') 38 | 39 | 40 | parser.add_argument('-vo', '--video-output-path', 41 | type=str, 42 | default='./output.avi', 43 | help='The path of the output video file') 44 | 45 | parser.add_argument('-l', '--labels', 46 | type=str, 47 | default='./yolov3-coco/coco-labels', 48 | help='Path to the file having the \ 49 | labels in a new-line seperated way.') 50 | 51 | parser.add_argument('-c', '--confidence', 52 | type=float, 53 | default=0.5, 54 | help='The model will reject boundaries which has a \ 55 | probabiity less than the confidence value. \ 56 | default: 0.5') 57 | 58 | parser.add_argument('-th', '--threshold', 59 | type=float, 60 | default=0.3, 61 | help='The threshold to use when applying the \ 62 | Non-Max Suppresion') 63 | 64 | parser.add_argument('--download-model', 65 | type=bool, 66 | default=False, 67 | help='Set to True, if the model weights and configurations \ 68 | are not present on your local machine.') 69 | 70 | parser.add_argument('-t', '--show-time', 71 | type=bool, 72 | default=False, 73 | help='Show the time taken to infer each image.') 74 | 75 | FLAGS, unparsed = parser.parse_known_args() 76 | 77 | # Download the YOLOv3 models if needed 78 | if FLAGS.download_model: 79 | subprocess.call(['./yolov3-coco/get_model.sh']) 80 | 81 | # Get the labels 82 | labels = open(FLAGS.labels).read().strip().split('\n') 83 | 84 | # Intializing colors to represent each label uniquely 85 | colors = np.random.randint(0, 255, size=(len(labels), 3), dtype='uint8') 86 | 87 | # Load the weights and configutation to form the pretrained YOLOv3 model 88 | net = cv.dnn.readNetFromDarknet(FLAGS.config, FLAGS.weights) 89 | 90 | # Get the output layer names of the model 91 | layer_names = net.getLayerNames() 92 | layer_names = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()] 93 | 94 | # If both image and video files are given then raise error 95 | if FLAGS.image_path is None and FLAGS.video_path is None: 96 | print ('Neither path to an image or path to video provided') 97 | print ('Starting Inference on Webcam') 98 | 99 | # Do inference with given image 100 | if FLAGS.image_path: 101 | # Read the image 102 | try: 103 | img = cv.imread(FLAGS.image_path) 104 | height, width = img.shape[:2] 105 | except: 106 | raise 'Image cannot be loaded!\n\ 107 | Please check the path provided!' 108 | 109 | finally: 110 | img, _, _, _, _ = infer_image(net, layer_names, height, width, img, colors, labels, FLAGS) 111 | show_image(img) 112 | 113 | elif FLAGS.video_path: 114 | # Read the video 115 | try: 116 | vid = cv.VideoCapture(FLAGS.video_path) 117 | height, width = None, None 118 | writer = None 119 | except: 120 | raise 'Video cannot be loaded!\n\ 121 | Please check the path provided!' 122 | 123 | finally: 124 | while True: 125 | grabbed, frame = vid.read() 126 | 127 | # Checking if the complete video is read 128 | if not grabbed: 129 | break 130 | 131 | if width is None or height is None: 132 | height, width = frame.shape[:2] 133 | 134 | frame, _, _, _, _ = infer_image(net, layer_names, height, width, frame, colors, labels, FLAGS) 135 | 136 | if writer is None: 137 | # Initialize the video writer 138 | fourcc = cv.VideoWriter_fourcc(*"MJPG") 139 | writer = cv.VideoWriter(FLAGS.video_output_path, fourcc, 30, 140 | (frame.shape[1], frame.shape[0]), True) 141 | 142 | 143 | writer.write(frame) 144 | 145 | print ("[INFO] Cleaning up...") 146 | writer.release() 147 | vid.release() 148 | 149 | 150 | else: 151 | # Infer real-time on webcam 152 | count = 0 153 | 154 | vid = cv.VideoCapture(0) 155 | while True: 156 | _, frame = vid.read() 157 | height, width = frame.shape[:2] 158 | 159 | if count == 0: 160 | frame, boxes, confidences, classids, idxs = infer_image(net, layer_names, \ 161 | height, width, frame, colors, labels, FLAGS) 162 | count += 1 163 | else: 164 | frame, boxes, confidences, classids, idxs = infer_image(net, layer_names, \ 165 | height, width, frame, colors, labels, FLAGS, boxes, confidences, classids, idxs, infer=False) 166 | count = (count + 1) % 6 167 | 168 | cv.imshow('webcam', frame) 169 | 170 | if cv.waitKey(1) & 0xFF == ord('q'): 171 | break 172 | vid.release() 173 | cv.destroyAllWindows() 174 | -------------------------------------------------------------------------------- /yolo_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import cv2 as cv 4 | import subprocess 5 | import time 6 | import os 7 | 8 | def show_image(img): 9 | cv.imshow("Image", img) 10 | cv.waitKey(0) 11 | 12 | def draw_labels_and_boxes(img, boxes, confidences, classids, idxs, colors, labels): 13 | # If there are any detections 14 | if len(idxs) > 0: 15 | for i in idxs.flatten(): 16 | # Get the bounding box coordinates 17 | x, y = boxes[i][0], boxes[i][1] 18 | w, h = boxes[i][2], boxes[i][3] 19 | 20 | # Get the unique color for this class 21 | color = [int(c) for c in colors[classids[i]]] 22 | 23 | # Draw the bounding box rectangle and label on the image 24 | cv.rectangle(img, (x, y), (x+w, y+h), color, 2) 25 | text = "{}: {:4f}".format(labels[classids[i]], confidences[i]) 26 | cv.putText(img, text, (x, y-5), cv.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) 27 | 28 | return img 29 | 30 | 31 | def generate_boxes_confidences_classids(outs, height, width, tconf): 32 | boxes = [] 33 | confidences = [] 34 | classids = [] 35 | 36 | for out in outs: 37 | for detection in out: 38 | #print (detection) 39 | #a = input('GO!') 40 | 41 | # Get the scores, classid, and the confidence of the prediction 42 | scores = detection[5:] 43 | classid = np.argmax(scores) 44 | confidence = scores[classid] 45 | 46 | # Consider only the predictions that are above a certain confidence level 47 | if confidence > tconf: 48 | # TODO Check detection 49 | box = detection[0:4] * np.array([width, height, width, height]) 50 | centerX, centerY, bwidth, bheight = box.astype('int') 51 | 52 | # Using the center x, y coordinates to derive the top 53 | # and the left corner of the bounding box 54 | x = int(centerX - (bwidth / 2)) 55 | y = int(centerY - (bheight / 2)) 56 | 57 | # Append to list 58 | boxes.append([x, y, int(bwidth), int(bheight)]) 59 | confidences.append(float(confidence)) 60 | classids.append(classid) 61 | 62 | return boxes, confidences, classids 63 | 64 | def infer_image(net, layer_names, height, width, img, colors, labels, FLAGS, 65 | boxes=None, confidences=None, classids=None, idxs=None, infer=True): 66 | 67 | if infer: 68 | # Contructing a blob from the input image 69 | blob = cv.dnn.blobFromImage(img, 1 / 255.0, (416, 416), 70 | swapRB=True, crop=False) 71 | 72 | # Perform a forward pass of the YOLO object detector 73 | net.setInput(blob) 74 | 75 | # Getting the outputs from the output layers 76 | start = time.time() 77 | outs = net.forward(layer_names) 78 | end = time.time() 79 | 80 | if FLAGS.show_time: 81 | print ("[INFO] YOLOv3 took {:6f} seconds".format(end - start)) 82 | 83 | 84 | # Generate the boxes, confidences, and classIDs 85 | boxes, confidences, classids = generate_boxes_confidences_classids(outs, height, width, FLAGS.confidence) 86 | 87 | # Apply Non-Maxima Suppression to suppress overlapping bounding boxes 88 | idxs = cv.dnn.NMSBoxes(boxes, confidences, FLAGS.confidence, FLAGS.threshold) 89 | 90 | if boxes is None or confidences is None or idxs is None or classids is None: 91 | raise '[ERROR] Required variables are set to None before drawing boxes on images.' 92 | 93 | # Draw labels and boxes on the image 94 | img = draw_labels_and_boxes(img, boxes, confidences, classids, idxs, colors, labels) 95 | 96 | return img, boxes, confidences, classids, idxs 97 | -------------------------------------------------------------------------------- /yolov3-coco/coco-labels: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /yolov3-coco/get_model.sh: -------------------------------------------------------------------------------- 1 | # The model here is YOLOv3 model trained by the official 2 | # authors of the model using the DarkNet Framework 3 | # and is made available from their website 4 | # http://pjreddie.com/yolo/ 5 | 6 | echo 'Getting the YOLOv3 model' 7 | echo 'Starting Download...' 8 | wget --no-check-certificate https://pjreddie.com/media/files/yolov3.weights 9 | echo 'Download completed successfully!!' -------------------------------------------------------------------------------- /yolov3-coco/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | 790 | --------------------------------------------------------------------------------