├── .gitignore
├── LICENSE
├── README.md
├── yolo.py
├── yolo_utils.py
└── yolov3-coco
├── coco-labels
├── get_model.sh
└── yolov3.cfg
/.gitignore:
--------------------------------------------------------------------------------
1 | *.weights
2 | *.swp
3 | images/
4 | __pycache__/
5 | *.avi
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Arunava
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # YOLOv3-Object-Detection-with-OpenCV
2 |
3 | This project implements an image and video object detection classifier using pretrained yolov3 models.
4 | The yolov3 models are taken from the official yolov3 paper which was released in 2018. The yolov3 implementation is from [darknet](https://github.com/pjreddie/darknet). Also, this project implements an option to perform classification real-time using the webcam.
5 |
6 | ## How to use?
7 |
8 | 1) Clone the repository
9 |
10 | ```
11 | git clone https://github.com/iArunava/YOLOv3-Object-Detection-with-OpenCV.git
12 | ```
13 |
14 | 2) Move to the directory
15 | ```
16 | cd YOLOv3-Object-Detection-with-OpenCV
17 | ```
18 |
19 | 3) To infer on an image that is stored on your local machine
20 | ```
21 | python3 yolo.py --image-path='/path/to/image/'
22 | ```
23 | 4) To infer on a video that is stored on your local machine
24 | ```
25 | python3 yolo.py --video-path='/path/to/video/'
26 | ```
27 | 5) To infer real-time on webcam
28 | ```
29 | python3 yolo.py
30 | ```
31 |
32 | Note: This works considering you have the `weights` and `config` files at the yolov3-coco directory.
33 |
34 | If the files are located somewhere else then mention the path while calling the `yolov3.py`. For more details
35 | ```
36 | yolo.py --help
37 | ```
38 |
39 | ## Inference on images
40 |
41 |
42 | 
43 | 
44 | 
45 | 
46 |
47 | ## Inference on Video
48 |
49 | [](https://www.youtube.com/watch?v=AzmCYs5fAn0)
50 | Click on the image to Play the video on YouTube
51 |
52 | ## Inference in Real-time
53 |
54 | [](https://youtu.be/QaxEtpRwmtI)
55 | Click on the image to Play the video on YouTube
56 |
57 | ## References
58 |
59 | 1) [PyImageSearch YOLOv3 Object Detection with OpenCV Blog](https://www.pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv/)
60 |
61 | ## License
62 |
63 | The code in this project is distributed under the MIT License.
64 |
--------------------------------------------------------------------------------
/yolo.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | import cv2 as cv
4 | import subprocess
5 | import time
6 | import os
7 | from yolo_utils import infer_image, show_image
8 |
9 | FLAGS = []
10 |
11 | if __name__ == '__main__':
12 | parser = argparse.ArgumentParser()
13 |
14 | parser.add_argument('-m', '--model-path',
15 | type=str,
16 | default='./yolov3-coco/',
17 | help='The directory where the model weights and \
18 | configuration files are.')
19 |
20 | parser.add_argument('-w', '--weights',
21 | type=str,
22 | default='./yolov3-coco/yolov3.weights',
23 | help='Path to the file which contains the weights \
24 | for YOLOv3.')
25 |
26 | parser.add_argument('-cfg', '--config',
27 | type=str,
28 | default='./yolov3-coco/yolov3.cfg',
29 | help='Path to the configuration file for the YOLOv3 model.')
30 |
31 | parser.add_argument('-i', '--image-path',
32 | type=str,
33 | help='The path to the image file')
34 |
35 | parser.add_argument('-v', '--video-path',
36 | type=str,
37 | help='The path to the video file')
38 |
39 |
40 | parser.add_argument('-vo', '--video-output-path',
41 | type=str,
42 | default='./output.avi',
43 | help='The path of the output video file')
44 |
45 | parser.add_argument('-l', '--labels',
46 | type=str,
47 | default='./yolov3-coco/coco-labels',
48 | help='Path to the file having the \
49 | labels in a new-line seperated way.')
50 |
51 | parser.add_argument('-c', '--confidence',
52 | type=float,
53 | default=0.5,
54 | help='The model will reject boundaries which has a \
55 | probabiity less than the confidence value. \
56 | default: 0.5')
57 |
58 | parser.add_argument('-th', '--threshold',
59 | type=float,
60 | default=0.3,
61 | help='The threshold to use when applying the \
62 | Non-Max Suppresion')
63 |
64 | parser.add_argument('--download-model',
65 | type=bool,
66 | default=False,
67 | help='Set to True, if the model weights and configurations \
68 | are not present on your local machine.')
69 |
70 | parser.add_argument('-t', '--show-time',
71 | type=bool,
72 | default=False,
73 | help='Show the time taken to infer each image.')
74 |
75 | FLAGS, unparsed = parser.parse_known_args()
76 |
77 | # Download the YOLOv3 models if needed
78 | if FLAGS.download_model:
79 | subprocess.call(['./yolov3-coco/get_model.sh'])
80 |
81 | # Get the labels
82 | labels = open(FLAGS.labels).read().strip().split('\n')
83 |
84 | # Intializing colors to represent each label uniquely
85 | colors = np.random.randint(0, 255, size=(len(labels), 3), dtype='uint8')
86 |
87 | # Load the weights and configutation to form the pretrained YOLOv3 model
88 | net = cv.dnn.readNetFromDarknet(FLAGS.config, FLAGS.weights)
89 |
90 | # Get the output layer names of the model
91 | layer_names = net.getLayerNames()
92 | layer_names = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
93 |
94 | # If both image and video files are given then raise error
95 | if FLAGS.image_path is None and FLAGS.video_path is None:
96 | print ('Neither path to an image or path to video provided')
97 | print ('Starting Inference on Webcam')
98 |
99 | # Do inference with given image
100 | if FLAGS.image_path:
101 | # Read the image
102 | try:
103 | img = cv.imread(FLAGS.image_path)
104 | height, width = img.shape[:2]
105 | except:
106 | raise 'Image cannot be loaded!\n\
107 | Please check the path provided!'
108 |
109 | finally:
110 | img, _, _, _, _ = infer_image(net, layer_names, height, width, img, colors, labels, FLAGS)
111 | show_image(img)
112 |
113 | elif FLAGS.video_path:
114 | # Read the video
115 | try:
116 | vid = cv.VideoCapture(FLAGS.video_path)
117 | height, width = None, None
118 | writer = None
119 | except:
120 | raise 'Video cannot be loaded!\n\
121 | Please check the path provided!'
122 |
123 | finally:
124 | while True:
125 | grabbed, frame = vid.read()
126 |
127 | # Checking if the complete video is read
128 | if not grabbed:
129 | break
130 |
131 | if width is None or height is None:
132 | height, width = frame.shape[:2]
133 |
134 | frame, _, _, _, _ = infer_image(net, layer_names, height, width, frame, colors, labels, FLAGS)
135 |
136 | if writer is None:
137 | # Initialize the video writer
138 | fourcc = cv.VideoWriter_fourcc(*"MJPG")
139 | writer = cv.VideoWriter(FLAGS.video_output_path, fourcc, 30,
140 | (frame.shape[1], frame.shape[0]), True)
141 |
142 |
143 | writer.write(frame)
144 |
145 | print ("[INFO] Cleaning up...")
146 | writer.release()
147 | vid.release()
148 |
149 |
150 | else:
151 | # Infer real-time on webcam
152 | count = 0
153 |
154 | vid = cv.VideoCapture(0)
155 | while True:
156 | _, frame = vid.read()
157 | height, width = frame.shape[:2]
158 |
159 | if count == 0:
160 | frame, boxes, confidences, classids, idxs = infer_image(net, layer_names, \
161 | height, width, frame, colors, labels, FLAGS)
162 | count += 1
163 | else:
164 | frame, boxes, confidences, classids, idxs = infer_image(net, layer_names, \
165 | height, width, frame, colors, labels, FLAGS, boxes, confidences, classids, idxs, infer=False)
166 | count = (count + 1) % 6
167 |
168 | cv.imshow('webcam', frame)
169 |
170 | if cv.waitKey(1) & 0xFF == ord('q'):
171 | break
172 | vid.release()
173 | cv.destroyAllWindows()
174 |
--------------------------------------------------------------------------------
/yolo_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | import cv2 as cv
4 | import subprocess
5 | import time
6 | import os
7 |
8 | def show_image(img):
9 | cv.imshow("Image", img)
10 | cv.waitKey(0)
11 |
12 | def draw_labels_and_boxes(img, boxes, confidences, classids, idxs, colors, labels):
13 | # If there are any detections
14 | if len(idxs) > 0:
15 | for i in idxs.flatten():
16 | # Get the bounding box coordinates
17 | x, y = boxes[i][0], boxes[i][1]
18 | w, h = boxes[i][2], boxes[i][3]
19 |
20 | # Get the unique color for this class
21 | color = [int(c) for c in colors[classids[i]]]
22 |
23 | # Draw the bounding box rectangle and label on the image
24 | cv.rectangle(img, (x, y), (x+w, y+h), color, 2)
25 | text = "{}: {:4f}".format(labels[classids[i]], confidences[i])
26 | cv.putText(img, text, (x, y-5), cv.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
27 |
28 | return img
29 |
30 |
31 | def generate_boxes_confidences_classids(outs, height, width, tconf):
32 | boxes = []
33 | confidences = []
34 | classids = []
35 |
36 | for out in outs:
37 | for detection in out:
38 | #print (detection)
39 | #a = input('GO!')
40 |
41 | # Get the scores, classid, and the confidence of the prediction
42 | scores = detection[5:]
43 | classid = np.argmax(scores)
44 | confidence = scores[classid]
45 |
46 | # Consider only the predictions that are above a certain confidence level
47 | if confidence > tconf:
48 | # TODO Check detection
49 | box = detection[0:4] * np.array([width, height, width, height])
50 | centerX, centerY, bwidth, bheight = box.astype('int')
51 |
52 | # Using the center x, y coordinates to derive the top
53 | # and the left corner of the bounding box
54 | x = int(centerX - (bwidth / 2))
55 | y = int(centerY - (bheight / 2))
56 |
57 | # Append to list
58 | boxes.append([x, y, int(bwidth), int(bheight)])
59 | confidences.append(float(confidence))
60 | classids.append(classid)
61 |
62 | return boxes, confidences, classids
63 |
64 | def infer_image(net, layer_names, height, width, img, colors, labels, FLAGS,
65 | boxes=None, confidences=None, classids=None, idxs=None, infer=True):
66 |
67 | if infer:
68 | # Contructing a blob from the input image
69 | blob = cv.dnn.blobFromImage(img, 1 / 255.0, (416, 416),
70 | swapRB=True, crop=False)
71 |
72 | # Perform a forward pass of the YOLO object detector
73 | net.setInput(blob)
74 |
75 | # Getting the outputs from the output layers
76 | start = time.time()
77 | outs = net.forward(layer_names)
78 | end = time.time()
79 |
80 | if FLAGS.show_time:
81 | print ("[INFO] YOLOv3 took {:6f} seconds".format(end - start))
82 |
83 |
84 | # Generate the boxes, confidences, and classIDs
85 | boxes, confidences, classids = generate_boxes_confidences_classids(outs, height, width, FLAGS.confidence)
86 |
87 | # Apply Non-Maxima Suppression to suppress overlapping bounding boxes
88 | idxs = cv.dnn.NMSBoxes(boxes, confidences, FLAGS.confidence, FLAGS.threshold)
89 |
90 | if boxes is None or confidences is None or idxs is None or classids is None:
91 | raise '[ERROR] Required variables are set to None before drawing boxes on images.'
92 |
93 | # Draw labels and boxes on the image
94 | img = draw_labels_and_boxes(img, boxes, confidences, classids, idxs, colors, labels)
95 |
96 | return img, boxes, confidences, classids, idxs
97 |
--------------------------------------------------------------------------------
/yolov3-coco/coco-labels:
--------------------------------------------------------------------------------
1 | person
2 | bicycle
3 | car
4 | motorbike
5 | aeroplane
6 | bus
7 | train
8 | truck
9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 |
--------------------------------------------------------------------------------
/yolov3-coco/get_model.sh:
--------------------------------------------------------------------------------
1 | # The model here is YOLOv3 model trained by the official
2 | # authors of the model using the DarkNet Framework
3 | # and is made available from their website
4 | # http://pjreddie.com/yolo/
5 |
6 | echo 'Getting the YOLOv3 model'
7 | echo 'Starting Download...'
8 | wget --no-check-certificate https://pjreddie.com/media/files/yolov3.weights
9 | echo 'Download completed successfully!!'
--------------------------------------------------------------------------------
/yolov3-coco/yolov3.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 |
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 |
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 |
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 |
606 |
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 |
617 |
618 | [route]
619 | layers = -4
620 |
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 |
629 | [upsample]
630 | stride=2
631 |
632 | [route]
633 | layers = -1, 61
634 |
635 |
636 |
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 |
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 |
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 |
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 |
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 |
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 |
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 |
692 |
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 |
703 |
704 |
705 | [route]
706 | layers = -4
707 |
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 |
716 | [upsample]
717 | stride=2
718 |
719 | [route]
720 | layers = -1, 36
721 |
722 |
723 |
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 |
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 |
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 |
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 |
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 |
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 |
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 |
779 |
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 |
790 |
--------------------------------------------------------------------------------