├── .gitignore
├── README.md
├── highway.gif
├── input
└── highway.mp4
├── main.py
├── output
└── .gitkeep
├── sort.py
└── yolo-coco
├── coco.names
└── yolov3.cfg
/.gitignore:
--------------------------------------------------------------------------------
1 | *.DS_Store
2 | __pycache__/
3 | output/*
4 | !output/.gitkeep
5 | *.pyc
6 | yolo-coco/yolov3.weights
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python Traffic Counter
2 |
3 | The purpose of this project is to detect and track vehicles on a video stream and count those going through a defined line.
4 |
5 | 
6 |
7 | It uses:
8 |
9 | * [YOLO](https://www.pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv) to detect objects on each of the video frames.
10 |
11 | * [SORT](https://github.com/abewley/sort) to track those objects over different frames.
12 |
13 | Once the objects are detected and tracked over different frames a simple mathematical calculation is applied to count the intersections between the vehicles previous and current frame positions with a defined line.
14 |
15 | The code on this prototype uses the code structure developed by Adrian Rosebrock for his article [YOLO object detection with OpenCV](https://www.pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv).
16 |
17 | ## Quick Start
18 |
19 | 1. Download the code to your computer.
20 | 2. [Download yolov3.weights](https://www.dropbox.com/s/99mm7olr1ohtjbq/yolov3.weights?dl=0) and place it in `/yolo-coco`.
21 | 3. Make sure you have Python 3.7.0 and [OpenCV 3.4.2](https://www.pyimagesearch.com/opencv-tutorials-resources-guides/) installed.
22 | 4. Run:
23 | ```
24 | $ python main.py --input input/highway.mp4 --output output/highway.avi --yolo yolo-coco
25 | ```
26 |
27 | ## Citation
28 |
29 | ### YOLO :
30 |
31 | @article{redmon2016yolo9000,
32 | title={YOLO9000: Better, Faster, Stronger},
33 | author={Redmon, Joseph and Farhadi, Ali},
34 | journal={arXiv preprint arXiv:1612.08242},
35 | year={2016}
36 | }
37 |
38 | ### SORT :
39 |
40 | @inproceedings{Bewley2016_sort,
41 | author={Bewley, Alex and Ge, Zongyuan and Ott, Lionel and Ramos, Fabio and Upcroft, Ben},
42 | booktitle={2016 IEEE International Conference on Image Processing (ICIP)},
43 | title={Simple online and realtime tracking},
44 | year={2016},
45 | pages={3464-3468},
46 | keywords={Benchmark testing;Complexity theory;Detectors;Kalman filters;Target tracking;Visualization;Computer Vision;Data Association;Detection;Multiple Object Tracking},
47 | doi={10.1109/ICIP.2016.7533003}
48 | }
--------------------------------------------------------------------------------
/highway.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HodenX/python-traffic-counter-with-yolo-and-sort/580426ce0992095311b8145aeb2a65fb80336b68/highway.gif
--------------------------------------------------------------------------------
/input/highway.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HodenX/python-traffic-counter-with-yolo-and-sort/580426ce0992095311b8145aeb2a65fb80336b68/input/highway.mp4
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # import the necessary packages
2 | import numpy as np
3 | import argparse
4 | import imutils
5 | import time
6 | import cv2
7 | import os
8 | import glob
9 |
10 | files = glob.glob('output/*.png')
11 | for f in files:
12 | os.remove(f)
13 |
14 | from sort import *
15 | tracker = Sort()
16 | memory = {}
17 | line = [(43, 543), (550, 655)]
18 | counter = 0
19 |
20 | # construct the argument parse and parse the arguments
21 | ap = argparse.ArgumentParser()
22 | ap.add_argument("-i", "--input", required=True,
23 | help="path to input video")
24 | ap.add_argument("-o", "--output", required=True,
25 | help="path to output video")
26 | ap.add_argument("-y", "--yolo", required=True,
27 | help="base path to YOLO directory")
28 | ap.add_argument("-c", "--confidence", type=float, default=0.5,
29 | help="minimum probability to filter weak detections")
30 | ap.add_argument("-t", "--threshold", type=float, default=0.3,
31 | help="threshold when applyong non-maxima suppression")
32 | args = vars(ap.parse_args())
33 |
34 | # Return true if line segments AB and CD intersect
35 | def intersect(A,B,C,D):
36 | return ccw(A,C,D) != ccw(B,C,D) and ccw(A,B,C) != ccw(A,B,D)
37 |
38 | def ccw(A,B,C):
39 | return (C[1]-A[1]) * (B[0]-A[0]) > (B[1]-A[1]) * (C[0]-A[0])
40 |
41 | # load the COCO class labels our YOLO model was trained on
42 | labelsPath = os.path.sep.join([args["yolo"], "coco.names"])
43 | LABELS = open(labelsPath).read().strip().split("\n")
44 |
45 | # initialize a list of colors to represent each possible class label
46 | np.random.seed(42)
47 | COLORS = np.random.randint(0, 255, size=(200, 3),
48 | dtype="uint8")
49 |
50 | # derive the paths to the YOLO weights and model configuration
51 | weightsPath = os.path.sep.join([args["yolo"], "yolov3.weights"])
52 | configPath = os.path.sep.join([args["yolo"], "yolov3.cfg"])
53 |
54 | # load our YOLO object detector trained on COCO dataset (80 classes)
55 | # and determine only the *output* layer names that we need from YOLO
56 | print("[INFO] loading YOLO from disk...")
57 | net = cv2.dnn.readNetFromDarknet(configPath, weightsPath)
58 | ln = net.getLayerNames()
59 | ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
60 |
61 | # initialize the video stream, pointer to output video file, and
62 | # frame dimensions
63 | vs = cv2.VideoCapture(args["input"])
64 | writer = None
65 | (W, H) = (None, None)
66 |
67 | frameIndex = 0
68 |
69 | # try to determine the total number of frames in the video file
70 | try:
71 | prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2() \
72 | else cv2.CAP_PROP_FRAME_COUNT
73 | total = int(vs.get(prop))
74 | print("[INFO] {} total frames in video".format(total))
75 |
76 | # an error occurred while trying to determine the total
77 | # number of frames in the video file
78 | except:
79 | print("[INFO] could not determine # of frames in video")
80 | print("[INFO] no approx. completion time can be provided")
81 | total = -1
82 |
83 | # loop over frames from the video file stream
84 | while True:
85 | # read the next frame from the file
86 | (grabbed, frame) = vs.read()
87 |
88 | # if the frame was not grabbed, then we have reached the end
89 | # of the stream
90 | if not grabbed:
91 | break
92 |
93 | # if the frame dimensions are empty, grab them
94 | if W is None or H is None:
95 | (H, W) = frame.shape[:2]
96 |
97 | # construct a blob from the input frame and then perform a forward
98 | # pass of the YOLO object detector, giving us our bounding boxes
99 | # and associated probabilities
100 | blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416),
101 | swapRB=True, crop=False)
102 | net.setInput(blob)
103 | start = time.time()
104 | layerOutputs = net.forward(ln)
105 | end = time.time()
106 |
107 | # initialize our lists of detected bounding boxes, confidences,
108 | # and class IDs, respectively
109 | boxes = []
110 | confidences = []
111 | classIDs = []
112 |
113 | # loop over each of the layer outputs
114 | for output in layerOutputs:
115 | # loop over each of the detections
116 | for detection in output:
117 | # extract the class ID and confidence (i.e., probability)
118 | # of the current object detection
119 | scores = detection[5:]
120 | classID = np.argmax(scores)
121 | confidence = scores[classID]
122 |
123 | # filter out weak predictions by ensuring the detected
124 | # probability is greater than the minimum probability
125 | if confidence > args["confidence"]:
126 | # scale the bounding box coordinates back relative to
127 | # the size of the image, keeping in mind that YOLO
128 | # actually returns the center (x, y)-coordinates of
129 | # the bounding box followed by the boxes' width and
130 | # height
131 | box = detection[0:4] * np.array([W, H, W, H])
132 | (centerX, centerY, width, height) = box.astype("int")
133 |
134 | # use the center (x, y)-coordinates to derive the top
135 | # and and left corner of the bounding box
136 | x = int(centerX - (width / 2))
137 | y = int(centerY - (height / 2))
138 |
139 | # update our list of bounding box coordinates,
140 | # confidences, and class IDs
141 | boxes.append([x, y, int(width), int(height)])
142 | confidences.append(float(confidence))
143 | classIDs.append(classID)
144 |
145 | # apply non-maxima suppression to suppress weak, overlapping
146 | # bounding boxes
147 | idxs = cv2.dnn.NMSBoxes(boxes, confidences, args["confidence"], args["threshold"])
148 |
149 | dets = []
150 | if len(idxs) > 0:
151 | # loop over the indexes we are keeping
152 | for i in idxs.flatten():
153 | (x, y) = (boxes[i][0], boxes[i][1])
154 | (w, h) = (boxes[i][2], boxes[i][3])
155 | dets.append([x, y, x+w, y+h, confidences[i]])
156 |
157 | np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
158 | dets = np.asarray(dets)
159 | tracks = tracker.update(dets)
160 |
161 | boxes = []
162 | indexIDs = []
163 | c = []
164 | previous = memory.copy()
165 | memory = {}
166 |
167 | for track in tracks:
168 | boxes.append([track[0], track[1], track[2], track[3]])
169 | indexIDs.append(int(track[4]))
170 | memory[indexIDs[-1]] = boxes[-1]
171 |
172 | if len(boxes) > 0:
173 | i = int(0)
174 | for box in boxes:
175 | # extract the bounding box coordinates
176 | (x, y) = (int(box[0]), int(box[1]))
177 | (w, h) = (int(box[2]), int(box[3]))
178 |
179 | # draw a bounding box rectangle and label on the image
180 | # color = [int(c) for c in COLORS[classIDs[i]]]
181 | # cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
182 |
183 | color = [int(c) for c in COLORS[indexIDs[i] % len(COLORS)]]
184 | cv2.rectangle(frame, (x, y), (w, h), color, 2)
185 |
186 | if indexIDs[i] in previous:
187 | previous_box = previous[indexIDs[i]]
188 | (x2, y2) = (int(previous_box[0]), int(previous_box[1]))
189 | (w2, h2) = (int(previous_box[2]), int(previous_box[3]))
190 | p0 = (int(x + (w-x)/2), int(y + (h-y)/2))
191 | p1 = (int(x2 + (w2-x2)/2), int(y2 + (h2-y2)/2))
192 | cv2.line(frame, p0, p1, color, 3)
193 |
194 | if intersect(p0, p1, line[0], line[1]):
195 | counter += 1
196 |
197 | # text = "{}: {:.4f}".format(LABELS[classIDs[i]], confidences[i])
198 | text = "{}".format(indexIDs[i])
199 | cv2.putText(frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
200 | i += 1
201 |
202 | # draw line
203 | cv2.line(frame, line[0], line[1], (0, 255, 255), 5)
204 |
205 | # draw counter
206 | cv2.putText(frame, str(counter), (100,200), cv2.FONT_HERSHEY_DUPLEX, 5.0, (0, 255, 255), 10)
207 | # counter += 1
208 |
209 | # saves image file
210 | cv2.imwrite("output/frame-{}.png".format(frameIndex), frame)
211 |
212 | # check if the video writer is None
213 | if writer is None:
214 | # initialize our video writer
215 | fourcc = cv2.VideoWriter_fourcc(*"MJPG")
216 | writer = cv2.VideoWriter(args["output"], fourcc, 30,
217 | (frame.shape[1], frame.shape[0]), True)
218 |
219 | # some information on processing single frame
220 | if total > 0:
221 | elap = (end - start)
222 | print("[INFO] single frame took {:.4f} seconds".format(elap))
223 | print("[INFO] estimated total time to finish: {:.4f}".format(
224 | elap * total))
225 |
226 | # write the output frame to disk
227 | writer.write(frame)
228 |
229 | # increase frame index
230 | frameIndex += 1
231 |
232 | if frameIndex >= 4000:
233 | print("[INFO] cleaning up...")
234 | writer.release()
235 | vs.release()
236 | exit()
237 |
238 | # release the file pointers
239 | print("[INFO] cleaning up...")
240 | writer.release()
241 | vs.release()
--------------------------------------------------------------------------------
/output/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HodenX/python-traffic-counter-with-yolo-and-sort/580426ce0992095311b8145aeb2a65fb80336b68/output/.gitkeep
--------------------------------------------------------------------------------
/sort.py:
--------------------------------------------------------------------------------
1 | """
2 | SORT: A Simple, Online and Realtime Tracker
3 | Copyright (C) 2016 Alex Bewley alex@dynamicdetection.com
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation, either version 3 of the License, or
8 | (at your option) any later version.
9 |
10 | This program is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | GNU General Public License for more details.
14 |
15 | You should have received a copy of the GNU General Public License
16 | along with this program. If not, see .
17 | """
18 | from __future__ import print_function
19 |
20 | from numba import jit
21 | import numpy as np
22 | from sklearn.utils.linear_assignment_ import linear_assignment
23 | from filterpy.kalman import KalmanFilter
24 |
25 | @jit
26 | def iou(bb_test,bb_gt):
27 | """
28 | Computes IUO between two bboxes in the form [x1,y1,x2,y2]
29 | """
30 | xx1 = np.maximum(bb_test[0], bb_gt[0])
31 | yy1 = np.maximum(bb_test[1], bb_gt[1])
32 | xx2 = np.minimum(bb_test[2], bb_gt[2])
33 | yy2 = np.minimum(bb_test[3], bb_gt[3])
34 | w = np.maximum(0., xx2 - xx1)
35 | h = np.maximum(0., yy2 - yy1)
36 | wh = w * h
37 | o = wh / ((bb_test[2]-bb_test[0])*(bb_test[3]-bb_test[1])
38 | + (bb_gt[2]-bb_gt[0])*(bb_gt[3]-bb_gt[1]) - wh)
39 | return(o)
40 |
41 | def convert_bbox_to_z(bbox):
42 | """
43 | Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form
44 | [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is
45 | the aspect ratio
46 | """
47 | w = bbox[2]-bbox[0]
48 | h = bbox[3]-bbox[1]
49 | x = bbox[0]+w/2.
50 | y = bbox[1]+h/2.
51 | s = w*h #scale is just area
52 | r = w/float(h)
53 | return np.array([x,y,s,r]).reshape((4,1))
54 |
55 | def convert_x_to_bbox(x,score=None):
56 | """
57 | Takes a bounding box in the centre form [x,y,s,r] and returns it in the form
58 | [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right
59 | """
60 | w = np.sqrt(x[2]*x[3])
61 | h = x[2]/w
62 | if(score==None):
63 | return np.array([x[0]-w/2.,x[1]-h/2.,x[0]+w/2.,x[1]+h/2.]).reshape((1,4))
64 | else:
65 | return np.array([x[0]-w/2.,x[1]-h/2.,x[0]+w/2.,x[1]+h/2.,score]).reshape((1,5))
66 |
67 | class KalmanBoxTracker(object):
68 | """
69 | This class represents the internel state of individual tracked objects observed as bbox.
70 | """
71 | count = 0
72 | def __init__(self,bbox):
73 | """
74 | Initialises a tracker using initial bounding box.
75 | """
76 | #define constant velocity model
77 | self.kf = KalmanFilter(dim_x=7, dim_z=4)
78 | self.kf.F = np.array([[1,0,0,0,1,0,0],[0,1,0,0,0,1,0],[0,0,1,0,0,0,1],[0,0,0,1,0,0,0], [0,0,0,0,1,0,0],[0,0,0,0,0,1,0],[0,0,0,0,0,0,1]])
79 | self.kf.H = np.array([[1,0,0,0,0,0,0],[0,1,0,0,0,0,0],[0,0,1,0,0,0,0],[0,0,0,1,0,0,0]])
80 |
81 | self.kf.R[2:,2:] *= 10.
82 | self.kf.P[4:,4:] *= 1000. #give high uncertainty to the unobservable initial velocities
83 | self.kf.P *= 10.
84 | self.kf.Q[-1,-1] *= 0.01
85 | self.kf.Q[4:,4:] *= 0.01
86 |
87 | self.kf.x[:4] = convert_bbox_to_z(bbox)
88 | self.time_since_update = 0
89 | self.id = KalmanBoxTracker.count
90 | KalmanBoxTracker.count += 1
91 | self.history = []
92 | self.hits = 0
93 | self.hit_streak = 0
94 | self.age = 0
95 |
96 | def update(self,bbox):
97 | """
98 | Updates the state vector with observed bbox.
99 | """
100 | self.time_since_update = 0
101 | self.history = []
102 | self.hits += 1
103 | self.hit_streak += 1
104 | self.kf.update(convert_bbox_to_z(bbox))
105 |
106 | def predict(self):
107 | """
108 | Advances the state vector and returns the predicted bounding box estimate.
109 | """
110 | if((self.kf.x[6]+self.kf.x[2])<=0):
111 | self.kf.x[6] *= 0.0
112 | self.kf.predict()
113 | self.age += 1
114 | if(self.time_since_update>0):
115 | self.hit_streak = 0
116 | self.time_since_update += 1
117 | self.history.append(convert_x_to_bbox(self.kf.x))
118 | return self.history[-1]
119 |
120 | def get_state(self):
121 | """
122 | Returns the current bounding box estimate.
123 | """
124 | return convert_x_to_bbox(self.kf.x)
125 |
126 | def associate_detections_to_trackers(detections,trackers,iou_threshold = 0.3):
127 | """
128 | Assigns detections to tracked object (both represented as bounding boxes)
129 |
130 | Returns 3 lists of matches, unmatched_detections and unmatched_trackers
131 | """
132 | if(len(trackers)==0) or (len(detections)==0):
133 | return np.empty((0,2),dtype=int), np.arange(len(detections)), np.empty((0,5),dtype=int)
134 | iou_matrix = np.zeros((len(detections),len(trackers)),dtype=np.float32)
135 |
136 | for d,det in enumerate(detections):
137 | for t,trk in enumerate(trackers):
138 | iou_matrix[d,t] = iou(det,trk)
139 | matched_indices = linear_assignment(-iou_matrix)
140 |
141 | unmatched_detections = []
142 | for d,det in enumerate(detections):
143 | if(d not in matched_indices[:,0]):
144 | unmatched_detections.append(d)
145 | unmatched_trackers = []
146 | for t,trk in enumerate(trackers):
147 | if(t not in matched_indices[:,1]):
148 | unmatched_trackers.append(t)
149 |
150 | #filter out matched with low IOU
151 | matches = []
152 | for m in matched_indices:
153 | if(iou_matrix[m[0],m[1]]= self.min_hits or self.frame_count <= self.min_hits)):
213 | ret.append(np.concatenate((d,[trk.id+1])).reshape(1,-1)) # +1 as MOT benchmark requires positive
214 | i -= 1
215 | #remove dead tracklet
216 | if(trk.time_since_update > self.max_age):
217 | self.trackers.pop(i)
218 | if(len(ret)>0):
219 | return np.concatenate(ret)
220 | return np.empty((0,5))
221 |
--------------------------------------------------------------------------------
/yolo-coco/coco.names:
--------------------------------------------------------------------------------
1 | person
2 | bicycle
3 | car
4 | motorbike
5 | aeroplane
6 | bus
7 | train
8 | truck
9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
--------------------------------------------------------------------------------
/yolo-coco/yolov3.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 |
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 |
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 |
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 |
606 |
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 |
617 |
618 | [route]
619 | layers = -4
620 |
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 |
629 | [upsample]
630 | stride=2
631 |
632 | [route]
633 | layers = -1, 61
634 |
635 |
636 |
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 |
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 |
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 |
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 |
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 |
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 |
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 |
692 |
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 |
703 |
704 |
705 | [route]
706 | layers = -4
707 |
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 |
716 | [upsample]
717 | stride=2
718 |
719 | [route]
720 | layers = -1, 36
721 |
722 |
723 |
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 |
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 |
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 |
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 |
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 |
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 |
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 |
779 |
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 |
790 |
--------------------------------------------------------------------------------