├── .gitignore
├── README.md
├── highway.gif
├── input
    └── highway.mp4
├── main.py
├── output
    └── .gitkeep
├── sort.py
└── yolo-coco
    ├── coco.names
    └── yolov3.cfg


/.gitignore:
--------------------------------------------------------------------------------
1 | *.DS_Store
2 | __pycache__/
3 | output/*
4 | !output/.gitkeep
5 | *.pyc
6 | yolo-coco/yolov3.weights


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python Traffic Counter
 2 | 
 3 | The purpose of this project is to detect and track vehicles on a video stream and count those going through a defined line. 
 4 | 
 5 | ![highway.gif](highway.gif)
 6 | 
 7 | It uses:
 8 | 
 9 | * [YOLO](https://www.pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv) to detect objects on each of the video frames.
10 | 
11 | * [SORT](https://github.com/abewley/sort) to track those objects over different frames.
12 | 
13 | Once the objects are detected and tracked over different frames a simple mathematical calculation is applied to count the intersections between the vehicles previous and current frame positions with a defined line.
14 | 
15 | The code on this prototype uses the code structure developed by Adrian Rosebrock for his article [YOLO object detection with OpenCV](https://www.pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv).
16 | 
17 | ## Quick Start
18 | 
19 | 1. Download the code to your computer.
20 | 2. [Download yolov3.weights](https://www.dropbox.com/s/99mm7olr1ohtjbq/yolov3.weights?dl=0) and place it in `/yolo-coco`.
21 | 3. Make sure you have Python 3.7.0 and [OpenCV 3.4.2](https://www.pyimagesearch.com/opencv-tutorials-resources-guides/) installed.
22 | 4. Run:
23 | ```
24 | $ python main.py --input input/highway.mp4 --output output/highway.avi --yolo yolo-coco
25 | ```
26 | 
27 | ## Citation
28 | 
29 | ### YOLO :
30 | 
31 |     @article{redmon2016yolo9000,
32 |       title={YOLO9000: Better, Faster, Stronger},
33 |       author={Redmon, Joseph and Farhadi, Ali},
34 |       journal={arXiv preprint arXiv:1612.08242},
35 |       year={2016}
36 |     }
37 | 
38 | ### SORT :
39 | 
40 |     @inproceedings{Bewley2016_sort,
41 |       author={Bewley, Alex and Ge, Zongyuan and Ott, Lionel and Ramos, Fabio and Upcroft, Ben},
42 |       booktitle={2016 IEEE International Conference on Image Processing (ICIP)},
43 |       title={Simple online and realtime tracking},
44 |       year={2016},
45 |       pages={3464-3468},
46 |       keywords={Benchmark testing;Complexity theory;Detectors;Kalman filters;Target tracking;Visualization;Computer Vision;Data Association;Detection;Multiple Object Tracking},
47 |       doi={10.1109/ICIP.2016.7533003}
48 |     }


--------------------------------------------------------------------------------
/highway.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HodenX/python-traffic-counter-with-yolo-and-sort/580426ce0992095311b8145aeb2a65fb80336b68/highway.gif


--------------------------------------------------------------------------------
/input/highway.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HodenX/python-traffic-counter-with-yolo-and-sort/580426ce0992095311b8145aeb2a65fb80336b68/input/highway.mp4


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # import the necessary packages
  2 | import numpy as np
  3 | import argparse
  4 | import imutils
  5 | import time
  6 | import cv2
  7 | import os
  8 | import glob
  9 | 
 10 | files = glob.glob('output/*.png')
 11 | for f in files:
 12 |    os.remove(f)
 13 | 
 14 | from sort import *
 15 | tracker = Sort()
 16 | memory = {}
 17 | line = [(43, 543), (550, 655)]
 18 | counter = 0
 19 | 
 20 | # construct the argument parse and parse the arguments
 21 | ap = argparse.ArgumentParser()
 22 | ap.add_argument("-i", "--input", required=True,
 23 | 	help="path to input video")
 24 | ap.add_argument("-o", "--output", required=True,
 25 | 	help="path to output video")
 26 | ap.add_argument("-y", "--yolo", required=True,
 27 | 	help="base path to YOLO directory")
 28 | ap.add_argument("-c", "--confidence", type=float, default=0.5,
 29 | 	help="minimum probability to filter weak detections")
 30 | ap.add_argument("-t", "--threshold", type=float, default=0.3,
 31 | 	help="threshold when applyong non-maxima suppression")
 32 | args = vars(ap.parse_args())
 33 | 
 34 | # Return true if line segments AB and CD intersect
 35 | def intersect(A,B,C,D):
 36 | 	return ccw(A,C,D) != ccw(B,C,D) and ccw(A,B,C) != ccw(A,B,D)
 37 | 
 38 | def ccw(A,B,C):
 39 | 	return (C[1]-A[1]) * (B[0]-A[0]) > (B[1]-A[1]) * (C[0]-A[0])
 40 | 
 41 | # load the COCO class labels our YOLO model was trained on
 42 | labelsPath = os.path.sep.join([args["yolo"], "coco.names"])
 43 | LABELS = open(labelsPath).read().strip().split("\n")
 44 | 
 45 | # initialize a list of colors to represent each possible class label
 46 | np.random.seed(42)
 47 | COLORS = np.random.randint(0, 255, size=(200, 3),
 48 | 	dtype="uint8")
 49 | 
 50 | # derive the paths to the YOLO weights and model configuration
 51 | weightsPath = os.path.sep.join([args["yolo"], "yolov3.weights"])
 52 | configPath = os.path.sep.join([args["yolo"], "yolov3.cfg"])
 53 | 
 54 | # load our YOLO object detector trained on COCO dataset (80 classes)
 55 | # and determine only the *output* layer names that we need from YOLO
 56 | print("[INFO] loading YOLO from disk...")
 57 | net = cv2.dnn.readNetFromDarknet(configPath, weightsPath)
 58 | ln = net.getLayerNames()
 59 | ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
 60 | 
 61 | # initialize the video stream, pointer to output video file, and
 62 | # frame dimensions
 63 | vs = cv2.VideoCapture(args["input"])
 64 | writer = None
 65 | (W, H) = (None, None)
 66 | 
 67 | frameIndex = 0
 68 | 
 69 | # try to determine the total number of frames in the video file
 70 | try:
 71 | 	prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2() \
 72 | 		else cv2.CAP_PROP_FRAME_COUNT
 73 | 	total = int(vs.get(prop))
 74 | 	print("[INFO] {} total frames in video".format(total))
 75 | 
 76 | # an error occurred while trying to determine the total
 77 | # number of frames in the video file
 78 | except:
 79 | 	print("[INFO] could not determine # of frames in video")
 80 | 	print("[INFO] no approx. completion time can be provided")
 81 | 	total = -1
 82 | 
 83 | # loop over frames from the video file stream
 84 | while True:
 85 | 	# read the next frame from the file
 86 | 	(grabbed, frame) = vs.read()
 87 | 
 88 | 	# if the frame was not grabbed, then we have reached the end
 89 | 	# of the stream
 90 | 	if not grabbed:
 91 | 		break
 92 | 
 93 | 	# if the frame dimensions are empty, grab them
 94 | 	if W is None or H is None:
 95 | 		(H, W) = frame.shape[:2]
 96 | 
 97 | 	# construct a blob from the input frame and then perform a forward
 98 | 	# pass of the YOLO object detector, giving us our bounding boxes
 99 | 	# and associated probabilities
100 | 	blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416),
101 | 		swapRB=True, crop=False)
102 | 	net.setInput(blob)
103 | 	start = time.time()
104 | 	layerOutputs = net.forward(ln)
105 | 	end = time.time()
106 | 
107 | 	# initialize our lists of detected bounding boxes, confidences,
108 | 	# and class IDs, respectively
109 | 	boxes = []
110 | 	confidences = []
111 | 	classIDs = []
112 | 
113 | 	# loop over each of the layer outputs
114 | 	for output in layerOutputs:
115 | 		# loop over each of the detections
116 | 		for detection in output:
117 | 			# extract the class ID and confidence (i.e., probability)
118 | 			# of the current object detection
119 | 			scores = detection[5:]
120 | 			classID = np.argmax(scores)
121 | 			confidence = scores[classID]
122 | 
123 | 			# filter out weak predictions by ensuring the detected
124 | 			# probability is greater than the minimum probability
125 | 			if confidence > args["confidence"]:
126 | 				# scale the bounding box coordinates back relative to
127 | 				# the size of the image, keeping in mind that YOLO
128 | 				# actually returns the center (x, y)-coordinates of
129 | 				# the bounding box followed by the boxes' width and
130 | 				# height
131 | 				box = detection[0:4] * np.array([W, H, W, H])
132 | 				(centerX, centerY, width, height) = box.astype("int")
133 | 
134 | 				# use the center (x, y)-coordinates to derive the top
135 | 				# and and left corner of the bounding box
136 | 				x = int(centerX - (width / 2))
137 | 				y = int(centerY - (height / 2))
138 | 
139 | 				# update our list of bounding box coordinates,
140 | 				# confidences, and class IDs
141 | 				boxes.append([x, y, int(width), int(height)])
142 | 				confidences.append(float(confidence))
143 | 				classIDs.append(classID)
144 | 
145 | 	# apply non-maxima suppression to suppress weak, overlapping
146 | 	# bounding boxes
147 | 	idxs = cv2.dnn.NMSBoxes(boxes, confidences, args["confidence"], args["threshold"])
148 | 	
149 | 	dets = []
150 | 	if len(idxs) > 0:
151 | 		# loop over the indexes we are keeping
152 | 		for i in idxs.flatten():
153 | 			(x, y) = (boxes[i][0], boxes[i][1])
154 | 			(w, h) = (boxes[i][2], boxes[i][3])
155 | 			dets.append([x, y, x+w, y+h, confidences[i]])
156 | 
157 | 	np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
158 | 	dets = np.asarray(dets)
159 | 	tracks = tracker.update(dets)
160 | 
161 | 	boxes = []
162 | 	indexIDs = []
163 | 	c = []
164 | 	previous = memory.copy()
165 | 	memory = {}
166 | 
167 | 	for track in tracks:
168 | 		boxes.append([track[0], track[1], track[2], track[3]])
169 | 		indexIDs.append(int(track[4]))
170 | 		memory[indexIDs[-1]] = boxes[-1]
171 | 
172 | 	if len(boxes) > 0:
173 | 		i = int(0)
174 | 		for box in boxes:
175 | 			# extract the bounding box coordinates
176 | 			(x, y) = (int(box[0]), int(box[1]))
177 | 			(w, h) = (int(box[2]), int(box[3]))
178 | 
179 | 			# draw a bounding box rectangle and label on the image
180 | 			# color = [int(c) for c in COLORS[classIDs[i]]]
181 | 			# cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
182 | 
183 | 			color = [int(c) for c in COLORS[indexIDs[i] % len(COLORS)]]
184 | 			cv2.rectangle(frame, (x, y), (w, h), color, 2)
185 | 
186 | 			if indexIDs[i] in previous:
187 | 				previous_box = previous[indexIDs[i]]
188 | 				(x2, y2) = (int(previous_box[0]), int(previous_box[1]))
189 | 				(w2, h2) = (int(previous_box[2]), int(previous_box[3]))
190 | 				p0 = (int(x + (w-x)/2), int(y + (h-y)/2))
191 | 				p1 = (int(x2 + (w2-x2)/2), int(y2 + (h2-y2)/2))
192 | 				cv2.line(frame, p0, p1, color, 3)
193 | 
194 | 				if intersect(p0, p1, line[0], line[1]):
195 | 					counter += 1
196 | 
197 | 			# text = "{}: {:.4f}".format(LABELS[classIDs[i]], confidences[i])
198 | 			text = "{}".format(indexIDs[i])
199 | 			cv2.putText(frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
200 | 			i += 1
201 | 
202 | 	# draw line
203 | 	cv2.line(frame, line[0], line[1], (0, 255, 255), 5)
204 | 
205 | 	# draw counter
206 | 	cv2.putText(frame, str(counter), (100,200), cv2.FONT_HERSHEY_DUPLEX, 5.0, (0, 255, 255), 10)
207 | 	# counter += 1
208 | 
209 | 	# saves image file
210 | 	cv2.imwrite("output/frame-{}.png".format(frameIndex), frame)
211 | 
212 | 	# check if the video writer is None
213 | 	if writer is None:
214 | 		# initialize our video writer
215 | 		fourcc = cv2.VideoWriter_fourcc(*"MJPG")
216 | 		writer = cv2.VideoWriter(args["output"], fourcc, 30,
217 | 			(frame.shape[1], frame.shape[0]), True)
218 | 
219 | 		# some information on processing single frame
220 | 		if total > 0:
221 | 			elap = (end - start)
222 | 			print("[INFO] single frame took {:.4f} seconds".format(elap))
223 | 			print("[INFO] estimated total time to finish: {:.4f}".format(
224 | 				elap * total))
225 | 
226 | 	# write the output frame to disk
227 | 	writer.write(frame)
228 | 
229 | 	# increase frame index
230 | 	frameIndex += 1
231 | 
232 | 	if frameIndex >= 4000:
233 | 		print("[INFO] cleaning up...")
234 | 		writer.release()
235 | 		vs.release()
236 | 		exit()
237 | 
238 | # release the file pointers
239 | print("[INFO] cleaning up...")
240 | writer.release()
241 | vs.release()


--------------------------------------------------------------------------------
/output/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HodenX/python-traffic-counter-with-yolo-and-sort/580426ce0992095311b8145aeb2a65fb80336b68/output/.gitkeep


--------------------------------------------------------------------------------
/sort.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     SORT: A Simple, Online and Realtime Tracker
  3 |     Copyright (C) 2016 Alex Bewley alex@dynamicdetection.com
  4 | 
  5 |     This program is free software: you can redistribute it and/or modify
  6 |     it under the terms of the GNU General Public License as published by
  7 |     the Free Software Foundation, either version 3 of the License, or
  8 |     (at your option) any later version.
  9 | 
 10 |     This program is distributed in the hope that it will be useful,
 11 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |     GNU General Public License for more details.
 14 | 
 15 |     You should have received a copy of the GNU General Public License
 16 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | """
 18 | from __future__ import print_function
 19 | 
 20 | from numba import jit
 21 | import numpy as np
 22 | from sklearn.utils.linear_assignment_ import linear_assignment
 23 | from filterpy.kalman import KalmanFilter
 24 | 
 25 | @jit
 26 | def iou(bb_test,bb_gt):
 27 |   """
 28 |   Computes IUO between two bboxes in the form [x1,y1,x2,y2]
 29 |   """
 30 |   xx1 = np.maximum(bb_test[0], bb_gt[0])
 31 |   yy1 = np.maximum(bb_test[1], bb_gt[1])
 32 |   xx2 = np.minimum(bb_test[2], bb_gt[2])
 33 |   yy2 = np.minimum(bb_test[3], bb_gt[3])
 34 |   w = np.maximum(0., xx2 - xx1)
 35 |   h = np.maximum(0., yy2 - yy1)
 36 |   wh = w * h
 37 |   o = wh / ((bb_test[2]-bb_test[0])*(bb_test[3]-bb_test[1])
 38 |     + (bb_gt[2]-bb_gt[0])*(bb_gt[3]-bb_gt[1]) - wh)
 39 |   return(o)
 40 | 
 41 | def convert_bbox_to_z(bbox):
 42 |   """
 43 |   Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form
 44 |     [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is
 45 |     the aspect ratio
 46 |   """
 47 |   w = bbox[2]-bbox[0]
 48 |   h = bbox[3]-bbox[1]
 49 |   x = bbox[0]+w/2.
 50 |   y = bbox[1]+h/2.
 51 |   s = w*h    #scale is just area
 52 |   r = w/float(h)
 53 |   return np.array([x,y,s,r]).reshape((4,1))
 54 | 
 55 | def convert_x_to_bbox(x,score=None):
 56 |   """
 57 |   Takes a bounding box in the centre form [x,y,s,r] and returns it in the form
 58 |     [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right
 59 |   """
 60 |   w = np.sqrt(x[2]*x[3])
 61 |   h = x[2]/w
 62 |   if(score==None):
 63 |     return np.array([x[0]-w/2.,x[1]-h/2.,x[0]+w/2.,x[1]+h/2.]).reshape((1,4))
 64 |   else:
 65 |     return np.array([x[0]-w/2.,x[1]-h/2.,x[0]+w/2.,x[1]+h/2.,score]).reshape((1,5))
 66 | 
 67 | class KalmanBoxTracker(object):
 68 |   """
 69 |   This class represents the internel state of individual tracked objects observed as bbox.
 70 |   """
 71 |   count = 0
 72 |   def __init__(self,bbox):
 73 |     """
 74 |     Initialises a tracker using initial bounding box.
 75 |     """
 76 |     #define constant velocity model
 77 |     self.kf = KalmanFilter(dim_x=7, dim_z=4)
 78 |     self.kf.F = np.array([[1,0,0,0,1,0,0],[0,1,0,0,0,1,0],[0,0,1,0,0,0,1],[0,0,0,1,0,0,0],  [0,0,0,0,1,0,0],[0,0,0,0,0,1,0],[0,0,0,0,0,0,1]])
 79 |     self.kf.H = np.array([[1,0,0,0,0,0,0],[0,1,0,0,0,0,0],[0,0,1,0,0,0,0],[0,0,0,1,0,0,0]])
 80 | 
 81 |     self.kf.R[2:,2:] *= 10.
 82 |     self.kf.P[4:,4:] *= 1000. #give high uncertainty to the unobservable initial velocities
 83 |     self.kf.P *= 10.
 84 |     self.kf.Q[-1,-1] *= 0.01
 85 |     self.kf.Q[4:,4:] *= 0.01
 86 | 
 87 |     self.kf.x[:4] = convert_bbox_to_z(bbox)
 88 |     self.time_since_update = 0
 89 |     self.id = KalmanBoxTracker.count
 90 |     KalmanBoxTracker.count += 1
 91 |     self.history = []
 92 |     self.hits = 0
 93 |     self.hit_streak = 0
 94 |     self.age = 0
 95 | 
 96 |   def update(self,bbox):
 97 |     """
 98 |     Updates the state vector with observed bbox.
 99 |     """
100 |     self.time_since_update = 0
101 |     self.history = []
102 |     self.hits += 1
103 |     self.hit_streak += 1
104 |     self.kf.update(convert_bbox_to_z(bbox))
105 | 
106 |   def predict(self):
107 |     """
108 |     Advances the state vector and returns the predicted bounding box estimate.
109 |     """
110 |     if((self.kf.x[6]+self.kf.x[2])<=0):
111 |       self.kf.x[6] *= 0.0
112 |     self.kf.predict()
113 |     self.age += 1
114 |     if(self.time_since_update>0):
115 |       self.hit_streak = 0
116 |     self.time_since_update += 1
117 |     self.history.append(convert_x_to_bbox(self.kf.x))
118 |     return self.history[-1]
119 | 
120 |   def get_state(self):
121 |     """
122 |     Returns the current bounding box estimate.
123 |     """
124 |     return convert_x_to_bbox(self.kf.x)
125 | 
126 | def associate_detections_to_trackers(detections,trackers,iou_threshold = 0.3):
127 |   """
128 |   Assigns detections to tracked object (both represented as bounding boxes)
129 | 
130 |   Returns 3 lists of matches, unmatched_detections and unmatched_trackers
131 |   """
132 |   if(len(trackers)==0) or (len(detections)==0):
133 |     return np.empty((0,2),dtype=int), np.arange(len(detections)), np.empty((0,5),dtype=int)
134 |   iou_matrix = np.zeros((len(detections),len(trackers)),dtype=np.float32)
135 | 
136 |   for d,det in enumerate(detections):
137 |     for t,trk in enumerate(trackers):
138 |       iou_matrix[d,t] = iou(det,trk)
139 |   matched_indices = linear_assignment(-iou_matrix)
140 | 
141 |   unmatched_detections = []
142 |   for d,det in enumerate(detections):
143 |     if(d not in matched_indices[:,0]):
144 |       unmatched_detections.append(d)
145 |   unmatched_trackers = []
146 |   for t,trk in enumerate(trackers):
147 |     if(t not in matched_indices[:,1]):
148 |       unmatched_trackers.append(t)
149 | 
150 |   #filter out matched with low IOU
151 |   matches = []
152 |   for m in matched_indices:
153 |     if(iou_matrix[m[0],m[1]]<iou_threshold):
154 |       unmatched_detections.append(m[0])
155 |       unmatched_trackers.append(m[1])
156 |     else:
157 |       matches.append(m.reshape(1,2))
158 |   if(len(matches)==0):
159 |     matches = np.empty((0,2),dtype=int)
160 |   else:
161 |     matches = np.concatenate(matches,axis=0)
162 | 
163 |   return matches, np.array(unmatched_detections), np.array(unmatched_trackers)
164 | 
165 | class Sort(object):
166 |   def __init__(self,max_age=1,min_hits=3):
167 |     """
168 |     Sets key parameters for SORT
169 |     """
170 |     self.max_age = max_age
171 |     self.min_hits = min_hits
172 |     self.trackers = []
173 |     self.frame_count = 0
174 | 
175 |   def update(self,dets):
176 |     """
177 |     Params:
178 |       dets - a numpy array of detections in the format [[x,y,w,h,score],[x,y,w,h,score],...]
179 |     Requires: this method must be called once for each frame even with empty detections.
180 |     Returns the a similar array, where the last column is the object ID.
181 | 
182 |     NOTE: The number of objects returned may differ from the number of detections provided.
183 |     """
184 |     self.frame_count += 1
185 |     #get predicted locations from existing trackers.
186 |     trks = np.zeros((len(self.trackers),5))
187 |     to_del = []
188 |     ret = []
189 |     for t,trk in enumerate(trks):
190 |       pos = self.trackers[t].predict()[0]
191 |       trk[:] = [pos[0], pos[1], pos[2], pos[3], 0]
192 |       if(np.any(np.isnan(pos))):
193 |         to_del.append(t)
194 |     trks = np.ma.compress_rows(np.ma.masked_invalid(trks))
195 |     for t in reversed(to_del):
196 |       self.trackers.pop(t)
197 |     matched, unmatched_dets, unmatched_trks = associate_detections_to_trackers(dets,trks)
198 | 
199 |     #update matched trackers with assigned detections
200 |     for t,trk in enumerate(self.trackers):
201 |       if(t not in unmatched_trks):
202 |         d = matched[np.where(matched[:,1]==t)[0],0]
203 |         trk.update(dets[d,:][0])
204 | 
205 |     #create and initialise new trackers for unmatched detections
206 |     for i in unmatched_dets:
207 |         trk = KalmanBoxTracker(dets[i,:])
208 |         self.trackers.append(trk)
209 |     i = len(self.trackers)
210 |     for trk in reversed(self.trackers):
211 |         d = trk.get_state()[0]
212 |         if((trk.time_since_update < 1) and (trk.hit_streak >= self.min_hits or self.frame_count <= self.min_hits)):
213 |           ret.append(np.concatenate((d,[trk.id+1])).reshape(1,-1)) # +1 as MOT benchmark requires positive
214 |         i -= 1
215 |         #remove dead tracklet
216 |         if(trk.time_since_update > self.max_age):
217 |           self.trackers.pop(i)
218 |     if(len(ret)>0):
219 |       return np.concatenate(ret)
220 |     return np.empty((0,5))
221 | 


--------------------------------------------------------------------------------
/yolo-coco/coco.names:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush


--------------------------------------------------------------------------------
/yolo-coco/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | # batch=1
  4 | # subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=16
  8 | width=608
  9 | height=608
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 
790 | 


--------------------------------------------------------------------------------