├── .gitignore ├── README.md ├── highway.gif ├── input └── highway.mp4 ├── main.py ├── output └── .gitkeep ├── sort.py └── yolo-coco ├── coco.names └── yolov3.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | __pycache__/ 3 | output/* 4 | !output/.gitkeep 5 | *.pyc 6 | yolo-coco/yolov3.weights -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Traffic Counter 2 | 3 | The purpose of this project is to detect and track vehicles on a video stream and count those going through a defined line. 4 | 5 | ![highway.gif](highway.gif) 6 | 7 | It uses: 8 | 9 | * [YOLO](https://www.pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv) to detect objects on each of the video frames. 10 | 11 | * [SORT](https://github.com/abewley/sort) to track those objects over different frames. 12 | 13 | Once the objects are detected and tracked over different frames a simple mathematical calculation is applied to count the intersections between the vehicles previous and current frame positions with a defined line. 14 | 15 | The code on this prototype uses the code structure developed by Adrian Rosebrock for his article [YOLO object detection with OpenCV](https://www.pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv). 16 | 17 | ## Quick Start 18 | 19 | 1. Download the code to your computer. 20 | 2. [Download yolov3.weights](https://www.dropbox.com/s/99mm7olr1ohtjbq/yolov3.weights?dl=0) and place it in `/yolo-coco`. 21 | 3. Make sure you have Python 3.7.0 and [OpenCV 3.4.2](https://www.pyimagesearch.com/opencv-tutorials-resources-guides/) installed. 22 | 4. Run: 23 | ``` 24 | $ python main.py --input input/highway.mp4 --output output/highway.avi --yolo yolo-coco 25 | ``` 26 | 27 | ## Citation 28 | 29 | ### YOLO : 30 | 31 | @article{redmon2016yolo9000, 32 | title={YOLO9000: Better, Faster, Stronger}, 33 | author={Redmon, Joseph and Farhadi, Ali}, 34 | journal={arXiv preprint arXiv:1612.08242}, 35 | year={2016} 36 | } 37 | 38 | ### SORT : 39 | 40 | @inproceedings{Bewley2016_sort, 41 | author={Bewley, Alex and Ge, Zongyuan and Ott, Lionel and Ramos, Fabio and Upcroft, Ben}, 42 | booktitle={2016 IEEE International Conference on Image Processing (ICIP)}, 43 | title={Simple online and realtime tracking}, 44 | year={2016}, 45 | pages={3464-3468}, 46 | keywords={Benchmark testing;Complexity theory;Detectors;Kalman filters;Target tracking;Visualization;Computer Vision;Data Association;Detection;Multiple Object Tracking}, 47 | doi={10.1109/ICIP.2016.7533003} 48 | } -------------------------------------------------------------------------------- /highway.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HodenX/python-traffic-counter-with-yolo-and-sort/580426ce0992095311b8145aeb2a65fb80336b68/highway.gif -------------------------------------------------------------------------------- /input/highway.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HodenX/python-traffic-counter-with-yolo-and-sort/580426ce0992095311b8145aeb2a65fb80336b68/input/highway.mp4 -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # import the necessary packages 2 | import numpy as np 3 | import argparse 4 | import imutils 5 | import time 6 | import cv2 7 | import os 8 | import glob 9 | 10 | files = glob.glob('output/*.png') 11 | for f in files: 12 | os.remove(f) 13 | 14 | from sort import * 15 | tracker = Sort() 16 | memory = {} 17 | line = [(43, 543), (550, 655)] 18 | counter = 0 19 | 20 | # construct the argument parse and parse the arguments 21 | ap = argparse.ArgumentParser() 22 | ap.add_argument("-i", "--input", required=True, 23 | help="path to input video") 24 | ap.add_argument("-o", "--output", required=True, 25 | help="path to output video") 26 | ap.add_argument("-y", "--yolo", required=True, 27 | help="base path to YOLO directory") 28 | ap.add_argument("-c", "--confidence", type=float, default=0.5, 29 | help="minimum probability to filter weak detections") 30 | ap.add_argument("-t", "--threshold", type=float, default=0.3, 31 | help="threshold when applyong non-maxima suppression") 32 | args = vars(ap.parse_args()) 33 | 34 | # Return true if line segments AB and CD intersect 35 | def intersect(A,B,C,D): 36 | return ccw(A,C,D) != ccw(B,C,D) and ccw(A,B,C) != ccw(A,B,D) 37 | 38 | def ccw(A,B,C): 39 | return (C[1]-A[1]) * (B[0]-A[0]) > (B[1]-A[1]) * (C[0]-A[0]) 40 | 41 | # load the COCO class labels our YOLO model was trained on 42 | labelsPath = os.path.sep.join([args["yolo"], "coco.names"]) 43 | LABELS = open(labelsPath).read().strip().split("\n") 44 | 45 | # initialize a list of colors to represent each possible class label 46 | np.random.seed(42) 47 | COLORS = np.random.randint(0, 255, size=(200, 3), 48 | dtype="uint8") 49 | 50 | # derive the paths to the YOLO weights and model configuration 51 | weightsPath = os.path.sep.join([args["yolo"], "yolov3.weights"]) 52 | configPath = os.path.sep.join([args["yolo"], "yolov3.cfg"]) 53 | 54 | # load our YOLO object detector trained on COCO dataset (80 classes) 55 | # and determine only the *output* layer names that we need from YOLO 56 | print("[INFO] loading YOLO from disk...") 57 | net = cv2.dnn.readNetFromDarknet(configPath, weightsPath) 58 | ln = net.getLayerNames() 59 | ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()] 60 | 61 | # initialize the video stream, pointer to output video file, and 62 | # frame dimensions 63 | vs = cv2.VideoCapture(args["input"]) 64 | writer = None 65 | (W, H) = (None, None) 66 | 67 | frameIndex = 0 68 | 69 | # try to determine the total number of frames in the video file 70 | try: 71 | prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2() \ 72 | else cv2.CAP_PROP_FRAME_COUNT 73 | total = int(vs.get(prop)) 74 | print("[INFO] {} total frames in video".format(total)) 75 | 76 | # an error occurred while trying to determine the total 77 | # number of frames in the video file 78 | except: 79 | print("[INFO] could not determine # of frames in video") 80 | print("[INFO] no approx. completion time can be provided") 81 | total = -1 82 | 83 | # loop over frames from the video file stream 84 | while True: 85 | # read the next frame from the file 86 | (grabbed, frame) = vs.read() 87 | 88 | # if the frame was not grabbed, then we have reached the end 89 | # of the stream 90 | if not grabbed: 91 | break 92 | 93 | # if the frame dimensions are empty, grab them 94 | if W is None or H is None: 95 | (H, W) = frame.shape[:2] 96 | 97 | # construct a blob from the input frame and then perform a forward 98 | # pass of the YOLO object detector, giving us our bounding boxes 99 | # and associated probabilities 100 | blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), 101 | swapRB=True, crop=False) 102 | net.setInput(blob) 103 | start = time.time() 104 | layerOutputs = net.forward(ln) 105 | end = time.time() 106 | 107 | # initialize our lists of detected bounding boxes, confidences, 108 | # and class IDs, respectively 109 | boxes = [] 110 | confidences = [] 111 | classIDs = [] 112 | 113 | # loop over each of the layer outputs 114 | for output in layerOutputs: 115 | # loop over each of the detections 116 | for detection in output: 117 | # extract the class ID and confidence (i.e., probability) 118 | # of the current object detection 119 | scores = detection[5:] 120 | classID = np.argmax(scores) 121 | confidence = scores[classID] 122 | 123 | # filter out weak predictions by ensuring the detected 124 | # probability is greater than the minimum probability 125 | if confidence > args["confidence"]: 126 | # scale the bounding box coordinates back relative to 127 | # the size of the image, keeping in mind that YOLO 128 | # actually returns the center (x, y)-coordinates of 129 | # the bounding box followed by the boxes' width and 130 | # height 131 | box = detection[0:4] * np.array([W, H, W, H]) 132 | (centerX, centerY, width, height) = box.astype("int") 133 | 134 | # use the center (x, y)-coordinates to derive the top 135 | # and and left corner of the bounding box 136 | x = int(centerX - (width / 2)) 137 | y = int(centerY - (height / 2)) 138 | 139 | # update our list of bounding box coordinates, 140 | # confidences, and class IDs 141 | boxes.append([x, y, int(width), int(height)]) 142 | confidences.append(float(confidence)) 143 | classIDs.append(classID) 144 | 145 | # apply non-maxima suppression to suppress weak, overlapping 146 | # bounding boxes 147 | idxs = cv2.dnn.NMSBoxes(boxes, confidences, args["confidence"], args["threshold"]) 148 | 149 | dets = [] 150 | if len(idxs) > 0: 151 | # loop over the indexes we are keeping 152 | for i in idxs.flatten(): 153 | (x, y) = (boxes[i][0], boxes[i][1]) 154 | (w, h) = (boxes[i][2], boxes[i][3]) 155 | dets.append([x, y, x+w, y+h, confidences[i]]) 156 | 157 | np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)}) 158 | dets = np.asarray(dets) 159 | tracks = tracker.update(dets) 160 | 161 | boxes = [] 162 | indexIDs = [] 163 | c = [] 164 | previous = memory.copy() 165 | memory = {} 166 | 167 | for track in tracks: 168 | boxes.append([track[0], track[1], track[2], track[3]]) 169 | indexIDs.append(int(track[4])) 170 | memory[indexIDs[-1]] = boxes[-1] 171 | 172 | if len(boxes) > 0: 173 | i = int(0) 174 | for box in boxes: 175 | # extract the bounding box coordinates 176 | (x, y) = (int(box[0]), int(box[1])) 177 | (w, h) = (int(box[2]), int(box[3])) 178 | 179 | # draw a bounding box rectangle and label on the image 180 | # color = [int(c) for c in COLORS[classIDs[i]]] 181 | # cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2) 182 | 183 | color = [int(c) for c in COLORS[indexIDs[i] % len(COLORS)]] 184 | cv2.rectangle(frame, (x, y), (w, h), color, 2) 185 | 186 | if indexIDs[i] in previous: 187 | previous_box = previous[indexIDs[i]] 188 | (x2, y2) = (int(previous_box[0]), int(previous_box[1])) 189 | (w2, h2) = (int(previous_box[2]), int(previous_box[3])) 190 | p0 = (int(x + (w-x)/2), int(y + (h-y)/2)) 191 | p1 = (int(x2 + (w2-x2)/2), int(y2 + (h2-y2)/2)) 192 | cv2.line(frame, p0, p1, color, 3) 193 | 194 | if intersect(p0, p1, line[0], line[1]): 195 | counter += 1 196 | 197 | # text = "{}: {:.4f}".format(LABELS[classIDs[i]], confidences[i]) 198 | text = "{}".format(indexIDs[i]) 199 | cv2.putText(frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) 200 | i += 1 201 | 202 | # draw line 203 | cv2.line(frame, line[0], line[1], (0, 255, 255), 5) 204 | 205 | # draw counter 206 | cv2.putText(frame, str(counter), (100,200), cv2.FONT_HERSHEY_DUPLEX, 5.0, (0, 255, 255), 10) 207 | # counter += 1 208 | 209 | # saves image file 210 | cv2.imwrite("output/frame-{}.png".format(frameIndex), frame) 211 | 212 | # check if the video writer is None 213 | if writer is None: 214 | # initialize our video writer 215 | fourcc = cv2.VideoWriter_fourcc(*"MJPG") 216 | writer = cv2.VideoWriter(args["output"], fourcc, 30, 217 | (frame.shape[1], frame.shape[0]), True) 218 | 219 | # some information on processing single frame 220 | if total > 0: 221 | elap = (end - start) 222 | print("[INFO] single frame took {:.4f} seconds".format(elap)) 223 | print("[INFO] estimated total time to finish: {:.4f}".format( 224 | elap * total)) 225 | 226 | # write the output frame to disk 227 | writer.write(frame) 228 | 229 | # increase frame index 230 | frameIndex += 1 231 | 232 | if frameIndex >= 4000: 233 | print("[INFO] cleaning up...") 234 | writer.release() 235 | vs.release() 236 | exit() 237 | 238 | # release the file pointers 239 | print("[INFO] cleaning up...") 240 | writer.release() 241 | vs.release() -------------------------------------------------------------------------------- /output/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HodenX/python-traffic-counter-with-yolo-and-sort/580426ce0992095311b8145aeb2a65fb80336b68/output/.gitkeep -------------------------------------------------------------------------------- /sort.py: -------------------------------------------------------------------------------- 1 | """ 2 | SORT: A Simple, Online and Realtime Tracker 3 | Copyright (C) 2016 Alex Bewley alex@dynamicdetection.com 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | """ 18 | from __future__ import print_function 19 | 20 | from numba import jit 21 | import numpy as np 22 | from sklearn.utils.linear_assignment_ import linear_assignment 23 | from filterpy.kalman import KalmanFilter 24 | 25 | @jit 26 | def iou(bb_test,bb_gt): 27 | """ 28 | Computes IUO between two bboxes in the form [x1,y1,x2,y2] 29 | """ 30 | xx1 = np.maximum(bb_test[0], bb_gt[0]) 31 | yy1 = np.maximum(bb_test[1], bb_gt[1]) 32 | xx2 = np.minimum(bb_test[2], bb_gt[2]) 33 | yy2 = np.minimum(bb_test[3], bb_gt[3]) 34 | w = np.maximum(0., xx2 - xx1) 35 | h = np.maximum(0., yy2 - yy1) 36 | wh = w * h 37 | o = wh / ((bb_test[2]-bb_test[0])*(bb_test[3]-bb_test[1]) 38 | + (bb_gt[2]-bb_gt[0])*(bb_gt[3]-bb_gt[1]) - wh) 39 | return(o) 40 | 41 | def convert_bbox_to_z(bbox): 42 | """ 43 | Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form 44 | [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is 45 | the aspect ratio 46 | """ 47 | w = bbox[2]-bbox[0] 48 | h = bbox[3]-bbox[1] 49 | x = bbox[0]+w/2. 50 | y = bbox[1]+h/2. 51 | s = w*h #scale is just area 52 | r = w/float(h) 53 | return np.array([x,y,s,r]).reshape((4,1)) 54 | 55 | def convert_x_to_bbox(x,score=None): 56 | """ 57 | Takes a bounding box in the centre form [x,y,s,r] and returns it in the form 58 | [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right 59 | """ 60 | w = np.sqrt(x[2]*x[3]) 61 | h = x[2]/w 62 | if(score==None): 63 | return np.array([x[0]-w/2.,x[1]-h/2.,x[0]+w/2.,x[1]+h/2.]).reshape((1,4)) 64 | else: 65 | return np.array([x[0]-w/2.,x[1]-h/2.,x[0]+w/2.,x[1]+h/2.,score]).reshape((1,5)) 66 | 67 | class KalmanBoxTracker(object): 68 | """ 69 | This class represents the internel state of individual tracked objects observed as bbox. 70 | """ 71 | count = 0 72 | def __init__(self,bbox): 73 | """ 74 | Initialises a tracker using initial bounding box. 75 | """ 76 | #define constant velocity model 77 | self.kf = KalmanFilter(dim_x=7, dim_z=4) 78 | self.kf.F = np.array([[1,0,0,0,1,0,0],[0,1,0,0,0,1,0],[0,0,1,0,0,0,1],[0,0,0,1,0,0,0], [0,0,0,0,1,0,0],[0,0,0,0,0,1,0],[0,0,0,0,0,0,1]]) 79 | self.kf.H = np.array([[1,0,0,0,0,0,0],[0,1,0,0,0,0,0],[0,0,1,0,0,0,0],[0,0,0,1,0,0,0]]) 80 | 81 | self.kf.R[2:,2:] *= 10. 82 | self.kf.P[4:,4:] *= 1000. #give high uncertainty to the unobservable initial velocities 83 | self.kf.P *= 10. 84 | self.kf.Q[-1,-1] *= 0.01 85 | self.kf.Q[4:,4:] *= 0.01 86 | 87 | self.kf.x[:4] = convert_bbox_to_z(bbox) 88 | self.time_since_update = 0 89 | self.id = KalmanBoxTracker.count 90 | KalmanBoxTracker.count += 1 91 | self.history = [] 92 | self.hits = 0 93 | self.hit_streak = 0 94 | self.age = 0 95 | 96 | def update(self,bbox): 97 | """ 98 | Updates the state vector with observed bbox. 99 | """ 100 | self.time_since_update = 0 101 | self.history = [] 102 | self.hits += 1 103 | self.hit_streak += 1 104 | self.kf.update(convert_bbox_to_z(bbox)) 105 | 106 | def predict(self): 107 | """ 108 | Advances the state vector and returns the predicted bounding box estimate. 109 | """ 110 | if((self.kf.x[6]+self.kf.x[2])<=0): 111 | self.kf.x[6] *= 0.0 112 | self.kf.predict() 113 | self.age += 1 114 | if(self.time_since_update>0): 115 | self.hit_streak = 0 116 | self.time_since_update += 1 117 | self.history.append(convert_x_to_bbox(self.kf.x)) 118 | return self.history[-1] 119 | 120 | def get_state(self): 121 | """ 122 | Returns the current bounding box estimate. 123 | """ 124 | return convert_x_to_bbox(self.kf.x) 125 | 126 | def associate_detections_to_trackers(detections,trackers,iou_threshold = 0.3): 127 | """ 128 | Assigns detections to tracked object (both represented as bounding boxes) 129 | 130 | Returns 3 lists of matches, unmatched_detections and unmatched_trackers 131 | """ 132 | if(len(trackers)==0) or (len(detections)==0): 133 | return np.empty((0,2),dtype=int), np.arange(len(detections)), np.empty((0,5),dtype=int) 134 | iou_matrix = np.zeros((len(detections),len(trackers)),dtype=np.float32) 135 | 136 | for d,det in enumerate(detections): 137 | for t,trk in enumerate(trackers): 138 | iou_matrix[d,t] = iou(det,trk) 139 | matched_indices = linear_assignment(-iou_matrix) 140 | 141 | unmatched_detections = [] 142 | for d,det in enumerate(detections): 143 | if(d not in matched_indices[:,0]): 144 | unmatched_detections.append(d) 145 | unmatched_trackers = [] 146 | for t,trk in enumerate(trackers): 147 | if(t not in matched_indices[:,1]): 148 | unmatched_trackers.append(t) 149 | 150 | #filter out matched with low IOU 151 | matches = [] 152 | for m in matched_indices: 153 | if(iou_matrix[m[0],m[1]]= self.min_hits or self.frame_count <= self.min_hits)): 213 | ret.append(np.concatenate((d,[trk.id+1])).reshape(1,-1)) # +1 as MOT benchmark requires positive 214 | i -= 1 215 | #remove dead tracklet 216 | if(trk.time_since_update > self.max_age): 217 | self.trackers.pop(i) 218 | if(len(ret)>0): 219 | return np.concatenate(ret) 220 | return np.empty((0,5)) 221 | -------------------------------------------------------------------------------- /yolo-coco/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush -------------------------------------------------------------------------------- /yolo-coco/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | 790 | --------------------------------------------------------------------------------