├── DeepSORT_Tracker.py └── README.md /DeepSORT_Tracker.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from time import time 4 | import cv2 5 | import torch 6 | import ultralytics 7 | from ultralytics import YOLO 8 | import supervision as sv 9 | from deep_sort_realtime.deepsort_tracker import DeepSort 10 | 11 | class Tracker: 12 | def __init__(self, capture_index): 13 | 14 | self.capture_index = capture_index 15 | 16 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 17 | print("Using Device: ", self.device) 18 | 19 | self.model = self.load_model() 20 | 21 | self.CLASS_NAMES_DICT = self.model.model.names 22 | 23 | self.box_annotator = sv.BoxAnnotator(sv.Color.green(), thickness=2, text_thickness=1, text_scale=0.5) 24 | 25 | self.object_tracker = DeepSort(max_age=5, 26 | n_init=2, 27 | nms_max_overlap=1.0, 28 | max_cosine_distance=0.3, 29 | nn_budget=None, 30 | override_track_class=None, 31 | embedder="mobilenet", 32 | half=True, 33 | bgr=True, 34 | embedder_gpu=True, 35 | embedder_model_name=None, 36 | embedder_wts=None, 37 | polygon=False, 38 | today=None) 39 | self.thr = 0.3 # detection threshold 40 | 41 | def load_model(self): 42 | 43 | model = YOLO("yolov8m.pt") 44 | model.fuse() 45 | 46 | return model 47 | 48 | def predict(self, frame): 49 | 50 | results = self.model(frame) 51 | 52 | return results 53 | 54 | def generate_bboxes(self, results): 55 | ''' 56 | transforms coordinates from YOLO detector format: [x1, y1, x2, y2] 57 | to DeepSORT Tracker format: [left, top, w, h] 58 | returns a list of detections, each in tuples of ( [left,top,w,h], confidence, detection_class ) 59 | ''' 60 | 61 | cords = results[0].boxes.xyxy.tolist() 62 | conf = results[0].boxes.conf.tolist() 63 | classes = results[0].boxes.cls.tolist() 64 | 65 | detections = [] 66 | 67 | for i in range(len(results[0])): 68 | if conf[i] > self.thr: 69 | #transforming coordinates 70 | cords[i][2] = int(cords[i][2] - cords[i][0]) #width = x2 - x1 71 | cords[i][3] = int(cords[i][3] - cords[i][1]) #height = y2 - y1 72 | #generating detections list 73 | detections.append((cords[i], conf[i], classes[i])) 74 | 75 | return detections 76 | 77 | def plot_tracks(self, tracks, frame): 78 | 79 | tracks = [track for track in tracks if track.is_confirmed()] 80 | 81 | if np.any(tracks): #checks if array has confirmed trakcs to plot, if not, returns original frame as is 82 | 83 | cords = np.array([track.to_ltrb() for track in tracks]) 84 | ids = np.array([track.track_id for track in tracks]) 85 | # Extract & Setup detections for visualization 86 | tracks_sv = sv.Detections( 87 | xyxy=cords, 88 | tracker_id=ids, 89 | ) 90 | 91 | # Format labels 92 | self.labels = ["ID:" + str(tracker_id) for _,_,_,_,tracker_id in tracks_sv] 93 | 94 | # Annotate and display frame 95 | frame = self.box_annotator.annotate(scene=frame, detections=tracks_sv, labels=self.labels) 96 | 97 | return frame 98 | 99 | def __call__(self): 100 | 101 | if self.capture_index: 102 | video_path = os.path.join('.', 'data', 'people.mp4') 103 | video_out_path = os.path.join('.', 'out1.mp4') 104 | cap = cv2.VideoCapture(video_path) 105 | width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 106 | height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 107 | cap_out = cv2.VideoWriter(video_out_path, cv2.VideoWriter_fourcc(*'mp4v'), cap.get(cv2.CAP_PROP_FPS), 108 | (width, height)) 109 | else: 110 | #Real-time Webcam stream 111 | cap = cv2.VideoCapture(self.capture_index) 112 | assert cap.isOpened() 113 | cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280) 114 | cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720) 115 | width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 116 | height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 117 | 118 | while True: 119 | 120 | start_time = time() 121 | 122 | ret, frame = cap.read() 123 | assert ret 124 | 125 | results = self.predict(frame) 126 | detections = self.generate_bboxes(results) 127 | 128 | tracks = self.object_tracker.update_tracks(detections, frame=frame) 129 | #plot tracks func, takes frame and tracks and returns an annotated frame like: 130 | frame = self.plot_tracks(tracks, frame) 131 | 132 | end_time = time() 133 | fps = 1/np.round(end_time - start_time, 2) 134 | 135 | cv2.putText(frame, f'FPS: {int(fps)}', (20,70), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2) 136 | 137 | cv2.imshow('YOLOv8 Detection', frame) 138 | 139 | if self.capture_index: 140 | cap_out.write(frame) #write to file 141 | 142 | if cv2.waitKey(5) & 0xFF == 27: #27 -> escape key 143 | 144 | break 145 | 146 | cap.release() 147 | cv2.destroyAllWindows() 148 | 149 | 150 | 151 | 152 | ultralytics.checks() 153 | tracker = Tracker(capture_index=1) 154 | tracker() 155 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepSORT 2 | An Object Tracking Algorithm based on the DeepSORT algorithm & a YOLOv8 Detector 3 | 4 | ## Dependencies 5 | 6 | - opencv-python 7 | - Pytorch 8 | - Supervision 9 | - YOLOv8m model from Ultralytics 10 | - deep_sort_realtime Library 11 | 12 | ## introduction 13 | 14 | The leading approach in multiple object tracking is tracking-by-detection, which utilizes object detection techniques. Typically, object trajectories are determined through global optimization problems that process entire video batches at once. Various frameworks, such as flow network formulations and probabilistic graphical models, have been adopted for this purpose. However, these methods are not suitable for online scenarios where the identification of targets is required at each time step. 15 | In such cases, more traditional methods like Multiple Hypothesis Tracking (MHT) and the Joint Probabilistic Data Association Filter (JPDAF) are used, performing data association frame-by-frame. Despite the recent revival of MHT and JPDAF for tracking-by-detection, their performance is computationally complex. On the other hand, Simple Online and Realtime Tracking (SORT) offers a simpler approach using Kalman filtering and frame-by-frame data association with the Hungarian method. 16 | 17 | SORT demonstrates favorable performance at high frame rates, especially when paired with a state-of-the-art people detector. However, SORT has limitations with tracking through occlusions due to its association metric, which relies on low state estimation uncertainty. To address this issue, a more informed metric that combines motion and appearance information, specifically a Deep Appearance Descriptor based on convolutional neural network trained on a large-scale person re-identification dataset, is proposed. This integration improves robustness against occlusions while maintaining ease of implementation and efficiency, making it suitable for online scenarios. 18 | 19 |  20 | 21 | ## Track Handling and State Estimation 22 | 23 | The track handling and Kalman filtering framework is mostly identical to the original formulation in SORT. The authors assume a very general tracking scenario where the camera is uncalibrated and where we have no ego-motion information available. While these circumstances pose a challenge to the filtering framework, it is the most common setup considered in recent multiple object tracking benchmarks. 24 | 25 | The tracking scenario is defined on the eight dimensional state space: (bounding box center position, aspect ratio, height) and their respective velocities 26 | 27 | $$(u,v,\gamma,h,\dot(u),\dot(v),\dot(\gamma),\dot(h))$$ 28 | 29 | observations of the object state: 30 | $$(u,v,\gamma,h)$$ 31 | 32 | ## Assignment Problem 33 | 34 | A conventional way to solve the association between the predicted Kalman states and newly arrived measurements is to build an assignment problem that can be solved using the Hungarian algorithm. Into this problem formulation we integrate motion and appearance information through combination of two appropriate metrics. 35 | 36 | To incorporate motion information we use the (squared) Mahalanobis distance between predicted Kalman states and newly arrived measurements: 37 | 38 | $$ 39 | d^{(1)}(i, j)=\left(\boldsymbol{d}_j-\boldsymbol{y}_i\right)^{\mathrm{T}} \boldsymbol{S}_i^{-1}\left(\boldsymbol{d}_j-\boldsymbol{y}_i\right) 40 | $$ 41 | 42 | The Mahalanobis distance takes state estimation uncertainty into account by measuring how many standard deviations the detection is away from the mean track location. Further, using this metric it is possible to exclude unlikely associations by thresholding the Mahalanobis distance at a 95% confidence interval computed from the $\chi^2$ distribution We denote this decision with an indicator: 43 | 44 | $$ 45 | b_{i, j}^{(1)}=\mathbb{1}\left[d^{(1)}(i, j) \leq t^{(1)}\right] 46 | $$ 47 | 48 | The Mahalanobis distance is usually effective for associating objects when there is low uncertainty in motion. However, in our problem formulation involving image space, the predicted state distribution obtained from the Kalman filtering framework only provides a rough estimate of object location. Camera motion can introduce rapid displacements in the image plane, making the Mahalanobis distance less reliable for tracking objects through occlusions. Therefore, we incorporate a second metric into the assignment problem. This additional metric involves computing an appearance descriptor for each bounding box detection and maintaining a gallery of the most recent $L_k$ associated appearance descriptors for each track. The second metric determines the smallest cosine distance between the i-th track and j-th detection in appearance space. 49 | 50 | $$ 51 | d^{(2)}(i, j)=\min \left\lbrace 1-\boldsymbol{r}_j{ }^{\mathrm{T}} \boldsymbol{r}_k^{(i)} \mid \boldsymbol{r}_k^{(i)} \in \mathcal{R}_i\right\rbrace 52 | $$ 53 | 54 | $$ 55 | b_{i, j}^{(2)}=\mathbb{1}\left[d^{(2)}(i, j) \leq t^{(2)}\right] 56 | $$ 57 | 58 | In combination, both metrics complement each other by serving different aspects of the assignment problem. On the one hand, the Mahalanobis distance provides information about possible object locations based on motion that are particularly useful for short-term predictions. On the other hand, the cosine distance considers appearance information that are particularly useful to recover identities after long-term occlusions, when motion is less discriminative. To build the association problem we combine both metrics using a weighted sum 59 | 60 | $$ 61 | c_{i, j}=\lambda d^{(1)}(i, j)+(1-\lambda) d^{(2)}(i, j) 62 | $$ 63 | 64 | where we call an association admissible if it is within the gating region of both metric: 65 | 66 | $$ 67 | b_{i, j}=\prod_{m=1}^2 b_{i, j}^{(m)} 68 | $$ 69 | 70 | ## Matching Cascade 71 | 72 | In order to address the challenges posed by occlusions and uncertain object locations, The authors propose an alternative approach to solving measurement-to-track associations. Instead of solving a global assignment problem, They introduce a cascade that solves a series of subproblems. The motivation for this approach arises from the observation that when an object remains occluded for a prolonged period, the uncertainty associated with its location increases over time. As a result, the probability distribution in the state space becomes more spread out, and the observation likelihood becomes less concentrated. It is essential for the association metric to account for this spread by increasing the measurement-to-track distance. However, counterintuitively, the conventional Mahalanobis distance tends to favor larger uncertainties, as it reduces the distance in terms of standard deviations between any detection and the projected track mean. This undesired behavior can lead to increased track fragmentation and unstable tracks. To address this issue, a matching cascade is proposed that prioritizes objects that are seen more frequently, thereby incorporating our understanding of how probability spreads in the association likelihood. 73 | 74 |
75 |
76 |
83 |
84 |