├── DeepSORT_Tracker.py
└── README.md


/DeepSORT_Tracker.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | from time import time
  4 | import cv2
  5 | import torch
  6 | import ultralytics
  7 | from ultralytics import YOLO
  8 | import supervision as sv
  9 | from deep_sort_realtime.deepsort_tracker import DeepSort
 10 | 
 11 | class Tracker:
 12 |     def __init__(self, capture_index):
 13 | 
 14 |         self.capture_index = capture_index
 15 |         
 16 |         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 17 |         print("Using Device: ", self.device)
 18 |         
 19 |         self.model = self.load_model()
 20 |         
 21 |         self.CLASS_NAMES_DICT = self.model.model.names
 22 |         
 23 |         self.box_annotator = sv.BoxAnnotator(sv.Color.green(), thickness=2, text_thickness=1, text_scale=0.5)
 24 | 
 25 |         self.object_tracker = DeepSort(max_age=5,
 26 |                 n_init=2,
 27 |                 nms_max_overlap=1.0,
 28 |                 max_cosine_distance=0.3,
 29 |                 nn_budget=None,
 30 |                 override_track_class=None,
 31 |                 embedder="mobilenet",
 32 |                 half=True,
 33 |                 bgr=True,
 34 |                 embedder_gpu=True,
 35 |                 embedder_model_name=None,
 36 |                 embedder_wts=None,
 37 |                 polygon=False,
 38 |                 today=None)
 39 |         self.thr = 0.3 # detection threshold
 40 |         
 41 |     def load_model(self):
 42 | 
 43 |         model = YOLO("yolov8m.pt")  
 44 |         model.fuse()
 45 | 
 46 |         return model
 47 |     
 48 |     def predict(self, frame):
 49 |     
 50 |         results = self.model(frame)
 51 |         
 52 |         return results
 53 |     
 54 |     def generate_bboxes(self, results):
 55 |         '''
 56 |         transforms coordinates from YOLO detector format: [x1, y1, x2, y2]
 57 |         to DeepSORT Tracker format: [left, top, w, h]
 58 |         returns a list of detections, each in tuples of ( [left,top,w,h], confidence, detection_class )
 59 |         '''
 60 | 
 61 |         cords = results[0].boxes.xyxy.tolist()
 62 |         conf = results[0].boxes.conf.tolist()
 63 |         classes = results[0].boxes.cls.tolist()
 64 | 
 65 |         detections = []
 66 | 
 67 |         for i in range(len(results[0])):
 68 |             if conf[i] > self.thr:
 69 |                 #transforming coordinates
 70 |                 cords[i][2] = int(cords[i][2] - cords[i][0]) #width = x2 - x1
 71 |                 cords[i][3] = int(cords[i][3] - cords[i][1]) #height = y2 - y1
 72 |                 #generating detections list
 73 |                 detections.append((cords[i], conf[i], classes[i]))
 74 | 
 75 |         return detections
 76 |     
 77 |     def plot_tracks(self, tracks, frame):
 78 |         
 79 |         tracks = [track for track in tracks if track.is_confirmed()]
 80 | 
 81 |         if np.any(tracks): #checks if array has confirmed trakcs to plot, if not, returns original frame as is
 82 | 
 83 |             cords = np.array([track.to_ltrb() for track in tracks])
 84 |             ids = np.array([track.track_id for track in tracks])
 85 |             # Extract & Setup detections for visualization
 86 |             tracks_sv = sv.Detections(
 87 |                         xyxy=cords,
 88 |                         tracker_id=ids,
 89 |                         )
 90 |             
 91 |             # Format labels
 92 |             self.labels = ["ID:" + str(tracker_id) for _,_,_,_,tracker_id in tracks_sv]
 93 |             
 94 |             # Annotate and display frame
 95 |             frame = self.box_annotator.annotate(scene=frame, detections=tracks_sv, labels=self.labels)
 96 |         
 97 |         return frame
 98 |     
 99 |     def __call__(self):
100 | 
101 |         if self.capture_index:
102 |             video_path = os.path.join('.', 'data', 'people.mp4')
103 |             video_out_path = os.path.join('.', 'out1.mp4')
104 |             cap = cv2.VideoCapture(video_path)
105 |             width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
106 |             height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
107 |             cap_out = cv2.VideoWriter(video_out_path, cv2.VideoWriter_fourcc(*'mp4v'), cap.get(cv2.CAP_PROP_FPS),
108 |                           (width, height))
109 |         else:
110 |             #Real-time Webcam stream
111 |             cap = cv2.VideoCapture(self.capture_index)
112 |             assert cap.isOpened()
113 |             cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
114 |             cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
115 |             width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
116 |             height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
117 |         
118 |         while True:
119 |             
120 |             start_time = time()
121 |             
122 |             ret, frame = cap.read()
123 |             assert ret
124 |             
125 |             results = self.predict(frame)
126 |             detections = self.generate_bboxes(results)
127 |             
128 |             tracks = self.object_tracker.update_tracks(detections, frame=frame)
129 |             #plot tracks func, takes frame and tracks and returns an annotated frame like:
130 |             frame = self.plot_tracks(tracks, frame)
131 | 
132 |             end_time = time()
133 |             fps = 1/np.round(end_time - start_time, 2)
134 |                 
135 |             cv2.putText(frame, f'FPS: {int(fps)}', (20,70), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)
136 |             
137 |             cv2.imshow('YOLOv8 Detection', frame)
138 | 
139 |             if self.capture_index:
140 |                 cap_out.write(frame) #write to file
141 | 
142 |             if cv2.waitKey(5) & 0xFF == 27: #27 -> escape key
143 |                 
144 |                 break
145 |         
146 |         cap.release()
147 |         cv2.destroyAllWindows()
148 |         
149 |         
150 |     
151 | 
152 | ultralytics.checks()
153 | tracker = Tracker(capture_index=1)
154 | tracker()
155 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepSORT
 2 | An Object Tracking Algorithm based on the DeepSORT algorithm &amp; a YOLOv8 Detector
 3 | 
 4 | ## Dependencies
 5 | 
 6 | - opencv-python
 7 | - Pytorch
 8 | - Supervision
 9 | - YOLOv8m model from Ultralytics
10 | - deep_sort_realtime Library 
11 | 
12 | ## introduction
13 | 
14 | The leading approach in multiple object tracking is tracking-by-detection, which utilizes object detection techniques. Typically, object trajectories are determined through global optimization problems that process entire video batches at once. Various frameworks, such as flow network formulations and probabilistic graphical models, have been adopted for this purpose. However, these methods are not suitable for online scenarios where the identification of targets is required at each time step. 
15 | In such cases, more traditional methods like Multiple Hypothesis Tracking (MHT) and the Joint Probabilistic Data Association Filter (JPDAF) are used, performing data association frame-by-frame. Despite the recent revival of MHT and JPDAF for tracking-by-detection, their performance is computationally complex. On the other hand, Simple Online and Realtime Tracking (SORT) offers a simpler approach using Kalman filtering and frame-by-frame data association with the Hungarian method. 
16 | 
17 | SORT demonstrates favorable performance at high frame rates, especially when paired with a state-of-the-art people detector. However, SORT has limitations with tracking through occlusions due to its association metric, which relies on low state estimation uncertainty. To address this issue, a more informed metric that combines motion and appearance information, specifically a Deep Appearance Descriptor based on convolutional neural network trained on a large-scale person re-identification dataset, is proposed. This integration improves robustness against occlusions while maintaining ease of implementation and efficiency, making it suitable for online scenarios.
18 | 
19 | ![applsci-12-01319-g003](https://github.com/Mazen-Elaraby/DeepSORT/assets/99294980/a16a3960-664d-4396-bf3f-8dd7560b92fc)
20 | 
21 | ## Track Handling and State Estimation
22 | 
23 | The track handling and Kalman filtering framework is mostly identical to the original formulation in SORT. The authors assume a very general tracking scenario where the camera is uncalibrated and where we have no ego-motion information available. While these circumstances pose a challenge to the filtering framework, it is the most common setup considered in recent multiple object tracking benchmarks.
24 | 
25 | The tracking scenario is defined on the eight dimensional state space: (bounding box center position, aspect ratio, height) and their respective velocities
26 | 
27 | $$(u,v,\gamma,h,\dot(u),\dot(v),\dot(\gamma),\dot(h))$$
28 | 
29 | observations of the object state:
30 | $$(u,v,\gamma,h)$$
31 | 
32 | ## Assignment Problem
33 | 
34 | A conventional way to solve the association between the predicted Kalman states and newly arrived measurements is to build an assignment problem that can be solved using the Hungarian algorithm. Into this problem formulation we integrate motion and appearance information through combination of two appropriate metrics.
35 | 
36 | To incorporate motion information we use the (squared) Mahalanobis distance between predicted Kalman states and newly arrived measurements:
37 | 
38 | $$
39 | d^{(1)}(i, j)=\left(\boldsymbol{d}_j-\boldsymbol{y}_i\right)^{\mathrm{T}} \boldsymbol{S}_i^{-1}\left(\boldsymbol{d}_j-\boldsymbol{y}_i\right)
40 | $$
41 | 
42 | The Mahalanobis distance takes state estimation uncertainty into account by measuring how many standard deviations the detection is away from the mean track location. Further, using this metric it is possible to exclude unlikely associations by thresholding the Mahalanobis distance at a 95% confidence interval computed from the $\chi^2$ distribution We denote this decision with an indicator:
43 | 
44 | $$
45 | b_{i, j}^{(1)}=\mathbb{1}\left[d^{(1)}(i, j) \leq t^{(1)}\right]
46 | $$
47 | 
48 | The Mahalanobis distance is usually effective for associating objects when there is low uncertainty in motion. However, in our problem formulation involving image space, the predicted state distribution obtained from the Kalman filtering framework only provides a rough estimate of object location. Camera motion can introduce rapid displacements in the image plane, making the Mahalanobis distance less reliable for tracking objects through occlusions. Therefore, we incorporate a second metric into the assignment problem. This additional metric involves computing an appearance descriptor for each bounding box detection and maintaining a gallery of the most recent $L_k$ associated appearance descriptors for each track. The second metric determines the smallest cosine distance between the i-th track and j-th detection in appearance space.
49 | 
50 | $$
51 | d^{(2)}(i, j)=\min \left\lbrace 1-\boldsymbol{r}_j{ }^{\mathrm{T}} \boldsymbol{r}_k^{(i)} \mid \boldsymbol{r}_k^{(i)} \in \mathcal{R}_i\right\rbrace
52 | $$
53 | 
54 | $$
55 | b_{i, j}^{(2)}=\mathbb{1}\left[d^{(2)}(i, j) \leq t^{(2)}\right]
56 | $$
57 | 
58 | In combination, both metrics complement each other by serving different aspects of the assignment problem. On the one hand, the Mahalanobis distance provides information about possible object locations based on motion that are particularly useful for short-term predictions. On the other hand, the cosine distance considers appearance information that are particularly useful to recover identities after long-term occlusions, when motion is less discriminative. To build the association problem we combine both metrics using a weighted sum
59 | 
60 | $$
61 | c_{i, j}=\lambda d^{(1)}(i, j)+(1-\lambda) d^{(2)}(i, j)
62 | $$
63 | 
64 | where we call an association admissible if it is within the gating region of both metric:
65 | 
66 | $$
67 | b_{i, j}=\prod_{m=1}^2 b_{i, j}^{(m)}
68 | $$
69 | 
70 | ## Matching Cascade
71 | 
72 | In order to address the challenges posed by occlusions and uncertain object locations, The authors propose an alternative approach to solving measurement-to-track associations. Instead of solving a global assignment problem, They introduce a cascade that solves a series of subproblems. The motivation for this approach arises from the observation that when an object remains occluded for a prolonged period, the uncertainty associated with its location increases over time. As a result, the probability distribution in the state space becomes more spread out, and the observation likelihood becomes less concentrated. It is essential for the association metric to account for this spread by increasing the measurement-to-track distance. However, counterintuitively, the conventional Mahalanobis distance tends to favor larger uncertainties, as it reduces the distance in terms of standard deviations between any detection and the projected track mean. This undesired behavior can lead to increased track fragmentation and unstable tracks. To address this issue, a matching cascade is proposed that prioritizes objects that are seen more frequently, thereby incorporating our understanding of how probability spreads in the association likelihood.
73 | 
74 | <p align="center">
75 |   <img src="https://github.com/Mazen-Elaraby/DeepSORT/assets/99294980/955ccc43-3298-462e-b2f9-5787e392b5bd" />
76 | </p>
77 | 
78 | ## Deep Appearance Descriptor
79 | 
80 | By using simple nearest neighbor queries without additional metric learning, successful application of the method requires a well-discriminating feature embedding to be trained offline, before the actual online tracking application. To this end, we employ a CNN that has been trained on a large-scale person re-identification dataset (MARS) that contains over 1,100,000 images of 1,261 pedestrians, making it well suited for deep metric learning in a people tracking context. The CNN architecture of our network is shown in The following figure. In summary, The authors employ a wide residual network with two convolutional layers followed by six residual blocks. The global feauture map of dimensionality 128 is computed in dense layer 10. A final batch and $l2$ normalization projects features onto the unit hypersphere to be compatible with our cosine appearance metric
81 | 
82 | <p align="center">
83 |   <img src="https://github.com/Mazen-Elaraby/DeepSORT/assets/99294980/5577ffc7-59bf-44c1-9c30-0a271ce3721b" />
84 | </p>
85 | 
86 | ## Output 
87 | 1):
88 | 
89 | https://github.com/Mazen-Elaraby/DeepSORT/assets/99294980/b38f15d1-2b79-46c1-99c5-999e1d6686fe
90 | 
91 | 2):
92 | 
93 | https://github.com/Mazen-Elaraby/DeepSORT/assets/99294980/cd9ce1e7-bfeb-4c75-b382-a264215bea7c
94 | 
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------