├── src ├── __init__.py ├── tracker │ ├── encoder.py │ ├── solver.py │ ├── similarities.py │ ├── geometry.py │ ├── supertrack.py │ └── tracker.py ├── utils │ ├── iotools.py │ ├── utils.py │ └── evaluate.py └── datasets │ └── dataset.py ├── conf ├── encoder │ └── precomputed.yaml ├── dataset │ ├── Synthehicle.yaml │ ├── Synthehicle-bev.yaml │ └── CityFlow.yaml ├── experiment │ ├── CityFlow.yaml │ └── Synthehicle.yaml └── config.yaml ├── pyproject.toml ├── .github └── ISSUE_TEMPLATE │ └── bug_report.md ├── setup.py ├── LICENSE ├── tools └── track.py ├── .gitignore └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /conf/encoder/precomputed.yaml: -------------------------------------------------------------------------------- 1 | name: precomputed 2 | -------------------------------------------------------------------------------- /conf/dataset/Synthehicle.yaml: -------------------------------------------------------------------------------- 1 | name: Synthehicle 2 | scene_path: ./test/Town06-O-dawn/ 3 | camera_pattern: C0* 4 | img_path: ./out_rgb/ 5 | gt_path: ./gt/gt.txt 6 | calibration_path: calibration.json -------------------------------------------------------------------------------- /conf/dataset/Synthehicle-bev.yaml: -------------------------------------------------------------------------------- 1 | name: Synthehicle 2 | scene_path: ./test/Town06-O-dawn/ 3 | camera_pattern: C0* 4 | img_path: ./out_rgb/ 5 | gt_path: ./gt/gt_bev.txt 6 | calibration_path: calibration.json -------------------------------------------------------------------------------- /conf/dataset/CityFlow.yaml: -------------------------------------------------------------------------------- 1 | name: AICITY 2 | scene_path: ./validation/S02 3 | camera_pattern: c00* 4 | img_path: ./img1/ 5 | img_ext: jpg 6 | offsets: [0, 0, 3, 8] 7 | calibration_path: calibration.json 8 | roi_path: "./data/AICITY/eval/ROIs/validation" 9 | -------------------------------------------------------------------------------- /src/tracker/encoder.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import warnings 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | import torchvision 7 | 8 | 9 | class Precomputed: 10 | def __init__(self, cfg): 11 | self.cfg = cfg 12 | 13 | def __call__(self, x): 14 | features = x["annotations"][:, 11:] 15 | return F.normalize(features, p=2, dim=1) 16 | 17 | 18 | def create_encoder(cfg, device): 19 | print(cfg) 20 | if cfg.name == "precomputed": 21 | return Precomputed(cfg) 22 | else: 23 | raise ValueError(f"Encoder {cfg.name} not found.") 24 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # this ruff configuration is taken (mostly) from hugging face 2 | [tool.ruff] 3 | line-length = 119 4 | 5 | [tool.ruff.lint] 6 | # Never enforce `E501` (line length violations). 7 | ignore = ["C901", "E501", "E741", "F402", "F823", "E402", "F401", "F403", "F811"] 8 | select = ["C", "E", "F", "I", "W"] 9 | 10 | [tool.ruff.lint.isort] 11 | lines-after-imports = 2 12 | known-first-party = ["stmc"] 13 | 14 | [tool.ruff.format] 15 | # Like Black, use double quotes for strings. 16 | quote-style = "double" 17 | 18 | # Like Black, indent with spaces, rather than tabs. 19 | indent-style = "space" 20 | 21 | # Like Black, respect magic trailing commas. 22 | skip-magic-trailing-comma = false 23 | 24 | # Like Black, automatically detect the appropriate line ending. 25 | line-ending = "auto" 26 | 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Environment** 27 | Please provide the following information: 28 | - Output of `pip freeze` and or `conda info` 29 | - Output of `python -m torch.utils.collect_env` 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /conf/experiment/CityFlow.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /dataset: CityFlow 5 | - override /encoder: precomputed 6 | 7 | dataset_path: ./data/AICITY/ 8 | 9 | resources: 10 | reid: LCFractal 11 | detector: YOLOX 12 | 13 | tracker: 14 | matching: 15 | distance_threshold: 0.001 16 | rescale_threshold: 0.7 17 | reid_decay: 0.7 18 | rescale_weight: 0.9 19 | confidence_thresh: 0.70 20 | low_confidence_thresh: null 21 | patience: 0 22 | memory: 160 23 | fdim: 2048 24 | prematching: 25 | enabled: false 26 | iou_bias: 0.50 27 | iou_threshold: 0.70 28 | prune_remaining: false 29 | 30 | preprocess: 31 | nms_thresh: 0.7 32 | roi_filter: true 33 | 34 | postprocess: 35 | expand_boxes: 36 | enable: true 37 | factor: 1.4 38 | remove_borders: 39 | enable: true 40 | border_size: 0 41 | size_filter: 42 | enable: true 43 | min_size: 6000 44 | max_size: 600000 45 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | 4 | setup( 5 | name="stmc", 6 | version="0.1.0", 7 | packages=find_packages(), 8 | install_requires=[ 9 | "hydra-core", 10 | "torch", 11 | "wandb", 12 | "loguru", 13 | "omegaconf", 14 | "qqdm", 15 | "pillow", 16 | "ramapy", 17 | ], 18 | entry_points={ 19 | "console_scripts": [ 20 | "track=tools.track:main", 21 | ], 22 | }, 23 | author="Fabian Herzog", 24 | author_email="fabian.herzog@tum.de", 25 | description="Spatial-Temporal Multi-Cuts for Online Multiple-Camera Vehicle Tracking", 26 | long_description=open("README.md").read(), 27 | long_description_content_type="text/markdown", 28 | url="https://github.com/fubel/stmc", 29 | classifiers=[ 30 | "Programming Language :: Python :: 3", 31 | "License :: OSI Approved :: MIT License", 32 | "Operating System :: OS Independent", 33 | ], 34 | python_requires=">=3.8", 35 | ) 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Fabian Herzog 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /conf/experiment/Synthehicle.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /dataset: Synthehicle 5 | - override /encoder: precomputed 6 | 7 | dataset_path: ./data/Synthehicle/ 8 | 9 | resources: 10 | reid: LightMBN 11 | detector: YOLOX 12 | 13 | tracker: 14 | matching: 15 | distance_threshold: 8 16 | rescale_threshold: 0.8 17 | reid_decay: 0.7 18 | rescale_weight: 0.3 19 | distance_weight: 0.7 20 | confidence_thresh: 0.6 21 | patience: 0 22 | memory: 15 23 | fdim: 3584 24 | prematching: 25 | enabled: true 26 | iou_bias: 0.60 27 | iou_threshold: 0.70 28 | prune_remaining: true 29 | 30 | preprocess: 31 | nms_thresh: 0.9 32 | roi_filter: true 33 | bottom: false 34 | box_projection_centers: 35 | alpha_w: 0.15 36 | alpha_h: 0.85 37 | 38 | postprocess: 39 | expand_boxes: 40 | enable: false 41 | factor: 1.4 42 | remove_borders: 43 | enable: false 44 | border_size: 5 45 | size_filter: 46 | enable: false 47 | min_size: 0 48 | max_size: 0 49 | 50 | evaluation: 51 | inplace: true 52 | evaluate_standard: true 53 | evaluate_hota: false 54 | evaluate_bev: false 55 | evaluate_external: false -------------------------------------------------------------------------------- /conf/config.yaml: -------------------------------------------------------------------------------- 1 | # config.yaml 2 | hydra/hydra_logging: null 3 | 4 | defaults: 5 | - dataset: CityFlow 6 | - encoder: precomputed 7 | 8 | dataset_path: ./data/AICITY/ 9 | output_path: ./outputs/ 10 | 11 | device: cuda 12 | 13 | logging: 14 | wandb: 15 | enable: false 16 | project: ggmc 17 | upload_results: false 18 | tags: null 19 | tensorboard: 20 | enable: false 21 | 22 | resources: 23 | path: ./resources/ 24 | detector: YOLOX 25 | reid: null 26 | 27 | visuals: 28 | plot_interval: 1 29 | plot_results: false 30 | plot_ground_truth: false 31 | plot_to_tensorboard: false 32 | grid_rows: 2 33 | store_files: true 34 | border_size: 3 35 | 36 | solver: 37 | backend: PD 38 | 39 | tracker: 40 | matching: 41 | distance_threshold: 0.02 42 | rescale_threshold: 0.65 43 | reid_decay: 1.0 44 | rescale_weight: 0.5 45 | distance_weight: 0.5 46 | confidence_thresh: 0.7 47 | low_confidence_thresh: null 48 | patience: 1 49 | memory: 15 50 | fdim: 512 51 | enable_accumulator: true 52 | prematching: 53 | enabled: true 54 | iou_bias: 0.60 55 | iou_threshold: 0.50 56 | prune_remaining: false 57 | 58 | preprocess: 59 | nms_thresh: null 60 | roi_filter: true 61 | bottom: true 62 | box_projection_centers: 63 | alpha_w: null 64 | alpha_h: null 65 | 66 | postprocess: 67 | expand_boxes: 68 | enable: true 69 | factor: 1.4 70 | remove_borders: 71 | enable: true 72 | border_size: 5 73 | size_filter: 74 | enable: true 75 | min_size: 6220 76 | max_size: 622080 77 | 78 | evaluation: 79 | inplace: true 80 | evaluate_standard: true 81 | evaluate_hota: false 82 | evaluate_bev: false 83 | evaluate_external: true 84 | -------------------------------------------------------------------------------- /src/tracker/solver.py: -------------------------------------------------------------------------------- 1 | import rama_py 2 | import torch 3 | 4 | 5 | def multicut(edge_index, edge_weights, opts): 6 | """Solves a multicut problem based on the RAMA algorithm. 7 | 8 | The edge_index is expected in the usual torch_geometric format. 9 | Note that RAMA requires u < v for each edge (u, v) in the graph. 10 | 11 | Args: 12 | edge_index (LongTensor): 2xE LongTensor of edge indices. 13 | edge_weights (LongTensor): E LongTensor of edge weights. 14 | 15 | Returns: 16 | LongTensor: N LongTensor of node labels, where N is the number 17 | of nodes in the graph. 18 | """ 19 | if (edge_index[0] > edge_index[1]).any(): 20 | raise ValueError("Solver expects u < v for each edge (u, v) in the graph.") 21 | if edge_index.device.index is None: 22 | raise ValueError("Solver runs on CUDA device only. Please move data to CUDA.") 23 | if edge_index.shape[1] == 0: 24 | return torch.empty(0).to("cuda") 25 | i = edge_index[0].to(torch.int32) 26 | j = edge_index[1].to(torch.int32) 27 | costs = edge_weights.to(torch.float32) 28 | num_nodes = torch.max(edge_index) + 1 29 | num_edges = edge_index.shape[1] 30 | node_labels = torch.ones(num_nodes, device=i.device).to(torch.int32) 31 | rama_py.rama_cuda_gpu_pointers( 32 | i.data_ptr(), 33 | j.data_ptr(), 34 | costs.data_ptr(), 35 | node_labels.data_ptr(), 36 | num_nodes, 37 | num_edges, 38 | i.device.index, 39 | opts, 40 | ) 41 | return node_labels 42 | 43 | 44 | def scale_weights(weights, threshold=0.7): 45 | """Scales the given weights to the range [-1, 1] based on the given threshold. 46 | 47 | Args: 48 | weights (FloatTensor): LongTensor of edge weights. 49 | threshold (float, optional): Threshold for scaling. Defaults to 0.4. 50 | 51 | Returns: 52 | FloatTensor: LongTensor of scaled edge weights. 53 | """ 54 | y = weights.clone() 55 | z = weights.clone() 56 | z[y == threshold] = 0.0 57 | z[y > threshold] = (y[y > threshold] - threshold) / (1 - threshold) 58 | z[y < threshold] = (y[y < threshold] - threshold) / (threshold) 59 | return z 60 | 61 | 62 | def create_solver(backend): 63 | opts = rama_py.multicut_solver_options(backend) 64 | opts.verbose = False 65 | return opts 66 | -------------------------------------------------------------------------------- /src/tracker/similarities.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision.ops import box_iou 3 | 4 | 5 | def cosine_similarity(a, b, eps=1e-8): 6 | """ 7 | Compute pairwise appearance distance between features. 8 | from https://stackoverflow.com/a/58144658 9 | """ 10 | a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None] 11 | a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n)) 12 | b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n)) 13 | sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1)) 14 | return sim_mt 15 | 16 | 17 | def batch_cosine_similarity(a, b, eps=1e-8): 18 | """Compute batched pairwise appearance distance between features. 19 | 20 | Args: 21 | a (torch.Tensor): (B, N, feature_dim) tensor. 22 | b (torch.Tensor): (B, N, feature_dim) tensor. 23 | eps (float, optional): Epsilon to prevent division by zero. Defaults to 1e-8. 24 | 25 | Returns: 26 | torch.Tensor: (B, N, N) tensor of pairwise similarities. 27 | """ 28 | # Compute norms along feature dimension and add new dimensions needed for broadcasting 29 | a_n = a.norm(dim=2)[:, :, None] 30 | b_n = b.norm(dim=2)[:, :, None] 31 | 32 | # Perform normalization and prevent division by zero. 33 | a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n)) 34 | b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n)) 35 | 36 | # Compute similarity matrix using batch matrix multiplication. 37 | sim_mt = torch.bmm(a_norm, b_norm.transpose(1, 2)) 38 | return sim_mt 39 | 40 | 41 | def batched_box_iou(boxes): 42 | """Compute batched pairwise IoU between boxes. 43 | 44 | Args: 45 | boxes (torch.Tensor): (B, N, 4) tensor of boxes. 46 | 47 | Returns: 48 | torch.Tensor: (B, N, N) tensor of pairwise IoU. 49 | """ 50 | ious = [] 51 | for sub_boxes in boxes: 52 | ious.append(box_iou(sub_boxes, sub_boxes)) 53 | return torch.stack(ious) 54 | 55 | 56 | def bev_distance(bev_positions): 57 | """Compute distance between positions on ground plane. 58 | 59 | Args: 60 | bev_positions (torch.Tensor): (N, 2) tensor of positions. 61 | 62 | Returns: 63 | torch.Tensor: (N, N) tensor of pairwise similarities. 64 | """ 65 | return torch.norm(bev_positions[:, None] - bev_positions[None, :], dim=2) 66 | 67 | 68 | def batch_bev_distance(bev_positions): 69 | """Compute batched distance similarity between positions on ground plane. 70 | 71 | Args: 72 | bev_positions (torch.Tensor): (B, N, 2) tensor of positions. 73 | 74 | Returns: 75 | torch.Tensor: (B, N, N) tensor of pairwise similarities. 76 | """ 77 | # Subtract positions across the batch, adding extra dimensions for broadcasting 78 | diff = bev_positions[:, :, None] - bev_positions[:, None, :] 79 | 80 | # Compute norm along the last dimension (x and y coordinates) 81 | norm = torch.norm(diff, dim=-1) 82 | 83 | # Return similarity 84 | return norm 85 | -------------------------------------------------------------------------------- /tools/track.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from subprocess import PIPE, run 4 | 5 | import hydra 6 | import torch 7 | from loguru import logger 8 | from omegaconf import DictConfig, OmegaConf 9 | from qqdm import format_str, qqdm 10 | 11 | import wandb 12 | from src.datasets.dataset import create_dataloader 13 | from src.tracker.encoder import create_encoder 14 | from src.tracker.solver import create_solver 15 | from src.tracker.tracker import create_tracker 16 | from src.utils.evaluate import evaluate_tracker 17 | from src.utils.iotools import ResultsWriter 18 | 19 | 20 | @hydra.main(version_base=None, config_path="../conf", config_name="config") 21 | def main(cfg: DictConfig) -> None: 22 | if cfg.device == "cpu" or not torch.cuda.is_available(): 23 | raise ValueError("This code runs on CUDA only. Please set device to 'cuda'.") 24 | else: 25 | device = torch.device(cfg.device) 26 | logger.info(f"🚀 Using device: {device}") 27 | 28 | cfg.tracker.matching.distance_weight = 1 - cfg.tracker.matching.rescale_weight 29 | 30 | # create output directories 31 | output_path = os.path.join(cfg.output_path) 32 | os.makedirs(output_path, exist_ok=True) 33 | output_path = os.path.join(output_path, cfg.dataset.name) 34 | logger.info(f"📂 Writing to output path: {output_path}") 35 | 36 | # Initialize wandb and tensorboard 37 | if cfg.logging.wandb.enable: 38 | wandb.init(project=cfg.logging.wandb.project) 39 | wandb.config.update(OmegaConf.to_container(cfg)) 40 | if cfg.logging.wandb.tags is not None: 41 | wandb.run.tags = cfg.logging.wandb.tags 42 | 43 | # Initialize solver 44 | solver_opts = create_solver(cfg.solver.backend) 45 | logger.info(f"✨ Initialized solver, using backend: {cfg.solver.backend}") 46 | 47 | # Initialize dataset and dataloader 48 | dataloader = create_dataloader(cfg) 49 | logger.info("✨ Created dataloader.") 50 | 51 | # Initialize encoder 52 | encoder = create_encoder(cfg.encoder, device) 53 | logger.info("✨ Created encoder.") 54 | 55 | tracker = create_tracker(cfg, solver_opts, encoder, len(dataloader.dataset.camera_names), device) 56 | logger.info("✨ Initialized tracker.") 57 | 58 | results_writer = ResultsWriter( 59 | output_path=output_path, 60 | cfg=cfg, 61 | normalization=dataloader.dataset._norm_factors, 62 | camera_names=dataloader.dataset.camera_names, 63 | ) 64 | 65 | tw = qqdm(range(len(dataloader)), desc=format_str("bold", "Description")) 66 | for i, batch in enumerate(dataloader): 67 | results, _ = tracker.step(batch) 68 | results_writer.add(results) 69 | stats = tracker.stats 70 | tw.set_infos(stats) 71 | tw.update() 72 | 73 | if cfg.logging.wandb.enable: 74 | _stats_str_to_float = {k: float(v) for k, v in stats.items()} 75 | wandb.log(_stats_str_to_float, step=i) 76 | 77 | logger.info(f"🕒 Cumulative execution time of tracker {tracker.cumulative_execution_time * 10}") 78 | logger.info(f"🕒 Average time per frame {tracker.cumulative_execution_time / tracker.frame}") 79 | 80 | results_writer.save() 81 | 82 | logger.info("🚀 Tracking completed.") 83 | logger.info( 84 | f"📈 Results saved to {results_writer.results_file}. " 85 | "Use the official evaluation script of the dataset for evaluation." 86 | ) 87 | 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /src/tracker/geometry.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import torch 5 | 6 | 7 | class Projector: 8 | def __init__(self, calibration_path: str): 9 | """ 10 | Initialize a Projector object. The projector is used to project points between image and world coordinates. 11 | 12 | Args: 13 | calibration_path (str): Path to the calibration file (JSON). 14 | 15 | Raises: 16 | FileNotFoundError: If the calibration file is not found. 17 | ValueError: If the homography is not found in the calibration file. 18 | """ 19 | if os.path.exists(calibration_path) is False: 20 | raise FileNotFoundError(f"Calibration file not found at path: {calibration_path}") 21 | self.calibration_path = calibration_path 22 | 23 | with open(calibration_path, "r") as f: 24 | calibration = json.load(f) 25 | try: 26 | homography_keys = [ 27 | "homography", 28 | "H", 29 | "homography_matrix", 30 | "homography matrix", 31 | ] 32 | valid_homography_key = set(homography_keys).intersection(set(calibration.keys())).pop() 33 | except KeyError: 34 | raise ValueError("Homography not found in calibration file.") 35 | self._homography = torch.Tensor(calibration[valid_homography_key]) 36 | self._inverse_homography = torch.inverse(self._homography) 37 | 38 | def image_to_world(self, points: torch.Tensor) -> torch.Tensor: 39 | """Projects image points to world coordinates. 40 | 41 | Args: 42 | points (torch.Tensor): Image points Nx2. 43 | 44 | Returns: 45 | torch.Tensor: World points Nx3. 46 | """ 47 | if points.dim() != 2: 48 | points = points.view(-1, 2) 49 | if points.size(1) != 2: 50 | raise ValueError(f"Expected image points to be of shape (N, 2), but got {points.shape}.") 51 | return self._homography_image_to_world(points) 52 | 53 | def world_to_image(self, points: torch.Tensor) -> torch.Tensor: 54 | """Projects world points to image coordinates. 55 | 56 | Args: 57 | points (torch.Tensor): World points Nx3. 58 | 59 | Returns: 60 | torch.Tensor: Image points Nx2. 61 | """ 62 | if points.dim() != 2: 63 | points = points.view(-1, 3) 64 | if points.size(1) != 3: 65 | points = torch.cat([points, torch.ones((points.shape[0], 1))], dim=1) 66 | return self._homography_world_to_image(points) 67 | 68 | def _homography_image_to_world(self, points: torch.Tensor) -> torch.Tensor: 69 | points = torch.cat([points, torch.ones((points.shape[0], 1))], dim=1) 70 | device = points.device 71 | homography = self._inverse_homography.to(device) 72 | projected_points = torch.matmul(homography, points.t()).t() 73 | projected_points = projected_points[:, :2] / projected_points[:, 2].reshape(-1, 1) 74 | return projected_points 75 | 76 | def _homography_world_to_image(self, points: torch.Tensor) -> torch.Tensor: 77 | device = points.device 78 | homography = self._homography.to(device) 79 | projected_points = torch.matmul(homography, points.t()).t() 80 | projected_points = projected_points[:, :2] / projected_points[:, 2].reshape(-1, 1) 81 | return projected_points 82 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Project-specific 2 | data/ 3 | eval/ 4 | resources/ 5 | outputs/ 6 | wandb/ 7 | 8 | .ruff_cache 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | cover/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | .pybuilder/ 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | # For a library or package, you might want to ignore these files since the code is 96 | # intended to run in multiple environments; otherwise, check them in: 97 | # .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # poetry 107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 108 | # This is especially recommended for binary packages to ensure reproducibility, and is more 109 | # commonly ignored for libraries. 110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 111 | #poetry.lock 112 | 113 | # pdm 114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 117 | # in version control. 118 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 119 | .pdm.toml 120 | .pdm-python 121 | .pdm-build/ 122 | 123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 124 | __pypackages__/ 125 | 126 | # Celery stuff 127 | celerybeat-schedule 128 | celerybeat.pid 129 | 130 | # SageMath parsed files 131 | *.sage.py 132 | 133 | # Environments 134 | .env 135 | .venv 136 | env/ 137 | venv/ 138 | ENV/ 139 | env.bak/ 140 | venv.bak/ 141 | 142 | # Spyder project settings 143 | .spyderproject 144 | .spyproject 145 | 146 | # Rope project settings 147 | .ropeproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | # pytype static type analyzer 161 | .pytype/ 162 | 163 | # Cython debug symbols 164 | cython_debug/ 165 | 166 | # PyCharm 167 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 168 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 169 | # and can be added to the global gitignore or merged into this file. For a more nuclear 170 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 171 | #.idea/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spatial-Temporal Multi-Cuts for Online Multiple-Camera Vehicle Tracking 2 | [![arXiv Badge](https://img.shields.io/badge/Paper-arXiv.2410.02638-b31b1b.svg)](https://arxiv.org/abs/2410.02638) 3 | [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) 4 | 5 | ## Authors 6 | 7 | [Fabian Herzog](https://github.com/fubel), [Johannes Gilg](https://github.com/Blueblue4), [Philipp Wolters](https://github.com/phi-wol), [Torben Teepe](https://github.com/tteepe/), and Gerhard Rigoll 8 | 9 | ## Installation 10 | 11 | Only tested with Python 3.8, CUDA 11.8, GCC >= 9.4.0 on NVIDIA RTX 3090, PyTorch 2.0.1 on Ubuntu 22.04. 12 | 13 | ```bash 14 | # Setup with miniconda 15 | conda create -n stmc python=3.8 16 | conda activate stmc 17 | 18 | # Setup torch 19 | conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.8 -c pytorch -c nvidia 20 | 21 | # Setup RAMA 22 | # (cf. https://github.com/pawelswoboda/RAMA/) 23 | git clone git@github.com:pawelswoboda/RAMA.git 24 | mkdir -p RAMA/build && cd RAMA/build 25 | cmake .. 26 | make -j 4 27 | 28 | # Setup Python bindings 29 | python -m pip install git+https://github.com/pawelswoboda/RAMA.git 30 | 31 | # Install remaining dependencies 32 | python -m pip install -r requirements.txt 33 | ``` 34 | 35 | ## Data Setup 36 | 37 | The config files assume the datasets are stored in `./data/`. You can setup a symlink to a different location or adjust the paths in the config. The datasets are available at: 38 | 39 | * [CityFlow](https://www.aicitychallenge.org) 40 | * [Synthehicle](https://github.com/fubel/synthehicle) 41 | 42 | You need to provide the camera calibrations in `calibration.json` files. They are available in the releases. 43 | 44 | ## Running the Code 45 | 46 | For a multi-camera scene, adjust the `config.yaml`. To track the Synthehicle scene `Town06-O-dawn`, run 47 | 48 | ```bash 49 | # for Synthehicle, Town06-O-dawn 50 | python -m tools.track +experiment=Synthehicle dataset.scene_path=./test/Town06-O-dawn/ 51 | ``` 52 | 53 | To track the CityFlow scene S02, run 54 | 55 | ```bash 56 | # for Synthehicle, Town06-O-dawn 57 | python -m tools.track +experiment=CityFlow 58 | ``` 59 | 60 | ❗️ We'll provide all pre-extracted detections and features soon! 61 | 62 | ## Features and Detections 63 | 64 | Our resources are formatted in the MOT-Challenge format, with the addition that the last N columns of a resource file store the appearance feature vector of that object. Detections and features are available in the releases. 65 | 66 | ❗️ We'll provide all pre-extracted detections and features soon! 67 | 68 | ## Evaluation 69 | 70 | The results are saved in the output directory specified in the config. 71 | 72 | **🚨 Please use the evaluation scripts provided by the respective datasets to evaluate the final results!** 73 | 74 | Our in-built evaluation follows the evaluation protocol of Synthehicle which differs from the CityFlow official evaluation script (our eval does not filter single-cam trajectories, for instance). 75 | 76 | ## Acknowledgements 77 | 78 | We'd like to thank the authors of the following repositories for providing code used in our work: 79 | 80 | * We use the [RAMA](https://github.com/pawelswoboda/RAMA.git) solver which enables fast multi-cuts on the GPU. 81 | * The features for CityFlow are from [LCFractal](https://github.com/LCFractal/AIC21-MTMC). 82 | 83 | ## Citation 84 | 85 | ``` 86 | @article{herzog2024spatial, 87 | title={{Spatial-Temporal Multi-Cuts for Online Multiple-Camera Vehicle Tracking}}, 88 | author={Herzog, Fabian and Gilg, Johannes and Wolters, Philipp and Teepe, Torben and Rigoll, Gerhard}, 89 | journal={arXiv preprint arXiv:2410.02638}, 90 | year={2024} 91 | } 92 | ``` 93 | 94 | ## License 95 | 96 | The original code in this repository is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. This project relies on various third-party libraries and dependencies, each with their own licensing terms. These dependencies may not be included in the MIT License that covers my original code. Users of this project must ensure they comply with all licenses of the required dependencies for their specific use case. Some dependencies may have more restrictive terms than the MIT License. 97 | -------------------------------------------------------------------------------- /src/utils/iotools.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import torch 5 | from torch.utils.tensorboard import SummaryWriter 6 | 7 | from .utils import expand_boxes, remove_border_boxes, size_filter 8 | 9 | 10 | class ResultsWriter: 11 | def __init__(self, output_path, cfg, normalization=None, camera_names=None): 12 | self._results = [] 13 | 14 | self.cfg = cfg 15 | self.output_path = output_path 16 | self._norm_factors = normalization 17 | 18 | self.rows = cfg.visuals.grid_rows 19 | self.plot_results = cfg.visuals.plot_results 20 | self.plot_every = cfg.visuals.plot_interval 21 | 22 | self.camera_names = camera_names 23 | 24 | self.writer = None 25 | 26 | if cfg.logging.tensorboard.enable: 27 | self.writer = SummaryWriter() 28 | 29 | self.store_files = cfg.visuals.store_files 30 | self.results_file = os.path.join(output_path, "results.txt") 31 | 32 | self.offsets = cfg.dataset.offsets if hasattr(cfg.dataset, "offsets") else [0] * len(camera_names) 33 | 34 | self.on_bev = True if cfg.dataset.name == "WildTrack" else False 35 | 36 | self._save_function = self.get_save_function(cfg) 37 | 38 | if os.path.exists(self.results_file): 39 | os.remove(self.results_file) 40 | 41 | os.makedirs(output_path, exist_ok=True) 42 | 43 | @property 44 | def results(self): 45 | results = torch.cat(self._results, dim=0) 46 | for i, offset in enumerate(self.offsets): 47 | results[results[:, 0] == i, 2] -= offset 48 | # multiply camera column by (-1) 49 | results[:, 0] *= -1 50 | for i, name in enumerate(self.camera_names): 51 | # this is a bit hacky if camera does not start with letter 52 | try: 53 | name_int = int(name[1:]) 54 | except ValueError: 55 | # fallback to index of camera 56 | name_int = i 57 | results[results[:, 0] == -i, 0] = name_int 58 | if self.cfg.postprocess.expand_boxes.enable: 59 | factor = self.cfg.postprocess.expand_boxes.factor 60 | results[:, 3:7] = expand_boxes(results[:, 3:7], factor) 61 | if self.cfg.postprocess.remove_borders.enable: 62 | boxes = results[:, 3:7] 63 | border = self.cfg.postprocess.remove_borders.border_size 64 | keep = remove_border_boxes(boxes, border) 65 | results = results[keep] 66 | if self.cfg.postprocess.size_filter.enable: 67 | boxes = results[:, 3:7] 68 | keep = size_filter( 69 | boxes, self.cfg.postprocess.size_filter.min_size, self.cfg.postprocess.size_filter.max_size 70 | ) 71 | results = results[keep] 72 | return results 73 | 74 | def add(self, result): 75 | _result = result.clone() 76 | if self._norm_factors is not None: 77 | _result = self.denormalize_bev(_result[:, 7:9]) 78 | self._results.append(result) 79 | 80 | def save(self): 81 | if self._results: 82 | self._save_function(self.results.cpu().numpy()) 83 | 84 | def _to_aicity19(self, result): 85 | # CAMERA_ID OBJ_ID FRAME X Y W H 1 X_BEV Y_BEV -1 86 | np.savetxt(self.results_file, result, fmt="%d %d %d %d %d %d %d %f %f") 87 | 88 | def _to_aicity24(self, result): 89 | # CAMERA_ID OBJ_ID FRAME X Y W H 1 X_BEV Y_BEV -1 90 | np.savetxt(self.results_file, result, fmt="%d %d %d %d %d %d %d %f %f") 91 | 92 | def _to_synthehicle(self, result): 93 | # CAMERA, FRAME, ID, X, Y, W, H, SCORE, X_BEV, Y_BEV 94 | np.savetxt(self.results_file, result[:, [2, 1]], fmt="%d", delimiter=",") 95 | 96 | def get_save_function(self, cfg): 97 | if "WildTrack" in cfg.dataset.name: 98 | return self._to_wildtrack 99 | elif "AICITY24" in cfg.dataset.name: 100 | return self._to_aicity19 101 | elif "AICITY" in cfg.dataset.name or "CityFlow" in cfg.dataset.name: 102 | return self._to_aicity24 103 | else: 104 | return self._to_synthehicle 105 | 106 | def denormalize_bev(self, positions): 107 | min_x, min_y, max_x, max_y = self._norm_factors 108 | return positions * torch.tensor([max_x - min_x, max_y - min_y]) + torch.tensor([min_x, min_y]) 109 | 110 | def squeeze_batch(self, x: torch.Tensor): 111 | if x.dim() == 4 and x.size(0) == 1: 112 | return x.squeeze(0) 113 | return x 114 | -------------------------------------------------------------------------------- /src/utils/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | from typing import List, Optional, Tuple 4 | 5 | import matplotlib.pyplot as plt 6 | import torch 7 | from torch.utils.tensorboard import SummaryWriter 8 | from torchvision import transforms 9 | from torchvision.io import write_jpeg 10 | from torchvision.utils import draw_bounding_boxes, make_grid 11 | 12 | 13 | def resize_transform(img, size=(256, 128)): 14 | """ 15 | Resize a torch image to the specified size. 16 | Used before passing the image to reid model. 17 | """ 18 | transform = transforms.Compose( 19 | [ 20 | transforms.ToPILImage(), 21 | transforms.Resize((size[0], size[1])), 22 | transforms.ToTensor(), 23 | ] 24 | ) 25 | return transform(img) 26 | 27 | 28 | def compute_centers(boxes, bottom=True, box_projection_centers=None): 29 | """ 30 | Compute the 2D centers of a torch tensor of bounding boxes. 31 | """ 32 | if bottom is True and box_projection_centers is not None: 33 | raise ValueError("Cannot project boxes to bottom and use box_projection_centers simultaneously.") 34 | centers = torch.zeros((boxes.shape[0], 2)) 35 | centers[:, 0] = boxes[:, 0] + boxes[:, 2] / 2 36 | if box_projection_centers is not None: 37 | alpha_w, alpha_h = box_projection_centers 38 | centers[:, 1] = boxes[:, 1] + alpha_h * boxes[:, 3] 39 | elif bottom: 40 | centers[:, 1] = boxes[:, 1] + boxes[:, 3] 41 | else: 42 | centers[:, 1] = boxes[:, 1] + boxes[:, 3] / 2 43 | return centers 44 | 45 | 46 | def tlwh_to_xyah(tlwh): 47 | """ 48 | Convert bounding box to format `(center x, center y, aspect ratio, 49 | height)`, where the aspect ratio is `width / height`. 50 | """ 51 | ret = tlwh.clone() 52 | if ret.dim() == 1: 53 | ret = ret.unsqueeze(0) 54 | ret[:, :2] += ret[:, 2:] / 2 55 | ret[:, 2] /= ret[:, 3] 56 | return ret 57 | 58 | 59 | def xyah_to_tlwh(xyah): 60 | """Get current position in bounding box format `(top left x, top left y, 61 | width, height)`. 62 | """ 63 | ret = xyah.clone() 64 | if ret.dim() == 1: 65 | ret = ret.unsqueeze(0) 66 | ret[:, 2] *= ret[:, 3] 67 | ret[:, :2] -= ret[:, 2:] / 2 68 | return ret 69 | 70 | 71 | def tlwh_to_tlbr(tlwh): 72 | """Convert bounding box to format `(top left x, top left y, bottom right 73 | x, bottom right y)`. 74 | """ 75 | ret = tlwh.clone() 76 | if ret.dim() == 1: 77 | ret = ret.unsqueeze(0) 78 | ret[:, 2:] += ret[:, :2] 79 | return ret 80 | 81 | 82 | def expand_boxes(in_boxes, factor): 83 | boxes = in_boxes.clone() 84 | cx, cy = boxes[:, 0] + boxes[:, 2] / 2, boxes[:, 1] + boxes[:, 3] / 2 85 | w, h = boxes[:, 2] * factor, boxes[:, 3] * factor 86 | boxes[:, 0] = cx - w / 2 87 | boxes[:, 1] = cy - h / 2 88 | boxes[:, 2] = w 89 | boxes[:, 3] = h 90 | return boxes 91 | 92 | 93 | def remove_border_boxes(boxes, border): 94 | xy1x2y2 = tlwh_to_tlbr(boxes) 95 | keep = ( 96 | (xy1x2y2[:, 0] > border) 97 | & (xy1x2y2[:, 1] > border) 98 | & (xy1x2y2[:, 2] < (1920 - border)) 99 | & (xy1x2y2[:, 3] < (1080 - border)) 100 | ) 101 | return keep 102 | 103 | 104 | def size_filter(boxes, size_min, size_max): 105 | sizes = boxes[:, 2] * boxes[:, 3] 106 | keep = (sizes >= size_min) & (sizes <= size_max) 107 | return keep 108 | 109 | 110 | def mpl_cmap_to_rgb(cmap_name: str, seed: int = 0) -> List[Tuple[int, int, int]]: 111 | """Returns a list of RGB values from a matplotlib colormap.""" 112 | cmap = plt.get_cmap(cmap_name) 113 | colors = [] 114 | for i in range(cmap.N): 115 | rgb = cmap(i)[:3] 116 | colors.append(tuple(int(255 * c) for c in rgb)) 117 | random.seed(seed) 118 | random.shuffle(colors) 119 | return colors 120 | 121 | 122 | def render_image_grid(images: List[torch.Tensor], *args, **kwargs) -> torch.Tensor: 123 | """Renders a grid of images. 124 | 125 | Args: 126 | images (List[torch.Tensor]): List of N images of shape (C, H, W). 127 | *args: Additional arguments to pass to the make_grid function. 128 | **kwargs: Additional keyword arguments to pass to the make_grid function. 129 | 130 | Returns: 131 | torch.Tensor: Image grid of shape (C, H, W). 132 | """ 133 | images = torch.stack(images) 134 | nrow = math.ceil(math.sqrt(len(images))) 135 | return make_grid(images, nrow=nrow, *args, **kwargs) 136 | 137 | 138 | def render_images_with_boxes( 139 | image: torch.Tensor, 140 | boxes: Optional[torch.Tensor] = None, 141 | labels: Optional[torch.Tensor] = None, 142 | confs: Optional[torch.Tensor] = None, 143 | colors: Optional[List[Tuple[int, int, int]]] = None, 144 | *args, 145 | **kwargs, 146 | ) -> List[torch.Tensor]: 147 | """Render image with bounding boxes. Colors correspond to the label index. Boxes are 148 | expected to be in MOT-format, i.e., (bb_left, bb_top, bb_widht, bb_height). 149 | 150 | Args: 151 | images (torch.Tensor): Image of shape (C, H, W). 152 | boxes (torch.Tensor): Boxes of shape (K, 4). 153 | labels (torch.Tensor): Label of shape (K,). 154 | colors (Optional[List[Tuple[int, int, int]]]): List of RGB colors. Defaults to None. 155 | *args: Additional arguments to pass to the draw_bounding_boxes function. 156 | **kwargs: Additional keyword arguments to pass to the draw_bounding_boxes function. 157 | 158 | Returns: 159 | torch.Tensor: Image with bounding boxes. 160 | """ 161 | if boxes is None: 162 | return image 163 | 164 | if colors is None: 165 | colors = mpl_cmap_to_rgb("rainbow") 166 | 167 | if labels is None: 168 | labels = torch.zeros(boxes.size(0)) 169 | 170 | color_palette = [colors[label % len(colors)] for label in labels] 171 | 172 | _labels = [str(label.item()) for i, label in enumerate(labels)] 173 | 174 | if confs is not None: 175 | _labels = [f"{label} ({conf.item():.2f})" for label, conf in zip(_labels, confs)] 176 | 177 | img = image.clone() 178 | bxs = boxes.clone() 179 | bxs[:, 2:] += bxs[:, :2] 180 | 181 | img = draw_bounding_boxes( 182 | img, 183 | bxs, 184 | labels=_labels, 185 | colors=color_palette, 186 | *args, 187 | **kwargs, 188 | ) 189 | return img 190 | 191 | 192 | def normalize_features(x): 193 | # shape of x: (C, N, F) 194 | # normalize features per channelg 195 | mean = x.mean(dim=2, keepdim=True) 196 | std = x.std(dim=2, keepdim=True) + 1e-8 197 | return (x - mean) / std 198 | 199 | 200 | def nanmax(x, dim=None): 201 | """Function like torch.nanmean for max.""" 202 | mask = torch.isnan(x) 203 | x_masked = torch.where(mask, torch.tensor(float("-inf")).to(x.device), x) 204 | max_vals, _ = torch.max(x_masked, dim=dim) 205 | 206 | # Restore NaN values if max is -inf (because all were NaN along dimension) 207 | max_vals = torch.where(max_vals == float("-inf"), torch.tensor(float("nan")).to(x.device), max_vals) 208 | return max_vals 209 | -------------------------------------------------------------------------------- /src/tracker/supertrack.py: -------------------------------------------------------------------------------- 1 | from enum import IntEnum 2 | 3 | import torch 4 | 5 | from ..utils.utils import tlwh_to_tlbr 6 | 7 | 8 | class TrackState(IntEnum): 9 | CREATED = 0 # Track is created but not confirmed yet 10 | ACTIVE = 1 # Track is confirmed and active 11 | LOST = 3 # Track is lost and not tracked, but kept in memory 12 | KILLED = 4 # Track is killed (e.g. due to merging with another track) 13 | 14 | 15 | class SuperTrack: 16 | def __init__( 17 | self, 18 | frame, 19 | features, 20 | boxes, 21 | positions_2d, 22 | positions_3d, 23 | confidence=None, 24 | ): 25 | self.frame = frame 26 | self.last_update = frame 27 | 28 | self.n_cams = features.size(0) 29 | self.features = features 30 | self.boxes = boxes 31 | self.positions_2d = positions_2d 32 | self.positions_3d = positions_3d 33 | 34 | self.label = None 35 | self.__state = TrackState.CREATED # private state variable 36 | 37 | # inactivity counter: how many frames since last update at each camera 38 | self.inactive_since = torch.zeros(self.n_cams, device=features.device) 39 | 40 | self.lost_since = 0 41 | 42 | # where to continue tracking: if False, track is not continued in this camera 43 | self.track_where = torch.ones(self.n_cams, device=features.device).bool() 44 | self.track_where[torch.isnan(features).any(dim=1)] = False 45 | 46 | # cams the track hasn't been seen in yet 47 | self.queries = torch.ones(self.n_cams, device=features.device).bool() 48 | 49 | # count updates for each camera 50 | self.ticks = torch.ones(self.n_cams, device=features.device) 51 | 52 | self.confidence = confidence 53 | 54 | self.velocities_2d = torch.zeros((self.n_cams, 4), device=features.device) 55 | self.velocities_3d = torch.zeros((self.n_cams, 2), device=features.device) 56 | 57 | @classmethod 58 | def empty(cls, n_cams, fdim, device): 59 | return cls( 60 | frame=None, 61 | features=torch.full((n_cams, fdim), float("nan"), device=device), 62 | boxes=torch.full((n_cams, 4), float("nan"), device=device), 63 | positions_2d=torch.full((n_cams, 2), float("nan"), device=device), 64 | positions_3d=torch.full((n_cams, 3), float("nan"), device=device), 65 | ) 66 | 67 | def activate(self): 68 | self.__state = TrackState.ACTIVE 69 | 70 | def deactivate(self): 71 | self.__state = TrackState.LOST 72 | 73 | def kill(self): 74 | self.__state = TrackState.KILLED 75 | 76 | def reset(self, cams=None): 77 | if cams is None: 78 | cams = range(self.n_cams) 79 | for cam in cams: 80 | self.track_where[cam] = False 81 | # self.inactive_since[cam] = 0 82 | 83 | def set_label(self, label): 84 | if self.label is not None: 85 | raise ValueError(f"Track {self} is already labeled.") 86 | self.label = label 87 | 88 | @property 89 | def keys(self): 90 | return ~self.queries 91 | 92 | @property 93 | def state(self): 94 | return self.__state 95 | 96 | @property 97 | def tlbr(self): 98 | return tlwh_to_tlbr(self.boxes) 99 | 100 | def is_complete(self): 101 | return ~torch.isnan(self.features).any() 102 | 103 | @property 104 | def p_features(self): 105 | return self.phantomize(self.features) 106 | 107 | @property 108 | def p_positions(self): 109 | return self.phantomize(self.positions_3d) 110 | 111 | @property 112 | def mean_positions_3d(self): 113 | return torch.nanmean(self.positions_3d, dim=0) 114 | 115 | @staticmethod 116 | def phantomize(tensor): 117 | """ 118 | Given a (B, n_cams, f_dim) tensor, replace nans with the average of 119 | the non-nan values along the cam axis. 120 | """ 121 | return torch.where(torch.isnan(tensor), torch.nanmean(tensor, dim=0, keepdim=True), tensor) 122 | 123 | def update(self, other): 124 | n_cams = self.features.size(0) 125 | if self.frame == other.frame: 126 | for cam in range(n_cams): 127 | if torch.isnan(self.features[cam]).any(): 128 | if torch.isnan(other.features[cam]).any(): 129 | continue 130 | self.features[cam] = other.features[cam] 131 | self.boxes[cam] = other.boxes[cam] 132 | self.positions_2d[cam] = other.positions_2d[cam] 133 | self.positions_3d[cam] = other.positions_3d[cam] 134 | self.inactive_since[cam] = 0 135 | self.track_where[cam] = True 136 | self.queries[cam] = False 137 | self.ticks[cam] = other.ticks[cam] 138 | else: 139 | if not torch.isnan(other.features[cam]).any(): 140 | raise ValueError(f"Found violation of constraints for track update with {self}.") 141 | elif self.frame < other.frame: 142 | for cam in range(n_cams): 143 | if not torch.isnan(other.features[cam]).any(): 144 | if not torch.isnan(self.features[cam]).any(): 145 | if self.velocities_2d[cam].sum() == 0: 146 | w = 1.0 147 | else: 148 | w = 0.8 149 | self.velocities_2d[cam] = ( 150 | w * (other.boxes[cam] - self.boxes[cam]) / (other.frame - self.frame) 151 | + (1 - w) * self.velocities_2d[cam] 152 | ) 153 | self.velocities_3d[cam] = ( 154 | w * (other.positions_3d[cam] - self.positions_3d[cam]) / (other.frame - self.frame) 155 | + (1 - w) * self.velocities_3d[cam] 156 | ) 157 | self.features[cam] = 0.9 * self.features[cam] + 0.1 * other.features[cam] 158 | self.boxes[cam] = other.boxes[cam] 159 | self.positions_2d[cam] = other.positions_2d[cam] 160 | self.positions_3d[cam] = other.positions_3d[cam] 161 | self.inactive_since[cam] = 0 162 | self.track_where[cam] = True 163 | self.queries[cam] = False 164 | self.ticks[cam] += 1 165 | else: 166 | self.features[cam] = other.features[cam] 167 | self.boxes[cam] = other.boxes[cam] 168 | self.positions_2d[cam] = other.positions_2d[cam] 169 | self.positions_3d[cam] = other.positions_3d[cam] 170 | self.inactive_since[cam] = 0 171 | self.track_where[cam] = True 172 | self.queries[cam] = False 173 | self.ticks[cam] = other.ticks[cam] 174 | else: 175 | if self.track_where[cam]: 176 | self.inactive_since[cam] += 1 177 | else: 178 | raise ValueError( 179 | f"Frame of other must be greater or equal to frame of self, but got {self.frame} and {other.frame}." 180 | ) 181 | self.last_update = other.frame 182 | self.frame = other.frame 183 | 184 | if self.state == TrackState.LOST: 185 | self.activate() 186 | 187 | def predict(self): 188 | for cam in range(self.n_cams): 189 | if ~self.track_where[cam]: 190 | continue 191 | prd_box = self.boxes[cam] + self.velocities_2d[cam] 192 | prd_pos = self.positions_3d[cam] + self.velocities_3d[cam] 193 | if prd_box[2] <= 0 or prd_box[3] <= 0: 194 | prd_box = self.boxes[cam] 195 | prd_pos = self.positions_3d[cam] 196 | self.boxes[cam] = prd_box 197 | self.positions_3d[cam] = prd_pos 198 | 199 | def merge(self, other): 200 | if other.state == TrackState.KILLED or self.state == TrackState.KILLED: 201 | raise ValueError("Cannot merge killed tracks.") 202 | if other.frame < self.frame: 203 | raise ValueError( 204 | f"Other track must not be older than self, but " 205 | f"self is at frame {self.frame} and other at frame {other.frame}." 206 | ) 207 | self.update( 208 | other.frame, 209 | other.features, 210 | other.boxes, 211 | other.positions_2d, 212 | other.positions_3d, 213 | ) 214 | # other was merged into self, so it is killed 215 | other.kill() 216 | 217 | def split(self, where: torch.Tensor): 218 | # keep the cams where "where" is True 219 | other_features = self.features.clone() 220 | other_boxes = self.boxes.clone() 221 | other_positions_2d = self.positions_2d.clone() 222 | other_positions_3d = self.positions_3d.clone() 223 | for w in where: 224 | if not w: 225 | self.features[w] = torch.nan 226 | self.boxes[w] = torch.nan 227 | self.positions_2d[w] = torch.nan 228 | self.positions_3d[w] = torch.nan 229 | else: 230 | other_features[w] = torch.nan 231 | other_boxes[w] = torch.nan 232 | other_positions_2d[w] = torch.nan 233 | other_positions_3d[w] = torch.nan 234 | return SuperTrack( 235 | frame=self.frame, 236 | features=other_features, 237 | boxes=other_boxes, 238 | positions_2d=other_positions_2d, 239 | positions_3d=other_positions_3d, 240 | ) 241 | 242 | def __repr__(self): 243 | return f"Track {self.label}" 244 | 245 | def to_tensor(self): 246 | output = [] 247 | if self.state == TrackState.LOST: 248 | return torch.Tensor(output) 249 | for i, box in enumerate(self.boxes): 250 | if ~self.track_where[i]: 251 | continue 252 | row = [i, self.label, self.frame, *box, *self.mean_positions_3d] 253 | output.append(row) 254 | return torch.Tensor(output) 255 | -------------------------------------------------------------------------------- /src/utils/evaluate.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import os 3 | import pathlib 4 | from typing import Dict, List, Optional, Union 5 | 6 | import motmetrics as mm 7 | import numpy as np 8 | import pandas as pd 9 | import torch 10 | from sklearn import metrics 11 | 12 | 13 | GT_COLUMNS = [ 14 | "frame", 15 | "id", 16 | "bb_left", 17 | "bb_top", 18 | "bb_width", 19 | "bb_height", 20 | "conf", 21 | "x", 22 | "y", 23 | "z", 24 | ] 25 | 26 | 27 | def get_hota_setup(): 28 | metrics = ["deta_alpha", "assa_alpha", "hota_alpha"] 29 | namemap = mm.io.motchallenge_metric_names 30 | namemap.update({"hota_alpha": "HOTA", "assa_alpha": "ASSA", "deta_alpha": "DETA"}) 31 | return metrics, namemap 32 | 33 | 34 | def evaluate_tracker(tracker_results, dataloader, hota_mode=False, bev_mode=False): 35 | gt_dfs = [pd.DataFrame(gt, columns=GT_COLUMNS) for gt in dataloader.dataset._ground_truths] 36 | ht_dfs = results_to_dfs(tracker_results) 37 | 38 | n_frames = [int(df["frame"].max()) for df in gt_dfs] 39 | 40 | gt_dfs = [mot_to_mm(df) for df in gt_dfs] 41 | ht_dfs = [mot_to_mm(df) for df in ht_dfs] 42 | 43 | gt_df = combine_dataframes(gt_dfs, n_frames) 44 | ht_df = combine_dataframes(ht_dfs, n_frames) 45 | 46 | # put column "x" to "X" 47 | if bev_mode: 48 | ht_df["X"] = ht_df["x"] 49 | ht_df["Y"] = ht_df["y"] 50 | gt_df["X"] = gt_df["x"] 51 | gt_df["Y"] = gt_df["y"] 52 | 53 | return evaluate_single_scene(ht_df, gt_df, hota_mode=hota_mode, bev_mode=bev_mode) 54 | 55 | 56 | def results_to_dfs(tracker_results: torch.Tensor) -> List[pd.DataFrame]: 57 | """Converts a tensor of results to a list of dataframes. Input tensor has format 58 | 59 | CAM_ID, OBJ_ID, FRAME_ID, X, Y, W, H, X_WORLD, Y_WORLD 60 | 61 | and resulting (n_cams) dataframes have columns 62 | 63 | frame, id, bb_left, bb_top, bb_width, bb_height, conf, x, y, z 64 | 65 | Args: 66 | tracker_results (torch.Tensor): Results tensor. 67 | Returns: 68 | List[pd.DataFrame]: List of dataframes. 69 | """ 70 | results = tracker_results.clone() 71 | results[:, [1, 2]] = results[:, [2, 1]] 72 | results = torch.cat((results[:, :7], torch.ones(results.shape[0], 1), results[:, 7:]), dim=1) 73 | results = torch.cat((results, -torch.ones(results.shape[0], 1)), dim=1) 74 | cam_res = [results[results[:, 0] == c][:, 1:] for c in torch.unique(results[:, 0]).cpu().numpy()] 75 | return [pd.DataFrame(res, columns=GT_COLUMNS) for res in cam_res] 76 | 77 | 78 | def evaluate_multi_scene(prediction_dfs, ground_truth_dfs, names=None, hota_mode=False, bev_mode=False): 79 | """Takes prediction and ground truth dataframes and runs motmetrics evaluation 80 | on a multiple scenes. For evaluation of multi-camera scenes, first combine a 81 | list of single-camera predictions and ground truths using `combine_dataframes` 82 | Args: 83 | prediction_dfs (_type_): _description_ 84 | ground_truth_dfs (_type_): _description_ 85 | names (_type_, optional): _description_. Defaults to None. 86 | Returns: 87 | _type_: _description_ 88 | """ 89 | if names is None: 90 | names = ["Untitled %s" % (i + 1) for i in range(len(prediction_dfs))] 91 | ground_truths = dict(zip(names, ground_truth_dfs)) 92 | predictions = dict(zip(names, prediction_dfs)) 93 | accs = [] 94 | names = [] 95 | 96 | if bev_mode: 97 | distfields = ["X", "Y"] 98 | dist = "seuc" 99 | distth = 1.0 100 | else: 101 | distfields = ["X", "Y", "Width", "Height"] 102 | dist = "iou" 103 | distth = 0.5 104 | 105 | for name, prediction in predictions.items(): 106 | if hota_mode: 107 | raise NotImplementedError 108 | else: 109 | accs.append( 110 | mm.utils.compare_to_groundtruth( 111 | ground_truths[name], prediction, dist=dist, distfields=distfields, distth=distth 112 | ) 113 | ) 114 | metrics = mm.metrics.motchallenge_metrics 115 | namemap = mm.io.motchallenge_metric_names 116 | names.append(name) 117 | 118 | mh = mm.metrics.create() 119 | 120 | summary = mh.compute_many( 121 | accs, 122 | names=names, 123 | metrics=metrics, 124 | generate_overall=True, 125 | ) 126 | namemap.update({"hota_alpha": "HOTA", "assa_alpha": "ASSA", "deta_alpha": "DETA"}) 127 | print(mm.io.render_summary(summary, formatters=mh.formatters, namemap=namemap)) 128 | strsummary = mm.io.render_summary(summary, formatters=mh.formatters, namemap=namemap) 129 | return summary, strsummary 130 | 131 | 132 | def evaluate_single_scene(prediction_df, ground_truth_df, hota_mode=False, bev_mode=False, name=None) -> pd.DataFrame: 133 | """Takes a prediction and ground truth dataframe and runs motmetrics evaluation 134 | on a single scene. For evaluation of multi-camera scenes, first combine a list 135 | of single-camera predictions and ground truths using `combine_dataframes`. 136 | Args: 137 | prediction_df (_type_): Multi-camera predictions. 138 | ground_truth_df (_type_): Multi-camera ground truth. 139 | name (str): Scene name. Defaults to None. 140 | """ 141 | return evaluate_multi_scene([prediction_df], [ground_truth_df], [name], hota_mode, bev_mode) 142 | 143 | 144 | def mot_to_mm(df: pd.DataFrame) -> pd.DataFrame: 145 | """Takes a MOT-style dataframe (with named columns [frame, id, ...]) 146 | and converts it to a dataframe with column names required by motmetrics. 147 | Args: 148 | df (pd.DataFrame): Input MOT-style dataframe. 149 | Returns: 150 | pd.DataFrame: Output dataframe ready to use in motmetrics evaluation. 151 | """ 152 | _df = df.rename( 153 | columns={ 154 | "frame": "FrameId", 155 | "id": "Id", 156 | "bb_left": "X", 157 | "bb_top": "Y", 158 | "bb_width": "Width", 159 | "bb_height": "Height", 160 | "conf": "Confidence", 161 | } 162 | ) 163 | columns_to_int = ["FrameId", "Id", "X", "Y", "Width", "Height"] 164 | columns_to_float = ["Confidence"] 165 | _df[columns_to_int] = _df[columns_to_int].astype(int) 166 | _df[columns_to_float] = _df[columns_to_float].astype(float) 167 | return _df 168 | 169 | 170 | def read_txt(path: Union[str, pathlib.Path]) -> pd.DataFrame: 171 | _df = pd.read_csv(path, names=GT_COLUMNS) 172 | _df = _df.rename( 173 | columns={ 174 | "frame": "FrameId", 175 | "id": "Id", 176 | "bb_left": "X", 177 | "bb_top": "Y", 178 | "bb_width": "Width", 179 | "bb_height": "Height", 180 | "conf": "Confidence", 181 | } 182 | ) 183 | columns_to_int = ["FrameId", "Id", "X", "Y", "Width", "Height"] 184 | columns_to_float = ["Confidence"] 185 | _df[columns_to_int] = _df[columns_to_int].astype(int) 186 | _df[columns_to_float] = _df[columns_to_float].astype(float) 187 | return _df 188 | 189 | 190 | def read_seqinfo(path: Union[str, pathlib.Path]) -> Dict: 191 | parser = configparser.ConfigParser() 192 | parser.read(path) 193 | return dict(parser["Sequence"]) 194 | 195 | 196 | def combine_dataframes(dataframes: List[pd.DataFrame], n_frames: Optional[List[int]] = None) -> pd.DataFrame: 197 | """Takes a list of single-camera dataframes and combines them for 198 | multi-camera evaluation. 199 | Args: 200 | dataframes (List[pd.DataFrame]): List of single-camera dataframes. 201 | n_frames (Optional[List[int]], optional): Defaults to None. 202 | Returns: 203 | pd.DataFrame: Multi-camera dataframe. 204 | """ 205 | if n_frames is None: 206 | n_frames = [int(df["FrameId"].max()) for df in dataframes] 207 | count_frames = 0 208 | dfs = [] 209 | for j, df in enumerate(dataframes): 210 | df["FrameId"] += count_frames 211 | count_frames += int(n_frames[j]) 212 | dfs.append(df) 213 | return pd.concat(dfs).set_index(["FrameId", "Id"]) 214 | 215 | 216 | def evaluate_mtmc( 217 | data_paths: List[Union[str, pathlib.Path]], 218 | prediction_path: Union[str, pathlib.Path], 219 | scene_name: str, 220 | hota_mode=False, 221 | bev_mode=False, 222 | ): 223 | seqinfos = [read_seqinfo(os.path.join(path, "seqinfo.ini")) for path in data_paths] 224 | ground_truths = [read_txt(os.path.join(path, "gt", "gt.txt")) for path in data_paths] 225 | prediction_paths = [os.path.join(prediction_path, seqinfo["name"] + ".txt") for seqinfo in seqinfos] 226 | predictions = [read_txt(path) for path in prediction_paths] 227 | ground_truth_df = combine_dataframes(ground_truths, [seqinfo["seqlength"] for seqinfo in seqinfos]) 228 | prediction_df = combine_dataframes(predictions, [seqinfo["seqlength"] for seqinfo in seqinfos]) 229 | 230 | ground_truths = {scene_name: ground_truth_df} 231 | predictions = {scene_name: prediction_df} 232 | 233 | 234 | def evaluate_synthehicle_json(prediction, ground_truth): 235 | preds_to_eval = [] 236 | truths_to_eval = [] 237 | names = [] 238 | for scene in ground_truth.keys(): 239 | if scene in prediction.keys(): 240 | gcams = ground_truth[scene] 241 | pcams = prediction[scene] 242 | preds_to_combine = [] 243 | truths_to_combine = [] 244 | for cam in gcams.keys(): 245 | if cam not in pcams.keys(): 246 | prediction[scene][cam] = [[1, 1, 0, 0, 0, 0, 1, -1, -1, -1]] 247 | preds_to_combine.append(mot_to_mm(pd.DataFrame(prediction[scene][cam], columns=GT_COLUMNS))) 248 | truths_to_combine.append(mot_to_mm(pd.DataFrame(ground_truth[scene][cam], columns=GT_COLUMNS))) 249 | names.append(scene) 250 | preds_to_eval.append(combine_dataframes(preds_to_combine, n_frames=[1800] * len(preds_to_combine))) 251 | truths_to_eval.append(combine_dataframes(truths_to_combine, n_frames=[1800] * len(truths_to_combine))) 252 | return evaluate_multi_scene(preds_to_eval, truths_to_eval, names) 253 | 254 | 255 | def clustering_performance(y_true, y_pred): 256 | y_t, y_p = y_true.cpu().numpy(), y_pred.cpu().numpy() 257 | return { 258 | "ARI": metrics.adjusted_rand_score(y_t, y_p), 259 | "AMI": metrics.adjusted_mutual_info_score(y_t, y_p), 260 | } 261 | -------------------------------------------------------------------------------- /src/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import pathlib 4 | import warnings 5 | from enum import IntEnum 6 | from typing import List, Optional 7 | 8 | import numpy as np 9 | import torch 10 | from loguru import logger 11 | from torch.utils.data import DataLoader 12 | from torchvision.io import ImageReadMode, read_image 13 | from torchvision.ops import nms 14 | 15 | from ..tracker.geometry import Projector 16 | from ..utils.utils import compute_centers, resize_transform, tlwh_to_tlbr 17 | 18 | 19 | class Annotation(IntEnum): 20 | CAM_ID = 0 21 | OBJ_ID = 1 22 | FRAME_ID = 2 23 | XMIN = 3 24 | YMIN = 4 25 | WIDTH = 5 26 | HEIGHT = 6 27 | CONF = 7 28 | XWORLD = 8 29 | YWORLD = 9 30 | 31 | 32 | class NMSTransform: 33 | def __init__(self, iou_threshold: float): 34 | """Initialize the NMSTransform which applied non-maximum suppression to the 35 | input annotations based on the specified IoU threshold. 36 | 37 | Args: 38 | iou_threshold (float): The Intersection over Union (IoU) threshold for NMS. 39 | Bounding boxes with IoU greater than this threshold will be suppressed. 40 | """ 41 | self.iou_threshold = iou_threshold 42 | 43 | def __call__(self, annotations: torch.Tensor) -> torch.Tensor: 44 | boxes = tlwh_to_tlbr(annotations[:, Annotation.XMIN : Annotation.HEIGHT + 1]) 45 | scores = annotations[:, Annotation.CONF] 46 | keep = nms(boxes, scores, self.iou_threshold) 47 | return keep 48 | 49 | 50 | class ROIFilter: 51 | def __init__(self, roi_path: str): 52 | """Initialize the ROIFilter. 53 | 54 | Args: 55 | roi_path (str): Path to the ROI image file. 56 | 57 | The ROI (Region of Interest) image is loaded as a binary mask, 58 | where 1 indicates areas of interest and 0 indicates areas to be filtered out. 59 | """ 60 | self.roi = read_image(roi_path, ImageReadMode.GRAY).squeeze(0).bool() 61 | self.size = self.roi.size() 62 | 63 | def __call__(self, annotations: torch.Tensor) -> torch.Tensor: 64 | centers = compute_centers(annotations[:, Annotation.XMIN - 1 : Annotation.HEIGHT]).int() 65 | centers[:, 0] = torch.clamp(centers[:, 0], 0, self.size[1] - 1) 66 | centers[:, 1] = torch.clamp(centers[:, 1], 0, self.size[0] - 1) 67 | keep = self.roi[centers[:, 1], centers[:, 0]] == 1 68 | return keep 69 | 70 | 71 | class MultiCamDataset: 72 | def __init__( 73 | self, 74 | annotation_paths: List[str], 75 | image_paths: List[str], 76 | calibration_paths: List[str], 77 | camera_names: List[int], 78 | ground_truth_paths: Optional[List[str]] = None, 79 | precomputed: bool = False, 80 | nms_threshold: Optional[float] = 0.9, 81 | time_offsets: Optional[List[int]] = None, 82 | roi_paths: Optional[List[str]] = None, 83 | normalize_bev: bool = False, 84 | bottom: bool = True, 85 | box_projection_centers=None, 86 | ): 87 | """Initialize the MultiCamDataset for data loading. 88 | 89 | Args: 90 | annotation_paths (List[str]): Paths to annotation files for each camera. 91 | image_paths (List[str]): Paths to image directories for each camera. 92 | calibration_paths (List[str]): Paths to calibration files for each camera. 93 | camera_names (List[int]): Names or IDs of the cameras. 94 | ground_truth_paths (Optional[List[str]], optional): Paths to ground truth files. Defaults to None. 95 | precomputed (bool, optional): Whether to use precomputed features. Defaults to False. 96 | nms_threshold (Optional[float], optional): Non-maximum suppression threshold. Defaults to 0.9. 97 | time_offsets (Optional[List[int]], optional): Time offsets for each camera. Defaults to None. 98 | roi_paths (Optional[List[str]], optional): Paths to region of interest mask images. Defaults to None. 99 | normalize_bev (bool, optional): Whether to normalize bird's-eye view coordinates. Defaults to False. 100 | bottom (bool, optional): Whether to use bottom of bounding box for projection. Defaults to True. 101 | box_projection_centers (Optional[Tuple[float, float]], optional): Projection centers for bounding boxes. Defaults to None. 102 | """ 103 | if time_offsets is None: 104 | self.time_offsets = [0] * len(image_paths) 105 | else: 106 | self.time_offsets = time_offsets 107 | 108 | self.annotation_paths = annotation_paths 109 | self.image_paths = image_paths 110 | self.calibration_paths = calibration_paths 111 | self.camera_names = camera_names 112 | self.precomputed = precomputed 113 | self.nms_transform = NMSTransform(nms_threshold) if nms_threshold is not None else None 114 | self.box_projection_centers = box_projection_centers 115 | self.bottom = bottom 116 | 117 | self.normalize_bev = normalize_bev 118 | 119 | if roi_paths is not None: 120 | self.roi_filters = [ROIFilter(roi_path) for roi_path in roi_paths] 121 | else: 122 | self.roi_filters = None 123 | 124 | self._load_calibrations() 125 | self._load_annotations() 126 | 127 | if ground_truth_paths is not None: 128 | self._load_ground_truth(ground_truth_paths) 129 | else: 130 | self._ground_truths = None 131 | self.gts = None 132 | 133 | self.length = max([len(list(pathlib.Path(image_path).glob("*.jpg"))) for image_path in self.image_paths]) 134 | 135 | if self.length == 0: 136 | warnings.warn("No images found. Visualization tools will not be available.") 137 | 138 | self.length = 2110 139 | 140 | self._filtered_by_nms = 0 141 | self._filtered_by_size = 0 142 | self._filtered_by_roi = 0 143 | 144 | def _load_ground_truth(self, ground_truth_paths): 145 | self._ground_truths = [ 146 | torch.from_numpy(np.loadtxt(ground_truth_path, delimiter=",", dtype=np.float32)) 147 | for ground_truth_path in ground_truth_paths 148 | ] 149 | 150 | for gt in self._ground_truths: 151 | if gt.shape[1] == 9: 152 | # append another column of ones 153 | gt = torch.cat((gt, torch.ones(gt.shape[0], 1)), dim=1) 154 | 155 | _cat_gts = [g.clone() for g in self._ground_truths] 156 | for i, gt in enumerate(_cat_gts): 157 | col = torch.ones((gt.shape[0], 1)) * i 158 | _cat_gts[i] = torch.cat((col, gt), dim=1) 159 | _cat_gts[i][:, 1] += self.time_offsets[i] 160 | 161 | self.gts = torch.cat(_cat_gts, dim=0) 162 | self.gts[:, [1, 2]] = self.gts[:, [2, 1]] 163 | 164 | def _load_calibrations(self): 165 | self._projectors = [Projector(calibration_path) for calibration_path in self.calibration_paths] 166 | 167 | def _load_annotations(self): 168 | anns = [ 169 | torch.from_numpy(np.loadtxt(annotation_path, delimiter=",", dtype=np.float32)) 170 | for annotation_path in self.annotation_paths 171 | ] 172 | 173 | # todo: add to preprocess config 174 | for i, ann in enumerate(anns): 175 | keep = (ann[:, Annotation.WIDTH - 1] * ann[:, Annotation.HEIGHT - 1]) >= 1200 176 | anns[i] = ann[keep] 177 | 178 | # filter roi images 179 | if self.roi_filters is not None: 180 | keep = self.roi_filters[i](anns[i]) 181 | anns[i] = anns[i][keep] 182 | logger.info(f"🔥 Filtered {keep.size(0) - keep.sum().item()} annotations by ROI.") 183 | 184 | for i, ann in enumerate(anns): 185 | col = torch.ones((ann.shape[0], 1)) * i 186 | anns[i] = torch.cat((col, ann), dim=1) 187 | anns[i][:, 1] += self.time_offsets[i] 188 | 189 | positions_2d = [] 190 | for i, ann in enumerate(anns): 191 | pos2d = compute_centers( 192 | ann[:, Annotation.XMIN : Annotation.HEIGHT + 1], self.bottom, self.box_projection_centers 193 | ) 194 | positions_2d.append(pos2d) 195 | 196 | positions_3d = [] 197 | for i, pos2d in enumerate(positions_2d): 198 | pos3d = self._projectors[i].image_to_world(pos2d) 199 | positions_3d.append(pos3d) 200 | 201 | anns = torch.cat(anns, dim=0) 202 | positions_2d = torch.cat(positions_2d, dim=0) 203 | positions_3d = torch.cat(positions_3d, dim=0) 204 | 205 | if anns.shape[1] == 9: 206 | # loaded from ground truth, append column of 1s as 7th column 207 | anns = torch.cat( 208 | ( 209 | anns[:, :6], 210 | torch.ones(anns.shape[0], 1), 211 | anns[:, 6:], 212 | ), 213 | dim=1, 214 | ) 215 | # swap columns frame and obj_id 216 | anns[:, [1, 2]] = anns[:, [2, 1]] 217 | 218 | self._annotations = anns 219 | self._positions_2d = positions_2d 220 | self._positions_3d = positions_3d 221 | 222 | if self.normalize_bev: 223 | self.apply_bev_norm() 224 | else: 225 | self._norm_factors = None 226 | 227 | self._annotations.to("cuda") 228 | self._positions_2d.to("cuda") 229 | self._positions_3d.to("cuda") 230 | 231 | def get_bev_ticks(self): 232 | return [ 233 | float(torch.min(self._positions_3d[:, 0])), 234 | float(torch.max(self._positions_3d[:, 0])), 235 | float(torch.min(self._positions_3d[:, 1])), 236 | float(torch.max(self._positions_3d[:, 1])), 237 | ] 238 | 239 | def get_crops(self, frame_annotations, frame_images): 240 | crops = [] 241 | for ann in frame_annotations: 242 | cam_id = int(ann[Annotation.CAM_ID]) 243 | x, y, w, h = ann[Annotation.XMIN : Annotation.HEIGHT + 1].int() 244 | # clamp to image dimensions 245 | x = torch.clamp(x, 0, frame_images[cam_id].size(1) - 1) 246 | y = torch.clamp(y, 0, frame_images[cam_id].size(2) - 1) 247 | w = torch.clamp(w, 0, frame_images[cam_id].size(1) - x) 248 | h = torch.clamp(h, 0, frame_images[cam_id].size(2) - y) 249 | crops.append(resize_transform(frame_images[cam_id][:, y : y + h, x : x + w])) 250 | if len(crops) == 0: 251 | return torch.empty(0) 252 | return torch.stack(crops) 253 | 254 | def apply_bev_norm(self): 255 | # normalize BEV positions to [0, 1] 256 | logger.info("📏 Normalizing BEV positions to [0, 1].") 257 | min_x, min_y = torch.min(self._positions_3d, dim=0)[0] 258 | max_x, max_y = torch.max(self._positions_3d, dim=0)[0] 259 | self._norm_factors = torch.tensor([min_x, min_y, max_x, max_y]) 260 | self._positions_3d = (self._positions_3d - torch.tensor([min_x, min_y])) / torch.tensor( 261 | [max_x - min_x, max_y - min_y] 262 | ) 263 | 264 | def __len__(self): 265 | return self.length 266 | 267 | def __getitem__(self, idx): 268 | frame = idx + 1 269 | 270 | annotations = self._annotations[self._annotations[:, Annotation.FRAME_ID] == frame] 271 | positions_2d = self._positions_2d[self._annotations[:, Annotation.FRAME_ID] == frame] 272 | positions_3d = self._positions_3d[self._annotations[:, Annotation.FRAME_ID] == frame] 273 | 274 | if self.gts is not None: 275 | ground_truth = self.gts[self.gts[:, Annotation.FRAME_ID] == frame] 276 | else: 277 | ground_truth = torch.empty(0) 278 | 279 | if self.nms_transform is not None: 280 | keep = self.nms_transform(annotations) 281 | else: 282 | keep = torch.arange(annotations.size(0)) 283 | 284 | annotations = annotations[keep] 285 | positions_2d = positions_2d[keep] 286 | positions_3d = positions_3d[keep] 287 | 288 | frame_images = [] 289 | for img_path, offset in zip(self.image_paths, self.time_offsets): 290 | try: 291 | frame_images.append(read_image(str(pathlib.Path(img_path) / f"{(frame - offset):06d}.jpg"))) 292 | except Exception: 293 | frame_images.append(torch.zeros(3, 1080, 1920).to(torch.uint8)) 294 | 295 | if not self.precomputed: 296 | frame_crops = self.get_crops(annotations, frame_images) 297 | else: 298 | frame_crops = torch.empty(0) 299 | 300 | return { 301 | "annotations": annotations, 302 | "positions_2d": positions_2d, 303 | "positions_3d": positions_3d, 304 | "images": frame_images, 305 | "crops": frame_crops, 306 | "ground_truth": ground_truth, 307 | } 308 | 309 | 310 | def create_dataloader(cfg): 311 | scene_path = os.path.join(cfg.dataset_path, cfg.dataset.scene_path) 312 | cameras = [ 313 | os.path.basename(f) 314 | for f in sorted(glob.glob(os.path.join(scene_path, cfg.dataset.camera_pattern))) 315 | if os.path.isdir(f) 316 | ] 317 | 318 | img_paths = [ 319 | os.path.join(cfg.dataset_path, cfg.dataset.scene_path, camera, cfg.dataset.img_path) for camera in cameras 320 | ] 321 | calibration_paths = [ 322 | os.path.join( 323 | cfg.dataset_path, 324 | cfg.dataset.scene_path, 325 | camera, 326 | cfg.dataset.calibration_path, 327 | ) 328 | for camera in cameras 329 | ] 330 | annotation_paths = [] 331 | for camera in cameras: 332 | if cfg.resources.reid is not None: 333 | scene_path = "-".join(pathlib.Path(cfg.dataset.scene_path).parts) 334 | if scene_path[-1] == "-": 335 | scene_path = scene_path[:-1] 336 | resource_name = ( 337 | f"{cfg.dataset.name}_{scene_path}-{camera}_{cfg.resources.detector}_{cfg.resources.reid}.txt" 338 | ) 339 | else: 340 | resource_name = f"{cfg.dataset.name}-{camera}_{cfg.resources.detector}.txt" 341 | annotation_paths.append(os.path.join(cfg.resources.path, resource_name)) 342 | 343 | if cfg.preprocess.nms_thresh is not None: 344 | nms_threshold = cfg.preprocess.nms_thresh 345 | else: 346 | nms_threshold = None 347 | 348 | if cfg.preprocess.roi_filter is not None and "roi_path" in cfg.dataset: 349 | roi_paths = [os.path.join(cfg.dataset.roi_path, camera, "roi.jpg") for camera in cameras] 350 | else: 351 | roi_paths = None 352 | 353 | ground_truth_paths = None 354 | 355 | time_offsets = None 356 | if "offsets" in cfg.dataset: 357 | if cfg.dataset.offsets is not None: 358 | time_offsets = cfg.dataset.offsets 359 | 360 | box_projection_centers = [ 361 | cfg.preprocess.box_projection_centers.alpha_w, 362 | cfg.preprocess.box_projection_centers.alpha_h, 363 | ] 364 | 365 | if box_projection_centers[0] is None: 366 | box_projection_centers = None 367 | elif box_projection_centers[1] is None: 368 | box_projection_centers[1] = 1 - box_projection_centers[0] 369 | 370 | dataset = MultiCamDataset( 371 | annotation_paths=annotation_paths, 372 | image_paths=img_paths, 373 | calibration_paths=calibration_paths, 374 | camera_names=cameras, 375 | ground_truth_paths=ground_truth_paths, 376 | precomputed=cfg.encoder.name == "precomputed", 377 | nms_threshold=nms_threshold, 378 | time_offsets=time_offsets, 379 | roi_paths=roi_paths, 380 | bottom=cfg.preprocess.bottom, 381 | box_projection_centers=box_projection_centers, 382 | ) 383 | dataloader = DataLoader( 384 | dataset, 385 | batch_size=1, 386 | shuffle=False, 387 | num_workers=8, 388 | ) 389 | return dataloader 390 | -------------------------------------------------------------------------------- /src/tracker/tracker.py: -------------------------------------------------------------------------------- 1 | import statistics 2 | import time 3 | from typing import Any, List, Optional, Tuple 4 | 5 | import motmetrics as mm 6 | import torch 7 | from omegaconf import DictConfig 8 | from scipy.optimize import linear_sum_assignment 9 | from torchvision.ops import box_iou 10 | 11 | from .similarities import batch_bev_distance, batch_cosine_similarity, batched_box_iou 12 | from .solver import multicut, scale_weights 13 | from .supertrack import SuperTrack, TrackState 14 | 15 | 16 | class Tracker: 17 | def __init__( 18 | self, 19 | solver_opts: Any, 20 | cfg: DictConfig, 21 | n_cams: int, 22 | feature_extractor: Optional[torch.nn.Module] = None, 23 | device: Optional[torch.device] = "cpu", 24 | ): 25 | """ 26 | Initialize the Tracker. 27 | 28 | Args: 29 | solver_opts: Options for the solver. 30 | cfg: Configuration dictionary. 31 | n_cams: Number of cameras. 32 | feature_extractor: Feature extractor module. 33 | device: Device to run the tracker on. 34 | """ 35 | self.feature_extractor = feature_extractor 36 | self.solver_opts = solver_opts 37 | self.device = device 38 | 39 | self.current_data = None 40 | 41 | self.feature_dim = cfg.tracker.fdim 42 | self.n_cams = n_cams 43 | self.cfg = cfg.tracker 44 | 45 | self.tracks: List[SuperTrack] = [] 46 | 47 | self.frame = 0 48 | self.free_id = 1 49 | 50 | self.latency = [] 51 | 52 | self.update_interval = 1 53 | self.stats = { 54 | "# Killed": 0, 55 | "Latency": 0, 56 | } 57 | 58 | self.cumulative_execution_time = 0 59 | 60 | def step(self, sample): 61 | """ 62 | Perform a single step of tracking. 63 | 64 | Args: 65 | sample: Input sample containing detections and features. 66 | 67 | Returns: 68 | tuple: A tuple containing current results and predicted results. 69 | """ 70 | # move sample to device and remove batch dimension 71 | t0 = time.time() 72 | for key in sample.keys(): 73 | if key != "images": 74 | sample[key] = sample[key].to(self.device).squeeze(0) 75 | self.frame += 1 76 | if self.frame % self.update_interval == 0: 77 | if sample["annotations"].size(0) > 0: 78 | matched, unmatched = self.update(sample) 79 | self._handle_unmatched(unmatched) 80 | 81 | t1 = time.time() 82 | self.cumulative_execution_time += t1 - t0 83 | self.latency.append(t1 - t0) 84 | 85 | self._sanitize() 86 | 87 | rresults = self.get_result() 88 | 89 | self.predict() 90 | 91 | presults = self.get_result() 92 | 93 | return rresults, presults 94 | 95 | def update(self, sample): 96 | """ 97 | Update the tracker with new detections and features. 98 | 99 | Args: 100 | sample: Input sample containing detections and features. 101 | 102 | Returns: 103 | tuple: A tuple containing matched and unmatched tracks. 104 | """ 105 | features = self.feature_extractor(sample) 106 | superboxes = self._new_superboxes_from_data(sample, features) 107 | superboxes = [s for s in superboxes if s.confidence >= self.cfg.confidence_thresh] 108 | 109 | relevant_tracks = self.tracks + superboxes 110 | _track_indices = torch.arange(len(self.tracks)).to(self.device) 111 | _superbox_indices = torch.arange(len(self.tracks), len(relevant_tracks)).to(self.device) 112 | 113 | low_conf_indices = None 114 | 115 | if self.cfg.low_confidence_thresh is not None: 116 | c1 = self.cfg.low_confidence_thresh 117 | c2 = self.cfg.confidence_thresh 118 | low_conf_superboxes = [s for s in superboxes if c1 <= s.confidence < c2] 119 | 120 | if len(low_conf_superboxes) > 0: 121 | n_relevant = len(relevant_tracks) 122 | relevant_tracks += low_conf_superboxes 123 | low_conf_indices = torch.arange(n_relevant, n_relevant + len(low_conf_superboxes)) 124 | 125 | if len(relevant_tracks) == 0: 126 | return [], [] 127 | 128 | features = torch.stack([track.p_features for track in relevant_tracks]) # (n_tracks, n_cams, feature_dim) 129 | positions = torch.stack([track.p_positions for track in relevant_tracks]) # (n_tracks, n_cams, 2) 130 | boxes = torch.stack([track.tlbr for track in relevant_tracks]) # (n_tracks, n_cams, 4) 131 | 132 | # compute (n_tracks) x (n_tracks) similarity matrix 133 | similarities = self._compute_similarities(features, positions, boxes) 134 | 135 | # compute weighted graph 136 | rescale_thresh = self.cfg.matching.rescale_threshold 137 | dist_thresh = self.cfg.matching.distance_threshold 138 | iou_bias = self.cfg.prematching.iou_bias if self.cfg.prematching.enabled else 0 139 | edge_index, edge_weights = self._build_weighted_graph( 140 | relevant_tracks, 141 | similarities, 142 | rescale_thresh, 143 | dist_thresh, 144 | iou_bias, 145 | reid_decay=self.cfg.matching.reid_decay, 146 | ) 147 | labels = multicut(edge_index, edge_weights, self.solver_opts) 148 | 149 | matched_tracks, unmatched_tracks = self._match(relevant_tracks, labels, low_conf_indices=low_conf_indices) 150 | 151 | self.tracks = matched_tracks + unmatched_tracks 152 | return matched_tracks, unmatched_tracks 153 | 154 | def _handle_unmatched(self, unmatched_tracks): 155 | """ 156 | Handle unmatched tracks by updating their inactive status. 157 | 158 | Args: 159 | unmatched_tracks: List of unmatched tracks. 160 | """ 161 | for track in unmatched_tracks: 162 | for cam in range(self.n_cams): 163 | if track.track_where[cam]: 164 | track.inactive_since[cam] += 1 165 | 166 | def predict(self): 167 | """ 168 | Project existing tracks into the future. 169 | """ 170 | for track in self.tracks: 171 | track.predict() 172 | 173 | def _new_superboxes_from_data(self, sample, sample_features): 174 | """ 175 | Create new superboxes from detections and features. 176 | 177 | Args: 178 | sample: Input sample containing detection information. 179 | sample_features: Extracted features from the sample. 180 | 181 | Returns: 182 | list: List of new SuperTrack objects. 183 | """ 184 | n_rows = sample_features.shape[0] 185 | 186 | features = torch.full((n_rows, self.n_cams, self.feature_dim), float("nan"), device=self.device) 187 | boxes = torch.full((n_rows, self.n_cams, 4), float("nan"), device=self.device) 188 | positions_2d = torch.full((n_rows, self.n_cams, 2), float("nan"), device=self.device) 189 | positions_3d = torch.full((n_rows, self.n_cams, 2), float("nan"), device=self.device) 190 | 191 | cam_ids = sample["annotations"][:, 0].int() 192 | features[torch.arange(n_rows), cam_ids] = sample_features 193 | boxes[torch.arange(n_rows), cam_ids] = sample["annotations"][:, 3:7] 194 | positions_2d[torch.arange(n_rows), cam_ids] = sample["positions_2d"] 195 | positions_3d[torch.arange(n_rows), cam_ids] = sample["positions_3d"] 196 | confidences = sample["annotations"][:, 7] 197 | 198 | superboxes = [ 199 | SuperTrack( 200 | frame=self.frame, 201 | features=features[row], 202 | boxes=boxes[row], 203 | positions_2d=positions_2d[row], 204 | positions_3d=positions_3d[row], 205 | confidence=confidences[row], 206 | ) 207 | for row in range(n_rows) 208 | ] 209 | 210 | return superboxes 211 | 212 | def _merge_tracks(self, tracks): 213 | """ 214 | Merge multiple tracks into a single track. 215 | 216 | Args: 217 | tracks: List of tracks to merge. 218 | 219 | Returns: 220 | SuperTrack: Merged track. 221 | """ 222 | _frames = sorted({track.frame for track in tracks}) 223 | 224 | newest_frame = _frames[-1] 225 | if len(_frames) > 1: 226 | penult_frame = _frames[-2] 227 | 228 | assert tracks[-1].frame == newest_frame 229 | 230 | newest_evidence = [track for track in tracks if track.frame == newest_frame] 231 | 232 | features = (torch.ones(self.n_cams, self.feature_dim) * (torch.nan)).to(self.device) 233 | boxes = (torch.ones(self.n_cams, 4) * (torch.nan)).to(self.device) 234 | positions_2d = (torch.ones(self.n_cams, 2) * (torch.nan)).to(self.device) 235 | positions_3d = (torch.ones(self.n_cams, 2) * (torch.nan)).to(self.device) 236 | track_where = torch.zeros(self.n_cams, dtype=torch.bool).to(self.device) 237 | 238 | for cam_id in range(self.n_cams): 239 | for track in newest_evidence: 240 | if not torch.isnan(track.features[cam_id]).any(): 241 | features[cam_id] = track.features[cam_id] 242 | boxes[cam_id] = track.boxes[cam_id] 243 | positions_2d[cam_id] = track.positions_2d[cam_id] 244 | positions_3d[cam_id] = track.positions_3d[cam_id] 245 | track_where[cam_id] = True 246 | break 247 | 248 | merged_track = SuperTrack( 249 | frame=newest_frame, 250 | features=features, 251 | boxes=boxes, 252 | positions_2d=positions_2d, 253 | positions_3d=positions_3d, 254 | ) 255 | 256 | if len(_frames) > 1: 257 | penult_track = [track for track in tracks if track.frame == penult_frame][0] 258 | penult_track.update(merged_track) 259 | merged_track = penult_track 260 | 261 | return merged_track 262 | 263 | def _match(self, tracks, labels, low_conf_indices=None): 264 | """ 265 | Match superboxes with superboxes, and merged superboxes with existing supertracks. 266 | 267 | Args: 268 | tracks: List of tracks to match. 269 | labels: Labels for each track. 270 | low_conf_indices: Indices of low confidence detections. 271 | 272 | Returns: 273 | tuple: A tuple containing new tracks and unmatched tracks. 274 | """ 275 | new_tracks = [] 276 | unmatched_tracks = [] 277 | 278 | for label in torch.unique(labels): 279 | track_indices = torch.where(labels == label)[0].tolist() 280 | if len(track_indices) == 1: 281 | track = tracks[track_indices[0]] 282 | if low_conf_indices is not None and track_indices[0] in low_conf_indices: 283 | continue 284 | if track.state == TrackState.CREATED: 285 | new_tracks.append(track) 286 | else: 287 | unmatched_tracks.append(track) 288 | else: 289 | if low_conf_indices is None: 290 | relevant_tracks = sorted([tracks[i] for i in track_indices], key=lambda x: x.frame) 291 | else: 292 | relevant_tracks = sorted( 293 | [tracks[i] for i in track_indices if i not in low_conf_indices], key=lambda x: x.frame 294 | ) 295 | merged_track = self._merge_tracks(relevant_tracks) 296 | if low_conf_indices is not None and not merged_track.is_complete(): 297 | relevant_low_conf_tracks = [tracks[i] for i in track_indices if i in low_conf_indices] 298 | merged_track = self._merge_tracks([merged_track] + relevant_low_conf_tracks) 299 | new_tracks.append(merged_track) 300 | 301 | return new_tracks, unmatched_tracks 302 | 303 | @staticmethod 304 | def _compute_similarities(features, positions, boxes): 305 | """ 306 | Compute similarity matrices for features, positions, and boxes. 307 | 308 | Args: 309 | features: Tensor of track features. 310 | positions: Tensor of track positions. 311 | boxes: Tensor of track bounding boxes. 312 | 313 | Returns: 314 | tuple: A tuple containing similarity matrices for features, positions, and IoU. 315 | """ 316 | # permute to (n_cams, n_tracks, feature_dim), (n_cams, n_tracks, 2), (n_cams, n_tracks, 4) 317 | features = features.permute(1, 0, 2) 318 | positions = positions.permute(1, 0, 2) 319 | boxes = boxes.permute(1, 0, 2) 320 | 321 | # compute pairwise similarities (n_cams, n_tracks, n_tracks) 322 | feature_sim = batch_cosine_similarity(features, features) 323 | position_dist = batch_bev_distance(positions) 324 | iou_sim = batched_box_iou(boxes) 325 | 326 | # average-pool similarities to (n_tracks, n_tracks) 327 | feature_sim = torch.nanmean(feature_sim, dim=0) 328 | position_dist = torch.nanmean(position_dist, dim=0) 329 | iou_sim = torch.nanmean(iou_sim, dim=0) 330 | 331 | return feature_sim, position_dist, iou_sim 332 | 333 | def _build_weighted_graph( 334 | self, 335 | tracks: List[SuperTrack], 336 | similarities: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], 337 | rescale_thresh: float, 338 | dist_thresh: float, 339 | iou_bias: float, 340 | reid_decay: float = 1, 341 | penalty: float = -100, 342 | ) -> Tuple[torch.Tensor, torch.Tensor]: 343 | """ 344 | Build a weighted graph from tracks and similarity matrices. 345 | 346 | Args: 347 | tracks: List of tracks. 348 | similarities: Tuple of similarity matrices. 349 | rescale_thresh: Threshold for rescaling weights. 350 | dist_thresh: Distance threshold for feasibility. 351 | iou_bias: Bias to add for IoU-based matching. 352 | reid_decay: Decay factor for ReID scores. 353 | penalty: Penalty for infeasible edges. 354 | 355 | Returns: 356 | tuple: A tuple containing edge indices and edge weights of the graph. 357 | """ 358 | adj = self._initialize_adjacency_matrix(similarities, tracks, reid_decay, rescale_thresh, dist_thresh) 359 | 360 | if self.cfg.prematching.enabled: 361 | adj = self._apply_prematching(adj, tracks, iou_bias) 362 | 363 | adj = self._finalize_adjacency_matrix(adj, penalty, tracks) 364 | 365 | return self._get_edge_index_and_weights(adj) 366 | 367 | def _initialize_adjacency_matrix( 368 | self, 369 | similarities: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], 370 | tracks: List[SuperTrack], 371 | reid_decay: float, 372 | rescale_thresh: float, 373 | dist_thresh: float, 374 | ) -> torch.Tensor: 375 | """ 376 | Initialize the adjacency matrix for the graph. 377 | 378 | Args: 379 | similarities: Tuple of similarity matrices. 380 | tracks: List of tracks. 381 | reid_decay: Decay factor for ReID scores. 382 | rescale_thresh: Threshold for rescaling weights. 383 | dist_thresh: Distance threshold for feasibility. 384 | 385 | Returns: 386 | torch.Tensor: Initialized adjacency matrix. 387 | """ 388 | appearance_sim, position_dist, _ = similarities 389 | device = appearance_sim.device 390 | 391 | frame_support_pairs = [(track.frame, track.track_where) for track in tracks] 392 | frames, supports = zip(*frame_support_pairs) 393 | 394 | times = torch.tensor(frames, dtype=torch.int, device=device) 395 | lost = torch.tensor([track.state == TrackState.LOST for track in tracks], device=device) 396 | lost_since = torch.tensor([track.lost_since for track in tracks], device=device) 397 | 398 | appearance_sim = appearance_sim * reid_decay**lost_since 399 | appearance_sim = scale_weights(appearance_sim, rescale_thresh) 400 | 401 | combined_sim = self.cfg.matching.rescale_weight * appearance_sim + self.cfg.matching.distance_weight * ( 402 | 1 - position_dist / dist_thresh 403 | ) 404 | 405 | adj = torch.zeros_like(appearance_sim) 406 | lmask = lost[:, None] | lost[None, :] 407 | same_time = times[:, None] == times[None, :] 408 | feasible = (position_dist < dist_thresh) | lmask 409 | 410 | adj[same_time & feasible] = torch.clip(combined_sim[same_time & feasible], min=0, max=1) 411 | adj[~same_time] = combined_sim[~same_time] 412 | adj[lmask] = combined_sim[lmask] 413 | 414 | return adj 415 | 416 | def _apply_prematching(self, adj: torch.Tensor, tracks: List[SuperTrack], iou_bias: float) -> torch.Tensor: 417 | """ 418 | Apply prematching to the adjacency matrix. 419 | 420 | Args: 421 | adj: Adjacency matrix. 422 | tracks: List of tracks. 423 | iou_bias: Bias to add for IoU-based matching. 424 | 425 | Returns: 426 | torch.Tensor: Updated adjacency matrix after prematching. 427 | """ 428 | cur_frame = max(track.frame for track in tracks) 429 | pen_frame = cur_frame - 1 430 | cur_track_idx_by_cam = [[] for _ in range(self.n_cams)] 431 | pen_track_idx_by_cam = [[] for _ in range(self.n_cams)] 432 | 433 | for i, track in enumerate(tracks): 434 | if track.frame == cur_frame: 435 | for cam in range(self.n_cams): 436 | if not torch.isnan(track.boxes[cam]).any(): 437 | cur_track_idx_by_cam[cam].append(i) 438 | elif track.frame == pen_frame: 439 | for cam in range(self.n_cams): 440 | if not torch.isnan(track.boxes[cam]).any(): 441 | pen_track_idx_by_cam[cam].append(i) 442 | 443 | for cam in range(self.n_cams): 444 | cur_boxes_cam = [tracks[i].tlbr[cam] for i in cur_track_idx_by_cam[cam]] 445 | pen_boxes_cam = [tracks[i].tlbr[cam] for i in pen_track_idx_by_cam[cam]] 446 | 447 | if not cur_boxes_cam or not pen_boxes_cam: 448 | continue 449 | 450 | iou_dist = 1 - box_iou(torch.stack(cur_boxes_cam), torch.stack(pen_boxes_cam)) 451 | row_ind, col_ind = linear_sum_assignment(iou_dist.cpu().numpy()) 452 | 453 | for r, c in zip(row_ind, col_ind): 454 | if iou_dist[r, c] > self.cfg.prematching.iou_threshold: 455 | continue 456 | cur_idx = cur_track_idx_by_cam[cam][r] 457 | if self.cfg.prematching.prune_remaining: 458 | adj[cur_idx] = 0 459 | adj[:, cur_idx] = 0 460 | adj[cur_idx, pen_track_idx_by_cam[cam][c]] += iou_bias 461 | adj[pen_track_idx_by_cam[cam][c], cur_idx] += iou_bias 462 | 463 | return adj 464 | 465 | def _finalize_adjacency_matrix(self, adj: torch.Tensor, penalty: float, tracks: List[SuperTrack]) -> torch.Tensor: 466 | """ 467 | Finalize the adjacency matrix by applying penalties. 468 | 469 | Args: 470 | adj: Adjacency matrix. 471 | penalty: Penalty value for infeasible edges. 472 | tracks: List of tracks. 473 | 474 | Returns: 475 | torch.Tensor: Finalized adjacency matrix. 476 | """ 477 | frame_support_pairs = [(track.frame, track.track_where) for track in tracks] 478 | frames, supports = zip(*frame_support_pairs) 479 | 480 | times = torch.tensor(frames, dtype=torch.int, device=adj.device) 481 | supps = torch.stack(supports).to(adj.device) 482 | 483 | same_time = times[:, None] == times[None, :] 484 | same_supp = (supps[:, None] & supps[None, :]).any(dim=2) 485 | 486 | adj[same_time & same_supp] = penalty 487 | adj = adj * torch.triu(torch.ones_like(adj), diagonal=1) 488 | 489 | return adj 490 | 491 | def _get_edge_index_and_weights(self, adj: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 492 | """ 493 | Extract edge indices and weights from the adjacency matrix. 494 | 495 | Args: 496 | adj: Adjacency matrix. 497 | 498 | Returns: 499 | tuple: A tuple containing edge indices and edge weights. 500 | """ 501 | edge_index = torch.nonzero(adj).t().long() 502 | edge_weights = adj[edge_index[0], edge_index[1]] 503 | return edge_index, edge_weights 504 | 505 | def _sanitize(self): 506 | """ 507 | Sanitize the tracker by updating track states and removing killed tracks. 508 | """ 509 | keep = [] 510 | for k, track in enumerate(self.tracks): 511 | if track.state is TrackState.CREATED: 512 | track.activate() 513 | if track.label is None: 514 | track.set_label(self.free_id) 515 | self.free_id += 1 516 | if torch.all(~track.track_where): 517 | if torch.all(track.inactive_since[track.inactive_since > 0] > self.cfg.patience): 518 | track.deactivate() 519 | if track.state is TrackState.LOST: 520 | if track.lost_since > self.cfg.memory: 521 | track.kill() 522 | else: 523 | track.lost_since += 1 524 | if track.state is not TrackState.KILLED: 525 | keep.append(track) 526 | for cam in range(self.n_cams): 527 | if track.inactive_since[cam] > self.cfg.patience: 528 | track.reset([cam]) 529 | killed = len(self.tracks) - len(keep) 530 | self.tracks = keep 531 | self.stats["# Tracks"] = len(self.tracks) 532 | self.stats["# Lost"] = len([track for track in self.tracks if track.state == TrackState.LOST]) 533 | self.stats["# Killed"] += killed 534 | 535 | latency = statistics.mean(self.latency) if len(self.latency) > 0 else 0 536 | self.stats["FPS"] = int(1 / latency) if latency > 0 else 0 537 | 538 | def _get_active_tracks(self): 539 | """ 540 | Get a list of active tracks. 541 | 542 | Returns: 543 | list: List of active tracks. 544 | """ 545 | return [track for track in self.tracks if track.state != TrackState.KILLED] 546 | 547 | def get_result(self, normalization=None, scale=1.0): 548 | """ 549 | Get the current online state of the tracker. 550 | 551 | Args: 552 | normalization: Optional normalization parameters. 553 | scale: Scale factor for the results. 554 | 555 | Returns: 556 | torch.Tensor: Tensor containing the current tracker state. 557 | """ 558 | to_stack = [track.to_tensor() for track in self.tracks if track.state == TrackState.ACTIVE] 559 | if len(to_stack) > 0: 560 | result = torch.cat(to_stack) 561 | else: 562 | result = torch.empty(0) 563 | if result.size(0) > 0: 564 | if normalization is not None: 565 | min_x, min_y, max_x, max_y = normalization 566 | result[:, 7:9] = result[:, 7:9] * torch.tensor([max_x - min_x, max_y - min_y]) + torch.tensor( 567 | [min_x, min_y] 568 | ) 569 | result[:, 7:9] *= scale 570 | return result 571 | 572 | def _get_index_by_id(self, tid): 573 | """ 574 | Get the index of a track by its ID. 575 | 576 | Args: 577 | tid: Track ID to search for. 578 | 579 | Returns: 580 | int: Index of the track with the given ID, or None if not found. 581 | """ 582 | for i, track in enumerate(self.tracks): 583 | if track.label == tid: 584 | return i 585 | return None 586 | 587 | 588 | def create_tracker(cfg, solver_cfg, feature_extractor, n_cams, device, writer=None): 589 | """ 590 | Create a new Tracker instance. 591 | 592 | Args: 593 | cfg: Configuration dictionary. 594 | solver_cfg: Solver configuration. 595 | feature_extractor: Feature extractor module. 596 | n_cams: Number of cameras. 597 | device: Device to run the tracker on. 598 | writer: Optional writer for logging. 599 | 600 | Returns: 601 | Tracker: A new Tracker instance. 602 | """ 603 | return Tracker( 604 | solver_opts=solver_cfg, 605 | cfg=cfg, 606 | feature_extractor=feature_extractor, 607 | n_cams=n_cams, 608 | device=device, 609 | ) 610 | --------------------------------------------------------------------------------