├── src
    ├── __init__.py
    ├── tracker
    │   ├── encoder.py
    │   ├── solver.py
    │   ├── similarities.py
    │   ├── geometry.py
    │   ├── supertrack.py
    │   └── tracker.py
    ├── utils
    │   ├── iotools.py
    │   ├── utils.py
    │   └── evaluate.py
    └── datasets
    │   └── dataset.py
├── conf
    ├── encoder
    │   └── precomputed.yaml
    ├── dataset
    │   ├── Synthehicle.yaml
    │   ├── Synthehicle-bev.yaml
    │   └── CityFlow.yaml
    ├── experiment
    │   ├── CityFlow.yaml
    │   └── Synthehicle.yaml
    └── config.yaml
├── pyproject.toml
├── .github
    └── ISSUE_TEMPLATE
    │   └── bug_report.md
├── setup.py
├── LICENSE
├── tools
    └── track.py
├── .gitignore
└── README.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/conf/encoder/precomputed.yaml:
--------------------------------------------------------------------------------
1 | name: precomputed
2 | 


--------------------------------------------------------------------------------
/conf/dataset/Synthehicle.yaml:
--------------------------------------------------------------------------------
1 | name: Synthehicle
2 | scene_path: ./test/Town06-O-dawn/
3 | camera_pattern: C0*
4 | img_path: ./out_rgb/
5 | gt_path: ./gt/gt.txt
6 | calibration_path: calibration.json


--------------------------------------------------------------------------------
/conf/dataset/Synthehicle-bev.yaml:
--------------------------------------------------------------------------------
1 | name: Synthehicle
2 | scene_path: ./test/Town06-O-dawn/
3 | camera_pattern: C0*
4 | img_path: ./out_rgb/
5 | gt_path: ./gt/gt_bev.txt
6 | calibration_path: calibration.json


--------------------------------------------------------------------------------
/conf/dataset/CityFlow.yaml:
--------------------------------------------------------------------------------
1 | name: AICITY
2 | scene_path: ./validation/S02
3 | camera_pattern: c00*
4 | img_path: ./img1/
5 | img_ext: jpg
6 | offsets: [0, 0, 3, 8]
7 | calibration_path: calibration.json
8 | roi_path: "./data/AICITY/eval/ROIs/validation"
9 | 


--------------------------------------------------------------------------------
/src/tracker/encoder.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import warnings
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | import torchvision
 7 | 
 8 | 
 9 | class Precomputed:
10 |     def __init__(self, cfg):
11 |         self.cfg = cfg
12 | 
13 |     def __call__(self, x):
14 |         features = x["annotations"][:, 11:]
15 |         return F.normalize(features, p=2, dim=1)
16 | 
17 | 
18 | def create_encoder(cfg, device):
19 |     print(cfg)
20 |     if cfg.name == "precomputed":
21 |         return Precomputed(cfg)
22 |     else:
23 |         raise ValueError(f"Encoder {cfg.name} not found.")
24 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # this ruff configuration is taken (mostly) from hugging face
 2 | [tool.ruff]
 3 | line-length = 119
 4 | 
 5 | [tool.ruff.lint]
 6 | # Never enforce `E501` (line length violations).
 7 | ignore = ["C901", "E501", "E741", "F402", "F823", "E402", "F401", "F403", "F811"]
 8 | select = ["C", "E", "F", "I", "W"]
 9 | 
10 | [tool.ruff.lint.isort]
11 | lines-after-imports = 2
12 | known-first-party = ["stmc"]
13 | 
14 | [tool.ruff.format]
15 | # Like Black, use double quotes for strings.
16 | quote-style = "double"
17 | 
18 | # Like Black, indent with spaces, rather than tabs.
19 | indent-style = "space"
20 | 
21 | # Like Black, respect magic trailing commas.
22 | skip-magic-trailing-comma = false
23 | 
24 | # Like Black, automatically detect the appropriate line ending.
25 | line-ending = "auto"
26 | 
27 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Environment**
27 | Please provide the following information:
28 | - Output of `pip freeze` and or `conda info`
29 | - Output of `python -m torch.utils.collect_env`
30 | 
31 | **Additional context**
32 | Add any other context about the problem here.
33 | 


--------------------------------------------------------------------------------
/conf/experiment/CityFlow.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /dataset: CityFlow
 5 |   - override /encoder: precomputed
 6 | 
 7 | dataset_path: ./data/AICITY/
 8 | 
 9 | resources:
10 |   reid: LCFractal
11 |   detector: YOLOX
12 | 
13 | tracker:
14 |   matching:
15 |     distance_threshold: 0.001
16 |     rescale_threshold: 0.7
17 |     reid_decay: 0.7
18 |     rescale_weight: 0.9
19 |   confidence_thresh: 0.70
20 |   low_confidence_thresh: null
21 |   patience: 0
22 |   memory: 160
23 |   fdim: 2048
24 |   prematching:
25 |     enabled: false
26 |     iou_bias: 0.50
27 |     iou_threshold: 0.70
28 |     prune_remaining: false
29 | 
30 | preprocess:
31 |   nms_thresh: 0.7
32 |   roi_filter: true
33 | 
34 | postprocess:
35 |   expand_boxes:
36 |     enable: true
37 |     factor: 1.4
38 |   remove_borders:
39 |     enable: true
40 |     border_size: 0
41 |   size_filter:
42 |     enable: true
43 |     min_size: 6000
44 |     max_size: 600000
45 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | 
 4 | setup(
 5 |     name="stmc",
 6 |     version="0.1.0",
 7 |     packages=find_packages(),
 8 |     install_requires=[
 9 |         "hydra-core",
10 |         "torch",
11 |         "wandb",
12 |         "loguru",
13 |         "omegaconf",
14 |         "qqdm",
15 |         "pillow",
16 |         "ramapy",
17 |     ],
18 |     entry_points={
19 |         "console_scripts": [
20 |             "track=tools.track:main",
21 |         ],
22 |     },
23 |     author="Fabian Herzog",
24 |     author_email="fabian.herzog@tum.de",
25 |     description="Spatial-Temporal Multi-Cuts for Online Multiple-Camera Vehicle Tracking",
26 |     long_description=open("README.md").read(),
27 |     long_description_content_type="text/markdown",
28 |     url="https://github.com/fubel/stmc",
29 |     classifiers=[
30 |         "Programming Language :: Python :: 3",
31 |         "License :: OSI Approved :: MIT License",
32 |         "Operating System :: OS Independent",
33 |     ],
34 |     python_requires=">=3.8",
35 | )
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Fabian Herzog
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/conf/experiment/Synthehicle.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /dataset: Synthehicle
 5 |   - override /encoder: precomputed
 6 | 
 7 | dataset_path: ./data/Synthehicle/
 8 | 
 9 | resources:
10 |   reid: LightMBN
11 |   detector: YOLOX
12 | 
13 | tracker:
14 |   matching:
15 |     distance_threshold: 8
16 |     rescale_threshold: 0.8
17 |     reid_decay: 0.7
18 |     rescale_weight: 0.3
19 |     distance_weight: 0.7
20 |   confidence_thresh: 0.6
21 |   patience: 0
22 |   memory: 15
23 |   fdim: 3584
24 |   prematching:
25 |     enabled: true
26 |     iou_bias: 0.60
27 |     iou_threshold: 0.70
28 |     prune_remaining: true
29 | 
30 | preprocess:
31 |   nms_thresh: 0.9
32 |   roi_filter: true
33 |   bottom: false
34 |   box_projection_centers:
35 |     alpha_w: 0.15
36 |     alpha_h: 0.85
37 | 
38 | postprocess:
39 |   expand_boxes:
40 |     enable: false
41 |     factor: 1.4
42 |   remove_borders:
43 |     enable: false
44 |     border_size: 5
45 |   size_filter:
46 |     enable: false
47 |     min_size: 0
48 |     max_size: 0
49 | 
50 | evaluation:
51 |   inplace: true
52 |   evaluate_standard: true
53 |   evaluate_hota: false
54 |   evaluate_bev: false
55 |   evaluate_external: false


--------------------------------------------------------------------------------
/conf/config.yaml:
--------------------------------------------------------------------------------
 1 | # config.yaml
 2 | hydra/hydra_logging: null
 3 | 
 4 | defaults:
 5 |   - dataset: CityFlow
 6 |   - encoder: precomputed
 7 | 
 8 | dataset_path: ./data/AICITY/
 9 | output_path: ./outputs/
10 | 
11 | device: cuda
12 | 
13 | logging:
14 |   wandb:
15 |     enable: false
16 |     project: ggmc
17 |     upload_results: false
18 |     tags: null
19 |   tensorboard:
20 |     enable: false
21 | 
22 | resources:
23 |   path: ./resources/
24 |   detector: YOLOX
25 |   reid: null
26 | 
27 | visuals:
28 |   plot_interval: 1
29 |   plot_results: false
30 |   plot_ground_truth: false
31 |   plot_to_tensorboard: false
32 |   grid_rows: 2
33 |   store_files: true
34 |   border_size: 3
35 | 
36 | solver:
37 |   backend: PD
38 | 
39 | tracker:
40 |   matching:
41 |     distance_threshold: 0.02
42 |     rescale_threshold: 0.65
43 |     reid_decay: 1.0
44 |     rescale_weight: 0.5
45 |     distance_weight: 0.5
46 |   confidence_thresh: 0.7
47 |   low_confidence_thresh: null
48 |   patience: 1
49 |   memory: 15
50 |   fdim: 512
51 |   enable_accumulator: true
52 |   prematching:
53 |     enabled: true
54 |     iou_bias: 0.60
55 |     iou_threshold: 0.50
56 |     prune_remaining: false
57 | 
58 | preprocess:
59 |   nms_thresh: null
60 |   roi_filter: true
61 |   bottom: true
62 |   box_projection_centers:
63 |     alpha_w: null
64 |     alpha_h: null
65 | 
66 | postprocess:
67 |   expand_boxes:
68 |     enable: true
69 |     factor: 1.4
70 |   remove_borders:
71 |     enable: true
72 |     border_size: 5
73 |   size_filter:
74 |     enable: true
75 |     min_size: 6220
76 |     max_size: 622080
77 | 
78 | evaluation:
79 |   inplace: true
80 |   evaluate_standard: true
81 |   evaluate_hota: false
82 |   evaluate_bev: false
83 |   evaluate_external: true
84 | 


--------------------------------------------------------------------------------
/src/tracker/solver.py:
--------------------------------------------------------------------------------
 1 | import rama_py
 2 | import torch
 3 | 
 4 | 
 5 | def multicut(edge_index, edge_weights, opts):
 6 |     """Solves a multicut problem based on the RAMA algorithm.
 7 | 
 8 |     The edge_index is expected in the usual torch_geometric format.
 9 |     Note that RAMA requires u < v for each edge (u, v) in the graph.
10 | 
11 |     Args:
12 |         edge_index (LongTensor): 2xE LongTensor of edge indices.
13 |         edge_weights (LongTensor): E LongTensor of edge weights.
14 | 
15 |     Returns:
16 |         LongTensor: N LongTensor of node labels, where N is the number
17 |             of nodes in the graph.
18 |     """
19 |     if (edge_index[0] > edge_index[1]).any():
20 |         raise ValueError("Solver expects u < v for each edge (u, v) in the graph.")
21 |     if edge_index.device.index is None:
22 |         raise ValueError("Solver runs on CUDA device only. Please move data to CUDA.")
23 |     if edge_index.shape[1] == 0:
24 |         return torch.empty(0).to("cuda")
25 |     i = edge_index[0].to(torch.int32)
26 |     j = edge_index[1].to(torch.int32)
27 |     costs = edge_weights.to(torch.float32)
28 |     num_nodes = torch.max(edge_index) + 1
29 |     num_edges = edge_index.shape[1]
30 |     node_labels = torch.ones(num_nodes, device=i.device).to(torch.int32)
31 |     rama_py.rama_cuda_gpu_pointers(
32 |         i.data_ptr(),
33 |         j.data_ptr(),
34 |         costs.data_ptr(),
35 |         node_labels.data_ptr(),
36 |         num_nodes,
37 |         num_edges,
38 |         i.device.index,
39 |         opts,
40 |     )
41 |     return node_labels
42 | 
43 | 
44 | def scale_weights(weights, threshold=0.7):
45 |     """Scales the given weights to the range [-1, 1] based on the given threshold.
46 | 
47 |     Args:
48 |         weights (FloatTensor): LongTensor of edge weights.
49 |         threshold (float, optional): Threshold for scaling. Defaults to 0.4.
50 | 
51 |     Returns:
52 |         FloatTensor: LongTensor of scaled edge weights.
53 |     """
54 |     y = weights.clone()
55 |     z = weights.clone()
56 |     z[y == threshold] = 0.0
57 |     z[y > threshold] = (y[y > threshold] - threshold) / (1 - threshold)
58 |     z[y < threshold] = (y[y < threshold] - threshold) / (threshold)
59 |     return z
60 | 
61 | 
62 | def create_solver(backend):
63 |     opts = rama_py.multicut_solver_options(backend)
64 |     opts.verbose = False
65 |     return opts
66 | 


--------------------------------------------------------------------------------
/src/tracker/similarities.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchvision.ops import box_iou
 3 | 
 4 | 
 5 | def cosine_similarity(a, b, eps=1e-8):
 6 |     """
 7 |     Compute pairwise appearance distance between features.
 8 |     from https://stackoverflow.com/a/58144658
 9 |     """
10 |     a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
11 |     a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
12 |     b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
13 |     sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
14 |     return sim_mt
15 | 
16 | 
17 | def batch_cosine_similarity(a, b, eps=1e-8):
18 |     """Compute batched pairwise appearance distance between features.
19 | 
20 |     Args:
21 |         a (torch.Tensor): (B, N, feature_dim) tensor.
22 |         b (torch.Tensor): (B, N, feature_dim) tensor.
23 |         eps (float, optional): Epsilon to prevent division by zero. Defaults to 1e-8.
24 | 
25 |     Returns:
26 |         torch.Tensor: (B, N, N) tensor of pairwise similarities.
27 |     """
28 |     # Compute norms along feature dimension and add new dimensions needed for broadcasting
29 |     a_n = a.norm(dim=2)[:, :, None]
30 |     b_n = b.norm(dim=2)[:, :, None]
31 | 
32 |     # Perform normalization and prevent division by zero.
33 |     a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
34 |     b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
35 | 
36 |     # Compute similarity matrix using batch matrix multiplication.
37 |     sim_mt = torch.bmm(a_norm, b_norm.transpose(1, 2))
38 |     return sim_mt
39 | 
40 | 
41 | def batched_box_iou(boxes):
42 |     """Compute batched pairwise IoU between boxes.
43 | 
44 |     Args:
45 |         boxes (torch.Tensor): (B, N, 4) tensor of boxes.
46 | 
47 |     Returns:
48 |         torch.Tensor: (B, N, N) tensor of pairwise IoU.
49 |     """
50 |     ious = []
51 |     for sub_boxes in boxes:
52 |         ious.append(box_iou(sub_boxes, sub_boxes))
53 |     return torch.stack(ious)
54 | 
55 | 
56 | def bev_distance(bev_positions):
57 |     """Compute distance between positions on ground plane.
58 | 
59 |     Args:
60 |         bev_positions (torch.Tensor): (N, 2) tensor of positions.
61 | 
62 |     Returns:
63 |         torch.Tensor: (N, N) tensor of pairwise similarities.
64 |     """
65 |     return torch.norm(bev_positions[:, None] - bev_positions[None, :], dim=2)
66 | 
67 | 
68 | def batch_bev_distance(bev_positions):
69 |     """Compute batched distance similarity between positions on ground plane.
70 | 
71 |     Args:
72 |         bev_positions (torch.Tensor): (B, N, 2) tensor of positions.
73 | 
74 |     Returns:
75 |         torch.Tensor: (B, N, N) tensor of pairwise similarities.
76 |     """
77 |     # Subtract positions across the batch, adding extra dimensions for broadcasting
78 |     diff = bev_positions[:, :, None] - bev_positions[:, None, :]
79 | 
80 |     # Compute norm along the last dimension (x and y coordinates)
81 |     norm = torch.norm(diff, dim=-1)
82 | 
83 |     # Return similarity
84 |     return norm
85 | 


--------------------------------------------------------------------------------
/tools/track.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from subprocess import PIPE, run
 4 | 
 5 | import hydra
 6 | import torch
 7 | from loguru import logger
 8 | from omegaconf import DictConfig, OmegaConf
 9 | from qqdm import format_str, qqdm
10 | 
11 | import wandb
12 | from src.datasets.dataset import create_dataloader
13 | from src.tracker.encoder import create_encoder
14 | from src.tracker.solver import create_solver
15 | from src.tracker.tracker import create_tracker
16 | from src.utils.evaluate import evaluate_tracker
17 | from src.utils.iotools import ResultsWriter
18 | 
19 | 
20 | @hydra.main(version_base=None, config_path="../conf", config_name="config")
21 | def main(cfg: DictConfig) -> None:
22 |     if cfg.device == "cpu" or not torch.cuda.is_available():
23 |         raise ValueError("This code runs on CUDA only. Please set device to 'cuda'.")
24 |     else:
25 |         device = torch.device(cfg.device)
26 |         logger.info(f"🚀 Using device: {device}")
27 | 
28 |     cfg.tracker.matching.distance_weight = 1 - cfg.tracker.matching.rescale_weight
29 | 
30 |     # create output directories
31 |     output_path = os.path.join(cfg.output_path)
32 |     os.makedirs(output_path, exist_ok=True)
33 |     output_path = os.path.join(output_path, cfg.dataset.name)
34 |     logger.info(f"📂 Writing to output path: {output_path}")
35 | 
36 |     # Initialize wandb and tensorboard
37 |     if cfg.logging.wandb.enable:
38 |         wandb.init(project=cfg.logging.wandb.project)
39 |         wandb.config.update(OmegaConf.to_container(cfg))
40 |         if cfg.logging.wandb.tags is not None:
41 |             wandb.run.tags = cfg.logging.wandb.tags
42 | 
43 |     # Initialize solver
44 |     solver_opts = create_solver(cfg.solver.backend)
45 |     logger.info(f"✨ Initialized solver, using backend: {cfg.solver.backend}")
46 | 
47 |     # Initialize dataset and dataloader
48 |     dataloader = create_dataloader(cfg)
49 |     logger.info("✨ Created dataloader.")
50 | 
51 |     # Initialize encoder
52 |     encoder = create_encoder(cfg.encoder, device)
53 |     logger.info("✨ Created encoder.")
54 | 
55 |     tracker = create_tracker(cfg, solver_opts, encoder, len(dataloader.dataset.camera_names), device)
56 |     logger.info("✨ Initialized tracker.")
57 | 
58 |     results_writer = ResultsWriter(
59 |         output_path=output_path,
60 |         cfg=cfg,
61 |         normalization=dataloader.dataset._norm_factors,
62 |         camera_names=dataloader.dataset.camera_names,
63 |     )
64 | 
65 |     tw = qqdm(range(len(dataloader)), desc=format_str("bold", "Description"))
66 |     for i, batch in enumerate(dataloader):
67 |         results, _ = tracker.step(batch)
68 |         results_writer.add(results)
69 |         stats = tracker.stats
70 |         tw.set_infos(stats)
71 |         tw.update()
72 | 
73 |         if cfg.logging.wandb.enable:
74 |             _stats_str_to_float = {k: float(v) for k, v in stats.items()}
75 |             wandb.log(_stats_str_to_float, step=i)
76 | 
77 |     logger.info(f"🕒 Cumulative execution time of tracker {tracker.cumulative_execution_time * 10}")
78 |     logger.info(f"🕒 Average time per frame {tracker.cumulative_execution_time / tracker.frame}")
79 | 
80 |     results_writer.save()
81 | 
82 |     logger.info("🚀 Tracking completed.")
83 |     logger.info(
84 |         f"📈 Results saved to {results_writer.results_file}. "
85 |         "Use the official evaluation script of the dataset for evaluation."
86 |     )
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     main()
91 | 


--------------------------------------------------------------------------------
/src/tracker/geometry.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class Projector:
 8 |     def __init__(self, calibration_path: str):
 9 |         """
10 |         Initialize a Projector object. The projector is used to project points between image and world coordinates.
11 | 
12 |         Args:
13 |             calibration_path (str): Path to the calibration file (JSON).
14 | 
15 |         Raises:
16 |             FileNotFoundError: If the calibration file is not found.
17 |             ValueError: If the homography is not found in the calibration file.
18 |         """
19 |         if os.path.exists(calibration_path) is False:
20 |             raise FileNotFoundError(f"Calibration file not found at path: {calibration_path}")
21 |         self.calibration_path = calibration_path
22 | 
23 |         with open(calibration_path, "r") as f:
24 |             calibration = json.load(f)
25 |             try:
26 |                 homography_keys = [
27 |                     "homography",
28 |                     "H",
29 |                     "homography_matrix",
30 |                     "homography matrix",
31 |                 ]
32 |                 valid_homography_key = set(homography_keys).intersection(set(calibration.keys())).pop()
33 |             except KeyError:
34 |                 raise ValueError("Homography not found in calibration file.")
35 |             self._homography = torch.Tensor(calibration[valid_homography_key])
36 |             self._inverse_homography = torch.inverse(self._homography)
37 | 
38 |     def image_to_world(self, points: torch.Tensor) -> torch.Tensor:
39 |         """Projects image points to world coordinates.
40 | 
41 |         Args:
42 |             points (torch.Tensor): Image points Nx2.
43 | 
44 |         Returns:
45 |             torch.Tensor: World points Nx3.
46 |         """
47 |         if points.dim() != 2:
48 |             points = points.view(-1, 2)
49 |         if points.size(1) != 2:
50 |             raise ValueError(f"Expected image points to be of shape (N, 2), but got {points.shape}.")
51 |         return self._homography_image_to_world(points)
52 | 
53 |     def world_to_image(self, points: torch.Tensor) -> torch.Tensor:
54 |         """Projects world points to image coordinates.
55 | 
56 |         Args:
57 |             points (torch.Tensor): World points Nx3.
58 | 
59 |         Returns:
60 |             torch.Tensor: Image points Nx2.
61 |         """
62 |         if points.dim() != 2:
63 |             points = points.view(-1, 3)
64 |         if points.size(1) != 3:
65 |             points = torch.cat([points, torch.ones((points.shape[0], 1))], dim=1)
66 |         return self._homography_world_to_image(points)
67 | 
68 |     def _homography_image_to_world(self, points: torch.Tensor) -> torch.Tensor:
69 |         points = torch.cat([points, torch.ones((points.shape[0], 1))], dim=1)
70 |         device = points.device
71 |         homography = self._inverse_homography.to(device)
72 |         projected_points = torch.matmul(homography, points.t()).t()
73 |         projected_points = projected_points[:, :2] / projected_points[:, 2].reshape(-1, 1)
74 |         return projected_points
75 | 
76 |     def _homography_world_to_image(self, points: torch.Tensor) -> torch.Tensor:
77 |         device = points.device
78 |         homography = self._homography.to(device)
79 |         projected_points = torch.matmul(homography, points.t()).t()
80 |         projected_points = projected_points[:, :2] / projected_points[:, 2].reshape(-1, 1)
81 |         return projected_points
82 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Project-specific
  2 | data/
  3 | eval/
  4 | resources/
  5 | outputs/
  6 | wandb/
  7 | 
  8 | .ruff_cache
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | share/python-wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .nox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | *.py,cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | cover/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | .pybuilder/
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # IPython
 91 | profile_default/
 92 | ipython_config.py
 93 | 
 94 | # pyenv
 95 | #   For a library or package, you might want to ignore these files since the code is
 96 | #   intended to run in multiple environments; otherwise, check them in:
 97 | # .python-version
 98 | 
 99 | # pipenv
100 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | #   install all needed dependencies.
104 | #Pipfile.lock
105 | 
106 | # poetry
107 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
109 | #   commonly ignored for libraries.
110 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 | 
113 | # pdm
114 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | #   in version control.
118 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
119 | .pdm.toml
120 | .pdm-python
121 | .pdm-build/
122 | 
123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124 | __pypackages__/
125 | 
126 | # Celery stuff
127 | celerybeat-schedule
128 | celerybeat.pid
129 | 
130 | # SageMath parsed files
131 | *.sage.py
132 | 
133 | # Environments
134 | .env
135 | .venv
136 | env/
137 | venv/
138 | ENV/
139 | env.bak/
140 | venv.bak/
141 | 
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 | 
146 | # Rope project settings
147 | .ropeproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | # pytype static type analyzer
161 | .pytype/
162 | 
163 | # Cython debug symbols
164 | cython_debug/
165 | 
166 | # PyCharm
167 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
170 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
171 | #.idea/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spatial-Temporal Multi-Cuts for Online Multiple-Camera Vehicle Tracking
 2 | [![arXiv Badge](https://img.shields.io/badge/Paper-arXiv.2410.02638-b31b1b.svg)](https://arxiv.org/abs/2410.02638)
 3 | [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 4 | 
 5 | ## Authors
 6 | 
 7 | [Fabian Herzog](https://github.com/fubel), [Johannes Gilg](https://github.com/Blueblue4), [Philipp Wolters](https://github.com/phi-wol), [Torben Teepe](https://github.com/tteepe/), and Gerhard Rigoll
 8 | 
 9 | ## Installation
10 | 
11 | Only tested with Python 3.8, CUDA 11.8, GCC >= 9.4.0 on NVIDIA RTX 3090, PyTorch 2.0.1 on Ubuntu 22.04.
12 | 
13 | ```bash
14 | # Setup with miniconda
15 | conda create -n stmc python=3.8
16 | conda activate stmc
17 | 
18 | # Setup torch
19 | conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.8 -c pytorch -c nvidia
20 | 
21 | # Setup RAMA
22 | # (cf. https://github.com/pawelswoboda/RAMA/)
23 | git clone git@github.com:pawelswoboda/RAMA.git
24 | mkdir -p RAMA/build && cd RAMA/build
25 | cmake ..
26 | make -j 4
27 | 
28 | # Setup Python bindings
29 | python -m pip install git+https://github.com/pawelswoboda/RAMA.git
30 | 
31 | # Install remaining dependencies
32 | python -m pip install -r requirements.txt
33 | ```
34 | 
35 | ## Data Setup
36 | 
37 | The config files assume the datasets are stored in `./data/`. You can setup a symlink to a different location or adjust the paths in the config. The datasets are available at:
38 | 
39 | * [CityFlow](https://www.aicitychallenge.org)
40 | * [Synthehicle](https://github.com/fubel/synthehicle)
41 | 
42 | You need to provide the camera calibrations in `calibration.json` files. They are available in the releases.
43 | 
44 | ## Running the Code
45 | 
46 | For a multi-camera scene, adjust the `config.yaml`. To track the Synthehicle scene `Town06-O-dawn`, run
47 | 
48 | ```bash
49 | # for Synthehicle, Town06-O-dawn
50 | python -m tools.track +experiment=Synthehicle dataset.scene_path=./test/Town06-O-dawn/
51 | ```
52 | 
53 | To track the CityFlow scene S02, run
54 | 
55 | ```bash
56 | # for Synthehicle, Town06-O-dawn
57 | python -m tools.track +experiment=CityFlow
58 | ```
59 | 
60 | ❗️ We'll provide all pre-extracted detections and features soon!
61 | 
62 | ## Features and Detections
63 | 
64 | Our resources are formatted in the MOT-Challenge format, with the addition that the last N columns of a resource file store the appearance feature vector of that object. Detections and features are available in the releases.
65 | 
66 | ❗️ We'll provide all pre-extracted detections and features soon!
67 | 
68 | ## Evaluation
69 | 
70 | The results are saved in the output directory specified in the config. 
71 | 
72 | **🚨 Please use the evaluation scripts provided by the respective datasets to evaluate the final results!**
73 | 
74 | Our in-built evaluation follows the evaluation protocol of Synthehicle which differs from the CityFlow official evaluation script (our eval does not filter single-cam trajectories, for instance). 
75 | 
76 | ## Acknowledgements
77 | 
78 | We'd like to thank the authors of the following repositories for providing code used in our work:
79 | 
80 | * We use the [RAMA](https://github.com/pawelswoboda/RAMA.git) solver which enables fast multi-cuts on the GPU.
81 | * The features for CityFlow are from [LCFractal](https://github.com/LCFractal/AIC21-MTMC).
82 | 
83 | ## Citation
84 | 
85 | ```
86 | @article{herzog2024spatial,
87 |   title={{Spatial-Temporal Multi-Cuts for Online Multiple-Camera Vehicle Tracking}},
88 |   author={Herzog, Fabian and Gilg, Johannes and Wolters, Philipp and Teepe, Torben and Rigoll, Gerhard},
89 |   journal={arXiv preprint arXiv:2410.02638},
90 |   year={2024}
91 | }
92 | ```
93 | 
94 | ## License
95 | 
96 | The original code in this repository is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. This project relies on various third-party libraries and dependencies, each with their own licensing terms. These dependencies may not be included in the MIT License that covers my original code. Users of this project must ensure they comply with all licenses of the required dependencies for their specific use case. Some dependencies may have more restrictive terms than the MIT License.
97 | 


--------------------------------------------------------------------------------
/src/utils/iotools.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from torch.utils.tensorboard import SummaryWriter
  6 | 
  7 | from .utils import expand_boxes, remove_border_boxes, size_filter
  8 | 
  9 | 
 10 | class ResultsWriter:
 11 |     def __init__(self, output_path, cfg, normalization=None, camera_names=None):
 12 |         self._results = []
 13 | 
 14 |         self.cfg = cfg
 15 |         self.output_path = output_path
 16 |         self._norm_factors = normalization
 17 | 
 18 |         self.rows = cfg.visuals.grid_rows
 19 |         self.plot_results = cfg.visuals.plot_results
 20 |         self.plot_every = cfg.visuals.plot_interval
 21 | 
 22 |         self.camera_names = camera_names
 23 | 
 24 |         self.writer = None
 25 | 
 26 |         if cfg.logging.tensorboard.enable:
 27 |             self.writer = SummaryWriter()
 28 | 
 29 |         self.store_files = cfg.visuals.store_files
 30 |         self.results_file = os.path.join(output_path, "results.txt")
 31 | 
 32 |         self.offsets = cfg.dataset.offsets if hasattr(cfg.dataset, "offsets") else [0] * len(camera_names)
 33 | 
 34 |         self.on_bev = True if cfg.dataset.name == "WildTrack" else False
 35 | 
 36 |         self._save_function = self.get_save_function(cfg)
 37 | 
 38 |         if os.path.exists(self.results_file):
 39 |             os.remove(self.results_file)
 40 | 
 41 |         os.makedirs(output_path, exist_ok=True)
 42 | 
 43 |     @property
 44 |     def results(self):
 45 |         results = torch.cat(self._results, dim=0)
 46 |         for i, offset in enumerate(self.offsets):
 47 |             results[results[:, 0] == i, 2] -= offset
 48 |         # multiply camera column by (-1)
 49 |         results[:, 0] *= -1
 50 |         for i, name in enumerate(self.camera_names):
 51 |             # this is a bit hacky if camera does not start with letter
 52 |             try:
 53 |                 name_int = int(name[1:])
 54 |             except ValueError:
 55 |                 # fallback to index of camera
 56 |                 name_int = i
 57 |             results[results[:, 0] == -i, 0] = name_int
 58 |         if self.cfg.postprocess.expand_boxes.enable:
 59 |             factor = self.cfg.postprocess.expand_boxes.factor
 60 |             results[:, 3:7] = expand_boxes(results[:, 3:7], factor)
 61 |         if self.cfg.postprocess.remove_borders.enable:
 62 |             boxes = results[:, 3:7]
 63 |             border = self.cfg.postprocess.remove_borders.border_size
 64 |             keep = remove_border_boxes(boxes, border)
 65 |             results = results[keep]
 66 |         if self.cfg.postprocess.size_filter.enable:
 67 |             boxes = results[:, 3:7]
 68 |             keep = size_filter(
 69 |                 boxes, self.cfg.postprocess.size_filter.min_size, self.cfg.postprocess.size_filter.max_size
 70 |             )
 71 |             results = results[keep]
 72 |         return results
 73 | 
 74 |     def add(self, result):
 75 |         _result = result.clone()
 76 |         if self._norm_factors is not None:
 77 |             _result = self.denormalize_bev(_result[:, 7:9])
 78 |         self._results.append(result)
 79 | 
 80 |     def save(self):
 81 |         if self._results:
 82 |             self._save_function(self.results.cpu().numpy())
 83 | 
 84 |     def _to_aicity19(self, result):
 85 |         # CAMERA_ID  OBJ_ID  FRAME  X Y  W  H  1  X_BEV  Y_BEV  -1
 86 |         np.savetxt(self.results_file, result, fmt="%d %d %d %d %d %d %d %f %f")
 87 | 
 88 |     def _to_aicity24(self, result):
 89 |         # CAMERA_ID  OBJ_ID FRAME  X Y  W  H  1  X_BEV  Y_BEV  -1
 90 |         np.savetxt(self.results_file, result, fmt="%d %d %d %d %d %d %d %f %f")
 91 | 
 92 |     def _to_synthehicle(self, result):
 93 |         # CAMERA, FRAME, ID, X, Y, W, H, SCORE, X_BEV, Y_BEV
 94 |         np.savetxt(self.results_file, result[:, [2, 1]], fmt="%d", delimiter=",")
 95 | 
 96 |     def get_save_function(self, cfg):
 97 |         if "WildTrack" in cfg.dataset.name:
 98 |             return self._to_wildtrack
 99 |         elif "AICITY24" in cfg.dataset.name:
100 |             return self._to_aicity19
101 |         elif "AICITY" in cfg.dataset.name or "CityFlow" in cfg.dataset.name:
102 |             return self._to_aicity24
103 |         else:
104 |             return self._to_synthehicle
105 | 
106 |     def denormalize_bev(self, positions):
107 |         min_x, min_y, max_x, max_y = self._norm_factors
108 |         return positions * torch.tensor([max_x - min_x, max_y - min_y]) + torch.tensor([min_x, min_y])
109 | 
110 |     def squeeze_batch(self, x: torch.Tensor):
111 |         if x.dim() == 4 and x.size(0) == 1:
112 |             return x.squeeze(0)
113 |         return x
114 | 


--------------------------------------------------------------------------------
/src/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import random
  3 | from typing import List, Optional, Tuple
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | import torch
  7 | from torch.utils.tensorboard import SummaryWriter
  8 | from torchvision import transforms
  9 | from torchvision.io import write_jpeg
 10 | from torchvision.utils import draw_bounding_boxes, make_grid
 11 | 
 12 | 
 13 | def resize_transform(img, size=(256, 128)):
 14 |     """
 15 |     Resize a torch image to the specified size.
 16 |     Used before passing the image to reid model.
 17 |     """
 18 |     transform = transforms.Compose(
 19 |         [
 20 |             transforms.ToPILImage(),
 21 |             transforms.Resize((size[0], size[1])),
 22 |             transforms.ToTensor(),
 23 |         ]
 24 |     )
 25 |     return transform(img)
 26 | 
 27 | 
 28 | def compute_centers(boxes, bottom=True, box_projection_centers=None):
 29 |     """
 30 |     Compute the 2D centers of a torch tensor of bounding boxes.
 31 |     """
 32 |     if bottom is True and box_projection_centers is not None:
 33 |         raise ValueError("Cannot project boxes to bottom and use box_projection_centers simultaneously.")
 34 |     centers = torch.zeros((boxes.shape[0], 2))
 35 |     centers[:, 0] = boxes[:, 0] + boxes[:, 2] / 2
 36 |     if box_projection_centers is not None:
 37 |         alpha_w, alpha_h = box_projection_centers
 38 |         centers[:, 1] = boxes[:, 1] + alpha_h * boxes[:, 3]
 39 |     elif bottom:
 40 |         centers[:, 1] = boxes[:, 1] + boxes[:, 3]
 41 |     else:
 42 |         centers[:, 1] = boxes[:, 1] + boxes[:, 3] / 2
 43 |     return centers
 44 | 
 45 | 
 46 | def tlwh_to_xyah(tlwh):
 47 |     """
 48 |     Convert bounding box to format `(center x, center y, aspect ratio,
 49 |     height)`, where the aspect ratio is `width / height`.
 50 |     """
 51 |     ret = tlwh.clone()
 52 |     if ret.dim() == 1:
 53 |         ret = ret.unsqueeze(0)
 54 |     ret[:, :2] += ret[:, 2:] / 2
 55 |     ret[:, 2] /= ret[:, 3]
 56 |     return ret
 57 | 
 58 | 
 59 | def xyah_to_tlwh(xyah):
 60 |     """Get current position in bounding box format `(top left x, top left y,
 61 |     width, height)`.
 62 |     """
 63 |     ret = xyah.clone()
 64 |     if ret.dim() == 1:
 65 |         ret = ret.unsqueeze(0)
 66 |     ret[:, 2] *= ret[:, 3]
 67 |     ret[:, :2] -= ret[:, 2:] / 2
 68 |     return ret
 69 | 
 70 | 
 71 | def tlwh_to_tlbr(tlwh):
 72 |     """Convert bounding box to format `(top left x, top left y, bottom right
 73 |     x, bottom right y)`.
 74 |     """
 75 |     ret = tlwh.clone()
 76 |     if ret.dim() == 1:
 77 |         ret = ret.unsqueeze(0)
 78 |     ret[:, 2:] += ret[:, :2]
 79 |     return ret
 80 | 
 81 | 
 82 | def expand_boxes(in_boxes, factor):
 83 |     boxes = in_boxes.clone()
 84 |     cx, cy = boxes[:, 0] + boxes[:, 2] / 2, boxes[:, 1] + boxes[:, 3] / 2
 85 |     w, h = boxes[:, 2] * factor, boxes[:, 3] * factor
 86 |     boxes[:, 0] = cx - w / 2
 87 |     boxes[:, 1] = cy - h / 2
 88 |     boxes[:, 2] = w
 89 |     boxes[:, 3] = h
 90 |     return boxes
 91 | 
 92 | 
 93 | def remove_border_boxes(boxes, border):
 94 |     xy1x2y2 = tlwh_to_tlbr(boxes)
 95 |     keep = (
 96 |         (xy1x2y2[:, 0] > border)
 97 |         & (xy1x2y2[:, 1] > border)
 98 |         & (xy1x2y2[:, 2] < (1920 - border))
 99 |         & (xy1x2y2[:, 3] < (1080 - border))
100 |     )
101 |     return keep
102 | 
103 | 
104 | def size_filter(boxes, size_min, size_max):
105 |     sizes = boxes[:, 2] * boxes[:, 3]
106 |     keep = (sizes >= size_min) & (sizes <= size_max)
107 |     return keep
108 | 
109 | 
110 | def mpl_cmap_to_rgb(cmap_name: str, seed: int = 0) -> List[Tuple[int, int, int]]:
111 |     """Returns a list of RGB values from a matplotlib colormap."""
112 |     cmap = plt.get_cmap(cmap_name)
113 |     colors = []
114 |     for i in range(cmap.N):
115 |         rgb = cmap(i)[:3]
116 |         colors.append(tuple(int(255 * c) for c in rgb))
117 |     random.seed(seed)
118 |     random.shuffle(colors)
119 |     return colors
120 | 
121 | 
122 | def render_image_grid(images: List[torch.Tensor], *args, **kwargs) -> torch.Tensor:
123 |     """Renders a grid of images.
124 | 
125 |     Args:
126 |         images (List[torch.Tensor]): List of N images of shape (C, H, W).
127 |         *args: Additional arguments to pass to the make_grid function.
128 |         **kwargs: Additional keyword arguments to pass to the make_grid function.
129 | 
130 |     Returns:
131 |         torch.Tensor: Image grid of shape (C, H, W).
132 |     """
133 |     images = torch.stack(images)
134 |     nrow = math.ceil(math.sqrt(len(images)))
135 |     return make_grid(images, nrow=nrow, *args, **kwargs)
136 | 
137 | 
138 | def render_images_with_boxes(
139 |     image: torch.Tensor,
140 |     boxes: Optional[torch.Tensor] = None,
141 |     labels: Optional[torch.Tensor] = None,
142 |     confs: Optional[torch.Tensor] = None,
143 |     colors: Optional[List[Tuple[int, int, int]]] = None,
144 |     *args,
145 |     **kwargs,
146 | ) -> List[torch.Tensor]:
147 |     """Render image with bounding boxes. Colors correspond to the label index. Boxes are
148 |     expected to be in MOT-format, i.e., (bb_left, bb_top, bb_widht, bb_height).
149 | 
150 |     Args:
151 |         images (torch.Tensor): Image of shape (C, H, W).
152 |         boxes (torch.Tensor): Boxes of shape (K, 4).
153 |         labels (torch.Tensor): Label of shape (K,).
154 |         colors (Optional[List[Tuple[int, int, int]]]): List of RGB colors. Defaults to None.
155 |         *args: Additional arguments to pass to the draw_bounding_boxes function.
156 |         **kwargs: Additional keyword arguments to pass to the draw_bounding_boxes function.
157 | 
158 |     Returns:
159 |         torch.Tensor: Image with bounding boxes.
160 |     """
161 |     if boxes is None:
162 |         return image
163 | 
164 |     if colors is None:
165 |         colors = mpl_cmap_to_rgb("rainbow")
166 | 
167 |     if labels is None:
168 |         labels = torch.zeros(boxes.size(0))
169 | 
170 |     color_palette = [colors[label % len(colors)] for label in labels]
171 | 
172 |     _labels = [str(label.item()) for i, label in enumerate(labels)]
173 | 
174 |     if confs is not None:
175 |         _labels = [f"{label} ({conf.item():.2f})" for label, conf in zip(_labels, confs)]
176 | 
177 |     img = image.clone()
178 |     bxs = boxes.clone()
179 |     bxs[:, 2:] += bxs[:, :2]
180 | 
181 |     img = draw_bounding_boxes(
182 |         img,
183 |         bxs,
184 |         labels=_labels,
185 |         colors=color_palette,
186 |         *args,
187 |         **kwargs,
188 |     )
189 |     return img
190 | 
191 | 
192 | def normalize_features(x):
193 |     # shape of x: (C, N, F)
194 |     # normalize features per channelg
195 |     mean = x.mean(dim=2, keepdim=True)
196 |     std = x.std(dim=2, keepdim=True) + 1e-8
197 |     return (x - mean) / std
198 | 
199 | 
200 | def nanmax(x, dim=None):
201 |     """Function like torch.nanmean for max."""
202 |     mask = torch.isnan(x)
203 |     x_masked = torch.where(mask, torch.tensor(float("-inf")).to(x.device), x)
204 |     max_vals, _ = torch.max(x_masked, dim=dim)
205 | 
206 |     # Restore NaN values if max is -inf (because all were NaN along dimension)
207 |     max_vals = torch.where(max_vals == float("-inf"), torch.tensor(float("nan")).to(x.device), max_vals)
208 |     return max_vals
209 | 


--------------------------------------------------------------------------------
/src/tracker/supertrack.py:
--------------------------------------------------------------------------------
  1 | from enum import IntEnum
  2 | 
  3 | import torch
  4 | 
  5 | from ..utils.utils import tlwh_to_tlbr
  6 | 
  7 | 
  8 | class TrackState(IntEnum):
  9 |     CREATED = 0  # Track is created but not confirmed yet
 10 |     ACTIVE = 1  # Track is confirmed and active
 11 |     LOST = 3  # Track is lost and not tracked, but kept in memory
 12 |     KILLED = 4  # Track is killed (e.g. due to merging with another track)
 13 | 
 14 | 
 15 | class SuperTrack:
 16 |     def __init__(
 17 |         self,
 18 |         frame,
 19 |         features,
 20 |         boxes,
 21 |         positions_2d,
 22 |         positions_3d,
 23 |         confidence=None,
 24 |     ):
 25 |         self.frame = frame
 26 |         self.last_update = frame
 27 | 
 28 |         self.n_cams = features.size(0)
 29 |         self.features = features
 30 |         self.boxes = boxes
 31 |         self.positions_2d = positions_2d
 32 |         self.positions_3d = positions_3d
 33 | 
 34 |         self.label = None
 35 |         self.__state = TrackState.CREATED  # private state variable
 36 | 
 37 |         # inactivity counter: how many frames since last update at each camera
 38 |         self.inactive_since = torch.zeros(self.n_cams, device=features.device)
 39 | 
 40 |         self.lost_since = 0
 41 | 
 42 |         # where to continue tracking: if False, track is not continued in this camera
 43 |         self.track_where = torch.ones(self.n_cams, device=features.device).bool()
 44 |         self.track_where[torch.isnan(features).any(dim=1)] = False
 45 | 
 46 |         # cams the track hasn't been seen in yet
 47 |         self.queries = torch.ones(self.n_cams, device=features.device).bool()
 48 | 
 49 |         # count updates for each camera
 50 |         self.ticks = torch.ones(self.n_cams, device=features.device)
 51 | 
 52 |         self.confidence = confidence
 53 | 
 54 |         self.velocities_2d = torch.zeros((self.n_cams, 4), device=features.device)
 55 |         self.velocities_3d = torch.zeros((self.n_cams, 2), device=features.device)
 56 | 
 57 |     @classmethod
 58 |     def empty(cls, n_cams, fdim, device):
 59 |         return cls(
 60 |             frame=None,
 61 |             features=torch.full((n_cams, fdim), float("nan"), device=device),
 62 |             boxes=torch.full((n_cams, 4), float("nan"), device=device),
 63 |             positions_2d=torch.full((n_cams, 2), float("nan"), device=device),
 64 |             positions_3d=torch.full((n_cams, 3), float("nan"), device=device),
 65 |         )
 66 | 
 67 |     def activate(self):
 68 |         self.__state = TrackState.ACTIVE
 69 | 
 70 |     def deactivate(self):
 71 |         self.__state = TrackState.LOST
 72 | 
 73 |     def kill(self):
 74 |         self.__state = TrackState.KILLED
 75 | 
 76 |     def reset(self, cams=None):
 77 |         if cams is None:
 78 |             cams = range(self.n_cams)
 79 |         for cam in cams:
 80 |             self.track_where[cam] = False
 81 |             # self.inactive_since[cam] = 0
 82 | 
 83 |     def set_label(self, label):
 84 |         if self.label is not None:
 85 |             raise ValueError(f"Track {self} is already labeled.")
 86 |         self.label = label
 87 | 
 88 |     @property
 89 |     def keys(self):
 90 |         return ~self.queries
 91 | 
 92 |     @property
 93 |     def state(self):
 94 |         return self.__state
 95 | 
 96 |     @property
 97 |     def tlbr(self):
 98 |         return tlwh_to_tlbr(self.boxes)
 99 | 
100 |     def is_complete(self):
101 |         return ~torch.isnan(self.features).any()
102 | 
103 |     @property
104 |     def p_features(self):
105 |         return self.phantomize(self.features)
106 | 
107 |     @property
108 |     def p_positions(self):
109 |         return self.phantomize(self.positions_3d)
110 | 
111 |     @property
112 |     def mean_positions_3d(self):
113 |         return torch.nanmean(self.positions_3d, dim=0)
114 | 
115 |     @staticmethod
116 |     def phantomize(tensor):
117 |         """
118 |         Given a (B, n_cams, f_dim) tensor, replace nans with the average of
119 |         the non-nan values along the cam axis.
120 |         """
121 |         return torch.where(torch.isnan(tensor), torch.nanmean(tensor, dim=0, keepdim=True), tensor)
122 | 
123 |     def update(self, other):
124 |         n_cams = self.features.size(0)
125 |         if self.frame == other.frame:
126 |             for cam in range(n_cams):
127 |                 if torch.isnan(self.features[cam]).any():
128 |                     if torch.isnan(other.features[cam]).any():
129 |                         continue
130 |                     self.features[cam] = other.features[cam]
131 |                     self.boxes[cam] = other.boxes[cam]
132 |                     self.positions_2d[cam] = other.positions_2d[cam]
133 |                     self.positions_3d[cam] = other.positions_3d[cam]
134 |                     self.inactive_since[cam] = 0
135 |                     self.track_where[cam] = True
136 |                     self.queries[cam] = False
137 |                     self.ticks[cam] = other.ticks[cam]
138 |                 else:
139 |                     if not torch.isnan(other.features[cam]).any():
140 |                         raise ValueError(f"Found violation of constraints for track update with {self}.")
141 |         elif self.frame < other.frame:
142 |             for cam in range(n_cams):
143 |                 if not torch.isnan(other.features[cam]).any():
144 |                     if not torch.isnan(self.features[cam]).any():
145 |                         if self.velocities_2d[cam].sum() == 0:
146 |                             w = 1.0
147 |                         else:
148 |                             w = 0.8
149 |                         self.velocities_2d[cam] = (
150 |                             w * (other.boxes[cam] - self.boxes[cam]) / (other.frame - self.frame)
151 |                             + (1 - w) * self.velocities_2d[cam]
152 |                         )
153 |                         self.velocities_3d[cam] = (
154 |                             w * (other.positions_3d[cam] - self.positions_3d[cam]) / (other.frame - self.frame)
155 |                             + (1 - w) * self.velocities_3d[cam]
156 |                         )
157 |                         self.features[cam] = 0.9 * self.features[cam] + 0.1 * other.features[cam]
158 |                         self.boxes[cam] = other.boxes[cam]
159 |                         self.positions_2d[cam] = other.positions_2d[cam]
160 |                         self.positions_3d[cam] = other.positions_3d[cam]
161 |                         self.inactive_since[cam] = 0
162 |                         self.track_where[cam] = True
163 |                         self.queries[cam] = False
164 |                         self.ticks[cam] += 1
165 |                     else:
166 |                         self.features[cam] = other.features[cam]
167 |                         self.boxes[cam] = other.boxes[cam]
168 |                         self.positions_2d[cam] = other.positions_2d[cam]
169 |                         self.positions_3d[cam] = other.positions_3d[cam]
170 |                         self.inactive_since[cam] = 0
171 |                         self.track_where[cam] = True
172 |                         self.queries[cam] = False
173 |                         self.ticks[cam] = other.ticks[cam]
174 |                 else:
175 |                     if self.track_where[cam]:
176 |                         self.inactive_since[cam] += 1
177 |         else:
178 |             raise ValueError(
179 |                 f"Frame of other must be greater or equal to frame of self, but got {self.frame} and {other.frame}."
180 |             )
181 |         self.last_update = other.frame
182 |         self.frame = other.frame
183 | 
184 |         if self.state == TrackState.LOST:
185 |             self.activate()
186 | 
187 |     def predict(self):
188 |         for cam in range(self.n_cams):
189 |             if ~self.track_where[cam]:
190 |                 continue
191 |             prd_box = self.boxes[cam] + self.velocities_2d[cam]
192 |             prd_pos = self.positions_3d[cam] + self.velocities_3d[cam]
193 |             if prd_box[2] <= 0 or prd_box[3] <= 0:
194 |                 prd_box = self.boxes[cam]
195 |                 prd_pos = self.positions_3d[cam]
196 |             self.boxes[cam] = prd_box
197 |             self.positions_3d[cam] = prd_pos
198 | 
199 |     def merge(self, other):
200 |         if other.state == TrackState.KILLED or self.state == TrackState.KILLED:
201 |             raise ValueError("Cannot merge killed tracks.")
202 |         if other.frame < self.frame:
203 |             raise ValueError(
204 |                 f"Other track must not be older than self, but "
205 |                 f"self is at frame {self.frame} and other at frame {other.frame}."
206 |             )
207 |         self.update(
208 |             other.frame,
209 |             other.features,
210 |             other.boxes,
211 |             other.positions_2d,
212 |             other.positions_3d,
213 |         )
214 |         # other was merged into self, so it is killed
215 |         other.kill()
216 | 
217 |     def split(self, where: torch.Tensor):
218 |         # keep the cams where "where" is True
219 |         other_features = self.features.clone()
220 |         other_boxes = self.boxes.clone()
221 |         other_positions_2d = self.positions_2d.clone()
222 |         other_positions_3d = self.positions_3d.clone()
223 |         for w in where:
224 |             if not w:
225 |                 self.features[w] = torch.nan
226 |                 self.boxes[w] = torch.nan
227 |                 self.positions_2d[w] = torch.nan
228 |                 self.positions_3d[w] = torch.nan
229 |             else:
230 |                 other_features[w] = torch.nan
231 |                 other_boxes[w] = torch.nan
232 |                 other_positions_2d[w] = torch.nan
233 |                 other_positions_3d[w] = torch.nan
234 |         return SuperTrack(
235 |             frame=self.frame,
236 |             features=other_features,
237 |             boxes=other_boxes,
238 |             positions_2d=other_positions_2d,
239 |             positions_3d=other_positions_3d,
240 |         )
241 | 
242 |     def __repr__(self):
243 |         return f"Track {self.label}"
244 | 
245 |     def to_tensor(self):
246 |         output = []
247 |         if self.state == TrackState.LOST:
248 |             return torch.Tensor(output)
249 |         for i, box in enumerate(self.boxes):
250 |             if ~self.track_where[i]:
251 |                 continue
252 |             row = [i, self.label, self.frame, *box, *self.mean_positions_3d]
253 |             output.append(row)
254 |         return torch.Tensor(output)
255 | 


--------------------------------------------------------------------------------
/src/utils/evaluate.py:
--------------------------------------------------------------------------------
  1 | import configparser
  2 | import os
  3 | import pathlib
  4 | from typing import Dict, List, Optional, Union
  5 | 
  6 | import motmetrics as mm
  7 | import numpy as np
  8 | import pandas as pd
  9 | import torch
 10 | from sklearn import metrics
 11 | 
 12 | 
 13 | GT_COLUMNS = [
 14 |     "frame",
 15 |     "id",
 16 |     "bb_left",
 17 |     "bb_top",
 18 |     "bb_width",
 19 |     "bb_height",
 20 |     "conf",
 21 |     "x",
 22 |     "y",
 23 |     "z",
 24 | ]
 25 | 
 26 | 
 27 | def get_hota_setup():
 28 |     metrics = ["deta_alpha", "assa_alpha", "hota_alpha"]
 29 |     namemap = mm.io.motchallenge_metric_names
 30 |     namemap.update({"hota_alpha": "HOTA", "assa_alpha": "ASSA", "deta_alpha": "DETA"})
 31 |     return metrics, namemap
 32 | 
 33 | 
 34 | def evaluate_tracker(tracker_results, dataloader, hota_mode=False, bev_mode=False):
 35 |     gt_dfs = [pd.DataFrame(gt, columns=GT_COLUMNS) for gt in dataloader.dataset._ground_truths]
 36 |     ht_dfs = results_to_dfs(tracker_results)
 37 | 
 38 |     n_frames = [int(df["frame"].max()) for df in gt_dfs]
 39 | 
 40 |     gt_dfs = [mot_to_mm(df) for df in gt_dfs]
 41 |     ht_dfs = [mot_to_mm(df) for df in ht_dfs]
 42 | 
 43 |     gt_df = combine_dataframes(gt_dfs, n_frames)
 44 |     ht_df = combine_dataframes(ht_dfs, n_frames)
 45 | 
 46 |     # put column "x" to "X"
 47 |     if bev_mode:
 48 |         ht_df["X"] = ht_df["x"]
 49 |         ht_df["Y"] = ht_df["y"]
 50 |         gt_df["X"] = gt_df["x"]
 51 |         gt_df["Y"] = gt_df["y"]
 52 | 
 53 |     return evaluate_single_scene(ht_df, gt_df, hota_mode=hota_mode, bev_mode=bev_mode)
 54 | 
 55 | 
 56 | def results_to_dfs(tracker_results: torch.Tensor) -> List[pd.DataFrame]:
 57 |     """Converts a tensor of results to a list of dataframes. Input tensor has format
 58 | 
 59 |         CAM_ID, OBJ_ID, FRAME_ID, X, Y, W, H, X_WORLD, Y_WORLD
 60 | 
 61 |     and resulting (n_cams) dataframes have columns
 62 | 
 63 |         frame, id, bb_left, bb_top, bb_width, bb_height, conf, x, y, z
 64 | 
 65 |     Args:
 66 |         tracker_results (torch.Tensor): Results tensor.
 67 |     Returns:
 68 |         List[pd.DataFrame]: List of dataframes.
 69 |     """
 70 |     results = tracker_results.clone()
 71 |     results[:, [1, 2]] = results[:, [2, 1]]
 72 |     results = torch.cat((results[:, :7], torch.ones(results.shape[0], 1), results[:, 7:]), dim=1)
 73 |     results = torch.cat((results, -torch.ones(results.shape[0], 1)), dim=1)
 74 |     cam_res = [results[results[:, 0] == c][:, 1:] for c in torch.unique(results[:, 0]).cpu().numpy()]
 75 |     return [pd.DataFrame(res, columns=GT_COLUMNS) for res in cam_res]
 76 | 
 77 | 
 78 | def evaluate_multi_scene(prediction_dfs, ground_truth_dfs, names=None, hota_mode=False, bev_mode=False):
 79 |     """Takes prediction and ground truth dataframes and runs motmetrics evaluation
 80 |     on a multiple scenes. For evaluation of multi-camera scenes, first combine a
 81 |     list of single-camera predictions and ground truths using `combine_dataframes`
 82 |     Args:
 83 |         prediction_dfs (_type_): _description_
 84 |         ground_truth_dfs (_type_): _description_
 85 |         names (_type_, optional): _description_. Defaults to None.
 86 |     Returns:
 87 |         _type_: _description_
 88 |     """
 89 |     if names is None:
 90 |         names = ["Untitled %s" % (i + 1) for i in range(len(prediction_dfs))]
 91 |     ground_truths = dict(zip(names, ground_truth_dfs))
 92 |     predictions = dict(zip(names, prediction_dfs))
 93 |     accs = []
 94 |     names = []
 95 | 
 96 |     if bev_mode:
 97 |         distfields = ["X", "Y"]
 98 |         dist = "seuc"
 99 |         distth = 1.0
100 |     else:
101 |         distfields = ["X", "Y", "Width", "Height"]
102 |         dist = "iou"
103 |         distth = 0.5
104 | 
105 |     for name, prediction in predictions.items():
106 |         if hota_mode:
107 |             raise NotImplementedError
108 |         else:
109 |             accs.append(
110 |                 mm.utils.compare_to_groundtruth(
111 |                     ground_truths[name], prediction, dist=dist, distfields=distfields, distth=distth
112 |                 )
113 |             )
114 |             metrics = mm.metrics.motchallenge_metrics
115 |             namemap = mm.io.motchallenge_metric_names
116 |         names.append(name)
117 | 
118 |     mh = mm.metrics.create()
119 | 
120 |     summary = mh.compute_many(
121 |         accs,
122 |         names=names,
123 |         metrics=metrics,
124 |         generate_overall=True,
125 |     )
126 |     namemap.update({"hota_alpha": "HOTA", "assa_alpha": "ASSA", "deta_alpha": "DETA"})
127 |     print(mm.io.render_summary(summary, formatters=mh.formatters, namemap=namemap))
128 |     strsummary = mm.io.render_summary(summary, formatters=mh.formatters, namemap=namemap)
129 |     return summary, strsummary
130 | 
131 | 
132 | def evaluate_single_scene(prediction_df, ground_truth_df, hota_mode=False, bev_mode=False, name=None) -> pd.DataFrame:
133 |     """Takes a prediction and ground truth dataframe and runs motmetrics evaluation
134 |     on a single scene. For evaluation of multi-camera scenes, first combine a list
135 |     of single-camera predictions and ground truths using `combine_dataframes`.
136 |     Args:
137 |         prediction_df (_type_): Multi-camera predictions.
138 |         ground_truth_df (_type_): Multi-camera ground truth.
139 |         name (str): Scene name. Defaults to None.
140 |     """
141 |     return evaluate_multi_scene([prediction_df], [ground_truth_df], [name], hota_mode, bev_mode)
142 | 
143 | 
144 | def mot_to_mm(df: pd.DataFrame) -> pd.DataFrame:
145 |     """Takes a MOT-style dataframe (with named columns [frame, id, ...])
146 |     and converts it to a dataframe with column names required by motmetrics.
147 |     Args:
148 |         df (pd.DataFrame): Input MOT-style dataframe.
149 |     Returns:
150 |         pd.DataFrame: Output dataframe ready to use in motmetrics evaluation.
151 |     """
152 |     _df = df.rename(
153 |         columns={
154 |             "frame": "FrameId",
155 |             "id": "Id",
156 |             "bb_left": "X",
157 |             "bb_top": "Y",
158 |             "bb_width": "Width",
159 |             "bb_height": "Height",
160 |             "conf": "Confidence",
161 |         }
162 |     )
163 |     columns_to_int = ["FrameId", "Id", "X", "Y", "Width", "Height"]
164 |     columns_to_float = ["Confidence"]
165 |     _df[columns_to_int] = _df[columns_to_int].astype(int)
166 |     _df[columns_to_float] = _df[columns_to_float].astype(float)
167 |     return _df
168 | 
169 | 
170 | def read_txt(path: Union[str, pathlib.Path]) -> pd.DataFrame:
171 |     _df = pd.read_csv(path, names=GT_COLUMNS)
172 |     _df = _df.rename(
173 |         columns={
174 |             "frame": "FrameId",
175 |             "id": "Id",
176 |             "bb_left": "X",
177 |             "bb_top": "Y",
178 |             "bb_width": "Width",
179 |             "bb_height": "Height",
180 |             "conf": "Confidence",
181 |         }
182 |     )
183 |     columns_to_int = ["FrameId", "Id", "X", "Y", "Width", "Height"]
184 |     columns_to_float = ["Confidence"]
185 |     _df[columns_to_int] = _df[columns_to_int].astype(int)
186 |     _df[columns_to_float] = _df[columns_to_float].astype(float)
187 |     return _df
188 | 
189 | 
190 | def read_seqinfo(path: Union[str, pathlib.Path]) -> Dict:
191 |     parser = configparser.ConfigParser()
192 |     parser.read(path)
193 |     return dict(parser["Sequence"])
194 | 
195 | 
196 | def combine_dataframes(dataframes: List[pd.DataFrame], n_frames: Optional[List[int]] = None) -> pd.DataFrame:
197 |     """Takes a list of single-camera dataframes and combines them for
198 |     multi-camera evaluation.
199 |     Args:
200 |         dataframes (List[pd.DataFrame]): List of single-camera dataframes.
201 |         n_frames (Optional[List[int]], optional): Defaults to None.
202 |     Returns:
203 |         pd.DataFrame: Multi-camera dataframe.
204 |     """
205 |     if n_frames is None:
206 |         n_frames = [int(df["FrameId"].max()) for df in dataframes]
207 |     count_frames = 0
208 |     dfs = []
209 |     for j, df in enumerate(dataframes):
210 |         df["FrameId"] += count_frames
211 |         count_frames += int(n_frames[j])
212 |         dfs.append(df)
213 |     return pd.concat(dfs).set_index(["FrameId", "Id"])
214 | 
215 | 
216 | def evaluate_mtmc(
217 |     data_paths: List[Union[str, pathlib.Path]],
218 |     prediction_path: Union[str, pathlib.Path],
219 |     scene_name: str,
220 |     hota_mode=False,
221 |     bev_mode=False,
222 | ):
223 |     seqinfos = [read_seqinfo(os.path.join(path, "seqinfo.ini")) for path in data_paths]
224 |     ground_truths = [read_txt(os.path.join(path, "gt", "gt.txt")) for path in data_paths]
225 |     prediction_paths = [os.path.join(prediction_path, seqinfo["name"] + ".txt") for seqinfo in seqinfos]
226 |     predictions = [read_txt(path) for path in prediction_paths]
227 |     ground_truth_df = combine_dataframes(ground_truths, [seqinfo["seqlength"] for seqinfo in seqinfos])
228 |     prediction_df = combine_dataframes(predictions, [seqinfo["seqlength"] for seqinfo in seqinfos])
229 | 
230 |     ground_truths = {scene_name: ground_truth_df}
231 |     predictions = {scene_name: prediction_df}
232 | 
233 | 
234 | def evaluate_synthehicle_json(prediction, ground_truth):
235 |     preds_to_eval = []
236 |     truths_to_eval = []
237 |     names = []
238 |     for scene in ground_truth.keys():
239 |         if scene in prediction.keys():
240 |             gcams = ground_truth[scene]
241 |             pcams = prediction[scene]
242 |             preds_to_combine = []
243 |             truths_to_combine = []
244 |             for cam in gcams.keys():
245 |                 if cam not in pcams.keys():
246 |                     prediction[scene][cam] = [[1, 1, 0, 0, 0, 0, 1, -1, -1, -1]]
247 |                 preds_to_combine.append(mot_to_mm(pd.DataFrame(prediction[scene][cam], columns=GT_COLUMNS)))
248 |                 truths_to_combine.append(mot_to_mm(pd.DataFrame(ground_truth[scene][cam], columns=GT_COLUMNS)))
249 |             names.append(scene)
250 |             preds_to_eval.append(combine_dataframes(preds_to_combine, n_frames=[1800] * len(preds_to_combine)))
251 |             truths_to_eval.append(combine_dataframes(truths_to_combine, n_frames=[1800] * len(truths_to_combine)))
252 |     return evaluate_multi_scene(preds_to_eval, truths_to_eval, names)
253 | 
254 | 
255 | def clustering_performance(y_true, y_pred):
256 |     y_t, y_p = y_true.cpu().numpy(), y_pred.cpu().numpy()
257 |     return {
258 |         "ARI": metrics.adjusted_rand_score(y_t, y_p),
259 |         "AMI": metrics.adjusted_mutual_info_score(y_t, y_p),
260 |     }
261 | 


--------------------------------------------------------------------------------
/src/datasets/dataset.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import pathlib
  4 | import warnings
  5 | from enum import IntEnum
  6 | from typing import List, Optional
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | from loguru import logger
 11 | from torch.utils.data import DataLoader
 12 | from torchvision.io import ImageReadMode, read_image
 13 | from torchvision.ops import nms
 14 | 
 15 | from ..tracker.geometry import Projector
 16 | from ..utils.utils import compute_centers, resize_transform, tlwh_to_tlbr
 17 | 
 18 | 
 19 | class Annotation(IntEnum):
 20 |     CAM_ID = 0
 21 |     OBJ_ID = 1
 22 |     FRAME_ID = 2
 23 |     XMIN = 3
 24 |     YMIN = 4
 25 |     WIDTH = 5
 26 |     HEIGHT = 6
 27 |     CONF = 7
 28 |     XWORLD = 8
 29 |     YWORLD = 9
 30 | 
 31 | 
 32 | class NMSTransform:
 33 |     def __init__(self, iou_threshold: float):
 34 |         """Initialize the NMSTransform which applied non-maximum suppression to the
 35 |         input annotations based on the specified IoU threshold.
 36 | 
 37 |         Args:
 38 |             iou_threshold (float): The Intersection over Union (IoU) threshold for NMS.
 39 |                 Bounding boxes with IoU greater than this threshold will be suppressed.
 40 |         """
 41 |         self.iou_threshold = iou_threshold
 42 | 
 43 |     def __call__(self, annotations: torch.Tensor) -> torch.Tensor:
 44 |         boxes = tlwh_to_tlbr(annotations[:, Annotation.XMIN : Annotation.HEIGHT + 1])
 45 |         scores = annotations[:, Annotation.CONF]
 46 |         keep = nms(boxes, scores, self.iou_threshold)
 47 |         return keep
 48 | 
 49 | 
 50 | class ROIFilter:
 51 |     def __init__(self, roi_path: str):
 52 |         """Initialize the ROIFilter.
 53 | 
 54 |         Args:
 55 |             roi_path (str): Path to the ROI image file.
 56 | 
 57 |         The ROI (Region of Interest) image is loaded as a binary mask,
 58 |         where 1 indicates areas of interest and 0 indicates areas to be filtered out.
 59 |         """
 60 |         self.roi = read_image(roi_path, ImageReadMode.GRAY).squeeze(0).bool()
 61 |         self.size = self.roi.size()
 62 | 
 63 |     def __call__(self, annotations: torch.Tensor) -> torch.Tensor:
 64 |         centers = compute_centers(annotations[:, Annotation.XMIN - 1 : Annotation.HEIGHT]).int()
 65 |         centers[:, 0] = torch.clamp(centers[:, 0], 0, self.size[1] - 1)
 66 |         centers[:, 1] = torch.clamp(centers[:, 1], 0, self.size[0] - 1)
 67 |         keep = self.roi[centers[:, 1], centers[:, 0]] == 1
 68 |         return keep
 69 | 
 70 | 
 71 | class MultiCamDataset:
 72 |     def __init__(
 73 |         self,
 74 |         annotation_paths: List[str],
 75 |         image_paths: List[str],
 76 |         calibration_paths: List[str],
 77 |         camera_names: List[int],
 78 |         ground_truth_paths: Optional[List[str]] = None,
 79 |         precomputed: bool = False,
 80 |         nms_threshold: Optional[float] = 0.9,
 81 |         time_offsets: Optional[List[int]] = None,
 82 |         roi_paths: Optional[List[str]] = None,
 83 |         normalize_bev: bool = False,
 84 |         bottom: bool = True,
 85 |         box_projection_centers=None,
 86 |     ):
 87 |         """Initialize the MultiCamDataset for data loading.
 88 | 
 89 |         Args:
 90 |             annotation_paths (List[str]): Paths to annotation files for each camera.
 91 |             image_paths (List[str]): Paths to image directories for each camera.
 92 |             calibration_paths (List[str]): Paths to calibration files for each camera.
 93 |             camera_names (List[int]): Names or IDs of the cameras.
 94 |             ground_truth_paths (Optional[List[str]], optional): Paths to ground truth files. Defaults to None.
 95 |             precomputed (bool, optional): Whether to use precomputed features. Defaults to False.
 96 |             nms_threshold (Optional[float], optional): Non-maximum suppression threshold. Defaults to 0.9.
 97 |             time_offsets (Optional[List[int]], optional): Time offsets for each camera. Defaults to None.
 98 |             roi_paths (Optional[List[str]], optional): Paths to region of interest mask images. Defaults to None.
 99 |             normalize_bev (bool, optional): Whether to normalize bird's-eye view coordinates. Defaults to False.
100 |             bottom (bool, optional): Whether to use bottom of bounding box for projection. Defaults to True.
101 |             box_projection_centers (Optional[Tuple[float, float]], optional): Projection centers for bounding boxes. Defaults to None.
102 |         """
103 |         if time_offsets is None:
104 |             self.time_offsets = [0] * len(image_paths)
105 |         else:
106 |             self.time_offsets = time_offsets
107 | 
108 |         self.annotation_paths = annotation_paths
109 |         self.image_paths = image_paths
110 |         self.calibration_paths = calibration_paths
111 |         self.camera_names = camera_names
112 |         self.precomputed = precomputed
113 |         self.nms_transform = NMSTransform(nms_threshold) if nms_threshold is not None else None
114 |         self.box_projection_centers = box_projection_centers
115 |         self.bottom = bottom
116 | 
117 |         self.normalize_bev = normalize_bev
118 | 
119 |         if roi_paths is not None:
120 |             self.roi_filters = [ROIFilter(roi_path) for roi_path in roi_paths]
121 |         else:
122 |             self.roi_filters = None
123 | 
124 |         self._load_calibrations()
125 |         self._load_annotations()
126 | 
127 |         if ground_truth_paths is not None:
128 |             self._load_ground_truth(ground_truth_paths)
129 |         else:
130 |             self._ground_truths = None
131 |             self.gts = None
132 | 
133 |         self.length = max([len(list(pathlib.Path(image_path).glob("*.jpg"))) for image_path in self.image_paths])
134 | 
135 |         if self.length == 0:
136 |             warnings.warn("No images found. Visualization tools will not be available.")
137 | 
138 |         self.length = 2110
139 | 
140 |         self._filtered_by_nms = 0
141 |         self._filtered_by_size = 0
142 |         self._filtered_by_roi = 0
143 | 
144 |     def _load_ground_truth(self, ground_truth_paths):
145 |         self._ground_truths = [
146 |             torch.from_numpy(np.loadtxt(ground_truth_path, delimiter=",", dtype=np.float32))
147 |             for ground_truth_path in ground_truth_paths
148 |         ]
149 | 
150 |         for gt in self._ground_truths:
151 |             if gt.shape[1] == 9:
152 |                 # append another column of ones
153 |                 gt = torch.cat((gt, torch.ones(gt.shape[0], 1)), dim=1)
154 | 
155 |         _cat_gts = [g.clone() for g in self._ground_truths]
156 |         for i, gt in enumerate(_cat_gts):
157 |             col = torch.ones((gt.shape[0], 1)) * i
158 |             _cat_gts[i] = torch.cat((col, gt), dim=1)
159 |             _cat_gts[i][:, 1] += self.time_offsets[i]
160 | 
161 |         self.gts = torch.cat(_cat_gts, dim=0)
162 |         self.gts[:, [1, 2]] = self.gts[:, [2, 1]]
163 | 
164 |     def _load_calibrations(self):
165 |         self._projectors = [Projector(calibration_path) for calibration_path in self.calibration_paths]
166 | 
167 |     def _load_annotations(self):
168 |         anns = [
169 |             torch.from_numpy(np.loadtxt(annotation_path, delimiter=",", dtype=np.float32))
170 |             for annotation_path in self.annotation_paths
171 |         ]
172 | 
173 |         # todo: add to preprocess config
174 |         for i, ann in enumerate(anns):
175 |             keep = (ann[:, Annotation.WIDTH - 1] * ann[:, Annotation.HEIGHT - 1]) >= 1200
176 |             anns[i] = ann[keep]
177 | 
178 |         # filter roi images
179 |         if self.roi_filters is not None:
180 |             keep = self.roi_filters[i](anns[i])
181 |             anns[i] = anns[i][keep]
182 |             logger.info(f"🔥 Filtered {keep.size(0) - keep.sum().item()} annotations by ROI.")
183 | 
184 |         for i, ann in enumerate(anns):
185 |             col = torch.ones((ann.shape[0], 1)) * i
186 |             anns[i] = torch.cat((col, ann), dim=1)
187 |             anns[i][:, 1] += self.time_offsets[i]
188 | 
189 |         positions_2d = []
190 |         for i, ann in enumerate(anns):
191 |             pos2d = compute_centers(
192 |                 ann[:, Annotation.XMIN : Annotation.HEIGHT + 1], self.bottom, self.box_projection_centers
193 |             )
194 |             positions_2d.append(pos2d)
195 | 
196 |         positions_3d = []
197 |         for i, pos2d in enumerate(positions_2d):
198 |             pos3d = self._projectors[i].image_to_world(pos2d)
199 |             positions_3d.append(pos3d)
200 | 
201 |         anns = torch.cat(anns, dim=0)
202 |         positions_2d = torch.cat(positions_2d, dim=0)
203 |         positions_3d = torch.cat(positions_3d, dim=0)
204 | 
205 |         if anns.shape[1] == 9:
206 |             # loaded from ground truth, append column of 1s as 7th column
207 |             anns = torch.cat(
208 |                 (
209 |                     anns[:, :6],
210 |                     torch.ones(anns.shape[0], 1),
211 |                     anns[:, 6:],
212 |                 ),
213 |                 dim=1,
214 |             )
215 |         # swap columns frame and obj_id
216 |         anns[:, [1, 2]] = anns[:, [2, 1]]
217 | 
218 |         self._annotations = anns
219 |         self._positions_2d = positions_2d
220 |         self._positions_3d = positions_3d
221 | 
222 |         if self.normalize_bev:
223 |             self.apply_bev_norm()
224 |         else:
225 |             self._norm_factors = None
226 | 
227 |         self._annotations.to("cuda")
228 |         self._positions_2d.to("cuda")
229 |         self._positions_3d.to("cuda")
230 | 
231 |     def get_bev_ticks(self):
232 |         return [
233 |             float(torch.min(self._positions_3d[:, 0])),
234 |             float(torch.max(self._positions_3d[:, 0])),
235 |             float(torch.min(self._positions_3d[:, 1])),
236 |             float(torch.max(self._positions_3d[:, 1])),
237 |         ]
238 | 
239 |     def get_crops(self, frame_annotations, frame_images):
240 |         crops = []
241 |         for ann in frame_annotations:
242 |             cam_id = int(ann[Annotation.CAM_ID])
243 |             x, y, w, h = ann[Annotation.XMIN : Annotation.HEIGHT + 1].int()
244 |             # clamp to image dimensions
245 |             x = torch.clamp(x, 0, frame_images[cam_id].size(1) - 1)
246 |             y = torch.clamp(y, 0, frame_images[cam_id].size(2) - 1)
247 |             w = torch.clamp(w, 0, frame_images[cam_id].size(1) - x)
248 |             h = torch.clamp(h, 0, frame_images[cam_id].size(2) - y)
249 |             crops.append(resize_transform(frame_images[cam_id][:, y : y + h, x : x + w]))
250 |         if len(crops) == 0:
251 |             return torch.empty(0)
252 |         return torch.stack(crops)
253 | 
254 |     def apply_bev_norm(self):
255 |         # normalize BEV positions to [0, 1]
256 |         logger.info("📏 Normalizing BEV positions to [0, 1].")
257 |         min_x, min_y = torch.min(self._positions_3d, dim=0)[0]
258 |         max_x, max_y = torch.max(self._positions_3d, dim=0)[0]
259 |         self._norm_factors = torch.tensor([min_x, min_y, max_x, max_y])
260 |         self._positions_3d = (self._positions_3d - torch.tensor([min_x, min_y])) / torch.tensor(
261 |             [max_x - min_x, max_y - min_y]
262 |         )
263 | 
264 |     def __len__(self):
265 |         return self.length
266 | 
267 |     def __getitem__(self, idx):
268 |         frame = idx + 1
269 | 
270 |         annotations = self._annotations[self._annotations[:, Annotation.FRAME_ID] == frame]
271 |         positions_2d = self._positions_2d[self._annotations[:, Annotation.FRAME_ID] == frame]
272 |         positions_3d = self._positions_3d[self._annotations[:, Annotation.FRAME_ID] == frame]
273 | 
274 |         if self.gts is not None:
275 |             ground_truth = self.gts[self.gts[:, Annotation.FRAME_ID] == frame]
276 |         else:
277 |             ground_truth = torch.empty(0)
278 | 
279 |         if self.nms_transform is not None:
280 |             keep = self.nms_transform(annotations)
281 |         else:
282 |             keep = torch.arange(annotations.size(0))
283 | 
284 |         annotations = annotations[keep]
285 |         positions_2d = positions_2d[keep]
286 |         positions_3d = positions_3d[keep]
287 | 
288 |         frame_images = []
289 |         for img_path, offset in zip(self.image_paths, self.time_offsets):
290 |             try:
291 |                 frame_images.append(read_image(str(pathlib.Path(img_path) / f"{(frame - offset):06d}.jpg")))
292 |             except Exception:
293 |                 frame_images.append(torch.zeros(3, 1080, 1920).to(torch.uint8))
294 | 
295 |         if not self.precomputed:
296 |             frame_crops = self.get_crops(annotations, frame_images)
297 |         else:
298 |             frame_crops = torch.empty(0)
299 | 
300 |         return {
301 |             "annotations": annotations,
302 |             "positions_2d": positions_2d,
303 |             "positions_3d": positions_3d,
304 |             "images": frame_images,
305 |             "crops": frame_crops,
306 |             "ground_truth": ground_truth,
307 |         }
308 | 
309 | 
310 | def create_dataloader(cfg):
311 |     scene_path = os.path.join(cfg.dataset_path, cfg.dataset.scene_path)
312 |     cameras = [
313 |         os.path.basename(f)
314 |         for f in sorted(glob.glob(os.path.join(scene_path, cfg.dataset.camera_pattern)))
315 |         if os.path.isdir(f)
316 |     ]
317 | 
318 |     img_paths = [
319 |         os.path.join(cfg.dataset_path, cfg.dataset.scene_path, camera, cfg.dataset.img_path) for camera in cameras
320 |     ]
321 |     calibration_paths = [
322 |         os.path.join(
323 |             cfg.dataset_path,
324 |             cfg.dataset.scene_path,
325 |             camera,
326 |             cfg.dataset.calibration_path,
327 |         )
328 |         for camera in cameras
329 |     ]
330 |     annotation_paths = []
331 |     for camera in cameras:
332 |         if cfg.resources.reid is not None:
333 |             scene_path = "-".join(pathlib.Path(cfg.dataset.scene_path).parts)
334 |             if scene_path[-1] == "-":
335 |                 scene_path = scene_path[:-1]
336 |             resource_name = (
337 |                 f"{cfg.dataset.name}_{scene_path}-{camera}_{cfg.resources.detector}_{cfg.resources.reid}.txt"
338 |             )
339 |         else:
340 |             resource_name = f"{cfg.dataset.name}-{camera}_{cfg.resources.detector}.txt"
341 |         annotation_paths.append(os.path.join(cfg.resources.path, resource_name))
342 | 
343 |     if cfg.preprocess.nms_thresh is not None:
344 |         nms_threshold = cfg.preprocess.nms_thresh
345 |     else:
346 |         nms_threshold = None
347 | 
348 |     if cfg.preprocess.roi_filter is not None and "roi_path" in cfg.dataset:
349 |         roi_paths = [os.path.join(cfg.dataset.roi_path, camera, "roi.jpg") for camera in cameras]
350 |     else:
351 |         roi_paths = None
352 | 
353 |     ground_truth_paths = None
354 | 
355 |     time_offsets = None
356 |     if "offsets" in cfg.dataset:
357 |         if cfg.dataset.offsets is not None:
358 |             time_offsets = cfg.dataset.offsets
359 | 
360 |     box_projection_centers = [
361 |         cfg.preprocess.box_projection_centers.alpha_w,
362 |         cfg.preprocess.box_projection_centers.alpha_h,
363 |     ]
364 | 
365 |     if box_projection_centers[0] is None:
366 |         box_projection_centers = None
367 |     elif box_projection_centers[1] is None:
368 |         box_projection_centers[1] = 1 - box_projection_centers[0]
369 | 
370 |     dataset = MultiCamDataset(
371 |         annotation_paths=annotation_paths,
372 |         image_paths=img_paths,
373 |         calibration_paths=calibration_paths,
374 |         camera_names=cameras,
375 |         ground_truth_paths=ground_truth_paths,
376 |         precomputed=cfg.encoder.name == "precomputed",
377 |         nms_threshold=nms_threshold,
378 |         time_offsets=time_offsets,
379 |         roi_paths=roi_paths,
380 |         bottom=cfg.preprocess.bottom,
381 |         box_projection_centers=box_projection_centers,
382 |     )
383 |     dataloader = DataLoader(
384 |         dataset,
385 |         batch_size=1,
386 |         shuffle=False,
387 |         num_workers=8,
388 |     )
389 |     return dataloader
390 | 


--------------------------------------------------------------------------------
/src/tracker/tracker.py:
--------------------------------------------------------------------------------
  1 | import statistics
  2 | import time
  3 | from typing import Any, List, Optional, Tuple
  4 | 
  5 | import motmetrics as mm
  6 | import torch
  7 | from omegaconf import DictConfig
  8 | from scipy.optimize import linear_sum_assignment
  9 | from torchvision.ops import box_iou
 10 | 
 11 | from .similarities import batch_bev_distance, batch_cosine_similarity, batched_box_iou
 12 | from .solver import multicut, scale_weights
 13 | from .supertrack import SuperTrack, TrackState
 14 | 
 15 | 
 16 | class Tracker:
 17 |     def __init__(
 18 |         self,
 19 |         solver_opts: Any,
 20 |         cfg: DictConfig,
 21 |         n_cams: int,
 22 |         feature_extractor: Optional[torch.nn.Module] = None,
 23 |         device: Optional[torch.device] = "cpu",
 24 |     ):
 25 |         """
 26 |         Initialize the Tracker.
 27 | 
 28 |         Args:
 29 |             solver_opts: Options for the solver.
 30 |             cfg: Configuration dictionary.
 31 |             n_cams: Number of cameras.
 32 |             feature_extractor: Feature extractor module.
 33 |             device: Device to run the tracker on.
 34 |         """
 35 |         self.feature_extractor = feature_extractor
 36 |         self.solver_opts = solver_opts
 37 |         self.device = device
 38 | 
 39 |         self.current_data = None
 40 | 
 41 |         self.feature_dim = cfg.tracker.fdim
 42 |         self.n_cams = n_cams
 43 |         self.cfg = cfg.tracker
 44 | 
 45 |         self.tracks: List[SuperTrack] = []
 46 | 
 47 |         self.frame = 0
 48 |         self.free_id = 1
 49 | 
 50 |         self.latency = []
 51 | 
 52 |         self.update_interval = 1
 53 |         self.stats = {
 54 |             "# Killed": 0,
 55 |             "Latency": 0,
 56 |         }
 57 | 
 58 |         self.cumulative_execution_time = 0
 59 | 
 60 |     def step(self, sample):
 61 |         """
 62 |         Perform a single step of tracking.
 63 | 
 64 |         Args:
 65 |             sample: Input sample containing detections and features.
 66 | 
 67 |         Returns:
 68 |             tuple: A tuple containing current results and predicted results.
 69 |         """
 70 |         # move sample to device and remove batch dimension
 71 |         t0 = time.time()
 72 |         for key in sample.keys():
 73 |             if key != "images":
 74 |                 sample[key] = sample[key].to(self.device).squeeze(0)
 75 |         self.frame += 1
 76 |         if self.frame % self.update_interval == 0:
 77 |             if sample["annotations"].size(0) > 0:
 78 |                 matched, unmatched = self.update(sample)
 79 |                 self._handle_unmatched(unmatched)
 80 | 
 81 |         t1 = time.time()
 82 |         self.cumulative_execution_time += t1 - t0
 83 |         self.latency.append(t1 - t0)
 84 | 
 85 |         self._sanitize()
 86 | 
 87 |         rresults = self.get_result()
 88 | 
 89 |         self.predict()
 90 | 
 91 |         presults = self.get_result()
 92 | 
 93 |         return rresults, presults
 94 | 
 95 |     def update(self, sample):
 96 |         """
 97 |         Update the tracker with new detections and features.
 98 | 
 99 |         Args:
100 |             sample: Input sample containing detections and features.
101 | 
102 |         Returns:
103 |             tuple: A tuple containing matched and unmatched tracks.
104 |         """
105 |         features = self.feature_extractor(sample)
106 |         superboxes = self._new_superboxes_from_data(sample, features)
107 |         superboxes = [s for s in superboxes if s.confidence >= self.cfg.confidence_thresh]
108 | 
109 |         relevant_tracks = self.tracks + superboxes
110 |         _track_indices = torch.arange(len(self.tracks)).to(self.device)
111 |         _superbox_indices = torch.arange(len(self.tracks), len(relevant_tracks)).to(self.device)
112 | 
113 |         low_conf_indices = None
114 | 
115 |         if self.cfg.low_confidence_thresh is not None:
116 |             c1 = self.cfg.low_confidence_thresh
117 |             c2 = self.cfg.confidence_thresh
118 |             low_conf_superboxes = [s for s in superboxes if c1 <= s.confidence < c2]
119 | 
120 |             if len(low_conf_superboxes) > 0:
121 |                 n_relevant = len(relevant_tracks)
122 |                 relevant_tracks += low_conf_superboxes
123 |                 low_conf_indices = torch.arange(n_relevant, n_relevant + len(low_conf_superboxes))
124 | 
125 |         if len(relevant_tracks) == 0:
126 |             return [], []
127 | 
128 |         features = torch.stack([track.p_features for track in relevant_tracks])  # (n_tracks, n_cams, feature_dim)
129 |         positions = torch.stack([track.p_positions for track in relevant_tracks])  # (n_tracks, n_cams, 2)
130 |         boxes = torch.stack([track.tlbr for track in relevant_tracks])  # (n_tracks, n_cams, 4)
131 | 
132 |         # compute (n_tracks) x (n_tracks) similarity matrix
133 |         similarities = self._compute_similarities(features, positions, boxes)
134 | 
135 |         # compute weighted graph
136 |         rescale_thresh = self.cfg.matching.rescale_threshold
137 |         dist_thresh = self.cfg.matching.distance_threshold
138 |         iou_bias = self.cfg.prematching.iou_bias if self.cfg.prematching.enabled else 0
139 |         edge_index, edge_weights = self._build_weighted_graph(
140 |             relevant_tracks,
141 |             similarities,
142 |             rescale_thresh,
143 |             dist_thresh,
144 |             iou_bias,
145 |             reid_decay=self.cfg.matching.reid_decay,
146 |         )
147 |         labels = multicut(edge_index, edge_weights, self.solver_opts)
148 | 
149 |         matched_tracks, unmatched_tracks = self._match(relevant_tracks, labels, low_conf_indices=low_conf_indices)
150 | 
151 |         self.tracks = matched_tracks + unmatched_tracks
152 |         return matched_tracks, unmatched_tracks
153 | 
154 |     def _handle_unmatched(self, unmatched_tracks):
155 |         """
156 |         Handle unmatched tracks by updating their inactive status.
157 | 
158 |         Args:
159 |             unmatched_tracks: List of unmatched tracks.
160 |         """
161 |         for track in unmatched_tracks:
162 |             for cam in range(self.n_cams):
163 |                 if track.track_where[cam]:
164 |                     track.inactive_since[cam] += 1
165 | 
166 |     def predict(self):
167 |         """
168 |         Project existing tracks into the future.
169 |         """
170 |         for track in self.tracks:
171 |             track.predict()
172 | 
173 |     def _new_superboxes_from_data(self, sample, sample_features):
174 |         """
175 |         Create new superboxes from detections and features.
176 | 
177 |         Args:
178 |             sample: Input sample containing detection information.
179 |             sample_features: Extracted features from the sample.
180 | 
181 |         Returns:
182 |             list: List of new SuperTrack objects.
183 |         """
184 |         n_rows = sample_features.shape[0]
185 | 
186 |         features = torch.full((n_rows, self.n_cams, self.feature_dim), float("nan"), device=self.device)
187 |         boxes = torch.full((n_rows, self.n_cams, 4), float("nan"), device=self.device)
188 |         positions_2d = torch.full((n_rows, self.n_cams, 2), float("nan"), device=self.device)
189 |         positions_3d = torch.full((n_rows, self.n_cams, 2), float("nan"), device=self.device)
190 | 
191 |         cam_ids = sample["annotations"][:, 0].int()
192 |         features[torch.arange(n_rows), cam_ids] = sample_features
193 |         boxes[torch.arange(n_rows), cam_ids] = sample["annotations"][:, 3:7]
194 |         positions_2d[torch.arange(n_rows), cam_ids] = sample["positions_2d"]
195 |         positions_3d[torch.arange(n_rows), cam_ids] = sample["positions_3d"]
196 |         confidences = sample["annotations"][:, 7]
197 | 
198 |         superboxes = [
199 |             SuperTrack(
200 |                 frame=self.frame,
201 |                 features=features[row],
202 |                 boxes=boxes[row],
203 |                 positions_2d=positions_2d[row],
204 |                 positions_3d=positions_3d[row],
205 |                 confidence=confidences[row],
206 |             )
207 |             for row in range(n_rows)
208 |         ]
209 | 
210 |         return superboxes
211 | 
212 |     def _merge_tracks(self, tracks):
213 |         """
214 |         Merge multiple tracks into a single track.
215 | 
216 |         Args:
217 |             tracks: List of tracks to merge.
218 | 
219 |         Returns:
220 |             SuperTrack: Merged track.
221 |         """
222 |         _frames = sorted({track.frame for track in tracks})
223 | 
224 |         newest_frame = _frames[-1]
225 |         if len(_frames) > 1:
226 |             penult_frame = _frames[-2]
227 | 
228 |         assert tracks[-1].frame == newest_frame
229 | 
230 |         newest_evidence = [track for track in tracks if track.frame == newest_frame]
231 | 
232 |         features = (torch.ones(self.n_cams, self.feature_dim) * (torch.nan)).to(self.device)
233 |         boxes = (torch.ones(self.n_cams, 4) * (torch.nan)).to(self.device)
234 |         positions_2d = (torch.ones(self.n_cams, 2) * (torch.nan)).to(self.device)
235 |         positions_3d = (torch.ones(self.n_cams, 2) * (torch.nan)).to(self.device)
236 |         track_where = torch.zeros(self.n_cams, dtype=torch.bool).to(self.device)
237 | 
238 |         for cam_id in range(self.n_cams):
239 |             for track in newest_evidence:
240 |                 if not torch.isnan(track.features[cam_id]).any():
241 |                     features[cam_id] = track.features[cam_id]
242 |                     boxes[cam_id] = track.boxes[cam_id]
243 |                     positions_2d[cam_id] = track.positions_2d[cam_id]
244 |                     positions_3d[cam_id] = track.positions_3d[cam_id]
245 |                     track_where[cam_id] = True
246 |                     break
247 | 
248 |         merged_track = SuperTrack(
249 |             frame=newest_frame,
250 |             features=features,
251 |             boxes=boxes,
252 |             positions_2d=positions_2d,
253 |             positions_3d=positions_3d,
254 |         )
255 | 
256 |         if len(_frames) > 1:
257 |             penult_track = [track for track in tracks if track.frame == penult_frame][0]
258 |             penult_track.update(merged_track)
259 |             merged_track = penult_track
260 | 
261 |         return merged_track
262 | 
263 |     def _match(self, tracks, labels, low_conf_indices=None):
264 |         """
265 |         Match superboxes with superboxes, and merged superboxes with existing supertracks.
266 | 
267 |         Args:
268 |             tracks: List of tracks to match.
269 |             labels: Labels for each track.
270 |             low_conf_indices: Indices of low confidence detections.
271 | 
272 |         Returns:
273 |             tuple: A tuple containing new tracks and unmatched tracks.
274 |         """
275 |         new_tracks = []
276 |         unmatched_tracks = []
277 | 
278 |         for label in torch.unique(labels):
279 |             track_indices = torch.where(labels == label)[0].tolist()
280 |             if len(track_indices) == 1:
281 |                 track = tracks[track_indices[0]]
282 |                 if low_conf_indices is not None and track_indices[0] in low_conf_indices:
283 |                     continue
284 |                 if track.state == TrackState.CREATED:
285 |                     new_tracks.append(track)
286 |                 else:
287 |                     unmatched_tracks.append(track)
288 |             else:
289 |                 if low_conf_indices is None:
290 |                     relevant_tracks = sorted([tracks[i] for i in track_indices], key=lambda x: x.frame)
291 |                 else:
292 |                     relevant_tracks = sorted(
293 |                         [tracks[i] for i in track_indices if i not in low_conf_indices], key=lambda x: x.frame
294 |                     )
295 |                 merged_track = self._merge_tracks(relevant_tracks)
296 |                 if low_conf_indices is not None and not merged_track.is_complete():
297 |                     relevant_low_conf_tracks = [tracks[i] for i in track_indices if i in low_conf_indices]
298 |                     merged_track = self._merge_tracks([merged_track] + relevant_low_conf_tracks)
299 |                 new_tracks.append(merged_track)
300 | 
301 |         return new_tracks, unmatched_tracks
302 | 
303 |     @staticmethod
304 |     def _compute_similarities(features, positions, boxes):
305 |         """
306 |         Compute similarity matrices for features, positions, and boxes.
307 | 
308 |         Args:
309 |             features: Tensor of track features.
310 |             positions: Tensor of track positions.
311 |             boxes: Tensor of track bounding boxes.
312 | 
313 |         Returns:
314 |             tuple: A tuple containing similarity matrices for features, positions, and IoU.
315 |         """
316 |         # permute to (n_cams, n_tracks, feature_dim), (n_cams, n_tracks, 2), (n_cams, n_tracks, 4)
317 |         features = features.permute(1, 0, 2)
318 |         positions = positions.permute(1, 0, 2)
319 |         boxes = boxes.permute(1, 0, 2)
320 | 
321 |         # compute pairwise similarities (n_cams, n_tracks, n_tracks)
322 |         feature_sim = batch_cosine_similarity(features, features)
323 |         position_dist = batch_bev_distance(positions)
324 |         iou_sim = batched_box_iou(boxes)
325 | 
326 |         # average-pool similarities to (n_tracks, n_tracks)
327 |         feature_sim = torch.nanmean(feature_sim, dim=0)
328 |         position_dist = torch.nanmean(position_dist, dim=0)
329 |         iou_sim = torch.nanmean(iou_sim, dim=0)
330 | 
331 |         return feature_sim, position_dist, iou_sim
332 | 
333 |     def _build_weighted_graph(
334 |         self,
335 |         tracks: List[SuperTrack],
336 |         similarities: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
337 |         rescale_thresh: float,
338 |         dist_thresh: float,
339 |         iou_bias: float,
340 |         reid_decay: float = 1,
341 |         penalty: float = -100,
342 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
343 |         """
344 |         Build a weighted graph from tracks and similarity matrices.
345 | 
346 |         Args:
347 |             tracks: List of tracks.
348 |             similarities: Tuple of similarity matrices.
349 |             rescale_thresh: Threshold for rescaling weights.
350 |             dist_thresh: Distance threshold for feasibility.
351 |             iou_bias: Bias to add for IoU-based matching.
352 |             reid_decay: Decay factor for ReID scores.
353 |             penalty: Penalty for infeasible edges.
354 | 
355 |         Returns:
356 |             tuple: A tuple containing edge indices and edge weights of the graph.
357 |         """
358 |         adj = self._initialize_adjacency_matrix(similarities, tracks, reid_decay, rescale_thresh, dist_thresh)
359 | 
360 |         if self.cfg.prematching.enabled:
361 |             adj = self._apply_prematching(adj, tracks, iou_bias)
362 | 
363 |         adj = self._finalize_adjacency_matrix(adj, penalty, tracks)
364 | 
365 |         return self._get_edge_index_and_weights(adj)
366 | 
367 |     def _initialize_adjacency_matrix(
368 |         self,
369 |         similarities: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
370 |         tracks: List[SuperTrack],
371 |         reid_decay: float,
372 |         rescale_thresh: float,
373 |         dist_thresh: float,
374 |     ) -> torch.Tensor:
375 |         """
376 |         Initialize the adjacency matrix for the graph.
377 | 
378 |         Args:
379 |             similarities: Tuple of similarity matrices.
380 |             tracks: List of tracks.
381 |             reid_decay: Decay factor for ReID scores.
382 |             rescale_thresh: Threshold for rescaling weights.
383 |             dist_thresh: Distance threshold for feasibility.
384 | 
385 |         Returns:
386 |             torch.Tensor: Initialized adjacency matrix.
387 |         """
388 |         appearance_sim, position_dist, _ = similarities
389 |         device = appearance_sim.device
390 | 
391 |         frame_support_pairs = [(track.frame, track.track_where) for track in tracks]
392 |         frames, supports = zip(*frame_support_pairs)
393 | 
394 |         times = torch.tensor(frames, dtype=torch.int, device=device)
395 |         lost = torch.tensor([track.state == TrackState.LOST for track in tracks], device=device)
396 |         lost_since = torch.tensor([track.lost_since for track in tracks], device=device)
397 | 
398 |         appearance_sim = appearance_sim * reid_decay**lost_since
399 |         appearance_sim = scale_weights(appearance_sim, rescale_thresh)
400 | 
401 |         combined_sim = self.cfg.matching.rescale_weight * appearance_sim + self.cfg.matching.distance_weight * (
402 |             1 - position_dist / dist_thresh
403 |         )
404 | 
405 |         adj = torch.zeros_like(appearance_sim)
406 |         lmask = lost[:, None] | lost[None, :]
407 |         same_time = times[:, None] == times[None, :]
408 |         feasible = (position_dist < dist_thresh) | lmask
409 | 
410 |         adj[same_time & feasible] = torch.clip(combined_sim[same_time & feasible], min=0, max=1)
411 |         adj[~same_time] = combined_sim[~same_time]
412 |         adj[lmask] = combined_sim[lmask]
413 | 
414 |         return adj
415 | 
416 |     def _apply_prematching(self, adj: torch.Tensor, tracks: List[SuperTrack], iou_bias: float) -> torch.Tensor:
417 |         """
418 |         Apply prematching to the adjacency matrix.
419 | 
420 |         Args:
421 |             adj: Adjacency matrix.
422 |             tracks: List of tracks.
423 |             iou_bias: Bias to add for IoU-based matching.
424 | 
425 |         Returns:
426 |             torch.Tensor: Updated adjacency matrix after prematching.
427 |         """
428 |         cur_frame = max(track.frame for track in tracks)
429 |         pen_frame = cur_frame - 1
430 |         cur_track_idx_by_cam = [[] for _ in range(self.n_cams)]
431 |         pen_track_idx_by_cam = [[] for _ in range(self.n_cams)]
432 | 
433 |         for i, track in enumerate(tracks):
434 |             if track.frame == cur_frame:
435 |                 for cam in range(self.n_cams):
436 |                     if not torch.isnan(track.boxes[cam]).any():
437 |                         cur_track_idx_by_cam[cam].append(i)
438 |             elif track.frame == pen_frame:
439 |                 for cam in range(self.n_cams):
440 |                     if not torch.isnan(track.boxes[cam]).any():
441 |                         pen_track_idx_by_cam[cam].append(i)
442 | 
443 |         for cam in range(self.n_cams):
444 |             cur_boxes_cam = [tracks[i].tlbr[cam] for i in cur_track_idx_by_cam[cam]]
445 |             pen_boxes_cam = [tracks[i].tlbr[cam] for i in pen_track_idx_by_cam[cam]]
446 | 
447 |             if not cur_boxes_cam or not pen_boxes_cam:
448 |                 continue
449 | 
450 |             iou_dist = 1 - box_iou(torch.stack(cur_boxes_cam), torch.stack(pen_boxes_cam))
451 |             row_ind, col_ind = linear_sum_assignment(iou_dist.cpu().numpy())
452 | 
453 |             for r, c in zip(row_ind, col_ind):
454 |                 if iou_dist[r, c] > self.cfg.prematching.iou_threshold:
455 |                     continue
456 |                 cur_idx = cur_track_idx_by_cam[cam][r]
457 |                 if self.cfg.prematching.prune_remaining:
458 |                     adj[cur_idx] = 0
459 |                     adj[:, cur_idx] = 0
460 |                 adj[cur_idx, pen_track_idx_by_cam[cam][c]] += iou_bias
461 |                 adj[pen_track_idx_by_cam[cam][c], cur_idx] += iou_bias
462 | 
463 |         return adj
464 | 
465 |     def _finalize_adjacency_matrix(self, adj: torch.Tensor, penalty: float, tracks: List[SuperTrack]) -> torch.Tensor:
466 |         """
467 |         Finalize the adjacency matrix by applying penalties.
468 | 
469 |         Args:
470 |             adj: Adjacency matrix.
471 |             penalty: Penalty value for infeasible edges.
472 |             tracks: List of tracks.
473 | 
474 |         Returns:
475 |             torch.Tensor: Finalized adjacency matrix.
476 |         """
477 |         frame_support_pairs = [(track.frame, track.track_where) for track in tracks]
478 |         frames, supports = zip(*frame_support_pairs)
479 | 
480 |         times = torch.tensor(frames, dtype=torch.int, device=adj.device)
481 |         supps = torch.stack(supports).to(adj.device)
482 | 
483 |         same_time = times[:, None] == times[None, :]
484 |         same_supp = (supps[:, None] & supps[None, :]).any(dim=2)
485 | 
486 |         adj[same_time & same_supp] = penalty
487 |         adj = adj * torch.triu(torch.ones_like(adj), diagonal=1)
488 | 
489 |         return adj
490 | 
491 |     def _get_edge_index_and_weights(self, adj: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
492 |         """
493 |         Extract edge indices and weights from the adjacency matrix.
494 | 
495 |         Args:
496 |             adj: Adjacency matrix.
497 | 
498 |         Returns:
499 |             tuple: A tuple containing edge indices and edge weights.
500 |         """
501 |         edge_index = torch.nonzero(adj).t().long()
502 |         edge_weights = adj[edge_index[0], edge_index[1]]
503 |         return edge_index, edge_weights
504 | 
505 |     def _sanitize(self):
506 |         """
507 |         Sanitize the tracker by updating track states and removing killed tracks.
508 |         """
509 |         keep = []
510 |         for k, track in enumerate(self.tracks):
511 |             if track.state is TrackState.CREATED:
512 |                 track.activate()
513 |             if track.label is None:
514 |                 track.set_label(self.free_id)
515 |                 self.free_id += 1
516 |             if torch.all(~track.track_where):
517 |                 if torch.all(track.inactive_since[track.inactive_since > 0] > self.cfg.patience):
518 |                     track.deactivate()
519 |             if track.state is TrackState.LOST:
520 |                 if track.lost_since > self.cfg.memory:
521 |                     track.kill()
522 |                 else:
523 |                     track.lost_since += 1
524 |             if track.state is not TrackState.KILLED:
525 |                 keep.append(track)
526 |             for cam in range(self.n_cams):
527 |                 if track.inactive_since[cam] > self.cfg.patience:
528 |                     track.reset([cam])
529 |         killed = len(self.tracks) - len(keep)
530 |         self.tracks = keep
531 |         self.stats["# Tracks"] = len(self.tracks)
532 |         self.stats["# Lost"] = len([track for track in self.tracks if track.state == TrackState.LOST])
533 |         self.stats["# Killed"] += killed
534 | 
535 |         latency = statistics.mean(self.latency) if len(self.latency) > 0 else 0
536 |         self.stats["FPS"] = int(1 / latency) if latency > 0 else 0
537 | 
538 |     def _get_active_tracks(self):
539 |         """
540 |         Get a list of active tracks.
541 | 
542 |         Returns:
543 |             list: List of active tracks.
544 |         """
545 |         return [track for track in self.tracks if track.state != TrackState.KILLED]
546 | 
547 |     def get_result(self, normalization=None, scale=1.0):
548 |         """
549 |         Get the current online state of the tracker.
550 | 
551 |         Args:
552 |             normalization: Optional normalization parameters.
553 |             scale: Scale factor for the results.
554 | 
555 |         Returns:
556 |             torch.Tensor: Tensor containing the current tracker state.
557 |         """
558 |         to_stack = [track.to_tensor() for track in self.tracks if track.state == TrackState.ACTIVE]
559 |         if len(to_stack) > 0:
560 |             result = torch.cat(to_stack)
561 |         else:
562 |             result = torch.empty(0)
563 |         if result.size(0) > 0:
564 |             if normalization is not None:
565 |                 min_x, min_y, max_x, max_y = normalization
566 |                 result[:, 7:9] = result[:, 7:9] * torch.tensor([max_x - min_x, max_y - min_y]) + torch.tensor(
567 |                     [min_x, min_y]
568 |                 )
569 |             result[:, 7:9] *= scale
570 |         return result
571 | 
572 |     def _get_index_by_id(self, tid):
573 |         """
574 |         Get the index of a track by its ID.
575 | 
576 |         Args:
577 |             tid: Track ID to search for.
578 | 
579 |         Returns:
580 |             int: Index of the track with the given ID, or None if not found.
581 |         """
582 |         for i, track in enumerate(self.tracks):
583 |             if track.label == tid:
584 |                 return i
585 |         return None
586 | 
587 | 
588 | def create_tracker(cfg, solver_cfg, feature_extractor, n_cams, device, writer=None):
589 |     """
590 |     Create a new Tracker instance.
591 | 
592 |     Args:
593 |         cfg: Configuration dictionary.
594 |         solver_cfg: Solver configuration.
595 |         feature_extractor: Feature extractor module.
596 |         n_cams: Number of cameras.
597 |         device: Device to run the tracker on.
598 |         writer: Optional writer for logging.
599 | 
600 |     Returns:
601 |         Tracker: A new Tracker instance.
602 |     """
603 |     return Tracker(
604 |         solver_opts=solver_cfg,
605 |         cfg=cfg,
606 |         feature_extractor=feature_extractor,
607 |         n_cams=n_cams,
608 |         device=device,
609 |     )
610 | 


--------------------------------------------------------------------------------