├── .gitattributes
├── src
    ├── solver
    │   ├── __init__.py
    │   └── engine.py
    ├── nn
    │   ├── __init__.py
    │   ├── optimizer
    │   │   ├── __init__.py
    │   │   ├── warmup.py
    │   │   └── ema.py
    │   └── backbone
    │   │   ├── __init__.py
    │   │   └── resnet.py
    ├── misc
    │   ├── __init__.py
    │   ├── profiler.py
    │   ├── keypoint_ops.py
    │   ├── mask_ops.py
    │   ├── metrics.py
    │   ├── keypoint_loss.py
    │   ├── box_ops.py
    │   ├── get_param_dicts.py
    │   ├── dist_utils.py
    │   └── logger.py
    ├── core
    │   ├── __init__.py
    │   ├── utils.py
    │   └── instantiate.py
    ├── data
    │   ├── __init__.py
    │   ├── container.py
    │   ├── coco.py
    │   ├── crowdpose.py
    │   └── dataloader.py
    └── models
    │   └── detrpose
    │       ├── __init__.py
    │       ├── detrpose.py
    │       ├── postprocesses.py
    │       ├── matcher.py
    │       ├── utils.py
    │       ├── ms_deform_attn.py
    │       └── dn_component.py
├── assets
    ├── metrics.png
    ├── lambda_logo1.png
    ├── lambda_logo2.png
    ├── TENSORRT_CONTAINER_LAMBDA.AI.md
    └── TENSORRT_DEB_LAMBDA.AI.md
├── examples
    ├── example1.jpg
    └── example2.jpg
├── tools
    ├── benchmark
    │   ├── requirements.txt
    │   ├── utils.py
    │   ├── get_info.py
    │   ├── dataset.py
    │   ├── torch_benchmark.py
    │   └── trt_benchmark.py
    ├── deployment
    │   ├── export_tensorrt.py
    │   ├── export_yolo_w_nms.py
    │   └── export_onnx.py
    ├── visualization
    │   ├── backbone_encoder.py
    │   └── line_attention.py
    └── inference
    │   ├── onnx_inf.py
    │   ├── annotator_crowdpose.py
    │   ├── torch_inf.py
    │   └── trt_inf.py
├── requirements.txt
├── .gitignore
├── configs
    └── detrpose
    │   ├── detrpose_hgnetv2_l.py
    │   ├── detrpose_hgnetv2_l_crowdpose.py
    │   ├── detrpose_hgnetv2_x.py
    │   ├── detrpose_hgnetv2_x_crowdpose.py
    │   ├── detrpose_hgnetv2_m.py
    │   ├── detrpose_hgnetv2_s.py
    │   ├── detrpose_hgnetv2_m_crowdpose.py
    │   ├── detrpose_hgnetv2_s_crowdpose.py
    │   ├── detrpose_hgnetv2_n.py
    │   ├── detrpose_hgnetv2_n_crowdpose.py
    │   └── include
    │       ├── detrpose_hgnetv2.py
    │       ├── dataset.py
    │       └── dataset_crowdpose.py
└── train.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-documentation
2 | 


--------------------------------------------------------------------------------
/src/solver/__init__.py:
--------------------------------------------------------------------------------
1 | from .trainer import Trainer


--------------------------------------------------------------------------------
/src/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import *
2 | from .optimizer import *
3 | 


--------------------------------------------------------------------------------
/src/misc/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved


--------------------------------------------------------------------------------
/src/nn/optimizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .warmup import LinearWarmup
2 | from .ema import ModelEMA


--------------------------------------------------------------------------------
/assets/metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/assets/metrics.png


--------------------------------------------------------------------------------
/examples/example1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/examples/example1.jpg


--------------------------------------------------------------------------------
/examples/example2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/examples/example2.jpg


--------------------------------------------------------------------------------
/src/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .instantiate import instantiate
2 | from .lazy import LazyConfig, LazyCall


--------------------------------------------------------------------------------
/assets/lambda_logo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/assets/lambda_logo1.png


--------------------------------------------------------------------------------
/assets/lambda_logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/assets/lambda_logo2.png


--------------------------------------------------------------------------------
/tools/benchmark/requirements.txt:
--------------------------------------------------------------------------------
1 | onnxruntime
2 | onnxscript
3 | onnxsim
4 | tensorrt
5 | pycuda
6 | tqdm
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | opencv-python
 2 | transformers
 3 | cloudpickle
 4 | pycocotools
 5 | xtcocotools
 6 | omegaconf
 7 | calflops
 8 | iopath
 9 | scipy
10 | numpy==1.23.5
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | weight/
 2 | data/COCO2017
 3 | data/crowdpose
 4 | output/
 5 | official_weights/
 6 | onnx_engines/
 7 | trt_engines/
 8 | clean_pth_files.py
 9 | **/__pycache__/**
10 | examples/output/
11 | 


--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .coco import CocoDetection
2 | # from .coco_eval import CocoEvaluator
3 | 
4 | from .crowdpose import CrowdPoseDetection
5 | # from .crowdpose_eval import CrowdPoseEvaluator
6 | 
7 | # from .dataloader import DataLoader, BatchImageCollateFunction
8 | 


--------------------------------------------------------------------------------
/src/models/detrpose/__init__.py:
--------------------------------------------------------------------------------
1 | from .matcher import HungarianMatcher
2 | from .detrpose import DETRPose
3 | from .criterion import Criterion
4 | from .transformer import Transformer
5 | from .postprocesses import PostProcess
6 | from .hybrid_encoder import HybridEncoder
7 | 


--------------------------------------------------------------------------------
/tools/deployment/export_tensorrt.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | 
 3 | input_folder = 'onnx_engines'
 4 | input_files = [f for f in os.listdir(input_folder)]
 5 | 
 6 | output_folder = 'trt_engines'
 7 | output_files = [f.replace('onnx', 'engine') for f in input_files]
 8 | 
 9 | os.makedirs(output_folder, exist_ok=True)
10 | 
11 | trtexec="/usr/src/tensorrt/bin/trtexec"
12 | 
13 | for f_in, f_out in zip(input_files, output_files):
14 | 	cmd = f'{trtexec} --onnx="{input_folder}/{f_in}" --saveEngine="{output_folder}/{f_out}" --fp16'
15 | 	print(f'running:\t{cmd}')
16 | 	os.system(cmd)


--------------------------------------------------------------------------------
/src/nn/backbone/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Modified from Conditional DETR
 3 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Copied from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 8 | # ------------------------------------------------------------------------
 9 | 
10 | from .resnet import ResNet
11 | from .hgnetv2 import HGNetv2
12 | 


--------------------------------------------------------------------------------
/src/misc/profiler.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from calflops import calculate_flops
 3 | from typing import Tuple
 4 | 
 5 | def stats(
 6 |     model,
 7 |     input_shape: Tuple=(1, 3, 640, 640), ) -> Tuple[int, dict]:
 8 | 
 9 |     model_for_info = copy.deepcopy(model).deploy()
10 | 
11 |     flops, macs, _ = calculate_flops(model=model_for_info,
12 |                                         input_shape=input_shape,
13 |                                         output_as_string=True,
14 |                                         output_precision=4,
15 |                                         print_detailed=False)
16 |     params = sum(p.numel() for p in model_for_info.parameters())
17 |     del model_for_info
18 |     return {'flops': flops, 'macs': macs, 'params': params}
19 | 


--------------------------------------------------------------------------------
/src/misc/keypoint_ops.py:
--------------------------------------------------------------------------------
 1 | import torch, os
 2 | 
 3 | def keypoint_xyxyzz_to_xyzxyz(keypoints: torch.Tensor):
 4 |     """_summary_
 5 | 
 6 |     Args:
 7 |         keypoints (torch.Tensor): ..., 51
 8 |     """
 9 |     res = torch.zeros_like(keypoints)
10 |     num_points = keypoints.shape[-1] // 3
11 |     Z = keypoints[..., :2*num_points]
12 |     V = keypoints[..., 2*num_points:]
13 |     res[...,0::3] = Z[..., 0::2]
14 |     res[...,1::3] = Z[..., 1::2]
15 |     res[...,2::3] = V[...]
16 |     return res
17 | 
18 | def keypoint_xyzxyz_to_xyxyzz(keypoints: torch.Tensor):
19 |     """_summary_
20 | 
21 |     Args:
22 |         keypoints (torch.Tensor): ..., 51
23 |     """
24 |     res = torch.zeros_like(keypoints)
25 |     num_points = keypoints.shape[-1] // 3
26 |     res[...,0:2*num_points:2] = keypoints[..., 0::3]
27 |     res[...,1:2*num_points:2] = keypoints[..., 1::3]
28 |     res[...,2*num_points:] = keypoints[..., 2::3]
29 |     return res


--------------------------------------------------------------------------------
/tools/benchmark/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import contextlib
 3 | import numpy as np
 4 | from PIL import Image
 5 | from collections import OrderedDict
 6 | 
 7 | import onnx
 8 | import torch
 9 | 
10 | 
11 | def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'):
12 |     '''--loadInputs='image:input_tensor.bin'
13 |     '''
14 |     im = Image.open(path).resize(size)
15 |     data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255.
16 |     data.tofile(output_name)
17 | 
18 | 
19 | class TimeProfiler(contextlib.ContextDecorator):
20 |     def __init__(self, ):
21 |         self.total = 0
22 | 
23 |     def __enter__(self, ):
24 |         self.start = self.time()
25 |         return self
26 | 
27 |     def __exit__(self, type, value, traceback):
28 |         self.total += self.time() - self.start
29 | 
30 |     def reset(self, ):
31 |         self.total = 0
32 | 
33 |     def time(self, ):
34 |         if torch.cuda.is_available():
35 |             torch.cuda.synchronize()
36 |         return time.time()
37 | 


--------------------------------------------------------------------------------
/tools/deployment/export_yolo_w_nms.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from ultralytics import YOLO
 3 | 
 4 | def main(args):
 5 |     output_folder = 'trt_engines'
 6 |     os.makedirs(output_folder, exist_ok=True)
 7 | 
 8 |     model = YOLO(f"{args.name}.pt")
 9 |     model.export(format="engine", nms=True, iou=args.iou_threshold, conf=args.score_threshold, half=True, dynamic=False)
10 | 
11 |     with open(f"{args.name}.engine", "rb") as f:
12 |         meta_len = int.from_bytes(f.read(4), byteorder="little")
13 |         f.seek(meta_len + 4)
14 |         engine = f.read()
15 | 
16 |     new_name  = f"{args.name}_" + str(args.iou_threshold).split('.')[1] + '_' + str(args.score_threshold).split('.')[1]
17 |     with open(f"{output_folder}/{new_name}.engine", "wb") as f:
18 |         f.write(engine)
19 | 
20 | if __name__ == "__main__":
21 |     import argparse
22 | 
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--name", type=str, default="yolo11n_tuned")
25 |     parser.add_argument("--score_threshold", type=float, default=0.01)
26 |     parser.add_argument("--iou_threshold", type=float, default=0.7)
27 |     args = parser.parse_args()
28 | 
29 |     main(args)


--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_l.py:
--------------------------------------------------------------------------------
 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
 2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator
 3 | 
 4 | from src.core import LazyCall as L
 5 | from src.nn.optimizer import ModelEMA 
 6 | from src.misc.get_param_dicts import get_optim_params
 7 | 
 8 | from torch import optim
 9 | 
10 | training_params.output_dir =  "output/detrpose_hgnetv2_l"
11 | training_params.epochs = 52 # 48 + 4
12 | training_params.use_ema = True
13 | 
14 | ema = L(ModelEMA)(
15 |     decay=0.9999,
16 |     warmups=2000
17 |     )
18 | 
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 |     params=L(get_optim_params)(
22 |         cfg=[
23 |                 {
24 |                 'params': '^(?=.*backbone).*$',
25 |                 'lr': 0.00001
26 |                 },
27 |             ],
28 |         # model=model
29 |         ),
30 |     lr=0.0001,
31 |     betas=[0.9, 0.999],
32 |     weight_decay=0.0001
33 |     )
34 | 
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 |     # optimizer=optimizer,
37 |     milestones=[1000],
38 |     gamma=0.1
39 |     )
40 | 
41 | 


--------------------------------------------------------------------------------
/src/misc/mask_ops.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List
 2 | 
 3 | # needed due to empty tensor bug in pytorch and torchvision 0.5
 4 | import torchvision
 5 | __torchvision_need_compat_flag = float(torchvision.__version__.split('.')[1]) < 7
 6 | if __torchvision_need_compat_flag:
 7 |     from torchvision.ops import _new_empty_tensor
 8 |     from torchvision.ops.misc import _output_size
 9 |     
10 | def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
11 |     # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
12 |     """
13 |     Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
14 |     This will eventually be supported natively by PyTorch, and this
15 |     class can go away.
16 |     """
17 |     if __torchvision_need_compat_flag < 0.7:
18 |         if input.numel() > 0:
19 |             return torch.nn.functional.interpolate(
20 |                 input, size, scale_factor, mode, align_corners
21 |             )
22 | 
23 |         output_shape = _output_size(2, input, size, scale_factor)
24 |         output_shape = list(input.shape[:-2]) + list(output_shape)
25 |         return _new_empty_tensor(input, output_shape)
26 |     else:
27 |         return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
28 | 


--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_l_crowdpose.py:
--------------------------------------------------------------------------------
 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
 2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator
 3 | 
 4 | from src.core import LazyCall as L
 5 | from src.nn.optimizer import ModelEMA 
 6 | from src.misc.get_param_dicts import get_optim_params
 7 | 
 8 | from torch import optim
 9 | 
10 | training_params.output_dir =  "output/detrpose_hgnetv2_l_crowdpose"
11 | training_params.epochs = 64 # 48 + 16
12 | training_params.use_ema = True
13 | 
14 | ema = L(ModelEMA)(
15 |     decay=0.9999,
16 |     warmups=2000
17 |     )
18 | 
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 |     params=L(get_optim_params)(
22 |         cfg=[
23 |                 {
24 |                 'params': '^(?=.*backbone).*$',
25 |                 'lr': 0.00001
26 |                 },
27 |             ],
28 |         # model=model
29 |         ),
30 |     lr=0.0001,
31 |     betas=[0.9, 0.999],
32 |     weight_decay=0.0001
33 |     )
34 | 
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 |     # optimizer=optimizer,
37 |     milestones=[1000],
38 |     gamma=0.1
39 |     )
40 | 
41 | model.transformer.num_body_points=14
42 | criterion.matcher.num_body_points=14
43 | criterion.num_body_points=14
44 | postprocessor.num_body_points=14
45 | 


--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_x.py:
--------------------------------------------------------------------------------
 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
 2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator 
 3 | 
 4 | from src.core import LazyCall as L
 5 | from src.nn.optimizer import ModelEMA 
 6 | from src.misc.get_param_dicts import get_optim_params
 7 | 
 8 | from torch import optim
 9 | 
10 | training_params.output_dir =  "output/detrpose_hgnetv2_x"
11 | training_params.epochs = 52 # 48 + 4 
12 | training_params.use_ema = True
13 | training_params.grad_accum_steps = 1
14 | 
15 | ema = L(ModelEMA)(
16 |     decay=0.9999,
17 |     warmups=2000
18 |     )
19 | 
20 | # optimizer params
21 | optimizer = L(optim.AdamW)(
22 |     params=L(get_optim_params)(
23 |         cfg=[
24 |                 {
25 |                 'params': '^(?=.*backbone).*$',
26 |                 'lr': 0.00005
27 |                 },
28 |             ],
29 |         # model=model
30 |         ),
31 |     lr=0.0001,
32 |     betas=[0.9, 0.999],
33 |     weight_decay=0.0001
34 |     )
35 | 
36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
37 |     # optimizer=optimizer,
38 |     milestones=[1000],
39 |     gamma=0.1
40 |     )
41 | 
42 | model.backbone.name = 'B5'
43 | model.encoder.hidden_dim = 384
44 | model.encoder.dim_feedforward = 2048
45 | model.transformer.hidden_dim = 384
46 | # model.transformer.feat_channels = [384, 384, 384]
47 | model.transformer.reg_scale = 8
48 | 


--------------------------------------------------------------------------------
/tools/benchmark/get_info.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
 3 | """
 4 | 
 5 | import os, sys
 6 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
 7 | from src.core import LazyConfig, instantiate
 8 | 
 9 | import argparse
10 | from calflops import calculate_flops
11 | 
12 | import torch
13 | import torch.nn as nn
14 | 
15 | def custom_repr(self):
16 |     return f'{{Tensor:{tuple(self.shape)}}} {original_repr(self)}'
17 | original_repr = torch.Tensor.__repr__
18 | torch.Tensor.__repr__ = custom_repr
19 | 
20 | def main(args, ):
21 |     """main
22 |     """
23 |     cfg = LazyConfig.load(args.config_file)
24 |     
25 |     if hasattr(cfg.model.backbone, 'pretrained'):
26 |         cfg.model.backbone.pretrained = False
27 | 
28 |     model = instantiate(cfg.model)
29 |     
30 |     model = model.deploy()
31 |     model.eval()
32 | 
33 |     flops, macs, _ = calculate_flops(model=model,
34 |                                      input_shape=(1, 3, 640, 640),
35 |                                      output_as_string=True,
36 |                                      output_precision=4)
37 |     params = sum(p.numel() for p in model.parameters())
38 |     print("Model FLOPs:%s   MACs:%s   Params:%s \n" %(flops, macs, params))
39 | 
40 | 
41 | if __name__ == '__main__':
42 | 
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument('--config_file', '-c', default= "configs/linea/linea_hgnetv2_lpy", type=str)
45 |     args = parser.parse_args()
46 | 
47 |     main(args)
48 | 


--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_x_crowdpose.py:
--------------------------------------------------------------------------------
 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
 2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator
 3 | 
 4 | from src.core import LazyCall as L
 5 | from src.nn.optimizer import ModelEMA 
 6 | from src.misc.get_param_dicts import get_optim_params
 7 | 
 8 | from torch import optim
 9 | 
10 | training_params.output_dir =  "output/detrpose_hgnetv2_x_crowdpose"
11 | training_params.epochs = 52 # 48 + 4
12 | training_params.use_ema = True
13 | 
14 | ema = L(ModelEMA)(
15 |     decay=0.9999,
16 |     warmups=2000
17 |     )
18 | 
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 |     params=L(get_optim_params)(
22 |         cfg=[
23 |                 {
24 |                 'params': '^(?=.*backbone).*$',
25 |                 'lr': 0.00001
26 |                 },
27 |             ],
28 |         # model=model
29 |         ),
30 |     lr=0.0001,
31 |     betas=[0.9, 0.999],
32 |     weight_decay=0.0001
33 |     )
34 | 
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 |     # optimizer=optimizer,
37 |     milestones=[1000],
38 |     gamma=0.1
39 |     )
40 | 
41 | model.transformer.num_body_points=14
42 | criterion.matcher.num_body_points=14
43 | criterion.num_body_points=14
44 | postprocessor.num_body_points=14
45 | 
46 | model.backbone.name = 'B5'
47 | model.encoder.hidden_dim = 384
48 | model.encoder.dim_feedforward = 2048
49 | model.transformer.hidden_dim = 384
50 | # model.transformer.feat_channels = [384, 384, 384]
51 | model.transformer.reg_scale = 8
52 | 


--------------------------------------------------------------------------------
/src/models/detrpose/detrpose.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
 4 | ---------------------------------------------------------------------------------
 5 | Modified from DEIM (https://github.com/Intellindust-AI-Lab/DEIM/)
 6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
 7 | ---------------------------------------------------------------------------------
 8 | Modified from D-FINE (https://github.com/Peterande/D-FINE/)
 9 | Copyright (c) 2024 D-FINE Authors. All Rights Reserved.
10 | ---------------------------------------------------------------------------------
11 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/)
12 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved.
13 | """
14 | 
15 | from torch import nn
16 | 
17 | class DETRPose(nn.Module):
18 |     def __init__(
19 |         self, 
20 |         backbone, 
21 |         encoder, 
22 |         transformer
23 |         ):
24 |         super().__init__()
25 |         self.backbone = backbone
26 |         self.encoder = encoder
27 |         self.transformer = transformer
28 | 
29 |     def deploy(self):
30 |         self.eval()
31 |         for m in self.modules():
32 |             if hasattr(m, "convert_to_deploy"):
33 |                 m.convert_to_deploy()
34 |         return self
35 | 
36 |     def forward(self, samples, targets=None):
37 |         feats = self.backbone(samples)
38 |         feats = self.encoder(feats)
39 |         out = self.transformer(feats, targets, samples if self.training else None)
40 |         return out
41 | 
42 | 


--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_m.py:
--------------------------------------------------------------------------------
 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
 2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator 
 3 | 
 4 | from src.core import LazyCall as L
 5 | from src.nn.optimizer import ModelEMA 
 6 | from src.misc.get_param_dicts import get_optim_params
 7 | 
 8 | from torch import optim
 9 | 
10 | training_params.output_dir =  "output/detrpose_hgnetv2_m"
11 | training_params.epochs = 64 # 60 + 4 
12 | training_params.use_ema = True
13 | training_params.grad_accum_steps = 1
14 | 
15 | ema = L(ModelEMA)(
16 |     decay=0.9999,
17 |     warmups=2000
18 |     )
19 | 
20 | # optimizer params
21 | optimizer = L(optim.AdamW)(
22 |     params=L(get_optim_params)(
23 |         cfg=[
24 |                 {
25 |                 'params': '^(?=.*backbone).*$',
26 |                 'lr': 0.00001
27 |                 },
28 |             ],
29 |         # model=model
30 |         ),
31 |     lr=0.0001,
32 |     betas=[0.9, 0.999],
33 |     weight_decay=0.0001
34 |     )
35 | 
36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
37 |     # optimizer=optimizer,
38 |     milestones=[15],
39 |     gamma=0.1
40 |     )
41 | 
42 | model.backbone.name = 'B2'
43 | model.backbone.use_lab = True
44 | model.encoder.in_channels = [384, 768, 1536]
45 | model.encoder.depth_mult = 0.67
46 | model.transformer.num_decoder_layers = 4
47 | 
48 | dataset_train.dataset.transforms.policy = {
49 |     'name': 'stop_epoch',
50 |     'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
51 |     'epoch': [5, 35, 60] # 60 / 2 + 5 = 35
52 |     }
53 | dataset_train.collate_fn.base_size_repeat = 6
54 | dataset_train.collate_fn.stop_epoch = 60
55 | 


--------------------------------------------------------------------------------
/src/nn/optimizer/warmup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | from torch.optim.lr_scheduler import LRScheduler
 7 | 
 8 | 
 9 | class Warmup(object):
10 |     def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int=-1) -> None:
11 |         self.lr_scheduler = lr_scheduler
12 |         self.warmup_end_values = [pg['lr'] for pg in lr_scheduler.optimizer.param_groups]
13 |         self.last_step = last_step
14 |         self.warmup_duration = warmup_duration
15 |         self.step()
16 | 
17 |     def state_dict(self):
18 |         return {k: v for k, v in self.__dict__.items() if k != 'lr_scheduler'}
19 | 
20 |     def load_state_dict(self, state_dict):
21 |         self.__dict__.update(state_dict)
22 | 
23 |     def get_warmup_factor(self, step, **kwargs):
24 |         raise NotImplementedError
25 | 
26 |     def step(self, ):
27 |         self.last_step += 1
28 |         if self.last_step >= self.warmup_duration:
29 |             return
30 |         factor = self.get_warmup_factor(self.last_step)
31 |         for i, pg in enumerate(self.lr_scheduler.optimizer.param_groups):
32 |             pg['lr'] = factor * self.warmup_end_values[i]
33 | 
34 |     def finished(self, ):
35 |         if self.last_step >= self.warmup_duration:
36 |             return True
37 |         return False
38 | 
39 | 
40 | class LinearWarmup(Warmup):
41 |     def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int = -1) -> None:
42 |         super().__init__(lr_scheduler, warmup_duration, last_step)
43 | 
44 |     def get_warmup_factor(self, step):
45 |         return min(1.0, (step + 1) / self.warmup_duration)


--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_s.py:
--------------------------------------------------------------------------------
 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
 2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator 
 3 | 
 4 | from src.core import LazyCall as L
 5 | from src.nn.optimizer import ModelEMA 
 6 | from src.misc.get_param_dicts import get_optim_params
 7 | 
 8 | from torch import optim
 9 | 
10 | training_params.output_dir =  "output/detrpose_hgnetv2_s"
11 | training_params.epochs = 100 # 96 + 4 
12 | training_params.use_ema = True
13 | training_params.grad_accum_steps = 1
14 | 
15 | ema = L(ModelEMA)(
16 |     decay=0.9999,
17 |     warmups=2000
18 |     )
19 | 
20 | # optimizer params
21 | optimizer = L(optim.AdamW)(
22 |     params=L(get_optim_params)(
23 |         cfg=[
24 |                 {
25 |                 'params': '^(?=.*backbone).*$',
26 |                 'lr': 0.0001
27 |                 },
28 |             ],
29 |         # model=model
30 |         ),
31 |     lr=0.0001,
32 |     betas=[0.9, 0.999],
33 |     weight_decay=0.0001
34 |     )
35 | 
36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
37 |     # optimizer=optimizer,
38 |     milestones=[1000],
39 |     gamma=0.1
40 |     )
41 | 
42 | model.backbone.name = 'B0'
43 | model.backbone.use_lab = True
44 | model.encoder.in_channels = [256, 512, 1024]
45 | model.encoder.depth_mult=0.34
46 | model.encoder.expansion=0.5
47 | model.transformer.num_decoder_layers = 3
48 | 
49 | dataset_train.dataset.transforms.policy = {
50 |     'name': 'stop_epoch',
51 |     'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
52 |     'epoch': [5, 53, 96] # 96 / 2 + 5 = 53
53 |     }
54 | dataset_train.collate_fn.base_size_repeat = 20
55 | dataset_train.collate_fn.stop_epoch = 96
56 | 


--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_m_crowdpose.py:
--------------------------------------------------------------------------------
 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
 2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator
 3 | 
 4 | from src.core import LazyCall as L
 5 | from src.nn.optimizer import ModelEMA 
 6 | from src.misc.get_param_dicts import get_optim_params
 7 | 
 8 | from torch import optim
 9 | 
10 | training_params.output_dir =  "output/detrpose_hgnetv2_m_crowdpose"
11 | training_params.epochs = 76 # 72 + 4
12 | training_params.use_ema = True
13 | 
14 | ema = L(ModelEMA)(
15 |     decay=0.9999,
16 |     warmups=2000
17 |     )
18 | 
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 |     params=L(get_optim_params)(
22 |         cfg=[
23 |                 {
24 |                 'params': '^(?=.*backbone).*$',
25 |                 'lr': 0.00001
26 |                 },
27 |             ],
28 |         # model=model
29 |         ),
30 |     lr=0.0001,
31 |     betas=[0.9, 0.999],
32 |     weight_decay=0.0001
33 |     )
34 | 
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 |     # optimizer=optimizer,
37 |     milestones=[1000],
38 |     gamma=0.1
39 |     )
40 | 
41 | model.transformer.num_body_points=14
42 | criterion.matcher.num_body_points=14
43 | criterion.num_body_points=14
44 | postprocessor.num_body_points=14
45 | 
46 | model.backbone.name = 'B2'
47 | model.backbone.use_lab = True
48 | model.encoder.in_channels = [384, 768, 1536]
49 | model.encoder.depth_mult = 0.67
50 | model.transformer.num_decoder_layers = 4
51 | 
52 | dataset_train.dataset.transforms.policy = {
53 |     'name': 'stop_epoch',
54 |     'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
55 |     'epoch': [5, 41, 72] # 72 / 2 + 5 = 35
56 |     }
57 | dataset_train.collate_fn.base_size_repeat = 6
58 | dataset_train.collate_fn.stop_epoch = 72
59 | 


--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_s_crowdpose.py:
--------------------------------------------------------------------------------
 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
 2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator
 3 | 
 4 | from src.core import LazyCall as L
 5 | from src.nn.optimizer import ModelEMA 
 6 | from src.misc.get_param_dicts import get_optim_params
 7 | 
 8 | from torch import optim
 9 | 
10 | training_params.output_dir =  "output/detrpose_hgnetv2_s_crowdpose"
11 | training_params.epochs = 176 # 156 + 20
12 | training_params.use_ema = True
13 | 
14 | ema = L(ModelEMA)(
15 |     decay=0.9999,
16 |     warmups=2000
17 |     )
18 | 
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 |     params=L(get_optim_params)(
22 |         cfg=[
23 |                 {
24 |                 'params': '^(?=.*backbone).*$',
25 |                 'lr': 0.00001
26 |                 },
27 |             ],
28 |         # model=model
29 |         ),
30 |     lr=0.0001,
31 |     betas=[0.9, 0.999],
32 |     weight_decay=0.0001
33 |     )
34 | 
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 |     # optimizer=optimizer,
37 |     milestones=[1000],
38 |     gamma=0.1
39 |     )
40 | 
41 | model.transformer.num_body_points=14
42 | criterion.matcher.num_body_points=14
43 | criterion.num_body_points=14
44 | postprocessor.num_body_points=14
45 | 
46 | model.backbone.name = 'B0'
47 | model.backbone.use_lab = True
48 | model.encoder.in_channels = [256, 512, 1024]
49 | model.encoder.depth_mult=0.34
50 | model.encoder.expansion=0.5
51 | model.transformer.num_decoder_layers = 3
52 | 
53 | dataset_train.dataset.transforms.policy = {
54 |     'name': 'stop_epoch',
55 |     'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
56 |     'epoch': [5, 83, 156] # 156 / 2 + 5 = 83
57 |     }
58 | dataset_train.collate_fn.base_size_repeat = 20
59 | dataset_train.collate_fn.stop_epoch = 156
60 | 


--------------------------------------------------------------------------------
/src/core/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | from typing import Any
 4 | from iopath.common.file_io import PathManager as PathManagerBase
 5 | 
 6 | PathManager = PathManagerBase()
 7 | 
 8 | def _convert_target_to_string(t: Any) -> str:
 9 |     """
10 |     Inverse of ``locate()``.
11 | 
12 |     Args:
13 |         t: any object with ``__module__`` and ``__qualname__``
14 |     """
15 |     module, qualname = t.__module__, t.__qualname__
16 | 
17 |     # Compress the path to this object, e.g. ``module.submodule._impl.class``
18 |     # may become ``module.submodule.class``, if the later also resolves to the same
19 |     # object. This simplifies the string, and also is less affected by moving the
20 |     # class implementation.
21 |     module_parts = module.split(".")
22 |     for k in range(1, len(module_parts)):
23 |         prefix = ".".join(module_parts[:k])
24 |         candidate = f"{prefix}.{qualname}"
25 |         try:
26 |             if locate(candidate) is t:
27 |                 return candidate
28 |         except ImportError:
29 |             pass
30 |     return f"{module}.{qualname}"
31 |     
32 |     
33 | def locate(name: str) -> Any:
34 |     """
35 |     Locate and return an object ``x`` using an input string ``{x.__module__}.{x.__qualname__}``,
36 |     such as "module.submodule.class_name".
37 | 
38 |     Raise Exception if it cannot be found.
39 |     """
40 |     obj = pydoc.locate(name)
41 | 
42 |     # Some cases (e.g. torch.optim.sgd.SGD) not handled correctly
43 |     # by pydoc.locate. Try a private function from hydra.
44 |     if obj is None:
45 |         try:
46 |             # from hydra.utils import get_method - will print many errors
47 |             from hydra.utils import _locate
48 |         except ImportError as e:
49 |             raise ImportError(f"Cannot dynamically locate object {name}!") from e
50 |         else:
51 |             obj = _locate(name)  # it raises if fails
52 | 
53 |     return obj
54 | 


--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_n.py:
--------------------------------------------------------------------------------
 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
 2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator 
 3 | 
 4 | from src.core import LazyCall as L
 5 | from src.nn.optimizer import ModelEMA 
 6 | from src.misc.get_param_dicts import get_optim_params
 7 | 
 8 | from torch import optim
 9 | 
10 | training_params.output_dir =  "output/detrpose_hgnetv2_n"
11 | training_params.epochs = 160 # 96 + 4 
12 | training_params.use_ema = True
13 | training_params.grad_accum_steps = 1
14 | 
15 | ema = L(ModelEMA)(
16 |     decay=0.9999,
17 |     warmups=2000
18 |     )
19 | 
20 | # optimizer params
21 | optimizer = L(optim.AdamW)(
22 |     params=L(get_optim_params)(
23 |         cfg=[
24 |                 {
25 |                 'params': '^(?=.*backbone).*$',
26 |                 'lr': 0.0001
27 |                 },
28 |             ],
29 |         # model=model
30 |         ),
31 |     lr=0.0001,
32 |     betas=[0.9, 0.999],
33 |     weight_decay=0.0001
34 |     )
35 | 
36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
37 |     # optimizer=optimizer,
38 |     milestones=[1000],
39 |     gamma=0.1
40 |     )
41 | 
42 | model.backbone.name = 'B0'
43 | model.backbone.use_lab = True
44 | model.backbone.return_idx = [2, 3]
45 | model.encoder.in_channels = [512, 1024]
46 | model.encoder.feat_strides = [16, 32]
47 | model.encoder.n_levels = 2
48 | model.encoder.use_encoder_idx = [1]
49 | model.encoder.depth_mult = 0.5
50 | model.encoder.expansion = 0.34
51 | model.encoder.hidden_dim = 128
52 | model.encoder.dim_feedforward = 512
53 | model.transformer.num_decoder_layers = 3
54 | model.transformer.num_feature_levels = 2
55 | model.transformer.dim_feedforward = 512
56 | model.transformer.feat_strides = [16, 32]
57 | model.transformer.hidden_dim = 128
58 | model.transformer.dec_n_points= 6
59 | 
60 | dataset_train.dataset.transforms.policy = {
61 |     'name': 'stop_epoch',
62 |     'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
63 |     'epoch': [5, 83, 156] # 156 / 2 + 5 = 83
64 |     }
65 | dataset_train.collate_fn.base_size_repeat = None
66 | dataset_train.collate_fn.stop_epoch = 156
67 | 


--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_n_crowdpose.py:
--------------------------------------------------------------------------------
 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
 2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator
 3 | 
 4 | from src.core import LazyCall as L
 5 | from src.nn.optimizer import ModelEMA 
 6 | from src.misc.get_param_dicts import get_optim_params
 7 | 
 8 | from torch import optim
 9 | 
10 | training_params.output_dir =  "output/detrpose_hgnetv2_n_crowdpose"
11 | training_params.epochs = 284 # 264 + 20
12 | training_params.use_ema = True
13 | 
14 | ema = L(ModelEMA)(
15 |     decay=0.9999,
16 |     warmups=2000
17 |     )
18 | 
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 |     params=L(get_optim_params)(
22 |         cfg=[
23 |                 {
24 |                 'params': '^(?=.*backbone).*$',
25 |                 'lr': 0.00001
26 |                 },
27 |             ],
28 |         # model=model
29 |         ),
30 |     lr=0.0001,
31 |     betas=[0.9, 0.999],
32 |     weight_decay=0.0001
33 |     )
34 | 
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 |     # optimizer=optimizer,
37 |     milestones=[1000],
38 |     gamma=0.1
39 |     )
40 | 
41 | model.transformer.num_body_points=14
42 | criterion.matcher.num_body_points=14
43 | criterion.num_body_points=14
44 | postprocessor.num_body_points=14
45 | 
46 | model.backbone.name = 'B0'
47 | model.backbone.use_lab = True
48 | model.backbone.return_idx = [2, 3]
49 | model.encoder.in_channels = [512, 1024]
50 | model.encoder.feat_strides = [16, 32]
51 | model.encoder.n_levels = 2
52 | model.encoder.use_encoder_idx = [1]
53 | model.encoder.depth_mult = 0.5
54 | model.encoder.expansion = 0.34
55 | model.encoder.hidden_dim = 128
56 | model.encoder.dim_feedforward = 512
57 | model.transformer.num_decoder_layers = 3
58 | model.transformer.num_feature_levels = 2
59 | model.transformer.dim_feedforward = 512
60 | model.transformer.feat_strides = [16, 32]
61 | model.transformer.hidden_dim = 128
62 | model.transformer.dec_n_points= 6
63 | 
64 | dataset_train.dataset.transforms.policy = {
65 |     'name': 'stop_epoch',
66 |     'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
67 |     'epoch': [5, 137, 264] # 264 / 2 + 5 = 137
68 |     }
69 | dataset_train.collate_fn.base_size_repeat = None
70 | dataset_train.collate_fn.stop_epoch = 264
71 | 


--------------------------------------------------------------------------------
/src/models/detrpose/postprocesses.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
 4 | ---------------------------------------------------------------------------------
 5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/)
 6 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved.
 7 | ---------------------------------------------------------------------------------
 8 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/)
 9 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved.
10 | ---------------------------------------------------------------------------------
11 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/)
12 | Copyright (c) 2023 IDEA. All Rights Reserved.
13 | """
14 | 
15 | import torch
16 | from torch import nn
17 | from torchvision.ops.boxes import nms
18 | 
19 | 
20 | class PostProcess(nn.Module):
21 |     """ This module converts the model's output into the format expected by the coco api"""
22 |     def __init__(self, num_select=60, num_body_points=17) -> None:
23 |         super().__init__()
24 |         self.num_select = num_select
25 |         self.num_body_points = num_body_points
26 |         self.deploy_mode = False
27 | 
28 |     @torch.no_grad()
29 |     def forward(self, outputs, target_sizes):
30 |         num_select = self.num_select
31 |         out_logits, out_keypoints= outputs['pred_logits'], outputs['pred_keypoints']
32 | 
33 |         prob = out_logits.sigmoid()
34 |         topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), num_select, dim=1)
35 |         scores = topk_values
36 | 
37 |         # keypoints
38 |         topk_keypoints = (topk_indexes.float() // out_logits.shape[2]).long()
39 |         labels = topk_indexes % out_logits.shape[2]
40 |         
41 |         if self.deploy_mode:
42 |             keypoints = torch.gather(out_keypoints, 1, topk_keypoints[..., None, None].expand(1, num_select, self.num_body_points, 2))
43 |             keypoints = keypoints * target_sizes[:, None, None, :]
44 |             return scores, labels, keypoints
45 | 
46 |         keypoints = torch.gather(out_keypoints, 1, topk_keypoints.unsqueeze(-1).repeat(1, 1, self.num_body_points*2))
47 |         keypoints = keypoints * target_sizes.repeat(1, self.num_body_points)[:, None, :]
48 |         keypoints_res = keypoints.unflatten(-1, (-1, 2))
49 |         keypoints_res = torch.cat(
50 |             [keypoints_res, torch.ones_like(keypoints_res[..., 0:1])], 
51 |             dim=-1).flatten(-2)
52 | 
53 |         results = [{'scores': s, 'labels': l, 'keypoints': k} for s, l, k in zip(scores, labels, keypoints_res)]
54 |         return results
55 | 
56 |     def deploy(self, ):
57 |         self.eval()
58 |         self.deploy_mode = True
59 |         return self
60 | 


--------------------------------------------------------------------------------
/src/misc/metrics.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | import json
 3 | import torch
 4 | 
 5 | def inverse_sigmoid(x, eps=1e-5):
 6 |     x = x.clamp(min=0, max=1)
 7 |     x1 = x.clamp(min=eps)
 8 |     x2 = (1 - x).clamp(min=eps)
 9 |     return torch.log(x1/x2)
10 | 
11 | 
12 | class BestMetricSingle():
13 |     def __init__(self, init_res=0.0, better='large') -> None:
14 |         self.init_res = init_res
15 |         self.best_res = init_res
16 |         self.best_ep = -1
17 | 
18 |         self.better = better
19 |         assert better in ['large', 'small']
20 | 
21 |     def isbetter(self, new_res, old_res):
22 |         if self.better == 'large':
23 |             return new_res > old_res
24 |         if self.better == 'small':
25 |             return new_res < old_res
26 | 
27 |     def update(self, new_res, ep):
28 |         if self.isbetter(new_res, self.best_res):
29 |             self.best_res = new_res
30 |             self.best_ep = ep
31 |             return True
32 |         return False
33 | 
34 |     def __str__(self) -> str:
35 |         return "best_res: {}\t best_ep: {}".format(self.best_res, self.best_ep)
36 | 
37 |     def __repr__(self) -> str:
38 |         return self.__str__()
39 | 
40 |     def summary(self) -> dict:
41 |         return {
42 |             'best_res': self.best_res,
43 |             'best_ep': self.best_ep,
44 |         }
45 | 
46 | 
47 | class BestMetricHolder():
48 |     def __init__(self, init_res=0.0, better='large', use_ema=False) -> None:
49 |         self.best_all = BestMetricSingle(init_res, better)
50 |         self.use_ema = use_ema
51 |         if use_ema:
52 |             self.best_ema = BestMetricSingle(init_res, better)
53 |             self.best_regular = BestMetricSingle(init_res, better)
54 |     
55 | 
56 |     def update(self, new_res, epoch, is_ema=False):
57 |         """
58 |         return if the results is the best.
59 |         """
60 |         if not self.use_ema:
61 |             return self.best_all.update(new_res, epoch)
62 |         else:
63 |             if is_ema:
64 |                 self.best_ema.update(new_res, epoch)
65 |                 return self.best_all.update(new_res, epoch)
66 |             else:
67 |                 self.best_regular.update(new_res, epoch)
68 |                 return self.best_all.update(new_res, epoch)
69 | 
70 |     def summary(self):
71 |         if not self.use_ema:
72 |             return self.best_all.summary()
73 | 
74 |         res = {}
75 |         res.update({f'all_{k}':v for k,v in self.best_all.summary().items()})
76 |         res.update({f'regular_{k}':v for k,v in self.best_regular.summary().items()})
77 |         res.update({f'ema_{k}':v for k,v in self.best_ema.summary().items()})
78 |         return res
79 | 
80 |     def __repr__(self) -> str:
81 |         return json.dumps(self.summary(), indent=2)
82 | 
83 |     def __str__(self) -> str:
84 |         return self.__repr__()


--------------------------------------------------------------------------------
/configs/detrpose/include/detrpose_hgnetv2.py:
--------------------------------------------------------------------------------
  1 | from src.core import LazyCall as L
  2 | from src.models.detrpose import (
  3 |     DETRPose,
  4 |     HybridEncoder,
  5 |     Transformer,
  6 |     PostProcess,
  7 |     Criterion,
  8 |     HungarianMatcher,
  9 |     )
 10 | 
 11 | from src.nn import HGNetv2
 12 | 
 13 | training_params = {
 14 |     "clip_max_norm": 0.1,
 15 |     "save_checkpoint_interval": 1,
 16 |     "grad_accum_steps": 2,
 17 |     "print_freq": 100,
 18 |     'sync_bn': True,
 19 |     'use_ema': False,
 20 |     'dist_url': 'env://',
 21 | }
 22 | 
 23 | eval_spatial_size = (640, 640)
 24 | hidden_dim = 256
 25 | n_levels = 3
 26 | feat_strides = [8, 16, 32]
 27 | num_classes = 2
 28 | 
 29 | model = L(DETRPose)(
 30 |     backbone=L(HGNetv2)(
 31 |         name='B4',
 32 |         use_lab=False,
 33 |         return_idx=[1, 2, 3],
 34 |         freeze_stem_only=True,
 35 |         freeze_at=-1,
 36 |         freeze_norm=True,
 37 |         pretrained=True,
 38 |         ),
 39 | 	encoder=L(HybridEncoder)(
 40 |         in_channels=[512, 1024, 2048],
 41 |         feat_strides=feat_strides,
 42 |         n_levels=n_levels,
 43 |         hidden_dim=hidden_dim,
 44 |         nhead=8,
 45 |         dim_feedforward=1024,
 46 |         dropout=0.0,
 47 |         enc_act='gelu',
 48 |         expansion=1.0,
 49 |         depth_mult=1.0,
 50 |         act='silu',
 51 |         temperatureH=20,
 52 |         temperatureW=20,
 53 |         eval_spatial_size= eval_spatial_size
 54 | 		),
 55 | 	transformer=L(Transformer)(
 56 |         hidden_dim=hidden_dim,
 57 |         dropout=0.0,
 58 |         nhead=8,
 59 |         num_queries=60,
 60 |         dim_feedforward=1024,
 61 |         num_decoder_layers=6,
 62 |         normalize_before=False,
 63 |         return_intermediate_dec=True,
 64 |         activation='relu',
 65 |         num_feature_levels=3,
 66 |         dec_n_points=4,
 67 |         learnable_tgt_init=True,
 68 |         two_stage_type='standard',
 69 |         num_body_points=17,
 70 |         aux_loss=True,
 71 |         num_classes=num_classes,
 72 |         dec_pred_class_embed_share = False,
 73 |         dec_pred_pose_embed_share = False,
 74 |         two_stage_class_embed_share=False,
 75 |         two_stage_bbox_embed_share=False,
 76 |         cls_no_bias = False,
 77 |         # new parameters
 78 |         feat_strides=[8, 16, 32],
 79 |         eval_spatial_size=eval_spatial_size,
 80 |         reg_max=32,
 81 |         reg_scale=4
 82 |         ),
 83 |     )
 84 | 
 85 | criterion = L(Criterion)(
 86 | 	num_classes=num_classes,
 87 | 	weight_dict={'loss_vfl': 2.0, 'loss_keypoints': 10.0, 'loss_oks': 4.0}, 
 88 | 	focal_alpha=0.25,
 89 | 	losses=['vfl', 'keypoints'], 
 90 | 	matcher=L(HungarianMatcher)(
 91 | 		cost_class=2.0,
 92 | 		cost_keypoints=10.0,
 93 |         cost_oks=4.0,
 94 | 		focal_alpha=0.25
 95 | 		),
 96 |     num_body_points=17
 97 | 	)
 98 | 
 99 | postprocessor = L(PostProcess)(num_select=60, num_body_points=17)
100 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from omegaconf import OmegaConf
 3 | 
 4 | from src.solver import Trainer
 5 | from src.misc import dist_utils
 6 | from src.core import LazyConfig, instantiate
 7 | 
 8 | def get_args_parser():
 9 |     parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
10 |     parser.add_argument('--config_file', '-c', type=str, required=True)
11 |     parser.add_argument('--options',
12 |         nargs='+',
13 |         help='override some settings in the used config, the key-value pair '
14 |         'in xxx=yyy format will be merged into config file.')
15 |     parser.add_argument('--device', default='cuda',
16 |                         help='device to use for training / testing')
17 |     parser.add_argument('--seed', default=42, type=int)
18 |     parser.add_argument('--resume', default=None, help='resume from checkpoint')
19 |     parser.add_argument('--pretrain', default=None, help='apply transfer learning to the backbone and encoder using DFINE weights')
20 |     parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
21 |                         help='start epoch')
22 |     parser.add_argument('--eval', action='store_true')
23 |     parser.add_argument('--test', action='store_true')
24 |     parser.add_argument('--find_unused_params', action='store_true')
25 | 
26 |     # distributed training parameters
27 |     parser.add_argument('--world_size', default=1, type=int,
28 |                         help='number of distributed processes')
29 |     parser.add_argument('--rank', default=0, type=int,
30 |                         help='number of distributed processes')
31 |     parser.add_argument("--local_rank", type=int, help='local rank for DistributedDataParallel')
32 |     parser.add_argument('--amp', action='store_true',
33 |                         help="Train with mixed precision")
34 | 
35 |     return parser
36 | 
37 | def main(args):
38 |     cfg = LazyConfig.load(args.config_file)
39 | 
40 |     updates = OmegaConf.create()
41 |     for k, v in args.__dict__.items():
42 |         if k not in ["options"] and v is not None:
43 |             updates[k] = v
44 |     cfg.training_params = OmegaConf.merge(cfg.training_params, updates)
45 | 
46 |     if args.options:
47 |         cfg = LazyConfig.apply_overrides(cfg, args.options) 
48 |     print(cfg)
49 |     
50 |     solver = Trainer(cfg)
51 | 
52 |     assert not(args.eval and args.test), "you can't do evaluation and test at the same time"
53 | 
54 |     if args.eval:
55 |         if hasattr(cfg.model.backbone, 'pretrained'):
56 |             cfg.model.backbone.pretrained = False
57 |         solver.eval()
58 |     elif args.test:
59 |         if hasattr(cfg.model.backbone, 'pretrained'):
60 |             cfg.model.backbone.pretrained = False
61 |         solver.test()
62 |     else:
63 |         solver.fit()
64 |     dist_utils.cleanup()
65 | 
66 | if __name__ == '__main__':
67 |     parser = argparse.ArgumentParser('RT-GroupPose training and evaluation script', parents=[get_args_parser()])
68 |     args = parser.parse_args()
69 |     main(args)
70 | 


--------------------------------------------------------------------------------
/assets/TENSORRT_CONTAINER_LAMBDA.AI.md:
--------------------------------------------------------------------------------
 1 | <h2 align="center">
 2 |   Manual to install TensorRT Containers in Lambda.ai instances 
 3 | </h2>
 4 | 
 5 | ## Quick Start
 6 | ### Lambda.ai
 7 | 1. Go to [Lambda.ai](https://lambda.ai) and create an account.
 8 | 2. Log in to your Lambda.ai account.
 9 | 3. Click on the `Launch instance' button. It is located on the top right side of the website.
10 | 4. Select an instance. To replicate our results from the appendix, select `8x Tesla V100 (16 GB)`
11 | 
12 | ### TensorRT Container Installation
13 | 1. Docker setup
14 |    ```shell
15 |    sudo usermod -aG docker $USER
16 |    newgrp docker
17 |    ```
18 | 3. Installing Nvidia DeepLearning container
19 |    ```shell
20 |     curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
21 |     && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
22 |     sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
23 |     sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
24 |    ``` 
25 | 2. Installing TensorRT docker container
26 |    ```shell
27 |    docker pull nvcr.io/nvidia/tensorrt:24.04-py3
28 |    docker run --gpus all -it --rm nvcr.io/nvidia/tensorrt:24.04-py3
29 |    ```
30 | 
31 | 3. Install the CUDA toolkit with the correct version (in our case 12.8)
32 |     ```shell
33 |     # cuda installation
34 |     wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
35 |     sudo dpkg -i cuda-keyring_1.1-1_all.deb
36 |     sudo apt-get update
37 |     sudo apt-get -y install cuda-toolkit-12-8
38 |     ```
39 | 
40 | The complete installation takes approximately 5 minutes.
41 | 
42 | ## Installing DETRPose
43 | ### Quick Start
44 | ```shell
45 | git clone https://github.com/SebastianJanampa/DETRPose.git
46 | cd DETRPose
47 | pip install -r requirements.txt
48 | apt-get update && apt-get install libgl1
49 | ```
50 | 
51 | ### Data Preparation
52 | ```
53 | pip install gdown # to download files from google drive
54 | gdown 1VprytECcLtU4tKP32SYi_7oDRbw7yUTL # images
55 | unzip images.zip
56 | ```
57 | 
58 | ### Usage
59 | ```shell
60 | pip install onnx onnxsim
61 | pip install -r tools/benchmark/requirements.txt
62 | 
63 | export model=l #n, s, m, l, x
64 | mkdir trt_engines
65 | ```
66 | 1. Download official weights
67 |     ```shell
68 |     wget https://github.com/SebastianJanampa/DETRPose/releases/download/model_weights/detrpose_hgnetv2_${model}.pth
69 |     ```
70 | 2. Export onnx
71 |     ```shell
72 |     python tools/deployment/export_onnx.py --check -c configs/detrpose/detrpose_hgnetv2_${model}.py -r detrpose_hgnetv2_${model}.pth
73 |     ```
74 | 3. Export tensorrt
75 |     ```shell
76 |     trtexec --onnx="onnx_engines/detrpose_hgnetv2_${model}.onnx" --saveEngine="trt_engines/detrpose_hgnetv2_${model}.engine" --fp16
77 |     ```
78 | 4. Benchmark
79 |     ```shell
80 |     python tools/benchmark/trt_benchmark.py --infer_dir ./images --engine_dir trt_engines
81 |     ```


--------------------------------------------------------------------------------
/configs/detrpose/include/dataset.py:
--------------------------------------------------------------------------------
  1 | from src.core import LazyCall as L
  2 | from src.data import CocoDetection
  3 | from src.data.dataloader import (
  4 | 	BatchImageCollateFunction, 
  5 | 	DataLoader
  6 | 	)
  7 | from src.data.coco_eval import CocoEvaluator
  8 | from src.data.container import Compose
  9 | import src.data.transforms as T
 10 | 
 11 | from .detrpose_hgnetv2 import eval_spatial_size
 12 | 
 13 | from omegaconf import OmegaConf
 14 | 
 15 | scales = [(640, 640)]
 16 | max_size = 1333
 17 | scales2_resize = [400, 500, 600]
 18 | 
 19 | __all__ = ["dataset_train", "dataset_val", "dataset_test", "evaluator"]
 20 | 
 21 | dataset_train = L(DataLoader)(
 22 | 	dataset=L(CocoDetection)(
 23 | 		img_folder="./data/COCO2017/train2017",
 24 | 		ann_file="./data/COCO2017/annotations/person_keypoints_train2017.json",
 25 | 		transforms=L(Compose)(
 26 | 			policy={
 27 | 				'name': 'stop_epoch',
 28 | 				'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
 29 | 				'epoch': [5, 29, 48]
 30 | 				},
 31 | 			mosaic_prob=0.5,
 32 | 			transforms1=L(T.Mosaic)(output_size=320, probability=1.0),
 33 | 			transforms2=L(T.RandomZoomOut)(p=0.5),
 34 | 			transforms3=L(T.RandomHorizontalFlip)(),
 35 | 			transforms4=L(T.ColorJitter)(),
 36 | 			transforms5=L(T.RandomResize)(sizes=scales, max_size=max_size), 
 37 | 			transforms6=L(T.ToTensor)(),
 38 | 			transforms7=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])	
 39 | 			),
 40 | 
 41 | 		),
 42 | 	total_batch_size=16,
 43 | 	collate_fn=L(BatchImageCollateFunction)(
 44 | 		base_size=eval_spatial_size[0],
 45 | 		base_size_repeat=4,
 46 | 		stop_epoch=48,
 47 | 		),
 48 | 	num_workers=4,
 49 | 	shuffle=True,
 50 | 	drop_last=True,
 51 | 	pin_memory=True
 52 | 	)
 53 | 
 54 | dataset_val = L(DataLoader)(
 55 | 	dataset=L(CocoDetection)(
 56 | 		img_folder="./data/COCO2017/val2017",
 57 | 		ann_file="./data/COCO2017/annotations/person_keypoints_val2017.json",
 58 | 		transforms=L(Compose)(
 59 | 			transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size), 
 60 | 			transforms2=L(T.ToTensor)(),
 61 | 			transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])
 62 | 			),
 63 | 		),
 64 | 	total_batch_size=32,
 65 | 	collate_fn=L(BatchImageCollateFunction)(
 66 | 		base_size=eval_spatial_size[0],
 67 | 		),
 68 | 	num_workers=4,
 69 | 	shuffle=False,
 70 | 	drop_last=False,
 71 | 	pin_memory=True
 72 | 	)
 73 | 
 74 | dataset_test = L(DataLoader)(
 75 | 	dataset=L(CocoDetection)(
 76 | 		img_folder="./data/COCO2017/test2017",
 77 | 		ann_file="./data/COCO2017/annotations/image_info_test-dev2017.json",
 78 | 		transforms=L(Compose)(
 79 | 			transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size), 
 80 | 			transforms2=L(T.ToTensor)(),
 81 | 			transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])
 82 | 			),
 83 | 		),
 84 | 	total_batch_size=32,
 85 | 	collate_fn=L(BatchImageCollateFunction)(
 86 | 		base_size=eval_spatial_size[0],
 87 | 		),
 88 | 	num_workers=4,
 89 | 	shuffle=False,
 90 | 	drop_last=False,
 91 | 	pin_memory=True
 92 | 	)
 93 | 
 94 | evaluator = L(CocoEvaluator)(
 95 | 	ann_file="./data/COCO2017/annotations/person_keypoints_val2017.json",
 96 | 	iou_types=['keypoints'],
 97 | 	useCats=True
 98 | 	)
 99 | 
100 | 


--------------------------------------------------------------------------------
/configs/detrpose/include/dataset_crowdpose.py:
--------------------------------------------------------------------------------
  1 | from src.core import LazyCall as L
  2 | from src.data import CrowdPoseDetection
  3 | from src.data.dataloader import (
  4 | 	BatchImageCollateFunction, 
  5 | 	DataLoader
  6 | 	)
  7 | from src.data.crowdpose_eval import CrowdPoseEvaluator
  8 | from src.data.container import Compose
  9 | import src.data.transforms as T
 10 | import src.data.transforms_crowdpose as CrowdT
 11 | 
 12 | from .detrpose_hgnetv2 import eval_spatial_size
 13 | 
 14 | from omegaconf import OmegaConf
 15 | 
 16 | scales = [(640, 640)]
 17 | max_size = 1333
 18 | scales2_resize = [400, 500, 600]
 19 | 
 20 | __all__ = ["dataset_train", "dataset_val", "dataset_test", "evaluator"]
 21 | 
 22 | dataset_train = L(DataLoader)(
 23 | 	dataset=L(CrowdPoseDetection)(
 24 | 		img_folder="./data/crowdpose/images",
 25 | 		ann_file="./data/crowdpose/annotations/crowdpose_trainval.json",
 26 | 		transforms=L(Compose)(
 27 | 			policy={
 28 | 				'name': 'stop_epoch',
 29 | 				'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
 30 | 				'epoch': [5, 29, 60]
 31 | 				},
 32 | 			mosaic_prob=0.5,
 33 | 			transforms1=L(T.Mosaic)(output_size=320, probability=1.0),
 34 | 			transforms2=L(T.RandomZoomOut)(p=0.5),
 35 | 			transforms3=L(CrowdT.RandomHorizontalFlip)(p=0.5),
 36 | 			transforms4=L(T.ColorJitter)(),
 37 | 			transforms5=L(T.RandomResize)(sizes=scales, max_size=max_size), 
 38 | 			transforms6=L(T.ToTensor)(),
 39 | 			transforms7=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])	
 40 | 			),
 41 | 
 42 | 		),
 43 | 	total_batch_size=16,
 44 | 	collate_fn=L(BatchImageCollateFunction)(
 45 | 		base_size=eval_spatial_size[0],
 46 | 		base_size_repeat=4,
 47 | 		stop_epoch=60,
 48 | 		),
 49 | 	num_workers=4,
 50 | 	shuffle=True,
 51 | 	drop_last=True,
 52 | 	pin_memory=True
 53 | 	)
 54 | 
 55 | dataset_val = L(DataLoader)(
 56 | 	dataset=L(CrowdPoseDetection)(
 57 | 		img_folder="./data/crowdpose/images",
 58 | 		ann_file="./data/crowdpose/annotations/crowdpose_test.json",
 59 | 		transforms=L(Compose)(
 60 | 			transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size), 
 61 | 			transforms2=L(T.ToTensor)(),
 62 | 			transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])
 63 | 			),
 64 | 		),
 65 | 	total_batch_size=32,
 66 | 	collate_fn=L(BatchImageCollateFunction)(
 67 | 		base_size=eval_spatial_size[0],
 68 | 		),
 69 | 	num_workers=4,
 70 | 	shuffle=False,
 71 | 	drop_last=False,
 72 | 	pin_memory=True
 73 | 	)
 74 | 
 75 | dataset_test = L(DataLoader)(
 76 | 	dataset=L(CrowdPoseDetection)(
 77 | 		img_folder="./data/crowdpose/images",
 78 | 		ann_file="./data/crowdpose/annotations/crowdpose_test.json",
 79 | 		transforms=L(Compose)(
 80 | 			transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size), 
 81 | 			transforms2=L(T.ToTensor)(),
 82 | 			transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])
 83 | 			),
 84 | 		),
 85 | 	total_batch_size=32,
 86 | 	collate_fn=L(BatchImageCollateFunction)(
 87 | 		base_size=eval_spatial_size[0],
 88 | 		),
 89 | 	num_workers=4,
 90 | 	shuffle=False,
 91 | 	drop_last=False,
 92 | 	pin_memory=True
 93 | 	)
 94 | 
 95 | evaluator = L(CrowdPoseEvaluator)(
 96 | 	ann_file="./data/crowdpose/annotations/crowdpose_test.json",
 97 | 	iou_types=['keypoints_crowd'],
 98 | 	useCats=True
 99 | 	)
100 | 
101 | 


--------------------------------------------------------------------------------
/src/core/instantiate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | import collections.abc as abc
 4 | import dataclasses
 5 | import logging
 6 | from typing import Any
 7 | 
 8 | from .utils import _convert_target_to_string, locate
 9 | 
10 | __all__ = ["dump_dataclass", "instantiate"]
11 | 
12 | 
13 | def dump_dataclass(obj: Any):
14 |     """
15 |     Dump a dataclass recursively into a dict that can be later instantiated.
16 | 
17 |     Args:
18 |         obj: a dataclass object
19 | 
20 |     Returns:
21 |         dict
22 |     """
23 |     assert dataclasses.is_dataclass(obj) and not isinstance(
24 |         obj, type
25 |     ), "dump_dataclass() requires an instance of a dataclass."
26 |     ret = {"_target_": _convert_target_to_string(type(obj))}
27 |     for f in dataclasses.fields(obj):
28 |         v = getattr(obj, f.name)
29 |         if dataclasses.is_dataclass(v):
30 |             v = dump_dataclass(v)
31 |         if isinstance(v, (list, tuple)):
32 |             v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v]
33 |         ret[f.name] = v
34 |     return ret
35 | 
36 | 
37 | def instantiate(cfg):
38 |     """
39 |     Recursively instantiate objects defined in dictionaries by
40 |     "_target_" and arguments.
41 | 
42 |     Args:
43 |         cfg: a dict-like object with "_target_" that defines the caller, and
44 |             other keys that define the arguments
45 | 
46 |     Returns:
47 |         object instantiated by cfg
48 |     """
49 |     from omegaconf import ListConfig, DictConfig, OmegaConf
50 | 
51 |     if isinstance(cfg, ListConfig):
52 |         lst = [instantiate(x) for x in cfg]
53 |         return ListConfig(lst, flags={"allow_objects": True})
54 |     if isinstance(cfg, list):
55 |         # Specialize for list, because many classes take
56 |         # list[objects] as arguments, such as ResNet, DatasetMapper
57 |         return [instantiate(x) for x in cfg]
58 | 
59 |     # If input is a DictConfig backed by dataclasses (i.e. omegaconf's structured config),
60 |     # instantiate it to the actual dataclass.
61 |     if isinstance(cfg, DictConfig) and dataclasses.is_dataclass(cfg._metadata.object_type):
62 |         return OmegaConf.to_object(cfg)
63 | 
64 |     if isinstance(cfg, abc.Mapping) and "_target_" in cfg:
65 |         # conceptually equivalent to hydra.utils.instantiate(cfg) with _convert_=all,
66 |         # but faster: https://github.com/facebookresearch/hydra/issues/1200
67 |         cfg = {k: instantiate(v) for k, v in cfg.items()}
68 |         cls = cfg.pop("_target_")
69 |         cls = instantiate(cls)
70 | 
71 |         if isinstance(cls, str):
72 |             cls_name = cls
73 |             cls = locate(cls_name)
74 |             assert cls is not None, cls_name
75 |         else:
76 |             try:
77 |                 cls_name = cls.__module__ + "." + cls.__qualname__
78 |             except Exception:
79 |                 # target could be anything, so the above could fail
80 |                 cls_name = str(cls)
81 |         assert callable(cls), f"_target_ {cls} does not define a callable object"
82 |         try:
83 |             return cls(**cfg)
84 |         except TypeError:
85 |             logger = logging.getLogger(__name__)
86 |             logger.error(f"Error when instantiating {cls_name}!")
87 |             raise
88 |     return cfg  # return as-is if don't know what to do


--------------------------------------------------------------------------------
/tools/benchmark/dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
  3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
  4 | """
  5 | 
  6 | import os
  7 | import glob
  8 | from PIL import Image
  9 | 
 10 | import torch
 11 | import torch.utils.data as data
 12 | import torchvision
 13 | import torchvision.transforms as T
 14 | import torchvision.transforms.functional as F
 15 | 
 16 | Image.MAX_IMAGE_PIXELS = None
 17 | 
 18 | class ToTensor(T.ToTensor):
 19 |     def __init__(self) -> None:
 20 |         super().__init__()
 21 | 
 22 |     def __call__(self, pic):
 23 |         if isinstance(pic, torch.Tensor):
 24 |             return pic
 25 |         return super().__call__(pic)
 26 | 
 27 | class PadToSize(T.Pad):
 28 |     def __init__(self, size, fill=0, padding_mode='constant'):
 29 |         super().__init__(0, fill, padding_mode)
 30 |         self.size = size
 31 |         self.fill = fill
 32 | 
 33 |     def __call__(self, img):
 34 |         """
 35 |         Args:
 36 |             img (PIL Image or Tensor): Image to be padded.
 37 | 
 38 |         Returns:
 39 |             PIL Image or Tensor: Padded image.
 40 |         """
 41 |         w, h = F.get_image_size(img)
 42 |         padding = (0, 0, self.size[0] - w, self.size[1] - h)
 43 |         return F.pad(img, padding, self.fill, self.padding_mode)
 44 | 
 45 | 
 46 | class Dataset(data.Dataset):
 47 |     def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None:
 48 |         super().__init__()
 49 | 
 50 |         self.device = device
 51 |         self.size = 640
 52 | 
 53 |         self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg')))
 54 | 
 55 |         if preprocess is None:
 56 |             self.preprocess = T.Compose([
 57 |                     T.Resize(size=639, max_size=640),
 58 |                     PadToSize(size=(640, 640), fill=114),
 59 |                     ToTensor(),
 60 |                     T.ConvertImageDtype(torch.float),
 61 |             ])
 62 |         else:
 63 |             self.preprocess = preprocess
 64 | 
 65 |     def __len__(self, ):
 66 |         return len(self.im_path_list)
 67 | 
 68 |     def __getitem__(self, index):
 69 |         # im = Image.open(self.img_path_list[index]).convert('RGB')
 70 |         im = torchvision.io.read_file(self.im_path_list[index])
 71 |         im = torchvision.io.decode_image(im, mode=torchvision.io.ImageReadMode.RGB).to(self.device)
 72 |         _, h, w = im.shape # c,h,w
 73 | 
 74 |         im = self.preprocess(im)
 75 | 
 76 |         blob = {
 77 |             'images': im,
 78 |             # 'im_shape': torch.tensor([self.size, self.size]).to(im.device),
 79 |             # 'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device),
 80 |             'orig_target_sizes': torch.tensor([w, h]).to(im.device),
 81 |         }
 82 | 
 83 |         return blob
 84 | 
 85 |     @staticmethod
 86 |     def post_process():
 87 |         pass
 88 | 
 89 |     @staticmethod
 90 |     def collate_fn():
 91 |         pass
 92 | 
 93 | 
 94 | def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''):
 95 |     '''show result
 96 |     Keys:
 97 |         'num_dets', 'det_boxes', 'det_scores', 'det_classes'
 98 |     '''
 99 |     for i in range(blob['image'].shape[0]):
100 |         det_scores = outputs['det_scores'][i]
101 |         det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold]
102 | 
103 |         im = (blob['image'][i] * 255).to(torch.uint8)
104 |         im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2)
105 |         Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg')
106 | 


--------------------------------------------------------------------------------
/src/misc/keypoint_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import torch.nn as nn
 4 | 
 5 | def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):
 6 |     sigmas = kpt_preds.new_tensor(sigmas)
 7 |     variances = (sigmas * 2)**2
 8 | 
 9 |     assert kpt_preds.size(0) == kpt_gts.size(0)
10 |     kpt_preds = kpt_preds.reshape(-1, kpt_preds.size(-1) // 2, 2)
11 |     kpt_gts = kpt_gts.reshape(-1, kpt_gts.size(-1) // 2, 2)
12 | 
13 |     squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \
14 |         (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2
15 |     squared_distance0 = squared_distance / (kpt_areas[:, None] * variances[None, :] * 2)
16 |     squared_distance1 = torch.exp(-squared_distance0)
17 |     squared_distance1 = squared_distance1 * kpt_valids
18 |     oks = squared_distance1.sum(dim=1) / (kpt_valids.sum(dim=1)+1e-6)
19 | 
20 |     return oks
21 | 
22 | def oks_loss(pred,
23 |              target,
24 |              valid=None,
25 |              area=None,
26 |              linear=False,
27 |              sigmas=None,
28 |              eps=1e-6):
29 |     oks = oks_overlaps(pred, target, valid, area, sigmas).clamp(min=eps)
30 |     if linear:
31 |         loss = oks
32 |     else:
33 |         loss = -oks.log()
34 |     return loss
35 | 
36 | 
37 | class OKSLoss(nn.Module):
38 |     def __init__(self,
39 |                  linear=False,
40 |                  num_keypoints=17,
41 |                  eps=1e-6,
42 |                  reduction='mean',
43 |                  loss_weight=1.0):
44 |         super(OKSLoss, self).__init__()
45 |         self.linear = linear
46 |         self.eps = eps
47 |         self.reduction = reduction
48 |         self.loss_weight = loss_weight
49 |         if num_keypoints == 17:
50 |             self.sigmas = np.array([
51 |                 .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
52 |                 1.07, .87, .87, .89, .89
53 |             ], dtype=np.float32) / 10.0
54 |         elif num_keypoints == 14:
55 |             self.sigmas = np.array([
56 |                 .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
57 |                 .79, .79
58 |             ]) / 10.0
59 |         elif num_keypoints == 3:
60 |             self.sigmas = np.array([
61 |                 1.07, 1.07, 0.67
62 |             ]) / 10.0
63 |         else:
64 |             raise ValueError(f'Unsupported keypoints number {num_keypoints}')
65 | 
66 |     def forward(self,
67 |                 pred,
68 |                 target,
69 |                 valid,
70 |                 area,
71 |                 weight=None,
72 |                 avg_factor=None,
73 |                 reduction_override=None):
74 |         assert reduction_override in (None, 'none', 'mean', 'sum')
75 |         reduction = (
76 |             reduction_override if reduction_override else self.reduction)
77 |         if (weight is not None) and (not torch.any(weight > 0)) and (
78 |                 reduction != 'none'):
79 |             if pred.dim() == weight.dim() + 1:
80 |                 weight = weight.unsqueeze(1)
81 |             return (pred * weight).sum()  # 0
82 |         if weight is not None and weight.dim() > 1:
83 |             # TODO: remove this in the future
84 |             # reduce the weight of shape (n, 4) to (n,) to match the
85 |             # iou_loss of shape (n,)
86 |             assert weight.shape == pred.shape
87 |             weight = weight.mean(-1)
88 |         loss = self.loss_weight * oks_loss(
89 |             pred,
90 |             target,
91 |             valid=valid,
92 |             area=area,
93 |             linear=self.linear,
94 |             sigmas=self.sigmas,
95 |             eps=self.eps)
96 |         return loss


--------------------------------------------------------------------------------
/tools/benchmark/torch_benchmark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
  3 | """
  4 | import time
  5 | import torch
  6 | from torch import nn
  7 | import torch.backends.cudnn as cudnn
  8 | cudnn.benchmark = True
  9 | 
 10 | import argparse
 11 | from dataset import Dataset
 12 | from tqdm import tqdm
 13 | 
 14 | import os, sys
 15 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
 16 | from src.core import LazyConfig, instantiate
 17 | 
 18 | def parse_args():
 19 |     parser = argparse.ArgumentParser(description='Argument Parser Example')
 20 |     parser.add_argument('--config_file', '-c', default='./configs/detrpose/detrpose_hgnetv2_l.py', type=str, )
 21 |     parser.add_argument('--resume', '-r', type=str, )
 22 |     parser.add_argument('--infer_dir',
 23 |                         type=str,
 24 |                         default='./data/COCO2017/val2017',
 25 |                         help="Directory for images to perform inference on.")
 26 |     args = parser.parse_args()
 27 |     return args
 28 | 
 29 | @torch.no_grad()
 30 | def warmup(model, data, img_size, n):
 31 |     for _ in range(n):
 32 |         _ = model(data, img_size)
 33 | 
 34 | @torch.no_grad()
 35 | def speed(model, data, n):
 36 |     times = []
 37 |     for i in tqdm(range(n), desc="Running Inference", unit="iteration"):
 38 |         blob = data[i]
 39 |         samples, target_sizes = blob['images'].unsqueeze(0), blob['orig_target_sizes']
 40 |         torch.cuda.synchronize()
 41 |         t_ = time.perf_counter()
 42 |         _ = model(samples, target_sizes)
 43 |         torch.cuda.synchronize()
 44 |         t = time.perf_counter() - t_
 45 |         times.append(t)
 46 | 
 47 |     # end-to-end model only
 48 |     times = sorted(times)
 49 |     if len(times) > 100:
 50 |         times = times[:100]
 51 |     return sum(times) / len(times)
 52 | 
 53 | def main():
 54 |     FLAGS = parse_args()
 55 |     dataset = Dataset(FLAGS.infer_dir)
 56 |     blob = torch.ones(1, 3, 640, 640).cuda()
 57 | 
 58 |     img_size = torch.tensor([[640, 640]], device='cuda')
 59 | 
 60 |     cfg = LazyConfig.load(FLAGS.config_file)
 61 |     
 62 |     if hasattr(cfg.model.backbone, 'pretrained'):
 63 |         cfg.model.backbone.pretrained = False
 64 | 
 65 |     model = instantiate(cfg.model)
 66 |     postprocessor = instantiate(cfg.postprocessor)
 67 | 
 68 |     if FLAGS.resume:
 69 |         checkpoint = torch.load(FLAGS.resume, map_location='cpu')
 70 |         if 'ema' in checkpoint:
 71 |             state = checkpoint['ema']['module']
 72 |         else:
 73 |             state = checkpoint['model']
 74 | 
 75 |         # NOTE load train mode state -> convert to deploy mode
 76 |         linea.load_state_dict(state)
 77 | 
 78 |     else:
 79 |         # raise AttributeError('Only support resume to load model.state_dict by now.')
 80 |         print('not load model.state_dict, use default init state dict...')
 81 | 
 82 |     class Model(nn.Module):
 83 |         def __init__(self, ) -> None:
 84 |             super().__init__()
 85 |             self.model = model.deploy()
 86 |             self.postprocessor = postprocessor.deploy()
 87 | 
 88 |         def forward(self, images, orig_target_sizes):
 89 |             outputs = self.model(images)
 90 |             outputs = self.postprocessor(outputs, orig_target_sizes)
 91 |             return outputs
 92 | 
 93 |     model = Model().cuda()
 94 |     
 95 |     warmup(model, blob, img_size, 400)
 96 |     t = []
 97 |     for _ in range(1):
 98 |         t.append(speed(model, dataset, 1000))
 99 |     avg_latency = 1000 * torch.tensor(t).mean()
100 |     print(f"model: {FLAGS.config_file}, Latency: {avg_latency:.2f} ms")
101 | 
102 |     del model
103 |     torch.cuda.empty_cache()
104 |     time.sleep(1)
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     main()
109 | 


--------------------------------------------------------------------------------
/src/nn/optimizer/ema.py:
--------------------------------------------------------------------------------
  1 | """
  2 | D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement
  3 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
  6 | Copyright (c) 2023 lyuwenyu. All Rights Reserved.
  7 | """
  8 | 
  9 | import math
 10 | from copy import deepcopy
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | 
 15 | from ...misc import dist_utils
 16 | 
 17 | __all__ = ["ModelEMA"]
 18 | 
 19 | 
 20 | class ModelEMA(object):
 21 |     """
 22 |     Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
 23 |     Keep a moving average of everything in the model state_dict (parameters and buffers).
 24 |     This is intended to allow functionality like
 25 |     https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
 26 |     A smoothed version of the weights is necessary for some training schemes to perform well.
 27 |     This class is sensitive where it is initialized in the sequence of model init,
 28 |     GPU assignment and distributed training wrappers.
 29 |     """
 30 | 
 31 |     def __init__(
 32 |         self, model: nn.Module, decay: float = 0.9999, warmups: int = 1000, start: int = 0
 33 |     ):
 34 |         super().__init__()
 35 | 
 36 |         self.module = deepcopy(dist_utils.de_parallel(model)).eval()
 37 |         # if next(model.parameters()).device.type != 'cpu':
 38 |         #     self.module.half()  # FP16 EMA
 39 | 
 40 |         self.decay = decay
 41 |         self.warmups = warmups
 42 |         self.before_start = 0
 43 |         self.start = start
 44 |         self.updates = 0  # number of EMA updates
 45 |         if warmups == 0:
 46 |             self.decay_fn = lambda x: decay
 47 |         else:
 48 |             self.decay_fn = lambda x: decay * (
 49 |                 1 - math.exp(-x / warmups)
 50 |             )  # decay exponential ramp (to help early epochs)
 51 | 
 52 |         for p in self.module.parameters():
 53 |             p.requires_grad_(False)
 54 | 
 55 |     def update(self, model: nn.Module):
 56 |         if self.before_start < self.start:
 57 |             self.before_start += 1
 58 |             return
 59 |         # Update EMA parameters
 60 |         with torch.no_grad():
 61 |             self.updates += 1
 62 |             d = self.decay_fn(self.updates)
 63 |             msd = dist_utils.de_parallel(model).state_dict()
 64 |             for k, v in self.module.state_dict().items():
 65 |                 if v.dtype.is_floating_point:
 66 |                     v *= d
 67 |                     v += (1 - d) * msd[k].detach()
 68 | 
 69 |     def to(self, *args, **kwargs):
 70 |         self.module = self.module.to(*args, **kwargs)
 71 |         return self
 72 | 
 73 |     def state_dict(
 74 |         self,
 75 |     ):
 76 |         return dict(module=self.module.state_dict(), updates=self.updates)
 77 | 
 78 |     def load_state_dict(self, state, strict=True):
 79 |         self.module.load_state_dict(state["module"], strict=strict)
 80 |         if "updates" in state:
 81 |             self.updates = state["updates"]
 82 | 
 83 |     def forwad(
 84 |         self,
 85 |     ):
 86 |         raise RuntimeError("ema...")
 87 | 
 88 |     def extra_repr(self) -> str:
 89 |         return f"decay={self.decay}, warmups={self.warmups}, name=ema"
 90 | 
 91 | 
 92 | class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel):
 93 |     """Maintains moving averages of model parameters using an exponential decay.
 94 |     ``ema_avg = decay * avg_model_param + (1 - decay) * model_param``
 95 |     `torch.optim.swa_utils.AveragedModel <https://pytorch.org/docs/stable/optim.html#custom-averaging-strategies>`_
 96 |     is used to compute the EMA.
 97 |     """
 98 | 
 99 |     def __init__(self, model, decay, device="cpu", use_buffers=True):
100 |         self.decay_fn = lambda x: decay * (1 - math.exp(-x / 2000))
101 | 
102 |         def ema_avg(avg_model_param, model_param, num_averaged):
103 |             decay = self.decay_fn(num_averaged)
104 |             return decay * avg_model_param + (1 - decay) * model_param
105 | 
106 |         super().__init__(model, device, ema_avg, use_buffers=use_buffers)
107 | 


--------------------------------------------------------------------------------
/tools/deployment/export_onnx.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---------------------------------------------------------------------------------
  3 | Modified from D-FINE
  4 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
  5 | ---------------------------------------------------------------------------------
  6 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
  7 | Copyright (c) 2023 lyuwenyu. All Rights Reserved.
  8 | """
  9 | 
 10 | import os
 11 | import sys
 12 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
 13 | from src.core import LazyConfig, instantiate
 14 | 
 15 | import torch
 16 | import torch.nn as nn
 17 | 
 18 | def main(args, ):
 19 |     """main
 20 |     """
 21 |     cfg = LazyConfig.load(args.config_file)
 22 |     
 23 |     if hasattr(cfg.model.backbone, 'pretrained'):
 24 |         cfg.model.backbone.pretrained = False
 25 | 
 26 |     model = instantiate(cfg.model)
 27 |     postprocessor = instantiate(cfg.postprocessor)
 28 | 
 29 |     if args.resume:
 30 |         checkpoint = torch.load(args.resume, map_location='cpu', weights_only=False)
 31 |         if 'ema' in checkpoint:
 32 |             state = checkpoint['ema']['module']
 33 |         else:
 34 |             state = checkpoint['model']
 35 | 
 36 |         # NOTE load train mode state -> convert to deploy mode
 37 |         model.load_state_dict(state)
 38 | 
 39 |     else:
 40 |         # raise AttributeError('Only support resume to load model.state_dict by now.')
 41 |         print('not load model.state_dict, use default init state dict...')
 42 | 
 43 |     model = model.deploy()
 44 |     model.eval()
 45 | 
 46 |     class Model(nn.Module):
 47 |         def __init__(self, ) -> None:
 48 |             super().__init__()
 49 |             self.model = model
 50 |             self.postprocessor = postprocessor.deploy()
 51 | 
 52 |         def forward(self, images, orig_target_sizes):
 53 |             outputs = self.model(images)
 54 |             outputs = self.postprocessor(outputs, orig_target_sizes)
 55 |             return outputs
 56 | 
 57 |     model = Model()
 58 | 
 59 |     data = torch.rand(1, 3, 640, 640)
 60 |     size = torch.tensor([[640, 640]])
 61 |     _ = model(data, size)
 62 | 
 63 |     dynamic_axes = {
 64 |         'images': {0: 'N', },
 65 |         'orig_target_sizes': {0: 'N'}
 66 |     }
 67 | 
 68 |     outout_folder = 'onnx_engines'
 69 |     os.makedirs(outout_folder , exist_ok=True)
 70 |     output_file = args.config_file.split('/')[-1].replace('py', 'onnx')
 71 |     output_file = f'{outout_folder}/{output_file}'
 72 |     # args.resume.replace('.pth', '.onnx') if args.resume else 'model.onnx'
 73 | 
 74 |     torch.onnx.export(
 75 |         model,
 76 |         (data, size),
 77 |         output_file,
 78 |         input_names=['images', 'orig_target_sizes'],
 79 |         output_names=['scores', 'labels', 'keypoints'],
 80 |         dynamic_axes=dynamic_axes,
 81 |         opset_version=16,
 82 |         # dynamo=True,
 83 |         # external_data=False,
 84 |         # verify=True,
 85 |         # report=True,
 86 |         verbose=False,
 87 |         do_constant_folding=True,
 88 |     )
 89 | 
 90 |     if args.check:
 91 |         import onnx
 92 |         onnx_model = onnx.load(output_file)
 93 |         onnx.checker.check_model(onnx_model)
 94 |         print('Check export onnx model done...')
 95 | 
 96 |     if args.simplify:
 97 |         import onnx
 98 |         import onnxsim
 99 |         dynamic = True
100 |         # input_shapes = {'images': [1, 3, 640, 640], 'orig_target_sizes': [1, 2]} if dynamic else None
101 |         input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None
102 |         onnx_model_simplify, check = onnxsim.simplify(output_file, test_input_shapes=input_shapes)
103 |         onnx.save(onnx_model_simplify, output_file)
104 |         print(f'Simplify onnx model {check}...')
105 | 
106 | 
107 | if __name__ == '__main__':
108 | 
109 |     import argparse
110 |     parser = argparse.ArgumentParser()
111 |     parser.add_argument('--config_file', '-c', default='configs/linea/linea_l.py', type=str, )
112 |     parser.add_argument('--resume', '-r', type=str, )
113 |     parser.add_argument('--check',  action='store_true', default=True,)
114 |     parser.add_argument('--simplify',  action='store_true', default=True,)
115 |     args = parser.parse_args()
116 |     main(args)
117 | 


--------------------------------------------------------------------------------
/src/misc/box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Utilities for bounding box manipulation and GIoU.
  4 | """
  5 | import torch, os
  6 | from torchvision.ops.boxes import box_area
  7 | 
  8 | 
  9 | def box_cxcywh_to_xyxy(x):
 10 |     x_c, y_c, w, h = x.unbind(-1)
 11 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 12 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 13 |     return torch.stack(b, dim=-1)
 14 | 
 15 | 
 16 | def box_xyxy_to_cxcywh(x):
 17 |     x0, y0, x1, y1 = x.unbind(-1)
 18 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
 19 |          (x1 - x0), (y1 - y0)]
 20 |     return torch.stack(b, dim=-1)
 21 | 
 22 | 
 23 | # modified from torchvision to also return the union
 24 | def box_iou(boxes1, boxes2):
 25 |     area1 = box_area(boxes1)
 26 |     area2 = box_area(boxes2)
 27 | 
 28 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 29 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 30 | 
 31 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 32 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 33 | 
 34 |     union = area1[:, None] + area2 - inter
 35 | 
 36 |     iou = inter / (union + 1e-6)
 37 |     return iou, union
 38 | 
 39 | 
 40 | def generalized_box_iou(boxes1, boxes2):
 41 |     """
 42 |     Generalized IoU from https://giou.stanford.edu/
 43 | 
 44 |     The boxes should be in [x0, y0, x1, y1] format
 45 | 
 46 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 47 |     and M = len(boxes2)
 48 |     """
 49 |     # degenerate boxes gives inf / nan results
 50 |     # so do an early check
 51 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 52 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 53 |     iou, union = box_iou(boxes1, boxes2)
 54 | 
 55 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 56 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 57 | 
 58 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 59 |     area = wh[:, :, 0] * wh[:, :, 1]
 60 | 
 61 |     return iou - (area - union) / (area + 1e-6)
 62 | 
 63 | 
 64 | 
 65 | # modified from torchvision to also return the union
 66 | def box_iou_pairwise(boxes1, boxes2):
 67 |     area1 = box_area(boxes1)
 68 |     area2 = box_area(boxes2)
 69 | 
 70 |     lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]
 71 |     rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]
 72 | 
 73 |     wh = (rb - lt).clamp(min=0)  # [N,2]
 74 |     inter = wh[:, 0] * wh[:, 1]  # [N]
 75 | 
 76 |     union = area1 + area2 - inter
 77 | 
 78 |     iou = inter / union
 79 |     return iou, union
 80 | 
 81 | 
 82 | def generalized_box_iou_pairwise(boxes1, boxes2):
 83 |     """
 84 |     Generalized IoU from https://giou.stanford.edu/
 85 | 
 86 |     Input:
 87 |         - boxes1, boxes2: N,4
 88 |     Output:
 89 |         - giou: N, 4
 90 |     """
 91 |     # degenerate boxes gives inf / nan results
 92 |     # so do an early check
 93 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 94 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 95 |     assert boxes1.shape == boxes2.shape
 96 |     iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
 97 | 
 98 |     lt = torch.min(boxes1[:, :2], boxes2[:, :2])
 99 |     rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
100 | 
101 |     wh = (rb - lt).clamp(min=0)  # [N,2]
102 |     area = wh[:, 0] * wh[:, 1]
103 | 
104 |     return iou - (area - union) / area
105 | 
106 | def masks_to_boxes(masks):
107 |     """Compute the bounding boxes around the provided masks
108 | 
109 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
110 | 
111 |     Returns a [N, 4] tensors, with the boxes in xyxy format
112 |     """
113 |     if masks.numel() == 0:
114 |         return torch.zeros((0, 4), device=masks.device)
115 | 
116 |     h, w = masks.shape[-2:]
117 | 
118 |     y = torch.arange(0, h, dtype=torch.float)
119 |     x = torch.arange(0, w, dtype=torch.float)
120 |     y, x = torch.meshgrid(y, x)
121 | 
122 |     x_mask = (masks * x.unsqueeze(0))
123 |     x_max = x_mask.flatten(1).max(-1)[0]
124 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
125 | 
126 |     y_mask = (masks * y.unsqueeze(0))
127 |     y_max = y_mask.flatten(1).max(-1)[0]
128 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
129 | 
130 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
131 | 
132 | if __name__ == '__main__':
133 |     x = torch.rand(5, 4)
134 |     y = torch.rand(3, 4)
135 |     iou, union = box_iou(x, y)


--------------------------------------------------------------------------------
/src/data/container.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
 4 | ---------------------------------------------------------------------------------
 5 | Modified from D-DEIM (https://github.com/Intellindust-AI-Lab/DEIM/)
 6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
 7 | ---------------------------------------------------------------------------------
 8 | Modified from D-FINE (https://github.com/Peterande/D-FINE/)
 9 | Copyright (c) 2024 D-FINE Authors. All Rights Reserved.
10 | ---------------------------------------------------------------------------------
11 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/)
12 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved.
13 | """
14 | 
15 | from omegaconf import ListConfig
16 | import random
17 | 
18 | class Compose(object):
19 |     def __init__(self, policy=None, mosaic_prob=0.0, **transforms):
20 |         self.transforms = []
21 |         for transform in transforms.values():
22 |             self.transforms.append(transform)
23 | 
24 |         self.mosaic_prob = mosaic_prob
25 | 
26 |         if policy is None:
27 |             self.policy = {'name': 'default'}
28 |         else:
29 |             self.policy = policy
30 |             if self.mosaic_prob > 0:
31 |                 print("     ### Mosaic with Prob.@{} and RandomZoomOut/RandomCrop  existed ### ".format(self.mosaic_prob))
32 |             print("     ### ImgTransforms Epochs: {} ### ".format(policy['epoch']))
33 |             print('     ### Policy_ops@{} ###'.format(policy['ops']))
34 | 
35 |         ### warnings ##
36 |         self.warning_mosaic_start = True
37 | 
38 |     def __call__(self, image, target, dataset=None):
39 |         return self.get_forward(self.policy['name'])(image, target, dataset)
40 | 
41 |     def get_forward(self, name):
42 |         forwards = {
43 |             'default': self.default_forward,
44 |             'stop_epoch': self.stop_epoch_forward,
45 |         }
46 |         return forwards[name]
47 | 
48 |     def default_forward(self, image, target, dataset=None):
49 |         for transform in self.transforms:
50 |             image, target = transform(image, target)
51 |         return image, target
52 | 
53 |     def stop_epoch_forward(self, image, target, dataset=None):
54 |         cur_epoch = dataset.epoch
55 |         policy_ops = self.policy['ops']
56 |         policy_epoch = self.policy['epoch']
57 | 
58 |         if isinstance(policy_epoch, (list, ListConfig)) and len(policy_epoch) == 3:
59 |             if policy_epoch[0] <= cur_epoch < policy_epoch[1]:
60 |                 with_mosaic = random.random() <= self.mosaic_prob       # Probility for Mosaic
61 |             else:
62 |                 with_mosaic = False
63 | 
64 |             for transform in self.transforms:
65 |                 if (type(transform).__name__ in policy_ops and cur_epoch < policy_epoch[0]):   # first stage: NoAug
66 |                     pass
67 |                 elif (type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch[-1]):   # last stage: NoAug
68 |                     pass
69 |                 else:
70 |                     # Using Mosaic for [policy_epoch[0], policy_epoch[1]] with probability
71 |                     if (type(transform).__name__ == 'Mosaic' and not with_mosaic):      
72 |                         pass
73 |                     # Mosaic and Zoomout/IoUCrop can not be co-existed in the same sample
74 |                     elif (type(transform).__name__ == 'RandomZoomOut' or type(transform).__name__ == 'RandomCrop') and with_mosaic:      
75 |                         pass
76 |                     else:
77 |                         if type(transform).__name__ == 'Mosaic':
78 |                             if self.warning_mosaic_start:
79 |                                 # It shows in which epochs mosaic is being used
80 |                                 print(f'     ### Mosaic is being used @ epoch {cur_epoch}...')
81 |                                 self.warning_mosaic_start = False
82 |                             image, target = transform(image, target, dataset)
83 |                         else:
84 |                             image, target = transform(image, target)
85 |         else:
86 |             for transform in self.transforms:
87 |                 image, target = transform(image, target)
88 | 
89 |         return image, target
90 | 
91 |     def __repr__(self):
92 |         format_string = self.__class__.__name__ + "("
93 |         for t in self.transforms:
94 |             format_string += "\n"
95 |             format_string += "    {0}".format(t)
96 |         format_string += "\n)"
97 |         return format_string


--------------------------------------------------------------------------------
/src/nn/backbone/resnet.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Conditional DETR
  3 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Copied from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | """
 11 | Backbone modules.
 12 | """
 13 | import os
 14 | 
 15 | import torch
 16 | import torch.nn.functional as F
 17 | import torchvision
 18 | from torch import nn
 19 | from torchvision.models._utils import IntermediateLayerGetter
 20 | from typing import Dict, List
 21 | 
 22 | class FrozenBatchNorm2d(torch.nn.Module):
 23 |     """
 24 |     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 25 | 
 26 |     Copy-paste from torchvision.misc.ops with added eps before rqsrt,
 27 |     without which any other models than torchvision.models.resnet[18,34,50,101]
 28 |     produce nans.
 29 |     """
 30 | 
 31 |     def __init__(self, n):
 32 |         super(FrozenBatchNorm2d, self).__init__()
 33 |         self.register_buffer("weight", torch.ones(n))
 34 |         self.register_buffer("bias", torch.zeros(n))
 35 |         self.register_buffer("running_mean", torch.zeros(n))
 36 |         self.register_buffer("running_var", torch.ones(n))
 37 | 
 38 |     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 39 |                               missing_keys, unexpected_keys, error_msgs):
 40 |         num_batches_tracked_key = prefix + 'num_batches_tracked'
 41 |         if num_batches_tracked_key in state_dict:
 42 |             del state_dict[num_batches_tracked_key]
 43 | 
 44 |         super(FrozenBatchNorm2d, self)._load_from_state_dict(
 45 |             state_dict, prefix, local_metadata, strict,
 46 |             missing_keys, unexpected_keys, error_msgs)
 47 | 
 48 |     def forward(self, x):
 49 |         # move reshapes to the beginning
 50 |         # to make it fuser-friendly
 51 |         w = self.weight.reshape(1, -1, 1, 1)
 52 |         b = self.bias.reshape(1, -1, 1, 1)
 53 |         rv = self.running_var.reshape(1, -1, 1, 1)
 54 |         rm = self.running_mean.reshape(1, -1, 1, 1)
 55 |         eps = 1e-5
 56 |         scale = w * (rv + eps).rsqrt()
 57 |         bias = b - rm * scale
 58 |         return x * scale + bias
 59 | 
 60 | 
 61 | class BackboneBase(nn.Module):
 62 |     def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_indices: list):
 63 |         super().__init__()
 64 |         for name, parameter in backbone.named_parameters():
 65 |             if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
 66 |                 parameter.requires_grad_(False)
 67 | 
 68 |         return_layers = {}
 69 |         for idx, layer_index in enumerate(return_interm_indices):
 70 |             return_layers.update({"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)})
 71 |         self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
 72 |         self.num_channels = num_channels
 73 | 
 74 |     def forward(self, input):
 75 |         xs = self.body(input)
 76 |         return xs.values()
 77 | 
 78 | 
 79 | class ResNet(BackboneBase):
 80 |     """ResNet backbone with frozen BatchNorm."""
 81 |     def __init__(self, name: str,
 82 |                  train_backbone: bool,
 83 |                  dilation: bool,
 84 |                  return_interm_indices:list,
 85 |                  batch_norm=FrozenBatchNorm2d,
 86 |                  pretrained=False,
 87 |                  ):
 88 |         if name in ['resnet18', 'resnet34', 'resnet50', 'resnet101']:
 89 |             backbone = getattr(torchvision.models, name)(
 90 |                 replace_stride_with_dilation=[False, False, dilation],
 91 |                 pretrained=pretrained, norm_layer=batch_norm)
 92 |         else:
 93 |             raise NotImplementedError("Why you can get here with name {}".format(name))
 94 |         # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
 95 |         assert name not in ('resnet18', 'resnet34'), "Only resnet50 and resnet101 are available."
 96 |         assert return_interm_indices in [[0,1,2,3], [1,2,3], [2, 3], [3]]
 97 |         num_channels_all = [256, 512, 1024, 2048]
 98 |         num_channels = num_channels_all[4-len(return_interm_indices):]
 99 |         super().__init__(backbone, train_backbone, num_channels, return_interm_indices)
100 | 
101 | 


--------------------------------------------------------------------------------
/src/misc/get_param_dicts.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | import re
  6 | 
  7 | 
  8 | def get_optim_params(cfg: list, model: nn.Module):
  9 |     """
 10 |     E.g.:
 11 |         ^(?=.*a)(?=.*b).*$  means including a and b
 12 |         ^(?=.*(?:a|b)).*$   means including a or b
 13 |         ^(?=.*a)(?!.*b).*$  means including a, but not b
 14 |     """
 15 | 
 16 |     param_groups = []
 17 |     visited = []
 18 | 
 19 |     cfg_ = []
 20 |     for pg in cfg:
 21 |         cfg_.append(dict(pg))
 22 | 
 23 |     for pg in cfg_:
 24 |         pattern = pg['params']
 25 |         params = {k: v for k, v in model.named_parameters() if v.requires_grad and len(re.findall(pattern, k)) > 0}
 26 |         pg['params'] = params.values()
 27 |         param_groups.append(pg)
 28 |         visited.extend(list(params.keys()))
 29 | 
 30 |     names = [k for k, v in model.named_parameters() if v.requires_grad]
 31 | 
 32 |     if len(visited) < len(names):
 33 |         unseen = set(names) - set(visited)
 34 |         params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen}
 35 |         param_groups.append({'params': params.values()})
 36 |         visited.extend(list(params.keys()))
 37 | 
 38 |     assert len(visited) == len(names), ''
 39 | 
 40 |     return param_groups
 41 | 
 42 | def match_name_keywords(n: str, name_keywords: list):
 43 |     out = False
 44 |     for b in name_keywords:
 45 |         if b in n:
 46 |             out = True
 47 |             break
 48 |     return out
 49 | 
 50 | 
 51 | def get_param_dict(args, model_without_ddp: nn.Module):
 52 |     try:
 53 |         param_dict_type = args.param_dict_type
 54 |     except:
 55 |         param_dict_type = 'default'
 56 |     assert param_dict_type in ['default', 'ddetr_in_mmdet', 'large_wd']
 57 | 
 58 |     # by default
 59 |     if param_dict_type == 'default':
 60 |         param_dicts = [
 61 |             {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
 62 |             {
 63 |                 "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
 64 |                 "lr": args.lr_backbone,
 65 |             }
 66 |         ]
 67 |         return param_dicts
 68 | 
 69 |     if param_dict_type == 'ddetr_in_mmdet':
 70 |         param_dicts = [
 71 |             {
 72 |                 "params":
 73 |                     [p for n, p in model_without_ddp.named_parameters()
 74 |                         if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
 75 |                 "lr": args.lr,
 76 |             },
 77 |             {
 78 |                 "params": [p for n, p in model_without_ddp.named_parameters() 
 79 |                         if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
 80 |                 "lr": args.lr_backbone,
 81 |             },
 82 |             {
 83 |                 "params": [p for n, p in model_without_ddp.named_parameters() 
 84 |                         if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
 85 |                 "lr": args.lr * args.lr_linear_proj_mult,
 86 |             }
 87 |         ]        
 88 |         return param_dicts
 89 | 
 90 |     if param_dict_type == 'large_wd':
 91 |         param_dicts = [
 92 |                 {
 93 |                     "params":
 94 |                         [p for n, p in model_without_ddp.named_parameters()
 95 |                             if not match_name_keywords(n, ['backbone']) and not match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
 96 |                 },
 97 |                 {
 98 |                     "params": [p for n, p in model_without_ddp.named_parameters() 
 99 |                             if match_name_keywords(n, ['backbone']) and match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
100 |                     "lr": args.lr_backbone,
101 |                     "weight_decay": 0.0,
102 |                 },
103 |                 {
104 |                     "params": [p for n, p in model_without_ddp.named_parameters() 
105 |                             if match_name_keywords(n, ['backbone']) and not match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
106 |                     "lr": args.lr_backbone,
107 |                     "weight_decay": args.weight_decay,
108 |                 },
109 |                 {
110 |                     "params":
111 |                         [p for n, p in model_without_ddp.named_parameters()
112 |                             if not match_name_keywords(n, ['backbone']) and match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
113 |                     "lr": args.lr,
114 |                     "weight_decay": 0.0,
115 |                 }
116 |             ]
117 | 
118 |         # print("param_dicts: {}".format(param_dicts))
119 | 
120 |     return param_dicts
121 | 


--------------------------------------------------------------------------------
/assets/TENSORRT_DEB_LAMBDA.AI.md:
--------------------------------------------------------------------------------
  1 | <h2 align="center">
  2 |   Manual to install TensorRT in Lambda.ai instances 
  3 | </h2>
  4 | 
  5 | ## Quick Start
  6 | ### Lambda.ai
  7 | 1. Go to [Lambda.ai](https://lambda.ai) and create an account.
  8 | 2. Log in to your Lambda.ai account.
  9 | 3. Click on the `Launch instance' button. It is located on the top right side of the website.
 10 | 4. Select an instance. To replicate our results from the appendix, select `1x A10 (24 GB PCle)`
 11 | 
 12 | ### CUDA Installation
 13 | The Lambda Stack installs a pre-packaged version of CUDA with only whats needed for typical deep learning workflows. 
 14 | But the `.deb` TensorRT installation expects the full CUDA Toolkit to already be installed in the system in the standard way via NVIDIAs `.deb` repo. 
 15 | Thats why your TensorRT installation only succeeded after installing CUDA. 
 16 | This ensured all the expected binaries, libraries, and metadata were in place for TensorRT to install cleanly.
 17 | 
 18 | 1. Check which CUDA version your Lambda.ai instance is using
 19 |     ```shell
 20 |     nvidia-smi
 21 |     ```
 22 |     We got the following output
 23 |     ```shell
 24 |     +-----------------------------------------------------------------------------------------+
 25 |     | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 26 |     |-----------------------------------------+------------------------+----------------------+
 27 |     | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 28 |     | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 29 |     |                                         |                        |               MIG M. |
 30 |     |=========================================+========================+======================|
 31 |     |   0  NVIDIA A10                     On  |   00000000:06:00.0 Off |                    0 |
 32 |     |  0%   28C    P8              9W /  150W |       1MiB /  23028MiB |      0%      Default |
 33 |     |                                         |                        |                  N/A |
 34 |     +-----------------------------------------+------------------------+----------------------+
 35 |                                                                                              
 36 |     +-----------------------------------------------------------------------------------------+
 37 |     | Processes:                                                                              |
 38 |     |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 39 |     |        ID   ID                                                               Usage      |
 40 |     |=========================================================================================|
 41 |     |  No running processes found                                                             |
 42 |     +-----------------------------------------------------------------------------------------+
 43 |     ```
 44 | 
 45 | 2. Install the CUDA toolkit with the correct version (in our case 12.8)
 46 |     ```shell
 47 |     # cuda installation
 48 |     wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
 49 |     sudo dpkg -i cuda-keyring_1.1-1_all.deb
 50 |     sudo apt-get update
 51 |     sudo apt-get -y install cuda-toolkit-12-8
 52 |     ```
 53 | 
 54 | 3. Install TensorRT
 55 | 
 56 |     When you use the `.deb` installation, you will install the latest TensorRT.
 57 |     ```shell
 58 |     #tensorrt installation
 59 |     wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.9.0/local_repo/nv-tensorrt-local-repo-ubuntu2204-10.9.0-cuda-12.8_1.0-1_amd64.deb
 60 |     sudo dpkg -i nv-tensorrt-local-repo-ubuntu2204-10.9.0-cuda-12.8_1.0-1_amd64.deb 
 61 |     sudo cp /var/nv-tensorrt-local-repo-ubuntu2204-10.9.0-cuda-12.8/nv-tensorrt-local-AD7406A2-keyring.gpg /usr/share/keyrings/
 62 |     sudo apt-get update
 63 |     sudo apt-get install tensorrt
 64 |     ```
 65 | 
 66 | The complete installation takes approximately 10-15 minutes.
 67 | 
 68 | ## Installing DETRPose
 69 | ### Quick Start
 70 | ```shell
 71 | git clone https://github.com/SebastianJanampa/DETRPose.git
 72 | cd DETRPose
 73 | pip install -r requirements.txt
 74 | ```
 75 | 
 76 | ### Data Preparation
 77 | ```
 78 | pip install gdown # to download files from google drive
 79 | gdown 1VprytECcLtU4tKP32SYi_7oDRbw7yUTL # images
 80 | unzip images.zip
 81 | ```
 82 | 
 83 | ### Usage
 84 | ```shell
 85 | pip install onnx onnxsim
 86 | pip install -r tools/benchmark/requirements.txt
 87 | 
 88 | export model=l #n, s, m, l, x
 89 | mkdir trt_engines
 90 | ```
 91 | 1. Download official weights
 92 |     ```shell
 93 |     wget https://github.com/SebastianJanampa/DETRPose/releases/download/model_weights/detrpose_hgnetv2_${model}.pth
 94 |     ```
 95 | 2. Export onnx
 96 |     ```shell
 97 |     python tools/deployment/export_onnx.py --check -c configs/detrpose/detrpose_hgnetv2_${model}.py -r detrpose_hgnetv2_${model}.pth
 98 |     ```
 99 | 3. Export tensorrt
100 |     ```shell
101 |     alias trtexec="/usr/src/tensorrt/bin/trtexec"
102 |     trtexec --onnx="onnx_engines/detrpose_hgnetv2_${model}.onnx" --saveEngine="trt_engines/detrpose_hgnetv2_${model}.engine" --fp16
103 |     ```
104 | 4. Benchmark
105 |     ```shell
106 |     python tools/benchmark/trt_benchmark.py --infer_dir ./images --engine_dir trt_engines
107 |     ```


--------------------------------------------------------------------------------
/tools/visualization/backbone_encoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  2 | # ------------------------------------------------------------------------
  3 | import os, sys
  4 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
  5 | 
  6 | import argparse
  7 | 
  8 | import matplotlib as mpl
  9 | import matplotlib.pyplot as plt
 10 | from matplotlib.backends.backend_agg import FigureCanvasAgg
 11 | 
 12 | import torch
 13 | from torch.utils.data import DataLoader
 14 | 
 15 | from util.slconfig import SLConfig
 16 | import util.misc as utils
 17 | 
 18 | import datasets
 19 | from datasets import build_dataset, BatchImageCollateFunction
 20 | 
 21 | 
 22 | def create(args, classname):
 23 |     # we use register to maintain models from catdet6 on.
 24 |     from models.registry import MODULE_BUILD_FUNCS
 25 |     class_module = getattr(args, classname)
 26 |     assert class_module in MODULE_BUILD_FUNCS._module_dict
 27 |     build_func = MODULE_BUILD_FUNCS.get(class_module)
 28 |     return build_func(args)
 29 | 
 30 | def main(args):
 31 |     cfg = SLConfig.fromfile(args.config)
 32 |     device = args.device
 33 | 
 34 |     setattr(cfg, 'coco_path', args.data_path)
 35 |     setattr(cfg, 'batch_size_train', 1)
 36 |     setattr(cfg, 'batch_size_val', 1)
 37 | 
 38 |     if 'HGNetv2' in cfg.backbone:
 39 |         cfg.pretrained = False
 40 | 
 41 |     # build model
 42 |     model, _ = create(cfg, 'modelname')
 43 |     model.to(device)
 44 | 
 45 |     dataset_val = build_dataset(image_set='val', args=cfg)
 46 | 
 47 |     sampler_val = torch.utils.data.SequentialSampler(dataset_val)
 48 | 
 49 |     data_loader_val = DataLoader(dataset_val, 1, sampler=sampler_val, drop_last=False, collate_fn=BatchImageCollateFunction(), num_workers=4)
 50 | 
 51 |     if args.resume:
 52 |         checkpoint = torch.load(args.resume, map_location='cpu')
 53 |         if 'ema' in checkpoint:
 54 |             state = checkpoint['ema']['module']
 55 |         else:
 56 |             state = checkpoint['model']
 57 | 
 58 |         # NOTE load train mode state -> convert to deploy mode
 59 |         model.load_state_dict(state)
 60 | 
 61 |     # folder path
 62 |     main_folder = cfg.output_dir
 63 |     if 'data/wireframe_processed' in args.data_path:
 64 |         backbone_dir = f'{main_folder}/visualization/backbone_wireframe'
 65 |         encoder_dir = f'{main_folder}/visualization/encoder_wireframe'
 66 | 
 67 |     elif 'data/york_processed' in args.data_path:
 68 |         backbone_dir = f'{main_folder}/visualization/backbone_york'
 69 |         encoder_dir = f'{main_folder}/visualization/encoder_york'
 70 |     else:
 71 |         raise 'Dataset does not exist. We support only wireframe and york datasets'
 72 | 
 73 |     os.makedirs(backbone_dir , exist_ok=True)
 74 |     os.makedirs(encoder_dir, exist_ok=True)
 75 | 
 76 |     with torch.no_grad():
 77 | 
 78 |         for i, (samples, targets) in enumerate(data_loader_val):
 79 |             samples = samples.to(device)
 80 | 
 81 |             enc_feature_maps = []
 82 |             backbone_feature_maps = []
 83 |             hooks = [
 84 |                 model.backbone.register_forward_hook(
 85 |                     lambda self, input, output: backbone_feature_maps.append(output)
 86 |                 ),
 87 |                 model.encoder.register_forward_hook(
 88 |                     lambda self, input, output: enc_feature_maps.append(output)
 89 |                 ),
 90 |             ]
 91 |             model(samples)
 92 |             
 93 |             for hook in hooks:
 94 |                 hook.remove()    
 95 |  
 96 |             back_feats = backbone_feature_maps[0]    
 97 |             enc_feats = enc_feature_maps[0]
 98 | 
 99 |             curr_img_id = targets[0]['image_id'].tolist()[0]
100 | 
101 |             for j, back_feat in enumerate(back_feats):
102 |                 down = j + 1
103 | 
104 |                 back_feat = back_feat[0].mean(0).cpu()
105 | 
106 |                 fig = plt.figure(figsize=(16, 16))
107 |                 plt.axis('off')
108 |                 plt.imshow(back_feat)
109 |                 plt.savefig(
110 |                     f"{backbone_dir}/{curr_img_id}_ds_{down}.png", 
111 |                     bbox_inches='tight', 
112 |                     pad_inches=0, 
113 |                     dpi=200
114 |                     )
115 |                 plt.close()
116 | 
117 |             for j, enc_feat in enumerate(enc_feats):
118 |                 down = j + 1
119 | 
120 |                 enc_feat = enc_feat[0].mean(0).cpu()
121 | 
122 |                 fig = plt.figure(figsize=(16, 16))
123 |                 plt.axis('off')
124 |                 plt.imshow(enc_feat)
125 |                 plt.savefig(
126 |                     f"{encoder_dir}/{curr_img_id}_ds_{down}.png", 
127 |                     bbox_inches='tight', 
128 |                     pad_inches=0, 
129 |                     dpi=200
130 |                     )
131 |                 plt.close()
132 | 
133 |             # check condition to stop program
134 |             if args.num_images is not None and i + 1 >= args.num_images:
135 |                 break
136 | 
137 | 
138 | if __name__ == '__main__':
139 |     parser = argparse.ArgumentParser('Visualization of Deformable Line Attention')
140 |     parser.add_argument('-c', '--config', type=str, required=True)
141 |     parser.add_argument('-r', '--resume', default='', help='resume from checkpoint')
142 |     parser.add_argument('-p', '--data-path', type=str, default='data/wireframe_processed', help='data path')
143 |     parser.add_argument('-d', '--device', type=str, default='cpu')
144 |     parser.add_argument('-n', '--num_images', type=int, help='total number of images to plot')
145 |     args = parser.parse_args()
146 |     main(args)
147 | 


--------------------------------------------------------------------------------
/src/data/coco.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | COCO dataset which returns image_id for evaluation.
  4 | 
  5 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
  6 | """
  7 | 
  8 | from pathlib import Path
  9 | import cv2
 10 | import numpy as np
 11 | import torch
 12 | import torch.utils.data
 13 | from PIL import Image
 14 | from pycocotools import mask as coco_mask
 15 | from pycocotools.coco import COCO
 16 | # import datasets.transforms as T
 17 | 
 18 | __all__ = ['build']
 19 | 
 20 | 
 21 | class CocoDetection(torch.utils.data.Dataset):
 22 |     def __init__(self, img_folder, ann_file, transforms, return_masks=False):
 23 |         super(CocoDetection, self).__init__()
 24 |         self._transforms = transforms
 25 |         self.prepare = ConvertCocoPolysToMask(return_masks)
 26 |         
 27 |         self.img_folder = Path(img_folder)
 28 |         self.coco = COCO(ann_file)
 29 |         imgIds = sorted(self.coco.getImgIds())
 30 |         
 31 |         if "train" in ann_file:
 32 |             self.all_imgIds = []
 33 |             for image_id in imgIds:
 34 |                 if self.coco.getAnnIds(imgIds=image_id) == []:
 35 |                     continue
 36 |                 ann_ids = self.coco.getAnnIds(imgIds=image_id)
 37 |                 target = self.coco.loadAnns(ann_ids)
 38 |                 num_keypoints = [obj["num_keypoints"] for obj in target]
 39 |                 if sum(num_keypoints) == 0:
 40 |                     continue
 41 |                 self.all_imgIds.append(image_id)
 42 |         else:
 43 |             self.all_imgIds = []
 44 |             for image_id in imgIds:
 45 |                 self.all_imgIds.append(image_id)
 46 | 
 47 |     def set_epoch(self, epoch):
 48 |         self._epoch = epoch
 49 | 
 50 |     @property
 51 |     def epoch(self):
 52 |         return self._epoch if hasattr(self, '_epoch') else -1
 53 | 
 54 |     def __len__(self):
 55 |         return len(self.all_imgIds)
 56 | 
 57 |     def load_item(self, idx):
 58 |         image_id = self.all_imgIds[idx]
 59 |         ann_ids = self.coco.getAnnIds(imgIds=image_id)
 60 |         target = self.coco.loadAnns(ann_ids)
 61 | 
 62 |         target = {'image_id': image_id, 'annotations': target}
 63 |         img = Image.open(self.img_folder / self.coco.loadImgs(image_id)[0]['file_name'])
 64 |         img, target = self.prepare(img, target)
 65 |         return img, target
 66 | 
 67 |     def __getitem__(self, idx):
 68 |         img, target = self.load_item(idx)
 69 |         if self._transforms is not None:
 70 |             img, target = self._transforms(img, target, self)
 71 |         return img, target
 72 | 
 73 | 
 74 | def convert_coco_poly_to_mask(segmentations, height, width):
 75 |     masks = []
 76 |     for polygons in segmentations:
 77 |         rles = coco_mask.frPyObjects(polygons, height, width)
 78 |         mask = coco_mask.decode(rles)
 79 |         if len(mask.shape) < 3:
 80 |             mask = mask[..., None]
 81 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 82 |         mask = mask.any(dim=2)
 83 |         masks.append(mask)
 84 |     if masks:
 85 |         masks = torch.stack(masks, dim=0)
 86 |     else:
 87 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 88 |     return masks
 89 | 
 90 | 
 91 | class ConvertCocoPolysToMask(object):
 92 |     def __init__(self, return_masks=False):
 93 |         self.return_masks = return_masks
 94 | 
 95 |     def __call__(self, image, target):
 96 |         w, h = image.size
 97 | 
 98 |         img_array = np.array(image)
 99 |         if len(img_array.shape) == 2:
100 |             img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
101 |             image = Image.fromarray(img_array)
102 |         image_id = target["image_id"]
103 |         image_id = torch.tensor([image_id])
104 |         anno = target["annotations"]
105 |         anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
106 |         anno = [obj for obj in anno if obj['num_keypoints'] != 0]
107 |         keypoints = [obj["keypoints"] for obj in anno]
108 |         boxes = [obj["bbox"] for obj in anno]
109 |         keypoints = torch.as_tensor(keypoints, dtype=torch.float32).reshape(-1, 17, 3)
110 |         # guard against no boxes via resizing
111 |         boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
112 |         boxes[:, 2:] += boxes[:, :2]
113 |         boxes[:, 0::2].clamp_(min=0, max=w)
114 |         boxes[:, 1::2].clamp_(min=0, max=h)
115 |         classes = [obj["category_id"] for obj in anno]
116 |         classes = torch.tensor(classes, dtype=torch.int64)
117 |         if self.return_masks:
118 |             segmentations = [obj["segmentation"] for obj in anno]
119 |             masks = convert_coco_poly_to_mask(segmentations, h, w)
120 |         keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
121 |         boxes = boxes[keep]
122 |         classes = classes[keep]
123 |         keypoints = keypoints[keep]
124 |         if self.return_masks:
125 |             masks = masks[keep]
126 |         target = {}
127 |         target["boxes"] = boxes
128 |         target["labels"] = classes
129 |         if self.return_masks:
130 |             target["masks"] = masks
131 |         target["image_id"] = image_id
132 |         if keypoints is not None:
133 |             target["keypoints"] = keypoints
134 |         # for conversion to coco api
135 |         area = torch.tensor([obj["area"] for obj in anno])
136 |         iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
137 |         target["area"] = area[keep]
138 |         target["iscrowd"] = iscrowd[keep]
139 |         target["orig_size"] = torch.as_tensor([int(w), int(h)])
140 |         target["size"] = torch.as_tensor([int(h), int(w)])
141 |         return image, target
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/tools/inference/onnx_inf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
  3 | """
  4 | import os
  5 | import cv2
  6 | import glob
  7 | import numpy as np
  8 | import onnxruntime as ort
  9 | import torch
 10 | import torchvision.transforms as T
 11 | 
 12 | from PIL import Image, ImageDraw
 13 | from copy import deepcopy
 14 | from annotator import Annotator
 15 | from annotator_crowdpose import AnnotatorCrowdpose
 16 | 
 17 | annotators = {'COCO': Annotator, 'CrowdPose': AnnotatorCrowdpose}
 18 | 
 19 | def process_image(sess, im_pil):
 20 |     w, h = im_pil.size
 21 |     orig_size = torch.tensor([w, h])[None]
 22 | 
 23 |     transforms = T.Compose(
 24 |         [
 25 |             T.Resize((640, 640)),
 26 |             T.ToTensor(),
 27 |         ]
 28 |     )
 29 |     im_data = transforms(im_pil).unsqueeze(0)
 30 |     annotator = annotators[annotator_type](deepcopy(im_pil))
 31 | 
 32 | 
 33 |     output = sess.run(
 34 |         output_names=None,
 35 |         input_feed={"images": im_data.numpy(), "orig_target_sizes": orig_size.numpy()},
 36 |     )
 37 | 
 38 |     scores, labels, keypoints = output
 39 |     scores, labels, keypoints = scores[0], labels[0], keypoints[0]
 40 |     for kpt, score in zip(keypoints, scores):
 41 |         if score > thrh:
 42 |             annotator.kpts(
 43 |                 kpt,
 44 |                 [h, w]
 45 |                 )
 46 |     annotator.save(f"{OUTPUT_NAME}.jpg")
 47 | 
 48 | 
 49 | def process_video(sess, video_path):
 50 |     cap = cv2.VideoCapture(video_path)
 51 | 
 52 |     # Get video properties
 53 |     fps = cap.get(cv2.CAP_PROP_FPS)
 54 |     orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
 55 |     orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 56 | 
 57 |     # Define the codec and create VideoWriter object
 58 |     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
 59 |     out = cv2.VideoWriter(f"{OUTPUT_NAME}.mp4", fourcc, fps, (orig_w, orig_h))
 60 | 
 61 |     transforms = T.Compose(
 62 |         [
 63 |             T.Resize((640, 640)),
 64 |             T.ToTensor(),
 65 |         ]
 66 |     )
 67 |         
 68 |     frame_count = 0
 69 |     print("Processing video frames...")
 70 |     while cap.isOpened():
 71 |         ret, frame = cap.read()
 72 |         if not ret:
 73 |             break
 74 | 
 75 |         # Convert frame to PIL image
 76 |         frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
 77 | 
 78 |         w, h = frame_pil.size
 79 |         orig_size = torch.tensor([w, h])[None]
 80 |         annotator = annotators[annotator_type](deepcopy(frame_pil))
 81 | 
 82 |         im_data = transforms(frame_pil).unsqueeze(0)
 83 | 
 84 |         output = sess.run(
 85 |             output_names=None,
 86 |             input_feed={"images": im_data.numpy(), "orig_target_sizes": orig_size.numpy()},
 87 |         )
 88 | 
 89 |         scores, labels, keypoints = output
 90 |         scores, labels, keypoints = scores[0], labels[0], keypoints[0]
 91 |         for kpt, score in zip(keypoints, scores):
 92 |             if score > thrh:
 93 |                 annotator.kpts(
 94 |                     kpt,
 95 |                     [h, w]
 96 |                     )
 97 | 
 98 |         # Convert back to OpenCV image
 99 |         frame = annotator.result()
100 | 
101 |         # Write the frame
102 |         out.write(frame)
103 |         frame_count += 1
104 | 
105 |         if frame_count % 10 == 0:
106 |             print(f"Processed {frame_count} frames...")
107 | 
108 |     cap.release()
109 |     out.release()
110 |     print(f"Video processing complete. Result saved as '{OUTPUT_NAME}.mp4'.")
111 | 
112 | def process_file(sess, file_path):
113 |     # Check if the input file is an image or a video
114 |     try:
115 |         # Try to open the input as an image
116 |         im_pil = Image.open(file_path).convert("RGB")
117 |         process_image(sess, im_pil)
118 |     except IOError:
119 |         # Not an image, process as video
120 |         process_video(sess, file_path)
121 | 
122 | def main(args):
123 |     assert args.annotator.lower() in ['coco', 'crowdpose']
124 |     # Global variable
125 |     global OUTPUT_NAME, thrh, annotator_type
126 | 
127 |     """Main function."""
128 |     # Load the ONNX model
129 |     sess = ort.InferenceSession(args.onnx)
130 |     print(f"Using device: {ort.get_device()}")
131 | 
132 |     input_path = args.input
133 |     thrh = 0.5 if args.thrh is None else args.thrh
134 | 
135 |     annotator_name = args.annotator.lower()
136 |     if annotator_name == 'coco':
137 |         annotator_type = 'COCO'
138 |     elif annotator_name == 'crowdpose':
139 |         annotator_type = 'CrowdPose'
140 | 
141 |     # Check if the input argumnet is a file or a folder
142 |     file_path = args.input
143 |     if os.path.isdir(file_path):
144 |         # Process a folder
145 |         folder_dir = args.input
146 |         output_dir = f"{folder_dir}/output"
147 |         os.makedirs(output_dir, exist_ok=True)
148 |         paths = list(glob.iglob(f"{folder_dir}/*.*"))
149 |         for file_path in paths:
150 |             OUTPUT_NAME = file_path.replace(f'{folder_dir}/', f'{output_dir}/').split('.')[0]
151 |             OUTPUT_NAME = f"{OUTPUT_NAME}_{annotator_type}"
152 |             process_file(sess, file_path)
153 |     else:
154 |         # Process a file
155 |         OUTPUT_NAME = f'onxx_results_{annotator_type}'
156 |         process_file(sess, file_path)
157 | 
158 | if __name__ == "__main__":
159 |     import argparse
160 | 
161 |     parser = argparse.ArgumentParser()
162 |     parser.add_argument("--onnx", type=str, required=True, help="Path to the ONNX model file.")
163 |     parser.add_argument("--annotator", type=str, required=True, help="Annotator type: COCO or CrowdPose.")
164 |     parser.add_argument("-i", "--input", type=str, required=True, help="Path to the input image or video file.")
165 |     parser.add_argument("-t", "--thrh", type=float, required=False, default=None)
166 |     args = parser.parse_args()
167 |     main(args)
168 | 


--------------------------------------------------------------------------------
/src/data/crowdpose.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
  3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/)
  6 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved.
  7 | ---------------------------------------------------------------------------------
  8 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/)
  9 | Copyright (c) 2023 IDEA. All Rights Reserved.
 10 | """
 11 | 
 12 | import json
 13 | from pathlib import Path
 14 | import cv2
 15 | import numpy as np
 16 | import torch
 17 | import torch.utils.data
 18 | from PIL import Image
 19 | from xtcocotools.coco import COCO
 20 | 
 21 | class CrowdPoseDetection(torch.utils.data.Dataset):
 22 |     def __init__(self, img_folder, ann_file, transforms, return_masks=False):
 23 |         super(CrowdPoseDetection, self).__init__()
 24 |         self._transforms = transforms
 25 |         self.prepare = ConvertCocoPolysToMask(return_masks)
 26 |         
 27 |         self.img_folder = Path(img_folder)
 28 |         self.coco = COCO(ann_file)
 29 |         imgIds = sorted(self.coco.getImgIds())
 30 |         
 31 |         if "train" in ann_file:
 32 |             self.all_imgIds = []
 33 |             for image_id in imgIds:
 34 |                 if self.coco.getAnnIds(imgIds=image_id) == []:
 35 |                     continue
 36 |                 ann_ids = self.coco.getAnnIds(imgIds=image_id)
 37 |                 target = self.coco.loadAnns(ann_ids)
 38 |                 num_keypoints = [obj["num_keypoints"] for obj in target]
 39 |                 if sum(num_keypoints) == 0:
 40 |                     continue
 41 |                 self.all_imgIds.append(image_id)
 42 |         else:
 43 |             self.all_imgIds = []
 44 |             for image_id in imgIds:
 45 |                 self.all_imgIds.append(image_id)
 46 | 
 47 |     def set_epoch(self, epoch):
 48 |         self._epoch = epoch
 49 | 
 50 |     @property
 51 |     def epoch(self):
 52 |         return self._epoch if hasattr(self, '_epoch') else -1
 53 | 
 54 |     def __len__(self):
 55 |         return len(self.all_imgIds)
 56 | 
 57 |     def load_item(self, idx):
 58 |         image_id = self.all_imgIds[idx]
 59 |         ann_ids = self.coco.getAnnIds(imgIds=image_id)
 60 |         target = self.coco.loadAnns(ann_ids)
 61 | 
 62 |         target = {'image_id': image_id, 'annotations': target}
 63 |         img = Image.open(self.img_folder / self.coco.loadImgs(image_id)[0]['file_name'])
 64 |         img, target = self.prepare(img, target)
 65 |         return img, target
 66 | 
 67 |     def __getitem__(self, idx):
 68 |         img, target = self.load_item(idx)
 69 |         if self._transforms is not None:
 70 |             img, target = self._transforms(img, target, self)
 71 |         return img, target
 72 | 
 73 | 
 74 | def convert_coco_poly_to_mask(segmentations, height, width):
 75 |     masks = []
 76 |     for polygons in segmentations:
 77 |         rles = coco_mask.frPyObjects(polygons, height, width)
 78 |         mask = coco_mask.decode(rles)
 79 |         if len(mask.shape) < 3:
 80 |             mask = mask[..., None]
 81 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 82 |         mask = mask.any(dim=2)
 83 |         masks.append(mask)
 84 |     if masks:
 85 |         masks = torch.stack(masks, dim=0)
 86 |     else:
 87 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 88 |     return masks
 89 | 
 90 | 
 91 | class ConvertCocoPolysToMask(object):
 92 |     def __init__(self, return_masks=False):
 93 |         self.return_masks = return_masks
 94 | 
 95 |     def __call__(self, image, target):
 96 |         w, h = image.size
 97 | 
 98 |         img_array = np.array(image)
 99 |         if len(img_array.shape) == 2:
100 |             img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
101 |             image = Image.fromarray(img_array)
102 |         image_id = target["image_id"]
103 |         image_id = torch.tensor([image_id])
104 |         anno = target["annotations"]
105 |         anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
106 |         anno = [obj for obj in anno if obj['num_keypoints'] != 0]
107 |         keypoints = [obj["keypoints"] for obj in anno]
108 |         boxes = [obj["bbox"] for obj in anno]
109 |         keypoints = torch.as_tensor(keypoints, dtype=torch.float32).reshape(-1, 14, 3)
110 |         # guard against no boxes via resizing
111 |         boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
112 |         boxes[:, 2:] += boxes[:, :2]
113 |         boxes[:, 0::2].clamp_(min=0, max=w)
114 |         boxes[:, 1::2].clamp_(min=0, max=h)
115 |         classes = [obj["category_id"] for obj in anno]
116 |         classes = torch.tensor(classes, dtype=torch.int64)
117 |         if self.return_masks:
118 |             segmentations = [obj["segmentation"] for obj in anno]
119 |             masks = convert_coco_poly_to_mask(segmentations, h, w)
120 |         keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
121 |         boxes = boxes[keep]
122 |         classes = classes[keep]
123 |         keypoints = keypoints[keep]
124 |         if self.return_masks:
125 |             masks = masks[keep]
126 |         target = {}
127 |         target["boxes"] = boxes
128 |         target["labels"] = classes
129 |         if self.return_masks:
130 |             target["masks"] = masks
131 |         target["image_id"] = image_id
132 |         if keypoints is not None:
133 |             target["keypoints"] = keypoints
134 |         iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
135 |         target["iscrowd"] = iscrowd[keep]
136 |         target["orig_size"] = torch.as_tensor([int(w), int(h)])
137 |         target["size"] = torch.as_tensor([int(h), int(w)])
138 |         return image, target
139 | 
140 | 


--------------------------------------------------------------------------------
/src/models/detrpose/matcher.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
  3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/)
  6 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved.
  7 | ---------------------------------------------------------------------------------
  8 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/)
  9 | Copyright (c) 2023 IDEA. All Rights Reserved.
 10 | """
 11 | 
 12 | import torch
 13 | from scipy.optimize import linear_sum_assignment
 14 | from torch import nn
 15 | import numpy as np
 16 | 
 17 | 
 18 | class HungarianMatcher(nn.Module):
 19 |     def __init__(self, cost_class: float = 1, focal_alpha=0.25,
 20 |                  cost_keypoints=1.0, cost_oks=0.01, num_body_points=17):
 21 |         super().__init__()
 22 |         self.cost_class = cost_class
 23 | 
 24 |         self.cost_keypoints = cost_keypoints
 25 |         self.cost_oks = cost_oks
 26 |         self.focal_alpha = focal_alpha
 27 |         self.num_body_points = num_body_points
 28 |         
 29 |         if num_body_points==17:
 30 |             self.sigmas = np.array([
 31 |                 .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
 32 |                 1.07, .87, .87, .89, .89
 33 |             ], dtype=np.float32) / 10.0
 34 | 
 35 |         elif num_body_points==14:
 36 |             self.sigmas = np.array([
 37 |                 .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
 38 |                 .79, .79
 39 |             ]) / 10.0
 40 |         else:
 41 |             raise NotImplementedError
 42 | 
 43 |     @torch.no_grad()
 44 |     def forward(self, outputs, targets):
 45 |         bs, num_queries = outputs["pred_logits"].shape[:2]
 46 | 
 47 |         # We flatten to compute the cost matrices in a batch
 48 |         out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
 49 |         out_keypoints = outputs["pred_keypoints"].flatten(0, 1)  # [batch_size * num_queries, 51]
 50 | 
 51 |         # Also concat the target labels and boxes
 52 |         tgt_ids = torch.cat([v["labels"] for v in targets])
 53 |         tgt_keypoints = torch.cat([v["keypoints"] for v in targets])  # nkp, 51
 54 |         tgt_area = torch.cat([v["area"] for v in targets])  # nkp, 51
 55 | 
 56 |         # Compute the classification cost.
 57 |         alpha = self.focal_alpha
 58 |         gamma = 2.0
 59 |         neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
 60 |         pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
 61 |         cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
 62 | 
 63 |         # compute the keypoint costs
 64 |         Z_pred = out_keypoints[:, 0:(self.num_body_points * 2)]
 65 |         Z_gt = tgt_keypoints[:, 0:(self.num_body_points * 2)]
 66 |         V_gt: torch.Tensor = tgt_keypoints[:, (self.num_body_points * 2):]
 67 |         if Z_pred.sum() > 0:
 68 |             sigmas = Z_pred.new_tensor(self.sigmas)
 69 |             variances = (sigmas * 2) ** 2
 70 |             kpt_preds = Z_pred.reshape(-1, Z_pred.size(-1) // 2, 2)
 71 |             kpt_gts = Z_gt.reshape(-1, Z_gt.size(-1) // 2, 2)
 72 |             squared_distance = (kpt_preds[:, None, :, 0] - kpt_gts[None, :, :, 0]) ** 2 + \
 73 |                                (kpt_preds[:, None, :, 1] - kpt_gts[None, :, :, 1]) ** 2
 74 |             squared_distance0 = squared_distance / (tgt_area[:, None] * variances[None, :] * 2)
 75 |             squared_distance1 = torch.exp(-squared_distance0)
 76 |             squared_distance1 = squared_distance1 * V_gt
 77 |             oks = squared_distance1.sum(dim=-1) / (V_gt.sum(dim=-1) + 1e-6)
 78 |             oks = oks.clamp(min=1e-6)
 79 |             cost_oks = 1 - oks
 80 | 
 81 |             cost_keypoints = torch.abs(Z_pred[:, None, :] - Z_gt[None])  # npred, ngt, 34
 82 |             cost_keypoints = cost_keypoints * V_gt.repeat_interleave(2, dim=1)[None]
 83 |             cost_keypoints = cost_keypoints.sum(-1)
 84 |             C = self.cost_class * cost_class + self.cost_keypoints * cost_keypoints + self.cost_oks * cost_oks
 85 |             C = C.view(bs, num_queries, -1).cpu()
 86 | 
 87 |         else:
 88 |             cost_keypoints = cost_oks = 0
 89 |             C = self.cost_class * cost_class + self.cost_keypoints * cost_keypoints + self.cost_oks * cost_oks
 90 |             C = C.view(bs, num_queries, -1).cpu()
 91 | 
 92 |         # Final cost matrix
 93 |         sizes = [len(v["boxes"]) for v in targets]
 94 |         indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
 95 | 
 96 |         if tgt_ids.shape[0] > 0:
 97 |             cost_mean_dict = {
 98 |                 'class': cost_class.mean(),
 99 |                 "keypoints": cost_keypoints.mean()
100 |             }
101 |         else:
102 |             # for the cases when no grounding truth boxes
103 |             cost_mean_dict = {
104 |                 'class': torch.zeros_like(cost_class.mean()),
105 |                 'keypoints': torch.zeros_like(cost_keypoints.mean()),
106 |             }
107 | 
108 |         return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in
109 |                 indices]#, cost_mean_dict
110 | 
111 | def build_matcher(args):
112 |     assert args.matcher_type in ['HungarianMatcher'], "Unknown args.matcher_type: {}".format(
113 |         args.matcher_type)
114 |     if args.matcher_type == 'HungarianMatcher':
115 |         return HungarianMatcher(
116 |             cost_class=args.set_cost_class, focal_alpha=args.focal_alpha, cost_keypoints=args.set_cost_keypoints, cost_oks=args.set_cost_oks, num_body_points=args.num_body_points)
117 |     else:
118 |         raise NotImplementedError("Unknown args.matcher_type: {}".format(args.matcher_type))


--------------------------------------------------------------------------------
/src/misc/dist_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import atexit
  3 | import json
  4 | import torch
  5 | import torch.nn  as nn
  6 | import torch.distributed as dist
  7 | 
  8 | from torch.utils.data import DistributedSampler
  9 | from torch.nn.parallel import DataParallel as DP
 10 | from torch.nn.parallel import DistributedDataParallel as DDP
 11 | 
 12 | from ..data.dataloader import DataLoader
 13 | 
 14 | def is_dist_avail_and_initialized():
 15 |     if not dist.is_available():
 16 |         return False
 17 |     if not dist.is_initialized():
 18 |         return False
 19 |     return True
 20 | 
 21 | 
 22 | def get_world_size():
 23 |     if not is_dist_avail_and_initialized():
 24 |         return 1
 25 |     return dist.get_world_size()
 26 | 
 27 | 
 28 | def get_rank():
 29 |     if not is_dist_avail_and_initialized():
 30 |         return 0
 31 |     return dist.get_rank()
 32 | 
 33 | 
 34 | def is_main_process():
 35 |     return get_rank() == 0
 36 | 
 37 | 
 38 | def save_on_master(*args, **kwargs):
 39 |     if is_main_process():
 40 |         torch.save(*args, **kwargs)
 41 | 
 42 | 
 43 | def init_distributed_mode(args):
 44 |     if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != '': # 'RANK' in os.environ and 
 45 |         args.rank = int(os.environ["RANK"])
 46 |         args.world_size = int(os.environ['WORLD_SIZE'])
 47 |         args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
 48 |         # local_world_size = int(os.environ['WORLD_SIZE'])
 49 |         # args.world_size = args.world_size * local_world_size
 50 |         # args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
 51 |         # args.rank = args.rank * local_world_size + args.local_rank
 52 |         # print('world size: {}, rank: {}, local rank: {}'.format(args.world_size, args.rank, args.local_rank))
 53 |         # print(json.dumps(dict(os.environ), indent=2))
 54 |     elif 'SLURM_PROCID' in os.environ:
 55 |         args.rank = int(os.environ['SLURM_PROCID'])
 56 |         args.gpu = args.local_rank = int(os.environ['SLURM_LOCALID'])
 57 |         args.world_size = int(os.environ['SLURM_NPROCS'])
 58 |         
 59 |         # print('world size: {}, world rank: {}, local rank: {}, device_count: {}'.format(args.world_size, args.rank, args.local_rank, torch.cuda.device_count()))
 60 |         # print("os.environ['SLURM_JOB_NODELIST']:", os.environ['SLURM_JOB_NODELIST'])
 61 |         # print(json.dumps(dict(os.environ), indent=2))
 62 |         # print('args:')
 63 |         # print(json.dumps(vars(args), indent=2))
 64 |     else:
 65 |         print('Not using distributed mode')
 66 |         args.distributed = False
 67 |         args.world_size = 1
 68 |         args.rank = 0
 69 |         args.local_rank = 0
 70 |         return
 71 | 
 72 |     print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank))
 73 |     args.distributed = True
 74 |     torch.cuda.set_device(args.local_rank)
 75 |     args.dist_backend = 'nccl'
 76 |     print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True)
 77 |     torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
 78 |                                          world_size=args.world_size, rank=args.rank)
 79 |     print("Before torch.distributed.barrier()")
 80 |     torch.distributed.barrier()
 81 |     print("End torch.distributed.barrier()")
 82 |     setup_for_distributed(args.rank == 0)
 83 | 
 84 | def setup_for_distributed(is_master):
 85 |     """
 86 |     This function disables printing when not in master process
 87 |     """
 88 |     import builtins as __builtin__
 89 |     builtin_print = __builtin__.print
 90 | 
 91 |     def print(*args, **kwargs):
 92 |         force = kwargs.pop('force', False)
 93 |         if is_master or force:
 94 |             builtin_print(*args, **kwargs)
 95 | 
 96 |     __builtin__.print = print
 97 | 
 98 | def warp_loader(loader, shuffle=False):
 99 |     if is_dist_avail_and_initialized():
100 |         sampler = DistributedSampler(loader.dataset, shuffle=shuffle)
101 |         loader = DataLoader(loader.dataset,
102 |                             loader.batch_size,
103 |                             sampler=sampler,
104 |                             drop_last=loader.drop_last,
105 |                             collate_fn=loader.collate_fn,
106 |                             pin_memory=loader.pin_memory,
107 |                             num_workers=loader.num_workers)
108 |     return loader
109 | 
110 | 
111 | def warp_model(
112 |     model: torch.nn.Module,
113 |     sync_bn: bool=False,
114 |     dist_mode: str='ddp',
115 |     find_unused_parameters: bool=False,
116 |     compile: bool=False,
117 |     compile_mode: str='reduce-overhead',
118 |     **kwargs
119 | ):
120 |     if is_dist_avail_and_initialized():
121 |         rank = get_rank()
122 |         model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if sync_bn else model
123 |         if dist_mode == 'dp':
124 |             model = DP(model, device_ids=[rank], output_device=rank)
125 |         elif dist_mode == 'ddp':
126 |             model = DDP(model, device_ids=[rank], output_device=rank, find_unused_parameters=find_unused_parameters)
127 |         else:
128 |             raise AttributeError('')
129 | 
130 |     if compile:
131 |         model = torch.compile(model, mode=compile_mode)
132 | 
133 |     return model
134 | 
135 | @atexit.register
136 | def cleanup():
137 |     """cleanup distributed environment"""
138 |     if is_dist_avail_and_initialized():
139 |         torch.distributed.barrier()
140 |         torch.distributed.destroy_process_group()
141 | 
142 | 
143 | def is_parallel(model) -> bool:
144 |     # Returns True if model is of type DP or DDP
145 |     return type(model) in (
146 |         torch.nn.parallel.DataParallel,
147 |         torch.nn.parallel.DistributedDataParallel,
148 |     )
149 | 
150 | 
151 | def de_parallel(model) -> nn.Module:
152 |     # De-parallelize a model: returns single-GPU model if model is of type DP or DDP
153 |     return model.module if is_parallel(model) else model
154 | 


--------------------------------------------------------------------------------
/src/models/detrpose/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
  3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/)
  6 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved.
  7 | ---------------------------------------------------------------------------------
  8 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/)
  9 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved.
 10 | ---------------------------------------------------------------------------------
 11 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/)
 12 | Copyright (c) 2023 IDEA. All Rights Reserved.
 13 | """
 14 | 
 15 | import torch
 16 | import random
 17 | from torch import nn, Tensor
 18 | import os
 19 | import numpy as np
 20 | import math
 21 | import torch.nn.functional as F
 22 | from torch import nn
 23 | 
 24 | 
 25 | def gen_encoder_output_proposals(memory:Tensor, spatial_shapes:Tensor):
 26 |     """
 27 |     Input:
 28 |         - memory: bs, \sum{hw}, d_model
 29 |         - spatial_shapes: nlevel, 2
 30 |         - learnedwh: 2
 31 |     Output:
 32 |         - output_memory: bs, \sum{hw}, d_model
 33 |         - output_proposals: bs, \sum{hw}, 4
 34 |     """
 35 |     N_, S_, C_ = memory.shape
 36 |     base_scale = 4.0
 37 |     proposals = []
 38 |     _cur = 0
 39 |     for lvl, (H_, W_) in enumerate(spatial_shapes):
 40 |         grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
 41 |                                         torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device),
 42 |                                         indexing='ij')
 43 |         grid = torch.stack([grid_x, grid_y], -1) # H_, W_, 2
 44 | 
 45 |         grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) /  torch.tensor([W_, H_], dtype=torch.float32, device=memory.device)
 46 |             
 47 |         proposal = grid.view(N_, -1, 2)
 48 |         proposals.append(proposal)
 49 |         _cur += (H_ * W_)
 50 |     output_proposals = torch.cat(proposals, 1)
 51 |     output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
 52 |     output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid
 53 |     output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
 54 | 
 55 |     output_memory = memory
 56 |     output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
 57 | 
 58 |     return output_memory, output_proposals
 59 | 
 60 |         
 61 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
 62 |     """
 63 |     Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
 64 |     Args:
 65 |         inputs: A float tensor of arbitrary shape.
 66 |                 The predictions for each example.
 67 |         targets: A float tensor with the same shape as inputs. Stores the binary
 68 |                  classification label for each element in inputs
 69 |                 (0 for the negative class and 1 for the positive class).
 70 |         alpha: (optional) Weighting factor in range (0,1) to balance
 71 |                 positive vs negative examples. Default = -1 (no weighting).
 72 |         gamma: Exponent of the modulating factor (1 - p_t) to
 73 |                balance easy vs hard examples.
 74 |     Returns:
 75 |         Loss tensor
 76 |     """
 77 |     prob = inputs.sigmoid()
 78 |     ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
 79 |     p_t = prob * targets + (1 - prob) * (1 - targets)
 80 |     loss = ce_loss * ((1 - p_t) ** gamma)
 81 | 
 82 |     if alpha >= 0:
 83 |         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
 84 |         loss = alpha_t * loss
 85 | 
 86 | 
 87 |     return loss.mean(1).sum() / num_boxes
 88 | 
 89 | class MLP(nn.Module):
 90 |     """ Very simple multi-layer perceptron (also called FFN)"""
 91 | 
 92 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
 93 |         super().__init__()
 94 |         self.num_layers = num_layers
 95 |         h = [hidden_dim] * (num_layers - 1)
 96 |         self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
 97 | 
 98 |     def forward(self, x):
 99 |         for i, layer in enumerate(self.layers):
100 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
101 |         return x
102 | 
103 | def _get_activation_fn(activation, d_model=256, batch_dim=0):
104 |     """Return an activation function given a string"""
105 |     if activation == "relu":
106 |         return F.relu
107 |     if activation == "gelu":
108 |         return F.gelu
109 |     if activation == "glu":
110 |         return F.glu
111 |     if activation == "prelu":
112 |         return nn.PReLU()
113 |     if activation == "selu":
114 |         return F.selu
115 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
116 | 
117 | 
118 | def gen_sineembed_for_position(pos_tensor):
119 |     # n_query, bs, _ = pos_tensor.size()
120 |     # sineembed_tensor = torch.zeros(n_query, bs, 256)
121 |     scale = 2 * math.pi
122 |     dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
123 |     dim_t = 10000 ** (2 * (dim_t // 2) / 128)
124 |     x_embed = pos_tensor[:, :, 0] * scale
125 |     y_embed = pos_tensor[:, :, 1] * scale
126 |     pos_x = x_embed[:, :, None] / dim_t
127 |     pos_y = y_embed[:, :, None] / dim_t
128 |     pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
129 |     pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
130 |     if pos_tensor.size(-1) == 2:
131 |         pos = torch.cat((pos_y, pos_x), dim=2)
132 |     elif pos_tensor.size(-1) == 4:
133 |         w_embed = pos_tensor[:, :, 2] * scale
134 |         pos_w = w_embed[:, :, None] / dim_t
135 |         pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
136 | 
137 |         h_embed = pos_tensor[:, :, 3] * scale
138 |         pos_h = h_embed[:, :, None] / dim_t
139 |         pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
140 | 
141 |         pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
142 |     else:
143 |         raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
144 |     return pos
145 | 
146 | 
147 | def inverse_sigmoid(x, eps=1e-3):
148 |     x = x.clamp(min=0, max=1)
149 |     x1 = x.clamp(min=eps)
150 |     x2 = (1 - x).clamp(min=eps)
151 |     return torch.log(x1/x2)


--------------------------------------------------------------------------------
/tools/inference/annotator_crowdpose.py:
--------------------------------------------------------------------------------
  1 | #########################################################################################
  2 | # Modified from:
  3 | #   Ultralytics 
  4 | #   https://github.com/ultralytics/ultralytics/blob/main/ultralytics/utils/plotting.py
  5 | #########################################################################################
  6 | 
  7 | import math
  8 | import warnings
  9 | from pathlib import Path
 10 | from typing import Callable, Dict, List, Optional, Union
 11 | 
 12 | import cv2
 13 | import numpy as np
 14 | import torch
 15 | from PIL import Image, ImageDraw, ImageFont
 16 | from PIL import __version__ as pil_version
 17 | 
 18 | from annotator import Annotator, Colors
 19 | 
 20 | 
 21 | colors = Colors()  # create instance for 'from utils.plots import colors'
 22 | 
 23 | class AnnotatorCrowdpose(Annotator):
 24 |     """
 25 |     Ultralytics Annotator for train/val mosaics and JPGs and predictions annotations.
 26 | 
 27 |     Attributes:
 28 |         im (Image.Image | np.ndarray): The image to annotate.
 29 |         pil (bool): Whether to use PIL or cv2 for drawing annotations.
 30 |         font (ImageFont.truetype | ImageFont.load_default): Font used for text annotations.
 31 |         lw (float): Line width for drawing.
 32 |         skeleton (List[List[int]]): Skeleton structure for keypoints.
 33 |         limb_color (List[int]): Color palette for limbs.
 34 |         kpt_color (List[int]): Color palette for keypoints.
 35 |         dark_colors (set): Set of colors considered dark for text contrast.
 36 |         light_colors (set): Set of colors considered light for text contrast.
 37 | 
 38 |     Examples:
 39 |         >>> from ultralytics.utils.plotting import Annotator
 40 |         >>> im0 = cv2.imread("test.png")
 41 |         >>> annotator = Annotator(im0, line_width=10)
 42 |         >>> annotator.box_label([10, 10, 100, 100], "person", (255, 0, 0))
 43 |     """
 44 | 
 45 |     def __init__(
 46 |         self,
 47 |         im,
 48 |         line_width: Optional[int] = None,
 49 |         font_size: Optional[int] = None,
 50 |         font: str = "Arial.ttf",
 51 |         pil: bool = False,
 52 |         example: str = "abc",
 53 |     ):
 54 |         """Initialize the Annotator class with image and line width along with color palette for keypoints and limbs."""
 55 |         super().__init__(im, line_width, font_size, font, pil, example)
 56 | 
 57 |         # Pose Crowdpose
 58 |         self.skeleton = [
 59 |             # limbs
 60 |             [12, 10],
 61 |             [10, 8],
 62 |             [11, 9],
 63 |             [9, 7],
 64 |             # torso
 65 |             [8, 7],
 66 |             [8, 2],
 67 |             [7, 1],
 68 |             # arms
 69 |             [14, 1],
 70 |             [14, 2],
 71 |             [1, 3],
 72 |             [3, 5],
 73 |             [2, 4],
 74 |             [4, 6],
 75 |             # head
 76 |             [14, 13],
 77 |         ]
 78 | 
 79 |         self.limb_color = colors.pose_palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 0, 16]]
 80 |         self.kpt_color = colors.pose_palette[[0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 16, 0]]
 81 |         # 9, 9, 9, 9, 9, 9, 9, 0, 16, 16, 0, 0, 0, 0, 0, 0]]
 82 |         self.dark_colors = {
 83 |             (235, 219, 11),
 84 |             (243, 243, 243),
 85 |             (183, 223, 0),
 86 |             (221, 111, 255),
 87 |             (0, 237, 204),
 88 |             (68, 243, 0),
 89 |             (255, 255, 0),
 90 |             (179, 255, 1),
 91 |             (11, 255, 162),
 92 |         }
 93 |         self.light_colors = {
 94 |             (255, 42, 4),
 95 |             (79, 68, 255),
 96 |             (255, 0, 189),
 97 |             (255, 180, 0),
 98 |             (186, 0, 221),
 99 |             (0, 192, 38),
100 |             (255, 36, 125),
101 |             (104, 0, 123),
102 |             (108, 27, 255),
103 |             (47, 109, 252),
104 |             (104, 31, 17),
105 |         }
106 | 
107 |     # def kpts(
108 |     #     self,
109 |     #     kpts,
110 |     #     shape: tuple = (640, 640),
111 |     #     radius: Optional[int] = None,
112 |     #     kpt_line: bool = True,
113 |     #     conf_thres: float = 0.25,
114 |     #     kpt_color: Optional[tuple] = None,
115 |     # ):
116 |     #     """
117 |     #     Plot keypoints on the image.
118 | 
119 |     #     Args:
120 |     #         kpts (torch.Tensor): Keypoints, shape [17, 3] (x, y, confidence).
121 |     #         shape (tuple, optional): Image shape (h, w).
122 |     #         radius (int, optional): Keypoint radius.
123 |     #         kpt_line (bool, optional): Draw lines between keypoints.
124 |     #         conf_thres (float, optional): Confidence threshold.
125 |     #         kpt_color (tuple, optional): Keypoint color (B, G, R).
126 | 
127 |     #     Note:
128 |     #         - `kpt_line=True` currently only supports human pose plotting.
129 |     #         - Modifies self.im in-place.
130 |     #         - If self.pil is True, converts image to numpy array and back to PIL.
131 |     #     """
132 |     #     radius = radius if radius is not None else self.lw
133 |     #     if self.pil:
134 |     #         # Convert to numpy first
135 |     #         self.im = np.asarray(self.im).copy()
136 |     #     nkpt, ndim = kpts.shape
137 |     #     is_pose = nkpt == 17 and ndim in {2, 3}
138 |     #     kpt_line &= is_pose  # `kpt_line=True` for now only supports human pose plotting
139 |     #     for i, k in enumerate(kpts):
140 |     #         color_k = kpt_color or (self.kpt_color[i].tolist() if is_pose else colors(i))
141 |     #         x_coord, y_coord = k[0], k[1]
142 |     #         if x_coord % shape[1] != 0 and y_coord % shape[0] != 0:
143 |     #             if len(k) == 3:
144 |     #                 conf = k[2]
145 |     #                 if conf < conf_thres:
146 |     #                     continue
147 |     #             cv2.circle(self.im, (int(x_coord), int(y_coord)), radius, color_k, -1, lineType=cv2.LINE_AA)
148 | 
149 |     #     if kpt_line:
150 |     #         ndim = kpts.shape[-1]
151 |     #         for i, sk in enumerate(self.skeleton):
152 |     #             pos1 = (int(kpts[(sk[0] - 1), 0]), int(kpts[(sk[0] - 1), 1]))
153 |     #             pos2 = (int(kpts[(sk[1] - 1), 0]), int(kpts[(sk[1] - 1), 1]))
154 |     #             if ndim == 3:
155 |     #                 conf1 = kpts[(sk[0] - 1), 2]
156 |     #                 conf2 = kpts[(sk[1] - 1), 2]
157 |     #                 if conf1 < conf_thres or conf2 < conf_thres:
158 |     #                     continue
159 |     #             if pos1[0] % shape[1] == 0 or pos1[1] % shape[0] == 0 or pos1[0] < 0 or pos1[1] < 0:
160 |     #                 continue
161 |     #             if pos2[0] % shape[1] == 0 or pos2[1] % shape[0] == 0 or pos2[0] < 0 or pos2[1] < 0:
162 |     #                 continue
163 |     #             cv2.line(
164 |     #                 self.im,
165 |     #                 pos1,
166 |     #                 pos2,
167 |     #                 kpt_color or self.limb_color[i].tolist(),
168 |     #                 thickness=int(np.ceil(self.lw / 2)),
169 |     #                 lineType=cv2.LINE_AA,
170 |     #             )
171 |     #     if self.pil:
172 |     #         # Convert im back to PIL and update draw
173 |     #         self.fromarray(self.im)
174 | 


--------------------------------------------------------------------------------
/tools/visualization/line_attention.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  2 | # ------------------------------------------------------------------------
  3 | import os, sys
  4 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
  5 | 
  6 | import argparse
  7 | 
  8 | import matplotlib as mpl
  9 | import matplotlib.pyplot as plt
 10 | from matplotlib.backends.backend_agg import FigureCanvasAgg
 11 | 
 12 | import torch
 13 | from torch.utils.data import DataLoader
 14 | 
 15 | from util.slconfig import SLConfig
 16 | 
 17 | import datasets
 18 | from datasets import build_dataset, BatchImageCollateFunction
 19 | 
 20 | 
 21 | def create(args, classname):
 22 |     # we use register to maintain models from catdet6 on.
 23 |     from models.registry import MODULE_BUILD_FUNCS
 24 |     class_module = getattr(args, classname)
 25 |     assert class_module in MODULE_BUILD_FUNCS._module_dict
 26 |     build_func = MODULE_BUILD_FUNCS.get(class_module)
 27 |     return build_func(args)
 28 | 
 29 | def main(args):
 30 |     cfg = SLConfig.fromfile(args.config)
 31 |     device = args.device
 32 | 
 33 |     setattr(cfg, 'coco_path', args.data_path)
 34 |     setattr(cfg, 'batch_size_train', 1)
 35 |     setattr(cfg, 'batch_size_val', 1)
 36 | 
 37 |     if 'HGNetv2' in cfg.backbone:
 38 |         cfg.pretrained = False
 39 | 
 40 |     # build model
 41 |     model, _ = create(cfg, 'modelname')
 42 |     model.to(device)
 43 |     
 44 |     criterion = create(cfg, 'criterionname')
 45 | 
 46 |     dataset_val = build_dataset(image_set='val', args=cfg)
 47 | 
 48 |     sampler_val = torch.utils.data.SequentialSampler(dataset_val)
 49 | 
 50 |     data_loader_val = DataLoader(dataset_val, 1, sampler=sampler_val, drop_last=False, collate_fn=BatchImageCollateFunction(), num_workers=4)
 51 | 
 52 |     if args.resume:
 53 |         checkpoint = torch.load(args.resume, map_location='cpu')
 54 |         if 'ema' in checkpoint:
 55 |             state = checkpoint['ema']['module']
 56 |         else:
 57 |             state = checkpoint['model']
 58 | 
 59 |         # NOTE load train mode state -> convert to deploy mode
 60 |         model.load_state_dict(state)
 61 |         
 62 |     # change to device
 63 |     model.to(device)
 64 | 
 65 |     # transformer parameters
 66 |     len_q = cfg.num_queries
 67 |     nheads = cfg.nheads
 68 |     num_sampling_points = cfg.dec_n_points
 69 |     num_points_scale = torch.tensor([1/n for n in num_sampling_points for _ in range(n)], dtype=torch.float32).reshape(-1, 1)
 70 | 
 71 |     # folder path
 72 |     main_folder = cfg.output_dir
 73 |     if 'data/wireframe_processed' in args.data_path:
 74 |         append_path = f'{main_folder}/visualization/line_attention_wireframe'
 75 | 
 76 |     elif 'data/york_processed' in args.data_path:
 77 |         append_path = f'{main_folder}/visualization/line_attention_york'
 78 |     os.makedirs(append_path , exist_ok=True)
 79 | 
 80 |     with torch.no_grad():
 81 | 
 82 |         for i, (samples, targets) in enumerate(data_loader_val):
 83 |             samples = samples.to(device)
 84 |             targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
 85 | 
 86 |             sampling_ratios = []
 87 |             reference_points = []
 88 |             attention_weights = []
 89 |             hooks = [
 90 |                 model.decoder.decoder.layers[-1].cross_attn.sampling_ratios.register_forward_hook(
 91 |                     lambda self, input, output: sampling_ratios.append(output[0])
 92 |                 ),
 93 |                 model.decoder.decoder.layers[-1].cross_attn.attention_weights.register_forward_hook(
 94 |                     lambda self, input, output: attention_weights.append(output[0])
 95 |                 ),
 96 |                 model.decoder.decoder.register_forward_hook(
 97 |                     lambda self, input, output: reference_points.append(output[0])
 98 |                 ),
 99 |             ]
100 | 
101 |             output = model(samples, None)
102 | 
103 |             [(src_idx, tgt_idx)] = criterion(output, targets, return_indices=True)
104 |             
105 |             for hook in hooks:
106 |                 hook.remove()    
107 |  
108 |             sampling_ratios = sampling_ratios[0].cpu().view(1, len_q, nheads, sum(num_sampling_points), 1)
109 |             attention_weights = attention_weights[0].cpu().view(1, len_q, nheads, sum(num_sampling_points))
110 |             attention_weights = torch.nn.functional.softmax(attention_weights, dim=-1)
111 | 
112 |             reference_points = reference_points[0][-2:-1].cpu().transpose(1, 2)
113 | 
114 |             vector = reference_points[:, :, None, :, :2] - reference_points[:, :, None, :, 2:]
115 |             center = 0.5 * (reference_points[:, :, None, :, :2] + reference_points[:, :, None, :, 2:])
116 | 
117 |             sampling_locations = center + sampling_ratios * num_points_scale * vector * 0.5
118 | 
119 |             # Plot image
120 |             img = samples[0].permute(1, 2, 0).cpu()
121 |             img = (img - img.min()) / (img.max() - img.min())
122 |             fig, ax = plt.subplots()
123 |             ax.imshow(img, extent=[0, 1, 1, 0])
124 | 
125 |             reference_points = reference_points.transpose(1, 2)[0, 0]
126 |             sampling_locations = sampling_locations[0]
127 |             attention_weights = attention_weights[0]
128 | 
129 |             # choose the query idx
130 |             line_idx = src_idx[tgt_idx == 0][0]
131 |             reference_points = reference_points[line_idx]
132 |             sampling_locations = sampling_locations[line_idx]
133 |             attention_weights = attention_weights[line_idx]
134 | 
135 |             # sampling points
136 |             for j in range(nheads):
137 |                 x1, y1 = sampling_locations[j].split(1, dim=-1)
138 |                 pos = ax.scatter(x1, y1, marker='*', c=attention_weights[j], cmap='jet', zorder=2)
139 |             cbar = fig.colorbar(pos, ax=ax)
140 |             cbar.ax.tick_params(size=0)
141 |             cbar.set_ticks([])
142 | 
143 |             # reference lines
144 |             x1, y1, x2, y2 = reference_points.split(1, dim=-1)
145 |             ax.plot((x1[0], x2[0]), (y1[0], y2[0]), c='k', marker='o', zorder=3)
146 | 
147 |             plt.axis([0, 1, 1, 0])
148 |             plt.axis(False)
149 | 
150 | 
151 |             curr_img_id = targets[0]['image_id'].tolist()[0]
152 |             plt.savefig(f'{append_path}/{curr_img_id}.png', bbox_inches="tight", pad_inches=0.0, dpi=100)
153 |             plt.close()
154 | 
155 |             # check condition to stop program
156 |             if args.num_images is not None and i + 1 >= args.num_images:
157 |                 break
158 | 
159 | 
160 | if __name__ == '__main__':
161 |     parser = argparse.ArgumentParser('Visualization of Deformable Line Attention')
162 |     parser.add_argument('-c', '--config', type=str, required=True)
163 |     parser.add_argument('-r', '--resume', default='', help='resume from checkpoint')
164 |     parser.add_argument('-p', '--data-path', type=str, default='data/wireframe_processed', help='data path')
165 |     parser.add_argument('-d', '--device', type=str, default='cpu')
166 |     parser.add_argument('-n', '--num_images', type=int, help='total number of images to plot')
167 |     args = parser.parse_args()
168 |     main(args)
169 | 


--------------------------------------------------------------------------------
/tools/inference/torch_inf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
  3 | """
  4 | import os
  5 | import sys
  6 | import glob
  7 | 
  8 | import cv2  # Added for video processing
  9 | import numpy as np
 10 | import torch
 11 | import torch.nn as nn
 12 | import torchvision.transforms as T
 13 | 
 14 | from PIL import Image, ImageDraw
 15 | from copy import deepcopy
 16 | from annotator import Annotator
 17 | from annotator_crowdpose import AnnotatorCrowdpose
 18 | 
 19 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
 20 | from src.core import LazyConfig, instantiate
 21 | 
 22 | annotators = {'COCO': Annotator, 'CrowdPose': AnnotatorCrowdpose}
 23 | 
 24 | def process_image(model, device, file_path):
 25 |     im_pil = Image.open(file_path).convert("RGB")
 26 |     w, h = im_pil.size
 27 |     orig_size = torch.tensor([[w, h]]).to(device)
 28 |     annotator = annotators[annotator_type](deepcopy(im_pil))
 29 | 
 30 |     transforms = T.Compose(
 31 |         [
 32 |             T.Resize((640, 640)),
 33 |             T.ToTensor(),
 34 |         ]
 35 |     )
 36 |     im_data = transforms(im_pil).unsqueeze(0).to(device)
 37 | 
 38 |     output = model(im_data, orig_size)
 39 | 
 40 |     scores, labels, keypoints = output
 41 |     scores, labels, keypoints = scores[0], labels[0], keypoints[0]
 42 |     for kpt, score in zip(keypoints, scores):
 43 |         if score > thrh:
 44 |             annotator.kpts(
 45 |                 kpt,
 46 |                 [h, w]
 47 |                 )
 48 |     annotator.save(f"{OUTPUT_NAME}.jpg")
 49 | 
 50 | 
 51 | def process_video(model, device, file_path):
 52 |     cap = cv2.VideoCapture(file_path)
 53 | 
 54 |     # Get video properties
 55 |     fps = cap.get(cv2.CAP_PROP_FPS)
 56 |     orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
 57 |     orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 58 | 
 59 |     # Define the codec and create VideoWriter object
 60 |     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
 61 |     out = cv2.VideoWriter(f"{OUTPUT_NAME}.mp4", fourcc, fps, (orig_w, orig_h))
 62 | 
 63 |     transforms = T.Compose(
 64 |         [
 65 |             T.Resize((640, 640)),
 66 |             T.ToTensor(),
 67 |         ]
 68 |     )
 69 | 
 70 |     frame_count = 0
 71 |     print("Processing video frames...")
 72 |     while cap.isOpened():
 73 |         ret, frame = cap.read()
 74 |         if not ret:
 75 |             break
 76 | 
 77 |         # Convert frame to PIL image
 78 |         frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
 79 | 
 80 |         w, h = frame_pil.size
 81 |         orig_size = torch.tensor([[w, h]]).to(device)
 82 | 
 83 |         annotator = annotators[annotator_type](deepcopy(frame_pil))
 84 | 
 85 |         im_data = transforms(frame_pil).unsqueeze(0).to(device)
 86 | 
 87 |         output = model(im_data, orig_size)
 88 | 
 89 |         scores, labels, keypoints = output
 90 |         scores, labels, keypoints = scores[0], labels[0], keypoints[0]
 91 |         for kpt, score in zip(keypoints, scores):
 92 |             if score > thrh:
 93 |                 annotator.kpts(
 94 |                     kpt,
 95 |                     [h, w]
 96 |                     )
 97 | 
 98 |         # Convert back to OpenCV image
 99 |         frame = annotator.result()
100 | 
101 |         # Write the frame
102 |         out.write(frame)
103 |         frame_count += 1
104 | 
105 |         if frame_count % 10 == 0:
106 |             print(f"Processed {frame_count} frames...")
107 | 
108 |     cap.release()
109 |     out.release()
110 |     print("Video processing complete. Result saved as 'results_video.mp4'.")
111 | 
112 | def process_file(model, device, file_path):
113 |     # Check if the input file is an image or a vide
114 |     if os.path.splitext(file_path)[-1].lower() in [".jpg", ".jpeg", ".png", ".bmp"]:
115 |         # Process as image
116 |         process_image(model, device, file_path)
117 |         print("Image processing complete.")
118 |     else:
119 |         # Process as video
120 |         process_video(model, device, file_path)
121 |         print("Video processing complete.")
122 | 
123 | def create(args, classname):
124 |     # we use register to maintain models from catdet6 on.
125 |     from models.registry import MODULE_BUILD_FUNCS
126 |     class_module = getattr(args, classname)
127 |     assert class_module in MODULE_BUILD_FUNCS._module_dict
128 |     build_func = MODULE_BUILD_FUNCS.get(class_module)
129 |     return build_func(args)
130 | 
131 | def main(args):
132 |     # Global variable
133 |     global OUTPUT_NAME, thrh, annotator_type
134 | 
135 |     """Main function"""
136 |     cfg = LazyConfig.load(args.config)
137 | 
138 |     if hasattr(cfg.model.backbone, 'pretrained'):
139 |         cfg.model.backbone.pretrained = False
140 | 
141 |     model = instantiate(cfg.model)
142 |     postprocessor = instantiate(cfg.postprocessor)
143 | 
144 |     num_body_points = model.transformer.num_body_points 
145 |     if  num_body_points == 17:
146 |         annotator_type = 'COCO'
147 |     elif num_body_points == 14:
148 |         annotator_type = 'CrowdPose'
149 |     else:
150 |         raise Exception(f'Not implemented annotator for model with {num_body_points} keypoints')
151 | 
152 |     if args.resume:
153 |         checkpoint = torch.load(args.resume, map_location='cpu', weights_only=False)
154 |         if 'ema' in checkpoint:
155 |             state = checkpoint['ema']['module']
156 |         else:
157 |             state = checkpoint['model']
158 | 
159 |         # NOTE load train mode state -> convert to deploy mode
160 |         model.load_state_dict(state)
161 | 
162 |     else:
163 |         # raise AttributeError('Only support resume to load model.state_dict by now.')
164 |         print('not load model.state_dict, use default init state dict...')
165 | 
166 |     class Model(nn.Module):
167 |         def __init__(self):
168 |             super().__init__()
169 |             self.model = model.deploy()
170 |             self.postprocessor = postprocessor.deploy()
171 | 
172 |         def forward(self, images, orig_target_sizes):
173 |             outputs = self.model(images)
174 |             outputs = self.postprocessor(outputs, orig_target_sizes)
175 |             return outputs
176 | 
177 |     device = args.device
178 |     model = Model().to(device)
179 |     thrh = 0.5 if args.thrh is None else args.thrh
180 | 
181 |     # Check if the input argumnet is a file or a folder
182 |     file_path = args.input
183 |     if os.path.isdir(file_path):
184 |         # Process a folder
185 |         folder_dir = args.input
186 |         output_dir = f"{folder_dir}/output"
187 |         os.makedirs(output_dir, exist_ok=True)
188 |         paths = list(glob.iglob(f"{folder_dir}/*.*"))
189 |         for file_path in paths:
190 |             OUTPUT_NAME = file_path.replace(f'{folder_dir}/', f'{output_dir}/').split('.')[0]
191 |             OUTPUT_NAME = f"{OUTPUT_NAME}_{annotator_type}"
192 |             process_file(model, device, file_path)
193 |     else:
194 |         # Process a file
195 |         OUTPUT_NAME = f'torch_results_{annotator_type}'
196 |         process_file(model, device, file_path)
197 | 
198 | 
199 | if __name__ == "__main__":
200 |     import argparse
201 | 
202 |     parser = argparse.ArgumentParser()
203 |     parser.add_argument("-c", "--config", type=str, required=True)
204 |     parser.add_argument("-r", "--resume", type=str, required=True)
205 |     parser.add_argument("-d", "--device", type=str, default="cpu")
206 |     parser.add_argument("-i", "--input", type=str, required=True)
207 |     parser.add_argument("-t", "--thrh", type=float, required=False, default=None)
208 |     args = parser.parse_args()
209 |     main(args)
210 | 


--------------------------------------------------------------------------------
/src/models/detrpose/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | import math
 10 | 
 11 | import torch
 12 | from torch import nn
 13 | import torch.nn.functional as F
 14 | from torch.nn.init import xavier_uniform_, constant_
 15 | 
 16 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
 17 |     # N_, S_, M_, D_ = value.shape
 18 |     _, D_ , _= value[0].shape
 19 |     N_, Lq_, M_, L_, P_, _ = sampling_locations.shape
 20 |     # value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
 21 |     
 22 |     sampling_grids = 2 * sampling_locations - 1
 23 |     sampling_grids = sampling_grids.transpose(1, 2).flatten(0, 1)
 24 | 
 25 |     sampling_value_list = []
 26 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
 27 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
 28 |         value_l_ = value[lid_].unflatten(2, (H_, W_)) 
 29 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
 30 |         sampling_grid_l_ = sampling_grids[:, :, lid_]
 31 |         # N_*M_, D_, Lq_, P_
 32 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
 33 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
 34 |         sampling_value_list.append(sampling_value_l_)
 35 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
 36 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
 37 |     output = (torch.concat(sampling_value_list, dim=-1) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
 38 |     return output.transpose(1, 2)#.contiguous()
 39 | 
 40 | 
 41 | class MSDeformAttn(nn.Module):
 42 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
 43 |         """
 44 |         Multi-Scale Deformable Attention Module
 45 |         :param d_model      hidden dimension
 46 |         :param n_levels     number of feature levels
 47 |         :param n_heads      number of attention heads
 48 |         :param n_points     number of sampling points per attention head per feature level
 49 |         """
 50 |         super().__init__()
 51 |         if d_model % n_heads != 0:
 52 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 53 |         _d_per_head = d_model // n_heads
 54 | 
 55 |         self.d_model = d_model
 56 |         self.n_levels = n_levels
 57 |         self.n_heads = n_heads
 58 |         self.n_points = n_points
 59 | 
 60 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 61 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 62 |         # self.value_proj = nn.Linear(d_model, d_model)
 63 |         # self.output_proj = nn.Linear(d_model, d_model)
 64 | 
 65 |         self.use_4D_normalizer = use_4D_normalizer
 66 | 
 67 |         self._reset_parameters()
 68 | 
 69 |     def _reset_parameters(self):
 70 |         constant_(self.sampling_offsets.weight.data, 0.)
 71 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 72 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 73 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 74 |         for i in range(self.n_points):
 75 |             grid_init[:, :, i, :] *= i % 4 + 1
 76 |         with torch.no_grad():
 77 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 78 |         if self.n_points % 4 != 0:
 79 |             constant_(self.sampling_offsets.bias, 0.)
 80 |         constant_(self.attention_weights.weight.data, 0.)
 81 |         constant_(self.attention_weights.bias.data, 0.)
 82 |         # xavier_uniform_(self.value_proj.weight.data)
 83 |         # constant_(self.value_proj.bias.data, 0.)
 84 |         # xavier_uniform_(self.output_proj.weight.data)
 85 |         # constant_(self.output_proj.bias.data, 0.)
 86 | 
 87 |     def forward(self, query, reference_points, value, input_spatial_shapes):
 88 |         """
 89 |         :param query                       (N, Length_{query}, C)
 90 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 91 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 92 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 93 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 94 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 95 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 96 | 
 97 |         :return output                     (N, Length_{query}, C)
 98 |         """
 99 |         N, Len_q, _ = query.shape
100 |         # assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
101 | 
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |         
106 |         # N, Len_q, n_heads, n_levels, n_points, 2
107 |         reference_points = torch.transpose(reference_points, 2, 3).flatten(1, 2)
108 |         
109 |         if reference_points.shape[-1] == 2:
110 |             offset_normalizer = torch.tensor(input_spatial_shapes, device=query.device)
111 |             offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.n_levels, 1, 2)
112 |             sampling_locations = reference_points[:, :, None, :, None, :] \
113 |                                  + sampling_offsets / offset_normalizer
114 |         elif reference_points.shape[-1] == 4:
115 |             if self.use_4D_normalizer:
116 |                 offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
117 |                 sampling_locations = reference_points[:, :, None, :, None, :2] \
118 |                                     + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
119 |             else:
120 |                 sampling_locations = reference_points[:, :, None, :, None, :2] \
121 |                                     + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
122 |         else:
123 |             raise ValueError(
124 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
125 | 
126 |         output = ms_deform_attn_core_pytorch(
127 |             value, input_spatial_shapes, sampling_locations, attention_weights)
128 |         return output
129 | 


--------------------------------------------------------------------------------
/src/solver/engine.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
  3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | Modified from DEIM (https://github.com/Intellindust-AI-Lab/DEIM/)
  6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
  7 | ---------------------------------------------------------------------------------
  8 | Modified from DETR (https://github.com/facebookresearch/detr/blob/main/engine.py)
  9 | Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 10 | """
 11 | 
 12 | import math
 13 | import sys
 14 | from typing import Iterable
 15 | 
 16 | import torch
 17 | from ..misc import logger as utils
 18 | from ..misc import dist_utils
 19 | 
 20 | GIGABYTE = 1024 ** 3
 21 | 
 22 | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
 23 |                     data_loader: Iterable, optimizer: torch.optim.Optimizer,
 24 |                     batch_size:int, grad_accum_steps:int, 
 25 |                     device: torch.device, epoch: int, max_norm: float = 0, writer=None,
 26 |                     lr_scheduler=None, warmup_scheduler=None, ema=None, args=None):
 27 |     scaler = torch.amp.GradScaler(str(device), enabled=args.amp)
 28 |     model.train()
 29 |     criterion.train()
 30 |     metric_logger = utils.MetricLogger(delimiter="  ")
 31 |     metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
 32 |     header = 'Epoch: [{}]'.format(epoch)
 33 |     print_freq = args.print_freq
 34 |     
 35 |     sub_batch_size = batch_size // args.grad_accum_steps
 36 | 
 37 |     print("Grad accum steps: ", args.grad_accum_steps)
 38 |     print("Batch size/GPU: ", batch_size)
 39 |     print("Total batch size: ", batch_size * dist_utils.get_world_size())
 40 | 
 41 |     optimizer.zero_grad()
 42 | 
 43 |     
 44 |     for i, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
 45 |         samples = samples.to(device)
 46 |         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
 47 | 
 48 |         global_step = epoch * len(data_loader) + i
 49 | 
 50 |         for j in range(args.grad_accum_steps):
 51 |             start_idx = j * sub_batch_size
 52 |             final_idx = start_idx + sub_batch_size
 53 |             new_samples = samples[start_idx:final_idx]
 54 |             new_samples = new_samples.to(device)
 55 |             new_targets = [{k: v.to(device) for k, v in t.items()} for t in targets[start_idx:final_idx]]
 56 | 
 57 |             with torch.amp.autocast(str(device), enabled=args.amp):
 58 |                 outputs = model(new_samples, new_targets)
 59 |             
 60 |             with torch.amp.autocast(str(device), enabled=False):
 61 |                 loss_dict = criterion(outputs, new_targets)
 62 |                 losses = sum(loss_dict.values())
 63 | 
 64 |             if args.amp:
 65 |                 scaler.scale(losses).backward()
 66 |             else:
 67 |                 losses.backward()
 68 | 
 69 |         # reduce losses over all GPUs for logging purposes
 70 |         loss_dict_reduced = utils.reduce_dict(loss_dict)
 71 |         losses_reduced_scaled = sum(loss_dict_reduced.values())
 72 | 
 73 |         loss_value = losses_reduced_scaled.item()
 74 | 
 75 |         if not math.isfinite(loss_value):
 76 |             print("Loss is {}, stopping training".format(loss_value))
 77 |             print(loss_dict_reduced)
 78 |             sys.exit(1)
 79 | 
 80 | 
 81 |         # amp backward function
 82 |         if args.amp:
 83 |             if max_norm > 0:
 84 |                 scaler.unscale_(optimizer)
 85 |                 torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
 86 |             scaler.step(optimizer)
 87 |             scaler.update()
 88 |         else:
 89 |             # original backward function
 90 |             if max_norm > 0:
 91 |                 torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
 92 |             optimizer.step()
 93 |                     
 94 |         # ema
 95 |         if ema is not None:
 96 |             ema.update(model)
 97 |             
 98 |         if warmup_scheduler is not None:
 99 |             warmup_scheduler.step() 
100 | 
101 | 
102 |         metric_logger.update(loss=loss_value, **loss_dict_reduced)
103 |         metric_logger.update(lr=optimizer.param_groups[0]["lr"])     
104 | 
105 | 
106 |         if writer and dist_utils.is_main_process() and global_step % 10 == 0:
107 |             writer.add_scalar('Loss/total', loss_value, global_step)
108 |             for j, pg in enumerate(optimizer.param_groups):
109 |                 writer.add_scalar(f'Lr/pg_{j}', pg['lr'], global_step)
110 |             for k, v in loss_dict_reduced.items():
111 |                 writer.add_scalar(f'Loss/{k}', v.item(), global_step)
112 |             free, total = torch.cuda.mem_get_info(device)
113 |             mem_used_MB = (total - free) / GIGABYTE
114 |             writer.add_scalar('Info/memory',  mem_used_MB, global_step)
115 | 
116 |         optimizer.zero_grad()
117 | 
118 |     # gather the stats from all processes
119 |     metric_logger.synchronize_between_processes()
120 |     print("Averaged stats:", metric_logger)
121 |     return {k: meter.global_avg for k, meter in metric_logger.meters.items() if meter.count > 0}
122 | 
123 | 
124 | 
125 | 
126 | @torch.no_grad()
127 | def evaluate(model, postprocessors, coco_evaluator, data_loader, device, writer=None, save_results=False):
128 |     model.eval()
129 |     if coco_evaluator is not None:
130 |         coco_evaluator.cleanup()
131 | 
132 |     metric_logger = utils.MetricLogger(delimiter="  ")
133 |     header = 'Test:'
134 |     res_json = [] 
135 | 
136 |     for samples, targets in metric_logger.log_every(data_loader, 10, header):
137 |         samples = samples.to(device)
138 |         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
139 | 
140 |         outputs = model(samples, targets)
141 | 
142 |         orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
143 |         results = postprocessors(outputs, orig_target_sizes)
144 | 
145 |         res = {target['image_id'].item(): output for target, output in zip(targets, results)}
146 |         if coco_evaluator is not None:
147 |             coco_evaluator.update(res)
148 | 
149 |         if save_results:
150 |             for k, v in res.items():
151 |                 scores = v['scores']
152 |                 labels = v['labels']
153 |                 keypoints = v['keypoints']
154 | 
155 |                 for s, l, kpt in zip(scores, labels, keypoints):
156 |                     res_json.append(
157 |                         {
158 |                         "image_id": k,
159 |                         "category_id": l.item(),
160 |                         "keypoints": kpt.round(decimals=4).tolist(),
161 |                         "score": s.item()
162 |                         }
163 |                         )
164 | 
165 |     # gather the stats from all processes
166 |     metric_logger.synchronize_between_processes()
167 |     print("Averaged stats:", metric_logger)
168 |     if coco_evaluator is not None:
169 |         coco_evaluator.synchronize_between_processes()
170 | 
171 |     if save_results:
172 |         return res_json
173 | 
174 |     # accumulate predictions from all images
175 |     if coco_evaluator is not None:
176 |         coco_evaluator.accumulate()
177 |         coco_evaluator.summarize()
178 | 
179 |     stats = {k: meter.global_avg for k, meter in metric_logger.meters.items() if meter.count > 0}
180 |     if coco_evaluator is not None:
181 |         stats['coco_eval_keypoints'] = coco_evaluator.coco_eval['keypoints'].stats.tolist()
182 |     return stats
183 | 


--------------------------------------------------------------------------------
/src/data/dataloader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
  3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | Modified from D-DEIM (https://github.com/Intellindust-AI-Lab/DEIM/)
  6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
  7 | ---------------------------------------------------------------------------------
  8 | Modified from D-FINE (https://github.com/Peterande/D-FINE/)
  9 | Copyright (c) 2024 D-FINE Authors. All Rights Reserved.
 10 | ---------------------------------------------------------------------------------
 11 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/)
 12 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved.
 13 | """
 14 | 
 15 | import torch
 16 | import torch.nn.functional as F
 17 | import torch.utils.data as data
 18 | from torchvision.transforms.functional import resize
 19 | import random
 20 | 
 21 | from PIL import Image, ImageDraw
 22 | import os
 23 | 
 24 | from copy import deepcopy
 25 | 
 26 | # This only for printing
 27 | RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
 28 | 
 29 | 
 30 | class DataLoader(data.DataLoader):
 31 |     def __repr__(self) -> str:
 32 |         format_string = self.__class__.__name__ + "("
 33 |         for n in ['dataset', 'batch_size', 'num_workers', 'drop_last', 'collate_fn']:
 34 |             format_string += "\n"
 35 |             format_string += "    {0}: {1}".format(n, getattr(self, n))
 36 |         format_string += "\n)"
 37 |         return format_string
 38 | 
 39 |     def set_epoch(self, epoch):
 40 |         self._epoch = epoch
 41 |         self.dataset.set_epoch(epoch)
 42 |         self.collate_fn.set_epoch(epoch)
 43 | 
 44 |     @property
 45 |     def epoch(self):
 46 |         return self._epoch if hasattr(self, '_epoch') else -1
 47 | 
 48 |     @property
 49 |     def shuffle(self):
 50 |         return self._shuffle
 51 | 
 52 |     @shuffle.setter
 53 |     def shuffle(self, shuffle):
 54 |         assert isinstance(shuffle, bool), 'shuffle must be a boolean'
 55 |         self._shuffle = shuffle
 56 |         
 57 |     
 58 | class BaseCollateFunction(object):
 59 |     def set_epoch(self, epoch):
 60 |         self._epoch = epoch
 61 | 
 62 |     @property
 63 |     def epoch(self):
 64 |         return self._epoch if hasattr(self, '_epoch') else -1
 65 | 
 66 |     def __call__(self, items):
 67 |         raise NotImplementedError('')
 68 | 
 69 | 
 70 | def generate_scales(base_size, base_size_repeat):
 71 |     scale_repeat = (base_size - int(base_size * 0.75 / 32) * 32) // 32
 72 |     scales = [int(base_size * 0.75 / 32) * 32 + i * 32 for i in range(scale_repeat)]
 73 |     scales += [base_size] * base_size_repeat
 74 |     scales += [int(base_size * 1.25 / 32) * 32 - i * 32 for i in range(scale_repeat)]
 75 |     return scales
 76 | 
 77 | 
 78 | class BatchImageCollateFunction(BaseCollateFunction):
 79 |     def __init__(
 80 |         self,
 81 |         stop_epoch=None,
 82 |         ema_restart_decay=0.9999,
 83 |         base_size=640,
 84 |         base_size_repeat=None,
 85 |         mixup_prob=0.0,
 86 |         mixup_epochs=[0, 0],
 87 |         vis_folder='./vis_dataset/',
 88 |         vis_save=False
 89 |     ) -> None:
 90 |         super().__init__()
 91 |         self.base_size = base_size
 92 |         self.scales = generate_scales(base_size, base_size_repeat) if base_size_repeat is not None else None
 93 |         if self.scales is not None:
 94 |             self.scales.sort()
 95 |             print(GREEN  + "Multi-scaling uses the following size: " + RESET, self.scales)
 96 |         self.stop_epoch = stop_epoch if stop_epoch is not None else 100000000
 97 |         self.ema_restart_decay = ema_restart_decay
 98 | 
 99 |         self.mixup_prob = mixup_prob
100 |         self.mixup_epochs = mixup_epochs
101 |         self.print_info_flag = True
102 | 
103 |         self.vis_save = vis_save
104 |         self.vis_folder = vis_folder
105 |         self.vis_image_number = 0
106 |         self.max_vis_image_number = 10
107 | 
108 |         if vis_save:
109 |             os.makedirs(self.vis_folder, exist_ok=True)
110 | 
111 |     def __call__(self, items):
112 |         images = torch.cat([x[0][None] for x in items], dim=0)
113 |         targets = [x[1] for x in items]
114 | 
115 |         images, targets = self.apply_mixup(images, targets)
116 | 
117 |         if self.scales is not None and self.epoch < self.stop_epoch:
118 |             sz = random.choice(self.scales)
119 |             images = resize(images, [sz, sz])
120 |         return images, targets
121 | 
122 |     def apply_mixup(self, images, targets):
123 |         """
124 |         Applies Mixup augmentation to the batch if conditions are met.
125 | 
126 |         Args:
127 |             images (torch.Tensor): Batch of images.
128 |             targets (list[dict]): List of target dictionaries corresponding to images.
129 | 
130 |         Returns:
131 |             tuple: Updated images and targets
132 |         """
133 |         # Log when Mixup is permanently disabled
134 |         if self.epoch == self.mixup_epochs[-1] and self.print_info_flag:
135 |             print(f"     ### Attention --- Mixup is closed after epoch@ {self.epoch} ###")
136 |             self.print_info_flag = False
137 | 
138 |         # Apply Mixup if within specified epoch range and probability threshold
139 |         if random.random() < self.mixup_prob and self.mixup_epochs[0] <= self.epoch < self.mixup_epochs[1]:
140 |             # Generate mixup ratio
141 |             beta = round(random.uniform(0.45, 0.55), 6)
142 | 
143 |             # Mix images
144 |             images = images.roll(shifts=1, dims=0).mul_(1.0 - beta).add_(images.mul(beta))
145 | 
146 |             # Prepare targets for Mixup
147 |             shifted_targets = targets[-1:] + targets[:-1]
148 |             updated_targets = deepcopy(targets)
149 | 
150 |             for i in range(len(targets)):
151 |                 # Combine boxes, labels, and areas from original and shifted targets
152 |                 updated_targets[i]['boxes'] = torch.cat([targets[i]['boxes'], shifted_targets[i]['boxes']], dim=0)
153 |                 updated_targets[i]['keypoints'] = torch.cat([targets[i]['keypoints'], shifted_targets[i]['keypoints']], dim=0)
154 |                 updated_targets[i]['labels'] = torch.cat([targets[i]['labels'], shifted_targets[i]['labels']], dim=0)
155 |                 updated_targets[i]['area'] = torch.cat([targets[i]['area'], shifted_targets[i]['area']], dim=0)
156 | 
157 |                 # Add mixup ratio to targets
158 |                 updated_targets[i]['mixup'] = torch.tensor(
159 |                     [beta] * len(targets[i]['labels']) + [1.0 - beta] * len(shifted_targets[i]['labels']), 
160 |                     dtype=torch.float32
161 |                     )
162 |             targets = updated_targets
163 | 
164 |             if self.vis_save and self.vis_image_number < self.max_vis_image_number:
165 |                 for i in range(len(updated_targets)):
166 |                     image_tensor = images[i]
167 |                     image_tensor_uint8 = ((image_tensor - image_tensor.min()) / (image_tensor.max() - image_tensor.min()) * 255).type(torch.uint8)
168 |                     image_numpy = image_tensor_uint8.numpy().transpose((1, 2, 0))
169 |                     pilImage = Image.fromarray(image_numpy)
170 |                     draw = ImageDraw.Draw(pilImage)
171 |                     print('mix_vis:', i, 'boxes.len=', len(updated_targets[i]['boxes']))
172 |                     for box in updated_targets[i]['boxes']:
173 |                         draw.rectangle([int(box[0]*640 - (box[2]*640)/2), int(box[1]*640 - (box[3]*640)/2), 
174 |                                         int(box[0]*640 + (box[2]*640)/2), int(box[1]*640 + (box[3]*640)/2)], outline=(255,255,0))
175 |                     for pose in updated_targets[i]['keypoints']:
176 |                         num_pose_point = pose.shape[0] // 3
177 |                         pose_ = pose[:-num_pose_point].reshape(-1, 2)
178 |                         for p in pose_:
179 |                             if sum(p) != 0:
180 |                                 draw.circle((p[0]*640, p[1]*640), 4, fill='blue')
181 | 
182 | 
183 |                     pilImage.save(self.vis_folder + f"example_{self.vis_image_number}_" + str(i) + "_"+ str(len(updated_targets[i]['boxes'])) +'_out.jpg')
184 |                     self.vis_image_number += 1
185 | 
186 |         return images, targets
187 | 


--------------------------------------------------------------------------------
/src/misc/logger.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import pickle
  3 | import datetime
  4 | from collections import defaultdict, deque
  5 | from typing import Dict
  6 | 
  7 | import torch
  8 | import torch.distributed as dist
  9 | 
 10 | from .dist_utils import is_dist_avail_and_initialized, get_world_size
 11 | 
 12 | 
 13 | class SmoothedValue(object):
 14 |     """Track a series of values and provide access to smoothed values over a
 15 |     window or the global series average.
 16 |     """
 17 | 
 18 |     def __init__(self, window_size=20, fmt=None):
 19 |         if fmt is None:
 20 |             fmt = "{median:.4f} ({global_avg:.4f})"
 21 |         self.deque = deque(maxlen=window_size)
 22 |         self.total = 0.0
 23 |         self.count = 0
 24 |         self.fmt = fmt
 25 | 
 26 |     def update(self, value, n=1):
 27 |         self.deque.append(value)
 28 |         self.count += n
 29 |         self.total += value * n
 30 | 
 31 |     def synchronize_between_processes(self):
 32 |         """
 33 |         Warning: does not synchronize the deque!
 34 |         """
 35 |         if not is_dist_avail_and_initialized():
 36 |             return
 37 |         t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
 38 |         dist.barrier()
 39 |         dist.all_reduce(t)
 40 |         t = t.tolist()
 41 |         self.count = int(t[0])
 42 |         self.total = t[1]
 43 | 
 44 |     @property
 45 |     def median(self):
 46 |         d = torch.tensor(list(self.deque))
 47 |         if d.shape[0] == 0:
 48 |             return 0
 49 |         return d.median().item()
 50 | 
 51 |     @property
 52 |     def avg(self):
 53 |         d = torch.tensor(list(self.deque), dtype=torch.float32)
 54 |         return d.mean().item()
 55 | 
 56 |     @property
 57 |     def global_avg(self):
 58 |         return self.total / self.count
 59 | 
 60 |     @property
 61 |     def max(self):
 62 |         return max(self.deque)
 63 | 
 64 |     @property
 65 |     def value(self):
 66 |         return self.deque[-1]
 67 | 
 68 |     def __str__(self):
 69 |         return self.fmt.format(
 70 |             median=self.median,
 71 |             avg=self.avg,
 72 |             global_avg=self.global_avg,
 73 |             max=self.max,
 74 |             value=self.value)
 75 | 
 76 | 
 77 | def all_gather(data):
 78 |     """
 79 |     Run all_gather on arbitrary picklable data (not necessarily tensors)
 80 |     Args:
 81 |         data: any picklable object
 82 |     Returns:
 83 |         list[data]: list of data gathered from each rank
 84 |     """
 85 |     world_size = get_world_size()
 86 |     if world_size == 1:
 87 |         return [data]
 88 | 
 89 |     # serialized to a Tensor
 90 |     buffer = pickle.dumps(data)
 91 |     storage = torch.ByteStorage.from_buffer(buffer)
 92 |     tensor = torch.ByteTensor(storage).to("cuda")
 93 | 
 94 |     # obtain Tensor size of each rank
 95 |     local_size = torch.tensor([tensor.numel()], device="cuda")
 96 |     size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
 97 |     dist.all_gather(size_list, local_size)
 98 |     size_list = [int(size.item()) for size in size_list]
 99 |     max_size = max(size_list)
100 | 
101 |     # receiving Tensor from all ranks
102 |     # we pad the tensor because torch all_gather does not support
103 |     # gathering tensors of different shapes
104 |     tensor_list = []
105 |     for _ in size_list:
106 |         tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
107 |     if local_size != max_size:
108 |         padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
109 |         tensor = torch.cat((tensor, padding), dim=0)
110 |     dist.all_gather(tensor_list, tensor)
111 | 
112 |     data_list = []
113 |     for size, tensor in zip(size_list, tensor_list):
114 |         buffer = tensor.cpu().numpy().tobytes()[:size]
115 |         data_list.append(pickle.loads(buffer))
116 | 
117 |     return data_list
118 | 
119 | 
120 | def reduce_dict(input_dict, average=True):
121 |     """
122 |     Args:
123 |         input_dict (dict): all the values will be reduced
124 |         average (bool): whether to do average or sum
125 |     Reduce the values in the dictionary from all processes so that all processes
126 |     have the averaged results. Returns a dict with the same fields as
127 |     input_dict, after reduction.
128 |     """
129 |     world_size = get_world_size()
130 |     if world_size < 2:
131 |         return input_dict
132 |     with torch.no_grad():
133 |         names = []
134 |         values = []
135 |         # sort the keys so that they are consistent across processes
136 |         for k in sorted(input_dict.keys()):
137 |             names.append(k)
138 |             values.append(input_dict[k])
139 |         values = torch.stack(values, dim=0)
140 |         dist.all_reduce(values)
141 |         if average:
142 |             values /= world_size
143 |         reduced_dict = {k: v for k, v in zip(names, values)}
144 |     return reduced_dict
145 | 
146 | 
147 | class MetricLogger(object):
148 |     def __init__(self, delimiter="\t"):
149 |         self.meters = defaultdict(SmoothedValue)
150 |         self.delimiter = delimiter
151 | 
152 |     def update(self, **kwargs):
153 |         for k, v in kwargs.items():
154 |             if isinstance(v, torch.Tensor):
155 |                 v = v.item()
156 |             assert isinstance(v, (float, int))
157 |             self.meters[k].update(v)
158 | 
159 |     def __getattr__(self, attr):
160 |         if attr in self.meters:
161 |             return self.meters[attr]
162 |         if attr in self.__dict__:
163 |             return self.__dict__[attr]
164 |         raise AttributeError("'{}' object has no attribute '{}'".format(
165 |             type(self).__name__, attr))
166 | 
167 |     def __str__(self):
168 |         loss_str = []
169 |         for name, meter in self.meters.items():
170 |             if meter.count > 0:
171 |                 loss_str.append(
172 |                     "{}: {}".format(name, str(meter))
173 |                 )
174 |         return self.delimiter.join(loss_str)
175 | 
176 |     def synchronize_between_processes(self):
177 |         for meter in self.meters.values():
178 |             meter.synchronize_between_processes()
179 | 
180 |     def add_meter(self, name, meter):
181 |         self.meters[name] = meter
182 | 
183 |     def log_every(self, iterable, print_freq, header=None, logger=None):
184 |         if logger is None:
185 |             print_func = print
186 |         else:
187 |             print_func = logger.info
188 | 
189 |         i = 0
190 |         if not header:
191 |             header = ''
192 |         start_time = time.time()
193 |         end = time.time()
194 |         iter_time = SmoothedValue(fmt='{avg:.4f}')
195 |         data_time = SmoothedValue(fmt='{avg:.4f}')
196 |         space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
197 |         if torch.cuda.is_available():
198 |             log_msg = self.delimiter.join([
199 |                 header,
200 |                 '[{0' + space_fmt + '}/{1}]',
201 |                 'eta: {eta}',
202 |                 '{meters}',
203 |                 'time: {time}',
204 |                 'data: {data}',
205 |                 'max mem: {memory:.0f}'
206 |             ])
207 |         else:
208 |             log_msg = self.delimiter.join([
209 |                 header,
210 |                 '[{0' + space_fmt + '}/{1}]',
211 |                 'eta: {eta}',
212 |                 '{meters}',
213 |                 'time: {time}',
214 |                 'data: {data}'
215 |             ])
216 |         MB = 1024.0 * 1024.0
217 |         for obj in iterable:
218 |             data_time.update(time.time() - end)
219 |             yield obj
220 |             iter_time.update(time.time() - end)
221 |             if i % print_freq == 0 or i == len(iterable) - 1:
222 |                 eta_seconds = iter_time.global_avg * (len(iterable) - i)
223 |                 eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
224 |                 if torch.cuda.is_available():
225 |                     print_func(log_msg.format(
226 |                         i, len(iterable), eta=eta_string,
227 |                         meters=str(self),
228 |                         time=str(iter_time), data=str(data_time),
229 |                         memory=torch.cuda.max_memory_allocated() / MB))
230 |                 else:
231 |                     print_func(log_msg.format(
232 |                         i, len(iterable), eta=eta_string,
233 |                         meters=str(self),
234 |                         time=str(iter_time), data=str(data_time)))
235 |             i += 1
236 |             end = time.time()
237 |         total_time = time.time() - start_time
238 |         total_time_str = str(datetime.timedelta(seconds=int(total_time)))
239 |         print_func('{} Total time: {} ({:.4f} s / it)'.format(
240 |             header, total_time_str, total_time / len(iterable)))
241 | 


--------------------------------------------------------------------------------
/tools/benchmark/trt_benchmark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
  3 | """
  4 | 
  5 | import tensorrt as trt
  6 | import pycuda.driver as cuda
  7 | from utils import TimeProfiler
  8 | import numpy as np
  9 | import os
 10 | import time
 11 | import torch
 12 | 
 13 | from collections import namedtuple, OrderedDict
 14 | import glob
 15 | import argparse
 16 | from dataset import Dataset
 17 | from tqdm import tqdm
 18 | 
 19 | 
 20 | def parse_args():
 21 |     parser = argparse.ArgumentParser(description='Argument Parser Example')
 22 |     parser.add_argument('--infer_dir',
 23 |                         type=str,
 24 |                         default='./data/COCO2017/val2017',
 25 |                         help="Directory for images to perform inference on.")
 26 |     parser.add_argument("--engine_dir",
 27 |                         type=str,
 28 |                         default='trt_engines',
 29 |                         help="Directory containing model engine files.")
 30 |     parser.add_argument('--busy',
 31 |                         action='store_true',
 32 |                         help="Flag to indicate that other processes may be running.")
 33 |     args = parser.parse_args()
 34 |     return args
 35 | 
 36 | class TRTInference(object):
 37 |     def __init__(self, engine_path, device='cuda', backend='torch', max_batch_size=32, verbose=False):
 38 |         self.engine_path = engine_path
 39 |         self.device = device
 40 |         self.backend = backend
 41 |         self.max_batch_size = max_batch_size
 42 | 
 43 |         self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
 44 |         self.engine = self.load_engine(engine_path)
 45 |         self.context = self.engine.create_execution_context()
 46 |         self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
 47 |         self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
 48 |         self.input_names = self.get_input_names()
 49 |         self.output_names = self.get_output_names()
 50 | 
 51 |         if self.backend == 'cuda':
 52 |             self.stream = cuda.Stream()
 53 |         self.time_profile = TimeProfiler()
 54 |         self.time_profile_dataset = TimeProfiler()
 55 |         self.yolo = 'yolo' in engine_path
 56 | 
 57 |     def init(self):
 58 |         self.dynamic = False
 59 | 
 60 |     def load_engine(self, path):
 61 |         trt.init_libnvinfer_plugins(self.logger, '')
 62 |         with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
 63 |             return runtime.deserialize_cuda_engine(f.read())
 64 | 
 65 |     def get_input_names(self):
 66 |         names = []
 67 |         for _, name in enumerate(self.engine):
 68 |             if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
 69 |                 names.append(name)
 70 |         return names
 71 | 
 72 |     def get_output_names(self):
 73 |         names = []
 74 |         for _, name in enumerate(self.engine):
 75 |             if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
 76 |                 names.append(name)
 77 |         return names
 78 | 
 79 |     def get_bindings(self, engine, context, max_batch_size=32, device=None):
 80 |         Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
 81 |         bindings = OrderedDict()
 82 |         for i, name in enumerate(engine):
 83 |             shape = engine.get_tensor_shape(name)
 84 |             dtype = trt.nptype(engine.get_tensor_dtype(name))
 85 | 
 86 |             if shape[0] == -1:
 87 |                 dynamic = True
 88 |                 shape[0] = max_batch_size
 89 |                 if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
 90 |                     context.set_input_shape(name, shape)
 91 | 
 92 |             if self.backend == 'cuda':
 93 |                 if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
 94 |                     data = np.random.randn(*shape).astype(dtype)
 95 |                     ptr = cuda.mem_alloc(data.nbytes)
 96 |                     bindings[name] = Binding(name, dtype, shape, data, ptr)
 97 |                 else:
 98 |                     data = cuda.pagelocked_empty(trt.volume(shape), dtype)
 99 |                     ptr = cuda.mem_alloc(data.nbytes)
100 |                     bindings[name] = Binding(name, dtype, shape, data, ptr)
101 |             else:
102 |                 data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
103 |                 bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
104 |         return bindings
105 | 
106 |     def run_torch(self, blob):
107 |         for n in self.input_names:
108 |             if self.bindings[n].shape != blob[n].shape:
109 |                 self.context.set_input_shape(n, blob[n].shape)
110 |                 self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
111 | 
112 |         self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
113 |         self.context.execute_v2(list(self.bindings_addr.values()))
114 |         outputs = {n: self.bindings[n].data for n in self.output_names}
115 |         return outputs
116 | 
117 |     def async_run_cuda(self, blob):
118 |         for n in self.input_names:
119 |             cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream)
120 | 
121 |         bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
122 |         self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)
123 | 
124 |         outputs = {}
125 |         for n in self.output_names:
126 |             cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream)
127 |             outputs[n] = self.bindings[n].data
128 | 
129 |         self.stream.synchronize()
130 | 
131 |         return outputs
132 | 
133 |     def __call__(self, blob):
134 |         if self.backend == 'torch':
135 |             return self.run_torch(blob)
136 |         elif self.backend == 'cuda':
137 |             return self.async_run_cuda(blob)
138 | 
139 |     def synchronize(self):
140 |         if self.backend == 'torch' and torch.cuda.is_available():
141 |             torch.cuda.synchronize()
142 |         elif self.backend == 'cuda':
143 |             self.stream.synchronize()
144 | 
145 |     def warmup(self, blob, n):
146 |         for _ in range(n):
147 |             _ = self(blob)
148 | 
149 |     def speed(self, blob, n, nonempty_process=False):
150 |         times = []
151 |         self.time_profile_dataset.reset()
152 |         for i in tqdm(range(n), desc="Running Inference", unit="iteration"):
153 |             self.time_profile.reset()
154 |             with self.time_profile_dataset:
155 |                 img = blob[i]
156 |                 if img['images'] is not None:
157 |                     img['image'] = img['input'] = img['images'].unsqueeze(0)
158 |                 else:
159 |                     img['images'] = img['input'] = img['image'].unsqueeze(0)
160 |             with self.time_profile:
161 |                 _ = self(img)
162 |             times.append(self.time_profile.total)
163 | 
164 |         # end-to-end model only
165 |         if not self.yolo:
166 |             print('end-to-end')
167 |             times = sorted(times)
168 |             if len(times) > 100 and nonempty_process:
169 |                 times = times[:100]
170 | 
171 |         avg_time = sum(times) / len(times)  # Calculate the average of the remaining times
172 |         return avg_time
173 | 
174 | def main():
175 |     FLAGS = parse_args()
176 |     dataset = Dataset(FLAGS.infer_dir)
177 |     im = torch.ones(1, 3, 640, 640).cuda()
178 |     blob = {
179 |             'image': im,
180 |             'images': im,
181 |             'input': im,
182 |             'im_shape': torch.tensor([640, 640]).to(im.device),
183 |             'scale_factor': torch.tensor([1, 1]).to(im.device),
184 |             'orig_target_sizes': torch.tensor([[640, 640]]).to(im.device),
185 |         }
186 | 
187 |     engine_files = glob.glob(os.path.join(FLAGS.engine_dir, "*.engine"))
188 |     results = []
189 | 
190 |     for engine_file in engine_files:
191 |         print(f"Testing engine: {engine_file}")
192 |         model = TRTInference(engine_file, max_batch_size=1, verbose=False)
193 |         model.init()
194 |         model.warmup(blob, 400)
195 |         t = []
196 |         for _ in range(1):
197 |             t.append(model.speed(dataset, 1000, FLAGS.busy))
198 |         avg_latency = 1000 * torch.tensor(t).mean()
199 |         results.append((engine_file, avg_latency))
200 |         print(f"Engine: {engine_file}, Latency: {avg_latency:.2f} ms")
201 | 
202 |         del model
203 |         torch.cuda.empty_cache()
204 |         time.sleep(1)
205 | 
206 |     sorted_results = sorted(results, key=lambda x: x[1])
207 |     for engine_file, latency in sorted_results:
208 |         print(f"Engine: {engine_file}, Latency: {latency:.2f} ms")
209 | 
210 | if __name__ == '__main__':
211 |     main()
212 | 


--------------------------------------------------------------------------------
/tools/inference/trt_inf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
  3 | """
  4 | 
  5 | import os
  6 | import time
  7 | import glob
  8 | import collections
  9 | import contextlib
 10 | from collections import OrderedDict
 11 | 
 12 | import cv2  # Added for video processing
 13 | import numpy as np
 14 | import tensorrt as trt
 15 | import torch
 16 | import torchvision.transforms as T
 17 | 
 18 | from PIL import Image, ImageDraw
 19 | from copy import deepcopy
 20 | from annotator import Annotator
 21 | from annotator_crowdpose import AnnotatorCrowdpose
 22 | 
 23 | annotators = {'COCO': Annotator, 'CrowdPose': AnnotatorCrowdpose}
 24 | 
 25 | 
 26 | class TimeProfiler(contextlib.ContextDecorator):
 27 |     def __init__(self):
 28 |         self.total = 0
 29 | 
 30 |     def __enter__(self):
 31 |         self.start = self.time()
 32 |         return self
 33 | 
 34 |     def __exit__(self, type, value, traceback):
 35 |         self.total += self.time() - self.start
 36 | 
 37 |     def reset(self):
 38 |         self.total = 0
 39 | 
 40 |     def time(self):
 41 |         if torch.cuda.is_available():
 42 |             torch.cuda.synchronize()
 43 |         return time.time()
 44 | 
 45 | 
 46 | class TRTInference(object):
 47 |     def __init__(
 48 |         self, engine_path, device="cuda:0", backend="torch", max_batch_size=32, verbose=False
 49 |     ):
 50 |         self.engine_path = engine_path
 51 |         self.device = device
 52 |         self.backend = backend
 53 |         self.max_batch_size = max_batch_size
 54 | 
 55 |         self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
 56 | 
 57 |         self.engine = self.load_engine(engine_path)
 58 |         self.context = self.engine.create_execution_context()
 59 |         self.bindings = self.get_bindings(
 60 |             self.engine, self.context, self.max_batch_size, self.device
 61 |         )
 62 |         self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
 63 |         self.input_names = self.get_input_names()
 64 |         self.output_names = self.get_output_names()
 65 |         self.time_profile = TimeProfiler()
 66 | 
 67 |     def load_engine(self, path):
 68 |         trt.init_libnvinfer_plugins(self.logger, "")
 69 |         with open(path, "rb") as f, trt.Runtime(self.logger) as runtime:
 70 |             return runtime.deserialize_cuda_engine(f.read())
 71 | 
 72 |     def get_input_names(self):
 73 |         names = []
 74 |         for _, name in enumerate(self.engine):
 75 |             if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
 76 |                 names.append(name)
 77 |         return names
 78 | 
 79 |     def get_output_names(self):
 80 |         names = []
 81 |         for _, name in enumerate(self.engine):
 82 |             if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
 83 |                 names.append(name)
 84 |         return names
 85 | 
 86 |     def get_bindings(self, engine, context, max_batch_size=32, device=None) -> OrderedDict:
 87 |         Binding = collections.namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr"))
 88 |         bindings = OrderedDict()
 89 | 
 90 |         for i, name in enumerate(engine):
 91 |             shape = engine.get_tensor_shape(name)
 92 |             dtype = trt.nptype(engine.get_tensor_dtype(name))
 93 | 
 94 |             if shape[0] == -1:
 95 |                 shape[0] = max_batch_size
 96 |                 if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
 97 |                     context.set_input_shape(name, shape)
 98 | 
 99 |             data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
100 |             bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
101 | 
102 |         return bindings
103 | 
104 |     def run_torch(self, blob):
105 |         for n in self.input_names:
106 |             if blob[n].dtype is not self.bindings[n].data.dtype:
107 |                 blob[n] = blob[n].to(dtype=self.bindings[n].data.dtype)
108 |             if self.bindings[n].shape != blob[n].shape:
109 |                 self.context.set_input_shape(n, blob[n].shape)
110 |                 self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
111 | 
112 |             assert self.bindings[n].data.dtype == blob[n].dtype, "{} dtype mismatch".format(n)
113 | 
114 |         self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
115 |         self.context.execute_v2(list(self.bindings_addr.values()))
116 |         outputs = {n: self.bindings[n].data for n in self.output_names}
117 | 
118 |         return outputs
119 | 
120 |     def __call__(self, blob):
121 |         if self.backend == "torch":
122 |             return self.run_torch(blob)
123 |         else:
124 |             raise NotImplementedError("Only 'torch' backend is implemented.")
125 | 
126 |     def synchronize(self):
127 |         if self.backend == "torch" and torch.cuda.is_available():
128 |             torch.cuda.synchronize()
129 | 
130 | def process_image(m, file_path, device):
131 |     im_pil = Image.open(file_path).convert("RGB")
132 |     w, h = im_pil.size
133 |     orig_size = torch.tensor([w, h])[None].to(device)
134 | 
135 |     transforms = T.Compose(
136 |         [
137 |             T.Resize((640, 640)),
138 |             T.ToTensor(),
139 |         ]
140 |     )
141 |     im_data = transforms(im_pil)[None]
142 |     annotator = annotators[annotator_type](deepcopy(im_pil))
143 | 
144 |     blob = {
145 |         "images": im_data.to(device),
146 |         "orig_target_sizes": orig_size.to(device),
147 |     }
148 | 
149 |     output = m(blob)
150 |     
151 |     scores, labels, keypoints = output.values()
152 |     scores, labels, keypoints = scores[0], labels[0], keypoints[0]
153 |     for kpt, score in zip(keypoints, scores):
154 |         if score > thrh:
155 |             annotator.kpts(
156 |                 kpt,
157 |                 [h, w]
158 |                 )
159 |     annotator.save(f"{OUTPUT_NAME}.jpg")
160 | 
161 | def process_video(m, file_path, device):
162 |     cap = cv2.VideoCapture(file_path)
163 | 
164 |     # Get video properties
165 |     fps = cap.get(cv2.CAP_PROP_FPS)
166 |     orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
167 |     orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
168 | 
169 |     # Define the codec and create VideoWriter object
170 |     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
171 |     out = cv2.VideoWriter(f"{OUTPUT_NAME}.mp4", fourcc, fps, (orig_w, orig_h))
172 | 
173 |     transforms = T.Compose(
174 |         [
175 |             T.Resize((640, 640)),
176 |             T.ToTensor(),
177 |         ]
178 |     )
179 | 
180 |     frame_count = 0
181 |     print("Processing video frames...")
182 |     while cap.isOpened():
183 |         ret, frame = cap.read()
184 |         if not ret:
185 |             break
186 | 
187 |         # Convert frame to PIL image
188 |         frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
189 | 
190 |         w, h = frame_pil.size
191 |         orig_size = torch.tensor([w, h], device=device)[None]
192 |         annotator = annotators[annotator_type](deepcopy(frame_pil))
193 | 
194 |         im_data = transforms(frame_pil)[None]
195 | 
196 |         blob = {
197 |             "images": im_data.to(device),
198 |             "orig_target_sizes": orig_size,
199 |         }
200 | 
201 |         output = m(blob)
202 | 
203 |         scores, labels, keypoints = output.values()
204 |         scores, labels, keypoints = scores[0], labels[0], keypoints[0]
205 |         for kpt, score in zip(keypoints, scores):
206 |             if score > thrh:
207 |                 annotator.kpts(
208 |                     kpt,
209 |                     [h, w]
210 |                     )
211 | 
212 |         # Convert back to OpenCV image
213 |         frame = annotator.result()
214 | 
215 |         # Write the frame
216 |         out.write(frame)
217 |         frame_count += 1
218 | 
219 |         if frame_count % 100 == 0:
220 |             print(f"Processed {frame_count} frames...")
221 | 
222 |     cap.release()
223 |     out.release()
224 |     print(f"Video processing complete. Result saved as '{OUTPUT_NAME}.mp4'.")
225 | 
226 | def process_file(m, file_path, device):
227 |     # Check if the input file is an image or a vide
228 |     if os.path.splitext(file_path)[-1].lower() in [".jpg", ".jpeg", ".png", ".bmp"]:
229 |         # Process as image
230 |         process_image(m, file_path, device)
231 |     else:
232 |         # Process as video
233 |         process_video(m, file_path, device)
234 | 
235 | if __name__ == "__main__":
236 |     import argparse
237 | 
238 |     parser = argparse.ArgumentParser()
239 |     parser.add_argument("-trt", "--trt", type=str, required=True)
240 |     parser.add_argument("--annotator", type=str, required=True, help="Annotator type: COCO or CrowdPose.")
241 |     parser.add_argument("-i", "--input", type=str, required=True)
242 |     parser.add_argument("-d", "--device", type=str, default="cuda:0")
243 |     parser.add_argument("-t", "--thrh", type=float, required=False, default=None)
244 | 
245 |     args = parser.parse_args()
246 | 
247 |     assert args.annotator.lower() in ['coco', 'crowdpose']
248 | 
249 |     # Global variable
250 |     global OUTPUT_NAME, thrh, annotator_type
251 |     thrh = 0.5 if args.thrh is None else args.thrh
252 | 
253 |     annotator_name = args.annotator.lower()
254 |     if annotator_name == 'coco':
255 |         annotator_type = 'COCO'
256 |     elif annotator_name == 'crowdpose':
257 |         annotator_type = 'CrowdPose'
258 |     
259 |     m = TRTInference(args.trt, device=args.device)
260 | 
261 |     # Check if the input argumnet is a file or a folder
262 |     file_path = args.input
263 |     if os.path.isdir(file_path):
264 |         # Process a folder
265 |         folder_dir = args.input
266 |         if folder_dir[-1] == '/':
267 |             folder_dir = folder_dir[:-1]
268 |         output_dir = f"{folder_dir}/output"
269 |         os.makedirs(output_dir, exist_ok=True)
270 |         paths = list(glob.iglob(f"{folder_dir}/*.*"))
271 |         for file_path in paths:
272 |             OUTPUT_NAME = file_path.replace(f'{folder_dir}/', f'{output_dir}/').split('.')[0]
273 |             OUTPUT_NAME = f"{OUTPUT_NAME}_{annotator_type}"
274 |             process_file(m, file_path, args.device)
275 |     else:
276 |         # Process a file
277 |         OUTPUT_NAME = f'trt_results_{annotator_type}'
278 |         process_file(m, file_path, args.device)


--------------------------------------------------------------------------------
/src/models/detrpose/dn_component.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
  3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | Modified from DINO (https://github.com/IDEA-Research/DINO/)
  6 | Copyright (c) 2022 IDEA. All Rights Reserved.
  7 | ---------------------------------------------------------------------------------
  8 | Modified from DN-DETR (https://github.com/IDEA-Research/DN-DETR/)
  9 | Copyright (c) 2022 IDEA. All Rights Reserved.
 10 | """
 11 | 
 12 | 
 13 | import torch
 14 | from .utils import inverse_sigmoid
 15 | import torch.nn.functional as F
 16 | import numpy as np
 17 | 
 18 | def get_sigmas(num_keypoints, device):
 19 |     if num_keypoints == 17:
 20 |         sigmas = np.array([
 21 |             .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
 22 |             1.07, .87, .87, .89, .89
 23 |         ], dtype=np.float32) / 10.0
 24 |     elif num_keypoints == 14:
 25 |         sigmas = np.array([
 26 |             .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
 27 |             .79, .79
 28 |         ]) / 10.0
 29 |     elif num_keypoints == 3:
 30 |         sigmas = np.array([
 31 |             1.07, 1.07, 0.67
 32 |         ]) / 10.0
 33 |     else:
 34 |         raise ValueError(f'Unsupported keypoints number {num_keypoints}')
 35 |     sigmas = np.concatenate([[0.1], sigmas]) # for the center of the human
 36 |     sigmas = torch.tensor(sigmas, device=device, dtype=torch.float32)
 37 |     return sigmas[None, :, None]
 38 | 
 39 | 
 40 | def prepare_for_cdn(dn_args, training, num_queries, num_classes, num_keypoints, hidden_dim, label_enc, pose_enc, img_dim, device):
 41 |     """
 42 |         A major difference of DINO from DN-DETR is that the author process pattern embedding pattern embedding in its detector
 43 |         forward function and use learnable tgt embedding, so we change this function a little bit.
 44 |         :param dn_args: targets, dn_number, label_noise_ratio
 45 |         :param training: if it is training or inference
 46 |         :param num_queries: number of queires
 47 |         :param num_classes: number of classes
 48 |         :param hidden_dim: transformer hidden dim
 49 |         :param label_enc: encode labels in dn
 50 |         :return:
 51 |         """
 52 |     if training:
 53 |         targets, dn_number, label_noise_ratio = dn_args
 54 |         # positive and negative dn queries
 55 |         dn_number = dn_number * 2
 56 |         known = [(torch.ones_like(t['labels'])) for t in targets]
 57 |         batch_size = len(known)
 58 |         known_num = [sum(k) for k in known]
 59 | 
 60 |         if int(max(known_num)) == 0:
 61 |             return None, None, None, None
 62 | 
 63 |         dn_number = dn_number // (int(max(known_num) * 2))
 64 |         dn_number = 1 if dn_number == 0 else dn_number
 65 | 
 66 |         unmask_bbox = unmask_label = torch.cat(known)
 67 |         
 68 |         # instance label denoise
 69 |         labels = torch.cat([t['labels'] for t in targets])
 70 |         batch_idx = torch.cat([torch.full_like(t['labels'].long(), i) for i, t in enumerate(targets)])
 71 | 
 72 |         known_indice = torch.nonzero(unmask_label + unmask_bbox)
 73 |         known_indice = known_indice.view(-1)
 74 |         known_indice = known_indice.repeat(2 * dn_number, 1).view(-1)
 75 | 
 76 |         known_labels = labels.repeat(2 * dn_number, 1).view(-1)
 77 |         known_labels_expaned = known_labels.clone()
 78 | 
 79 |         known_labels_poses_expaned = torch.arange(num_keypoints, dtype=torch.long, device=device)
 80 |         known_labels_poses_expaned = known_labels_poses_expaned[None].repeat(len(known_labels), 1)
 81 |        
 82 |         known_bid = batch_idx.repeat(2 * dn_number, 1).view(-1)
 83 |         
 84 |         if label_noise_ratio > 0:
 85 |             p = torch.rand_like(known_labels_expaned.float())
 86 |             chosen_indice = torch.nonzero(p < (label_noise_ratio * 0.5)).view(-1)  # half of bbox prob
 87 |             new_label = torch.randint_like(chosen_indice, 0, num_classes)  # randomly put a new one here
 88 |             known_labels_expaned.scatter_(0, chosen_indice, new_label)
 89 | 
 90 |             # weights = torch.ones((len(chosen_indice), num_keypoints), device=p.device)
 91 |             # new_label_pose = torch.multinomial(weights, num_samples=num_keypoints, replacement=False)
 92 |             # known_labels_poses_expaned.scatter_(0, chosen_indice.unsqueeze(-1).repeat(1, num_keypoints), new_label_pose)  
 93 | 
 94 |         # keypoint noise
 95 |         boxes = torch.cat([t['boxes'] for t in targets])
 96 |         xy = (boxes[:, :2] + boxes[:, 2:]) / 2.
 97 |         keypoints = torch.cat([t['keypoints'] for t in targets])
 98 |         if 'area' in targets[0]:
 99 |             areas = torch.cat([t['area'] for t in targets])
100 |         else:
101 |             areas = boxes[:, 2] * boxes[:, 3] * 0.53
102 |         poses = keypoints[:, 0:(num_keypoints * 2)]
103 |         poses = torch.cat([xy, poses], dim=1)
104 |         non_viz = keypoints[:, (num_keypoints * 2):] == 0
105 |         non_viz = torch.cat((torch.ones_like(non_viz[:, 0:1]).bool(), non_viz), dim=1)
106 |         vars = (2 * get_sigmas(num_keypoints, device)) ** 2
107 | 
108 | 
109 |         known_poses = poses.repeat(2 * dn_number, 1).reshape(-1, num_keypoints+1, 2)
110 |         known_areas = areas.repeat(2 * dn_number)[..., None, None] # normalized [0, 1]
111 |         known_areas = known_areas * img_dim[0] * img_dim[1] # scaled [0, h*w]
112 |         known_non_viz = non_viz.repeat(2 * dn_number, 1)
113 | 
114 |         single_pad = int(max(known_num))
115 |         pad_size = int(single_pad * 2 * dn_number)
116 |         positive_idx = torch.tensor(range(len(poses))).long().cuda().unsqueeze(0).repeat(dn_number, 1)
117 |         positive_idx += (torch.tensor(range(dn_number)) * len(poses) * 2).long().cuda().unsqueeze(1)
118 |         positive_idx = positive_idx.flatten()
119 |         negative_idx = positive_idx + len(poses)
120 | 
121 |         eps = np.finfo('float32').eps
122 |         rand_vector = torch.rand_like(known_poses)
123 |         rand_vector = F.normalize(rand_vector, -1) # ||rand_vector|| = 1 
124 |         rand_alpha = torch.zeros_like(known_poses[..., :1]).uniform_(-np.log(1), -np.log(0.5))
125 |         rand_alpha[negative_idx] = rand_alpha[negative_idx].uniform_(-np.log(0.5), -np.log(0.1))
126 |         rand_alpha *= 2 * (known_areas + eps) * vars ## This is distance **2 
127 |         rand_alpha = torch.sqrt(rand_alpha) / max(img_dim) 
128 |         # rand_alpha = rand_alpha ** 1.25 ## This is distance
129 |         rand_alpha[known_non_viz] = 0.
130 | 
131 |         known_poses_expand = known_poses + rand_alpha * rand_vector
132 | 
133 |         m = known_labels_expaned.long().to(device)
134 |         input_label_embed = label_enc(m)
135 |         # input_label_pose_embed = pose_enc(known_labels_poses_expaned)
136 |         input_label_pose_embed = pose_enc.weight[None].repeat(known_poses_expand.size(0), 1, 1)
137 |         input_label_embed = torch.cat([input_label_embed.unsqueeze(1), input_label_pose_embed], dim=1)
138 |         input_label_embed = input_label_embed.flatten(1)
139 | 
140 |         input_pose_embed = inverse_sigmoid(known_poses_expand)
141 | 
142 |         padding_label = torch.zeros(pad_size, hidden_dim * (num_keypoints + 1)).cuda()
143 |         padding_pose = torch.zeros(pad_size, num_keypoints+1).cuda()
144 | 
145 |         input_query_label = padding_label.repeat(batch_size, 1, 1)
146 |         input_query_pose = padding_pose[...,None].repeat(batch_size, 1, 1, 2)
147 | 
148 |         map_known_indice = torch.tensor([], device=device)
149 |         if len(known_num):
150 |             map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num])  # [1,2, 1,2,3]
151 |             map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(2 * dn_number)]).long()
152 |         if len(known_bid):
153 |             input_query_label[(known_bid.long(), map_known_indice)] = input_label_embed
154 |             input_query_pose[(known_bid.long(), map_known_indice)] = input_pose_embed
155 | 
156 |         tgt_size = pad_size + num_queries
157 |         attn_mask = torch.ones(tgt_size, tgt_size, device=device) < 0
158 |         # match query cannot see the reconstruct
159 |         attn_mask[pad_size:, :pad_size] = True
160 |         # reconstruct cannot see each other
161 |         for i in range(dn_number):
162 |             if i == 0:
163 |                 attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
164 |             if i == dn_number - 1:
165 |                 attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * i * 2] = True
166 |             else:
167 |                 attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
168 |                 attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * 2 * i] = True
169 |         # import matplotlib.pyplot as plt
170 |         # plt.imshow(~attn_mask.detach().cpu().numpy(), cmap='gray')
171 |         # plt.show()
172 | 
173 |         dn_meta = {
174 |             'pad_size': pad_size,
175 |             'num_dn_group': dn_number,
176 |         }
177 |     else:
178 | 
179 |         input_query_label = None
180 |         input_query_bbox = None
181 |         attn_mask = None
182 |         dn_meta = None
183 | 
184 |     return input_query_label.unflatten(-1, (-1, hidden_dim)), input_query_pose, attn_mask, dn_meta
185 | 
186 | 
187 | def dn_post_process(outputs_class, outputs_keypoints, dn_meta, aux_loss, _set_aux_loss):
188 |     """
189 |         post process of dn after output from the transformer
190 |         put the dn part in the dn_meta
191 |     """
192 |     if dn_meta and dn_meta['pad_size'] > 0:
193 |         output_known_class = outputs_class[:, :, :dn_meta['pad_size'], :]
194 |         output_known_keypoints = outputs_keypoints[:, :, :dn_meta['pad_size'], :]
195 |         outputs_class = outputs_class[:, :, dn_meta['pad_size']:, :]
196 |         outputs_keypoints = outputs_keypoints[:, :, dn_meta['pad_size']:, :]
197 |         out = {'pred_logits': output_known_class[-1], 'pred_keypoints': output_known_keypoints[-1]}
198 |         if aux_loss:
199 |             out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_keypoints)
200 |         dn_meta['output_known_lbs_keypoints'] = out
201 |     return outputs_class, outputs_keypoints
202 | 


--------------------------------------------------------------------------------