├── .gitattributes
├── src
├── solver
│ ├── __init__.py
│ └── engine.py
├── nn
│ ├── __init__.py
│ ├── optimizer
│ │ ├── __init__.py
│ │ ├── warmup.py
│ │ └── ema.py
│ └── backbone
│ │ ├── __init__.py
│ │ └── resnet.py
├── misc
│ ├── __init__.py
│ ├── profiler.py
│ ├── keypoint_ops.py
│ ├── mask_ops.py
│ ├── metrics.py
│ ├── keypoint_loss.py
│ ├── box_ops.py
│ ├── get_param_dicts.py
│ ├── dist_utils.py
│ └── logger.py
├── core
│ ├── __init__.py
│ ├── utils.py
│ └── instantiate.py
├── data
│ ├── __init__.py
│ ├── container.py
│ ├── coco.py
│ ├── crowdpose.py
│ └── dataloader.py
└── models
│ └── detrpose
│ ├── __init__.py
│ ├── detrpose.py
│ ├── postprocesses.py
│ ├── matcher.py
│ ├── utils.py
│ ├── ms_deform_attn.py
│ └── dn_component.py
├── assets
├── metrics.png
├── lambda_logo1.png
├── lambda_logo2.png
├── TENSORRT_CONTAINER_LAMBDA.AI.md
└── TENSORRT_DEB_LAMBDA.AI.md
├── examples
├── example1.jpg
└── example2.jpg
├── tools
├── benchmark
│ ├── requirements.txt
│ ├── utils.py
│ ├── get_info.py
│ ├── dataset.py
│ ├── torch_benchmark.py
│ └── trt_benchmark.py
├── deployment
│ ├── export_tensorrt.py
│ ├── export_yolo_w_nms.py
│ └── export_onnx.py
├── visualization
│ ├── backbone_encoder.py
│ └── line_attention.py
└── inference
│ ├── onnx_inf.py
│ ├── annotator_crowdpose.py
│ ├── torch_inf.py
│ └── trt_inf.py
├── requirements.txt
├── .gitignore
├── configs
└── detrpose
│ ├── detrpose_hgnetv2_l.py
│ ├── detrpose_hgnetv2_l_crowdpose.py
│ ├── detrpose_hgnetv2_x.py
│ ├── detrpose_hgnetv2_x_crowdpose.py
│ ├── detrpose_hgnetv2_m.py
│ ├── detrpose_hgnetv2_s.py
│ ├── detrpose_hgnetv2_m_crowdpose.py
│ ├── detrpose_hgnetv2_s_crowdpose.py
│ ├── detrpose_hgnetv2_n.py
│ ├── detrpose_hgnetv2_n_crowdpose.py
│ └── include
│ ├── detrpose_hgnetv2.py
│ ├── dataset.py
│ └── dataset_crowdpose.py
└── train.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-documentation
2 |
--------------------------------------------------------------------------------
/src/solver/__init__.py:
--------------------------------------------------------------------------------
1 | from .trainer import Trainer
--------------------------------------------------------------------------------
/src/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import *
2 | from .optimizer import *
3 |
--------------------------------------------------------------------------------
/src/misc/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
--------------------------------------------------------------------------------
/src/nn/optimizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .warmup import LinearWarmup
2 | from .ema import ModelEMA
--------------------------------------------------------------------------------
/assets/metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/assets/metrics.png
--------------------------------------------------------------------------------
/examples/example1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/examples/example1.jpg
--------------------------------------------------------------------------------
/examples/example2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/examples/example2.jpg
--------------------------------------------------------------------------------
/src/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .instantiate import instantiate
2 | from .lazy import LazyConfig, LazyCall
--------------------------------------------------------------------------------
/assets/lambda_logo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/assets/lambda_logo1.png
--------------------------------------------------------------------------------
/assets/lambda_logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/assets/lambda_logo2.png
--------------------------------------------------------------------------------
/tools/benchmark/requirements.txt:
--------------------------------------------------------------------------------
1 | onnxruntime
2 | onnxscript
3 | onnxsim
4 | tensorrt
5 | pycuda
6 | tqdm
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python
2 | transformers
3 | cloudpickle
4 | pycocotools
5 | xtcocotools
6 | omegaconf
7 | calflops
8 | iopath
9 | scipy
10 | numpy==1.23.5
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | weight/
2 | data/COCO2017
3 | data/crowdpose
4 | output/
5 | official_weights/
6 | onnx_engines/
7 | trt_engines/
8 | clean_pth_files.py
9 | **/__pycache__/**
10 | examples/output/
11 |
--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .coco import CocoDetection
2 | # from .coco_eval import CocoEvaluator
3 |
4 | from .crowdpose import CrowdPoseDetection
5 | # from .crowdpose_eval import CrowdPoseEvaluator
6 |
7 | # from .dataloader import DataLoader, BatchImageCollateFunction
8 |
--------------------------------------------------------------------------------
/src/models/detrpose/__init__.py:
--------------------------------------------------------------------------------
1 | from .matcher import HungarianMatcher
2 | from .detrpose import DETRPose
3 | from .criterion import Criterion
4 | from .transformer import Transformer
5 | from .postprocesses import PostProcess
6 | from .hybrid_encoder import HybridEncoder
7 |
--------------------------------------------------------------------------------
/tools/deployment/export_tensorrt.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | input_folder = 'onnx_engines'
4 | input_files = [f for f in os.listdir(input_folder)]
5 |
6 | output_folder = 'trt_engines'
7 | output_files = [f.replace('onnx', 'engine') for f in input_files]
8 |
9 | os.makedirs(output_folder, exist_ok=True)
10 |
11 | trtexec="/usr/src/tensorrt/bin/trtexec"
12 |
13 | for f_in, f_out in zip(input_files, output_files):
14 | cmd = f'{trtexec} --onnx="{input_folder}/{f_in}" --saveEngine="{output_folder}/{f_out}" --fp16'
15 | print(f'running:\t{cmd}')
16 | os.system(cmd)
--------------------------------------------------------------------------------
/src/nn/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Modified from Conditional DETR
3 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Copied from DETR (https://github.com/facebookresearch/detr)
7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
8 | # ------------------------------------------------------------------------
9 |
10 | from .resnet import ResNet
11 | from .hgnetv2 import HGNetv2
12 |
--------------------------------------------------------------------------------
/src/misc/profiler.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from calflops import calculate_flops
3 | from typing import Tuple
4 |
5 | def stats(
6 | model,
7 | input_shape: Tuple=(1, 3, 640, 640), ) -> Tuple[int, dict]:
8 |
9 | model_for_info = copy.deepcopy(model).deploy()
10 |
11 | flops, macs, _ = calculate_flops(model=model_for_info,
12 | input_shape=input_shape,
13 | output_as_string=True,
14 | output_precision=4,
15 | print_detailed=False)
16 | params = sum(p.numel() for p in model_for_info.parameters())
17 | del model_for_info
18 | return {'flops': flops, 'macs': macs, 'params': params}
19 |
--------------------------------------------------------------------------------
/src/misc/keypoint_ops.py:
--------------------------------------------------------------------------------
1 | import torch, os
2 |
3 | def keypoint_xyxyzz_to_xyzxyz(keypoints: torch.Tensor):
4 | """_summary_
5 |
6 | Args:
7 | keypoints (torch.Tensor): ..., 51
8 | """
9 | res = torch.zeros_like(keypoints)
10 | num_points = keypoints.shape[-1] // 3
11 | Z = keypoints[..., :2*num_points]
12 | V = keypoints[..., 2*num_points:]
13 | res[...,0::3] = Z[..., 0::2]
14 | res[...,1::3] = Z[..., 1::2]
15 | res[...,2::3] = V[...]
16 | return res
17 |
18 | def keypoint_xyzxyz_to_xyxyzz(keypoints: torch.Tensor):
19 | """_summary_
20 |
21 | Args:
22 | keypoints (torch.Tensor): ..., 51
23 | """
24 | res = torch.zeros_like(keypoints)
25 | num_points = keypoints.shape[-1] // 3
26 | res[...,0:2*num_points:2] = keypoints[..., 0::3]
27 | res[...,1:2*num_points:2] = keypoints[..., 1::3]
28 | res[...,2*num_points:] = keypoints[..., 2::3]
29 | return res
--------------------------------------------------------------------------------
/tools/benchmark/utils.py:
--------------------------------------------------------------------------------
1 | import time
2 | import contextlib
3 | import numpy as np
4 | from PIL import Image
5 | from collections import OrderedDict
6 |
7 | import onnx
8 | import torch
9 |
10 |
11 | def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'):
12 | '''--loadInputs='image:input_tensor.bin'
13 | '''
14 | im = Image.open(path).resize(size)
15 | data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255.
16 | data.tofile(output_name)
17 |
18 |
19 | class TimeProfiler(contextlib.ContextDecorator):
20 | def __init__(self, ):
21 | self.total = 0
22 |
23 | def __enter__(self, ):
24 | self.start = self.time()
25 | return self
26 |
27 | def __exit__(self, type, value, traceback):
28 | self.total += self.time() - self.start
29 |
30 | def reset(self, ):
31 | self.total = 0
32 |
33 | def time(self, ):
34 | if torch.cuda.is_available():
35 | torch.cuda.synchronize()
36 | return time.time()
37 |
--------------------------------------------------------------------------------
/tools/deployment/export_yolo_w_nms.py:
--------------------------------------------------------------------------------
1 | import os
2 | from ultralytics import YOLO
3 |
4 | def main(args):
5 | output_folder = 'trt_engines'
6 | os.makedirs(output_folder, exist_ok=True)
7 |
8 | model = YOLO(f"{args.name}.pt")
9 | model.export(format="engine", nms=True, iou=args.iou_threshold, conf=args.score_threshold, half=True, dynamic=False)
10 |
11 | with open(f"{args.name}.engine", "rb") as f:
12 | meta_len = int.from_bytes(f.read(4), byteorder="little")
13 | f.seek(meta_len + 4)
14 | engine = f.read()
15 |
16 | new_name = f"{args.name}_" + str(args.iou_threshold).split('.')[1] + '_' + str(args.score_threshold).split('.')[1]
17 | with open(f"{output_folder}/{new_name}.engine", "wb") as f:
18 | f.write(engine)
19 |
20 | if __name__ == "__main__":
21 | import argparse
22 |
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument("--name", type=str, default="yolo11n_tuned")
25 | parser.add_argument("--score_threshold", type=float, default=0.01)
26 | parser.add_argument("--iou_threshold", type=float, default=0.7)
27 | args = parser.parse_args()
28 |
29 | main(args)
--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_l.py:
--------------------------------------------------------------------------------
1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator
3 |
4 | from src.core import LazyCall as L
5 | from src.nn.optimizer import ModelEMA
6 | from src.misc.get_param_dicts import get_optim_params
7 |
8 | from torch import optim
9 |
10 | training_params.output_dir = "output/detrpose_hgnetv2_l"
11 | training_params.epochs = 52 # 48 + 4
12 | training_params.use_ema = True
13 |
14 | ema = L(ModelEMA)(
15 | decay=0.9999,
16 | warmups=2000
17 | )
18 |
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 | params=L(get_optim_params)(
22 | cfg=[
23 | {
24 | 'params': '^(?=.*backbone).*$',
25 | 'lr': 0.00001
26 | },
27 | ],
28 | # model=model
29 | ),
30 | lr=0.0001,
31 | betas=[0.9, 0.999],
32 | weight_decay=0.0001
33 | )
34 |
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 | # optimizer=optimizer,
37 | milestones=[1000],
38 | gamma=0.1
39 | )
40 |
41 |
--------------------------------------------------------------------------------
/src/misc/mask_ops.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, List
2 |
3 | # needed due to empty tensor bug in pytorch and torchvision 0.5
4 | import torchvision
5 | __torchvision_need_compat_flag = float(torchvision.__version__.split('.')[1]) < 7
6 | if __torchvision_need_compat_flag:
7 | from torchvision.ops import _new_empty_tensor
8 | from torchvision.ops.misc import _output_size
9 |
10 | def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
11 | # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
12 | """
13 | Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
14 | This will eventually be supported natively by PyTorch, and this
15 | class can go away.
16 | """
17 | if __torchvision_need_compat_flag < 0.7:
18 | if input.numel() > 0:
19 | return torch.nn.functional.interpolate(
20 | input, size, scale_factor, mode, align_corners
21 | )
22 |
23 | output_shape = _output_size(2, input, size, scale_factor)
24 | output_shape = list(input.shape[:-2]) + list(output_shape)
25 | return _new_empty_tensor(input, output_shape)
26 | else:
27 | return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
28 |
--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_l_crowdpose.py:
--------------------------------------------------------------------------------
1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator
3 |
4 | from src.core import LazyCall as L
5 | from src.nn.optimizer import ModelEMA
6 | from src.misc.get_param_dicts import get_optim_params
7 |
8 | from torch import optim
9 |
10 | training_params.output_dir = "output/detrpose_hgnetv2_l_crowdpose"
11 | training_params.epochs = 64 # 48 + 16
12 | training_params.use_ema = True
13 |
14 | ema = L(ModelEMA)(
15 | decay=0.9999,
16 | warmups=2000
17 | )
18 |
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 | params=L(get_optim_params)(
22 | cfg=[
23 | {
24 | 'params': '^(?=.*backbone).*$',
25 | 'lr': 0.00001
26 | },
27 | ],
28 | # model=model
29 | ),
30 | lr=0.0001,
31 | betas=[0.9, 0.999],
32 | weight_decay=0.0001
33 | )
34 |
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 | # optimizer=optimizer,
37 | milestones=[1000],
38 | gamma=0.1
39 | )
40 |
41 | model.transformer.num_body_points=14
42 | criterion.matcher.num_body_points=14
43 | criterion.num_body_points=14
44 | postprocessor.num_body_points=14
45 |
--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_x.py:
--------------------------------------------------------------------------------
1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator
3 |
4 | from src.core import LazyCall as L
5 | from src.nn.optimizer import ModelEMA
6 | from src.misc.get_param_dicts import get_optim_params
7 |
8 | from torch import optim
9 |
10 | training_params.output_dir = "output/detrpose_hgnetv2_x"
11 | training_params.epochs = 52 # 48 + 4
12 | training_params.use_ema = True
13 | training_params.grad_accum_steps = 1
14 |
15 | ema = L(ModelEMA)(
16 | decay=0.9999,
17 | warmups=2000
18 | )
19 |
20 | # optimizer params
21 | optimizer = L(optim.AdamW)(
22 | params=L(get_optim_params)(
23 | cfg=[
24 | {
25 | 'params': '^(?=.*backbone).*$',
26 | 'lr': 0.00005
27 | },
28 | ],
29 | # model=model
30 | ),
31 | lr=0.0001,
32 | betas=[0.9, 0.999],
33 | weight_decay=0.0001
34 | )
35 |
36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
37 | # optimizer=optimizer,
38 | milestones=[1000],
39 | gamma=0.1
40 | )
41 |
42 | model.backbone.name = 'B5'
43 | model.encoder.hidden_dim = 384
44 | model.encoder.dim_feedforward = 2048
45 | model.transformer.hidden_dim = 384
46 | # model.transformer.feat_channels = [384, 384, 384]
47 | model.transformer.reg_scale = 8
48 |
--------------------------------------------------------------------------------
/tools/benchmark/get_info.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
3 | """
4 |
5 | import os, sys
6 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
7 | from src.core import LazyConfig, instantiate
8 |
9 | import argparse
10 | from calflops import calculate_flops
11 |
12 | import torch
13 | import torch.nn as nn
14 |
15 | def custom_repr(self):
16 | return f'{{Tensor:{tuple(self.shape)}}} {original_repr(self)}'
17 | original_repr = torch.Tensor.__repr__
18 | torch.Tensor.__repr__ = custom_repr
19 |
20 | def main(args, ):
21 | """main
22 | """
23 | cfg = LazyConfig.load(args.config_file)
24 |
25 | if hasattr(cfg.model.backbone, 'pretrained'):
26 | cfg.model.backbone.pretrained = False
27 |
28 | model = instantiate(cfg.model)
29 |
30 | model = model.deploy()
31 | model.eval()
32 |
33 | flops, macs, _ = calculate_flops(model=model,
34 | input_shape=(1, 3, 640, 640),
35 | output_as_string=True,
36 | output_precision=4)
37 | params = sum(p.numel() for p in model.parameters())
38 | print("Model FLOPs:%s MACs:%s Params:%s \n" %(flops, macs, params))
39 |
40 |
41 | if __name__ == '__main__':
42 |
43 | parser = argparse.ArgumentParser()
44 | parser.add_argument('--config_file', '-c', default= "configs/linea/linea_hgnetv2_lpy", type=str)
45 | args = parser.parse_args()
46 |
47 | main(args)
48 |
--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_x_crowdpose.py:
--------------------------------------------------------------------------------
1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator
3 |
4 | from src.core import LazyCall as L
5 | from src.nn.optimizer import ModelEMA
6 | from src.misc.get_param_dicts import get_optim_params
7 |
8 | from torch import optim
9 |
10 | training_params.output_dir = "output/detrpose_hgnetv2_x_crowdpose"
11 | training_params.epochs = 52 # 48 + 4
12 | training_params.use_ema = True
13 |
14 | ema = L(ModelEMA)(
15 | decay=0.9999,
16 | warmups=2000
17 | )
18 |
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 | params=L(get_optim_params)(
22 | cfg=[
23 | {
24 | 'params': '^(?=.*backbone).*$',
25 | 'lr': 0.00001
26 | },
27 | ],
28 | # model=model
29 | ),
30 | lr=0.0001,
31 | betas=[0.9, 0.999],
32 | weight_decay=0.0001
33 | )
34 |
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 | # optimizer=optimizer,
37 | milestones=[1000],
38 | gamma=0.1
39 | )
40 |
41 | model.transformer.num_body_points=14
42 | criterion.matcher.num_body_points=14
43 | criterion.num_body_points=14
44 | postprocessor.num_body_points=14
45 |
46 | model.backbone.name = 'B5'
47 | model.encoder.hidden_dim = 384
48 | model.encoder.dim_feedforward = 2048
49 | model.transformer.hidden_dim = 384
50 | # model.transformer.feat_channels = [384, 384, 384]
51 | model.transformer.reg_scale = 8
52 |
--------------------------------------------------------------------------------
/src/models/detrpose/detrpose.py:
--------------------------------------------------------------------------------
1 | """
2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
4 | ---------------------------------------------------------------------------------
5 | Modified from DEIM (https://github.com/Intellindust-AI-Lab/DEIM/)
6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
7 | ---------------------------------------------------------------------------------
8 | Modified from D-FINE (https://github.com/Peterande/D-FINE/)
9 | Copyright (c) 2024 D-FINE Authors. All Rights Reserved.
10 | ---------------------------------------------------------------------------------
11 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/)
12 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved.
13 | """
14 |
15 | from torch import nn
16 |
17 | class DETRPose(nn.Module):
18 | def __init__(
19 | self,
20 | backbone,
21 | encoder,
22 | transformer
23 | ):
24 | super().__init__()
25 | self.backbone = backbone
26 | self.encoder = encoder
27 | self.transformer = transformer
28 |
29 | def deploy(self):
30 | self.eval()
31 | for m in self.modules():
32 | if hasattr(m, "convert_to_deploy"):
33 | m.convert_to_deploy()
34 | return self
35 |
36 | def forward(self, samples, targets=None):
37 | feats = self.backbone(samples)
38 | feats = self.encoder(feats)
39 | out = self.transformer(feats, targets, samples if self.training else None)
40 | return out
41 |
42 |
--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_m.py:
--------------------------------------------------------------------------------
1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator
3 |
4 | from src.core import LazyCall as L
5 | from src.nn.optimizer import ModelEMA
6 | from src.misc.get_param_dicts import get_optim_params
7 |
8 | from torch import optim
9 |
10 | training_params.output_dir = "output/detrpose_hgnetv2_m"
11 | training_params.epochs = 64 # 60 + 4
12 | training_params.use_ema = True
13 | training_params.grad_accum_steps = 1
14 |
15 | ema = L(ModelEMA)(
16 | decay=0.9999,
17 | warmups=2000
18 | )
19 |
20 | # optimizer params
21 | optimizer = L(optim.AdamW)(
22 | params=L(get_optim_params)(
23 | cfg=[
24 | {
25 | 'params': '^(?=.*backbone).*$',
26 | 'lr': 0.00001
27 | },
28 | ],
29 | # model=model
30 | ),
31 | lr=0.0001,
32 | betas=[0.9, 0.999],
33 | weight_decay=0.0001
34 | )
35 |
36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
37 | # optimizer=optimizer,
38 | milestones=[15],
39 | gamma=0.1
40 | )
41 |
42 | model.backbone.name = 'B2'
43 | model.backbone.use_lab = True
44 | model.encoder.in_channels = [384, 768, 1536]
45 | model.encoder.depth_mult = 0.67
46 | model.transformer.num_decoder_layers = 4
47 |
48 | dataset_train.dataset.transforms.policy = {
49 | 'name': 'stop_epoch',
50 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
51 | 'epoch': [5, 35, 60] # 60 / 2 + 5 = 35
52 | }
53 | dataset_train.collate_fn.base_size_repeat = 6
54 | dataset_train.collate_fn.stop_epoch = 60
55 |
--------------------------------------------------------------------------------
/src/nn/optimizer/warmup.py:
--------------------------------------------------------------------------------
1 | """
2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
4 | """
5 |
6 | from torch.optim.lr_scheduler import LRScheduler
7 |
8 |
9 | class Warmup(object):
10 | def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int=-1) -> None:
11 | self.lr_scheduler = lr_scheduler
12 | self.warmup_end_values = [pg['lr'] for pg in lr_scheduler.optimizer.param_groups]
13 | self.last_step = last_step
14 | self.warmup_duration = warmup_duration
15 | self.step()
16 |
17 | def state_dict(self):
18 | return {k: v for k, v in self.__dict__.items() if k != 'lr_scheduler'}
19 |
20 | def load_state_dict(self, state_dict):
21 | self.__dict__.update(state_dict)
22 |
23 | def get_warmup_factor(self, step, **kwargs):
24 | raise NotImplementedError
25 |
26 | def step(self, ):
27 | self.last_step += 1
28 | if self.last_step >= self.warmup_duration:
29 | return
30 | factor = self.get_warmup_factor(self.last_step)
31 | for i, pg in enumerate(self.lr_scheduler.optimizer.param_groups):
32 | pg['lr'] = factor * self.warmup_end_values[i]
33 |
34 | def finished(self, ):
35 | if self.last_step >= self.warmup_duration:
36 | return True
37 | return False
38 |
39 |
40 | class LinearWarmup(Warmup):
41 | def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int = -1) -> None:
42 | super().__init__(lr_scheduler, warmup_duration, last_step)
43 |
44 | def get_warmup_factor(self, step):
45 | return min(1.0, (step + 1) / self.warmup_duration)
--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_s.py:
--------------------------------------------------------------------------------
1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator
3 |
4 | from src.core import LazyCall as L
5 | from src.nn.optimizer import ModelEMA
6 | from src.misc.get_param_dicts import get_optim_params
7 |
8 | from torch import optim
9 |
10 | training_params.output_dir = "output/detrpose_hgnetv2_s"
11 | training_params.epochs = 100 # 96 + 4
12 | training_params.use_ema = True
13 | training_params.grad_accum_steps = 1
14 |
15 | ema = L(ModelEMA)(
16 | decay=0.9999,
17 | warmups=2000
18 | )
19 |
20 | # optimizer params
21 | optimizer = L(optim.AdamW)(
22 | params=L(get_optim_params)(
23 | cfg=[
24 | {
25 | 'params': '^(?=.*backbone).*$',
26 | 'lr': 0.0001
27 | },
28 | ],
29 | # model=model
30 | ),
31 | lr=0.0001,
32 | betas=[0.9, 0.999],
33 | weight_decay=0.0001
34 | )
35 |
36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
37 | # optimizer=optimizer,
38 | milestones=[1000],
39 | gamma=0.1
40 | )
41 |
42 | model.backbone.name = 'B0'
43 | model.backbone.use_lab = True
44 | model.encoder.in_channels = [256, 512, 1024]
45 | model.encoder.depth_mult=0.34
46 | model.encoder.expansion=0.5
47 | model.transformer.num_decoder_layers = 3
48 |
49 | dataset_train.dataset.transforms.policy = {
50 | 'name': 'stop_epoch',
51 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
52 | 'epoch': [5, 53, 96] # 96 / 2 + 5 = 53
53 | }
54 | dataset_train.collate_fn.base_size_repeat = 20
55 | dataset_train.collate_fn.stop_epoch = 96
56 |
--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_m_crowdpose.py:
--------------------------------------------------------------------------------
1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator
3 |
4 | from src.core import LazyCall as L
5 | from src.nn.optimizer import ModelEMA
6 | from src.misc.get_param_dicts import get_optim_params
7 |
8 | from torch import optim
9 |
10 | training_params.output_dir = "output/detrpose_hgnetv2_m_crowdpose"
11 | training_params.epochs = 76 # 72 + 4
12 | training_params.use_ema = True
13 |
14 | ema = L(ModelEMA)(
15 | decay=0.9999,
16 | warmups=2000
17 | )
18 |
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 | params=L(get_optim_params)(
22 | cfg=[
23 | {
24 | 'params': '^(?=.*backbone).*$',
25 | 'lr': 0.00001
26 | },
27 | ],
28 | # model=model
29 | ),
30 | lr=0.0001,
31 | betas=[0.9, 0.999],
32 | weight_decay=0.0001
33 | )
34 |
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 | # optimizer=optimizer,
37 | milestones=[1000],
38 | gamma=0.1
39 | )
40 |
41 | model.transformer.num_body_points=14
42 | criterion.matcher.num_body_points=14
43 | criterion.num_body_points=14
44 | postprocessor.num_body_points=14
45 |
46 | model.backbone.name = 'B2'
47 | model.backbone.use_lab = True
48 | model.encoder.in_channels = [384, 768, 1536]
49 | model.encoder.depth_mult = 0.67
50 | model.transformer.num_decoder_layers = 4
51 |
52 | dataset_train.dataset.transforms.policy = {
53 | 'name': 'stop_epoch',
54 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
55 | 'epoch': [5, 41, 72] # 72 / 2 + 5 = 35
56 | }
57 | dataset_train.collate_fn.base_size_repeat = 6
58 | dataset_train.collate_fn.stop_epoch = 72
59 |
--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_s_crowdpose.py:
--------------------------------------------------------------------------------
1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator
3 |
4 | from src.core import LazyCall as L
5 | from src.nn.optimizer import ModelEMA
6 | from src.misc.get_param_dicts import get_optim_params
7 |
8 | from torch import optim
9 |
10 | training_params.output_dir = "output/detrpose_hgnetv2_s_crowdpose"
11 | training_params.epochs = 176 # 156 + 20
12 | training_params.use_ema = True
13 |
14 | ema = L(ModelEMA)(
15 | decay=0.9999,
16 | warmups=2000
17 | )
18 |
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 | params=L(get_optim_params)(
22 | cfg=[
23 | {
24 | 'params': '^(?=.*backbone).*$',
25 | 'lr': 0.00001
26 | },
27 | ],
28 | # model=model
29 | ),
30 | lr=0.0001,
31 | betas=[0.9, 0.999],
32 | weight_decay=0.0001
33 | )
34 |
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 | # optimizer=optimizer,
37 | milestones=[1000],
38 | gamma=0.1
39 | )
40 |
41 | model.transformer.num_body_points=14
42 | criterion.matcher.num_body_points=14
43 | criterion.num_body_points=14
44 | postprocessor.num_body_points=14
45 |
46 | model.backbone.name = 'B0'
47 | model.backbone.use_lab = True
48 | model.encoder.in_channels = [256, 512, 1024]
49 | model.encoder.depth_mult=0.34
50 | model.encoder.expansion=0.5
51 | model.transformer.num_decoder_layers = 3
52 |
53 | dataset_train.dataset.transforms.policy = {
54 | 'name': 'stop_epoch',
55 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
56 | 'epoch': [5, 83, 156] # 156 / 2 + 5 = 83
57 | }
58 | dataset_train.collate_fn.base_size_repeat = 20
59 | dataset_train.collate_fn.stop_epoch = 156
60 |
--------------------------------------------------------------------------------
/src/core/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | from typing import Any
4 | from iopath.common.file_io import PathManager as PathManagerBase
5 |
6 | PathManager = PathManagerBase()
7 |
8 | def _convert_target_to_string(t: Any) -> str:
9 | """
10 | Inverse of ``locate()``.
11 |
12 | Args:
13 | t: any object with ``__module__`` and ``__qualname__``
14 | """
15 | module, qualname = t.__module__, t.__qualname__
16 |
17 | # Compress the path to this object, e.g. ``module.submodule._impl.class``
18 | # may become ``module.submodule.class``, if the later also resolves to the same
19 | # object. This simplifies the string, and also is less affected by moving the
20 | # class implementation.
21 | module_parts = module.split(".")
22 | for k in range(1, len(module_parts)):
23 | prefix = ".".join(module_parts[:k])
24 | candidate = f"{prefix}.{qualname}"
25 | try:
26 | if locate(candidate) is t:
27 | return candidate
28 | except ImportError:
29 | pass
30 | return f"{module}.{qualname}"
31 |
32 |
33 | def locate(name: str) -> Any:
34 | """
35 | Locate and return an object ``x`` using an input string ``{x.__module__}.{x.__qualname__}``,
36 | such as "module.submodule.class_name".
37 |
38 | Raise Exception if it cannot be found.
39 | """
40 | obj = pydoc.locate(name)
41 |
42 | # Some cases (e.g. torch.optim.sgd.SGD) not handled correctly
43 | # by pydoc.locate. Try a private function from hydra.
44 | if obj is None:
45 | try:
46 | # from hydra.utils import get_method - will print many errors
47 | from hydra.utils import _locate
48 | except ImportError as e:
49 | raise ImportError(f"Cannot dynamically locate object {name}!") from e
50 | else:
51 | obj = _locate(name) # it raises if fails
52 |
53 | return obj
54 |
--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_n.py:
--------------------------------------------------------------------------------
1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator
3 |
4 | from src.core import LazyCall as L
5 | from src.nn.optimizer import ModelEMA
6 | from src.misc.get_param_dicts import get_optim_params
7 |
8 | from torch import optim
9 |
10 | training_params.output_dir = "output/detrpose_hgnetv2_n"
11 | training_params.epochs = 160 # 96 + 4
12 | training_params.use_ema = True
13 | training_params.grad_accum_steps = 1
14 |
15 | ema = L(ModelEMA)(
16 | decay=0.9999,
17 | warmups=2000
18 | )
19 |
20 | # optimizer params
21 | optimizer = L(optim.AdamW)(
22 | params=L(get_optim_params)(
23 | cfg=[
24 | {
25 | 'params': '^(?=.*backbone).*$',
26 | 'lr': 0.0001
27 | },
28 | ],
29 | # model=model
30 | ),
31 | lr=0.0001,
32 | betas=[0.9, 0.999],
33 | weight_decay=0.0001
34 | )
35 |
36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
37 | # optimizer=optimizer,
38 | milestones=[1000],
39 | gamma=0.1
40 | )
41 |
42 | model.backbone.name = 'B0'
43 | model.backbone.use_lab = True
44 | model.backbone.return_idx = [2, 3]
45 | model.encoder.in_channels = [512, 1024]
46 | model.encoder.feat_strides = [16, 32]
47 | model.encoder.n_levels = 2
48 | model.encoder.use_encoder_idx = [1]
49 | model.encoder.depth_mult = 0.5
50 | model.encoder.expansion = 0.34
51 | model.encoder.hidden_dim = 128
52 | model.encoder.dim_feedforward = 512
53 | model.transformer.num_decoder_layers = 3
54 | model.transformer.num_feature_levels = 2
55 | model.transformer.dim_feedforward = 512
56 | model.transformer.feat_strides = [16, 32]
57 | model.transformer.hidden_dim = 128
58 | model.transformer.dec_n_points= 6
59 |
60 | dataset_train.dataset.transforms.policy = {
61 | 'name': 'stop_epoch',
62 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
63 | 'epoch': [5, 83, 156] # 156 / 2 + 5 = 83
64 | }
65 | dataset_train.collate_fn.base_size_repeat = None
66 | dataset_train.collate_fn.stop_epoch = 156
67 |
--------------------------------------------------------------------------------
/configs/detrpose/detrpose_hgnetv2_n_crowdpose.py:
--------------------------------------------------------------------------------
1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor
2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator
3 |
4 | from src.core import LazyCall as L
5 | from src.nn.optimizer import ModelEMA
6 | from src.misc.get_param_dicts import get_optim_params
7 |
8 | from torch import optim
9 |
10 | training_params.output_dir = "output/detrpose_hgnetv2_n_crowdpose"
11 | training_params.epochs = 284 # 264 + 20
12 | training_params.use_ema = True
13 |
14 | ema = L(ModelEMA)(
15 | decay=0.9999,
16 | warmups=2000
17 | )
18 |
19 | # optimizer params
20 | optimizer = L(optim.AdamW)(
21 | params=L(get_optim_params)(
22 | cfg=[
23 | {
24 | 'params': '^(?=.*backbone).*$',
25 | 'lr': 0.00001
26 | },
27 | ],
28 | # model=model
29 | ),
30 | lr=0.0001,
31 | betas=[0.9, 0.999],
32 | weight_decay=0.0001
33 | )
34 |
35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)(
36 | # optimizer=optimizer,
37 | milestones=[1000],
38 | gamma=0.1
39 | )
40 |
41 | model.transformer.num_body_points=14
42 | criterion.matcher.num_body_points=14
43 | criterion.num_body_points=14
44 | postprocessor.num_body_points=14
45 |
46 | model.backbone.name = 'B0'
47 | model.backbone.use_lab = True
48 | model.backbone.return_idx = [2, 3]
49 | model.encoder.in_channels = [512, 1024]
50 | model.encoder.feat_strides = [16, 32]
51 | model.encoder.n_levels = 2
52 | model.encoder.use_encoder_idx = [1]
53 | model.encoder.depth_mult = 0.5
54 | model.encoder.expansion = 0.34
55 | model.encoder.hidden_dim = 128
56 | model.encoder.dim_feedforward = 512
57 | model.transformer.num_decoder_layers = 3
58 | model.transformer.num_feature_levels = 2
59 | model.transformer.dim_feedforward = 512
60 | model.transformer.feat_strides = [16, 32]
61 | model.transformer.hidden_dim = 128
62 | model.transformer.dec_n_points= 6
63 |
64 | dataset_train.dataset.transforms.policy = {
65 | 'name': 'stop_epoch',
66 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
67 | 'epoch': [5, 137, 264] # 264 / 2 + 5 = 137
68 | }
69 | dataset_train.collate_fn.base_size_repeat = None
70 | dataset_train.collate_fn.stop_epoch = 264
71 |
--------------------------------------------------------------------------------
/src/models/detrpose/postprocesses.py:
--------------------------------------------------------------------------------
1 | """
2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
4 | ---------------------------------------------------------------------------------
5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/)
6 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved.
7 | ---------------------------------------------------------------------------------
8 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/)
9 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved.
10 | ---------------------------------------------------------------------------------
11 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/)
12 | Copyright (c) 2023 IDEA. All Rights Reserved.
13 | """
14 |
15 | import torch
16 | from torch import nn
17 | from torchvision.ops.boxes import nms
18 |
19 |
20 | class PostProcess(nn.Module):
21 | """ This module converts the model's output into the format expected by the coco api"""
22 | def __init__(self, num_select=60, num_body_points=17) -> None:
23 | super().__init__()
24 | self.num_select = num_select
25 | self.num_body_points = num_body_points
26 | self.deploy_mode = False
27 |
28 | @torch.no_grad()
29 | def forward(self, outputs, target_sizes):
30 | num_select = self.num_select
31 | out_logits, out_keypoints= outputs['pred_logits'], outputs['pred_keypoints']
32 |
33 | prob = out_logits.sigmoid()
34 | topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), num_select, dim=1)
35 | scores = topk_values
36 |
37 | # keypoints
38 | topk_keypoints = (topk_indexes.float() // out_logits.shape[2]).long()
39 | labels = topk_indexes % out_logits.shape[2]
40 |
41 | if self.deploy_mode:
42 | keypoints = torch.gather(out_keypoints, 1, topk_keypoints[..., None, None].expand(1, num_select, self.num_body_points, 2))
43 | keypoints = keypoints * target_sizes[:, None, None, :]
44 | return scores, labels, keypoints
45 |
46 | keypoints = torch.gather(out_keypoints, 1, topk_keypoints.unsqueeze(-1).repeat(1, 1, self.num_body_points*2))
47 | keypoints = keypoints * target_sizes.repeat(1, self.num_body_points)[:, None, :]
48 | keypoints_res = keypoints.unflatten(-1, (-1, 2))
49 | keypoints_res = torch.cat(
50 | [keypoints_res, torch.ones_like(keypoints_res[..., 0:1])],
51 | dim=-1).flatten(-2)
52 |
53 | results = [{'scores': s, 'labels': l, 'keypoints': k} for s, l, k in zip(scores, labels, keypoints_res)]
54 | return results
55 |
56 | def deploy(self, ):
57 | self.eval()
58 | self.deploy_mode = True
59 | return self
60 |
--------------------------------------------------------------------------------
/src/misc/metrics.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | import json
3 | import torch
4 |
5 | def inverse_sigmoid(x, eps=1e-5):
6 | x = x.clamp(min=0, max=1)
7 | x1 = x.clamp(min=eps)
8 | x2 = (1 - x).clamp(min=eps)
9 | return torch.log(x1/x2)
10 |
11 |
12 | class BestMetricSingle():
13 | def __init__(self, init_res=0.0, better='large') -> None:
14 | self.init_res = init_res
15 | self.best_res = init_res
16 | self.best_ep = -1
17 |
18 | self.better = better
19 | assert better in ['large', 'small']
20 |
21 | def isbetter(self, new_res, old_res):
22 | if self.better == 'large':
23 | return new_res > old_res
24 | if self.better == 'small':
25 | return new_res < old_res
26 |
27 | def update(self, new_res, ep):
28 | if self.isbetter(new_res, self.best_res):
29 | self.best_res = new_res
30 | self.best_ep = ep
31 | return True
32 | return False
33 |
34 | def __str__(self) -> str:
35 | return "best_res: {}\t best_ep: {}".format(self.best_res, self.best_ep)
36 |
37 | def __repr__(self) -> str:
38 | return self.__str__()
39 |
40 | def summary(self) -> dict:
41 | return {
42 | 'best_res': self.best_res,
43 | 'best_ep': self.best_ep,
44 | }
45 |
46 |
47 | class BestMetricHolder():
48 | def __init__(self, init_res=0.0, better='large', use_ema=False) -> None:
49 | self.best_all = BestMetricSingle(init_res, better)
50 | self.use_ema = use_ema
51 | if use_ema:
52 | self.best_ema = BestMetricSingle(init_res, better)
53 | self.best_regular = BestMetricSingle(init_res, better)
54 |
55 |
56 | def update(self, new_res, epoch, is_ema=False):
57 | """
58 | return if the results is the best.
59 | """
60 | if not self.use_ema:
61 | return self.best_all.update(new_res, epoch)
62 | else:
63 | if is_ema:
64 | self.best_ema.update(new_res, epoch)
65 | return self.best_all.update(new_res, epoch)
66 | else:
67 | self.best_regular.update(new_res, epoch)
68 | return self.best_all.update(new_res, epoch)
69 |
70 | def summary(self):
71 | if not self.use_ema:
72 | return self.best_all.summary()
73 |
74 | res = {}
75 | res.update({f'all_{k}':v for k,v in self.best_all.summary().items()})
76 | res.update({f'regular_{k}':v for k,v in self.best_regular.summary().items()})
77 | res.update({f'ema_{k}':v for k,v in self.best_ema.summary().items()})
78 | return res
79 |
80 | def __repr__(self) -> str:
81 | return json.dumps(self.summary(), indent=2)
82 |
83 | def __str__(self) -> str:
84 | return self.__repr__()
--------------------------------------------------------------------------------
/configs/detrpose/include/detrpose_hgnetv2.py:
--------------------------------------------------------------------------------
1 | from src.core import LazyCall as L
2 | from src.models.detrpose import (
3 | DETRPose,
4 | HybridEncoder,
5 | Transformer,
6 | PostProcess,
7 | Criterion,
8 | HungarianMatcher,
9 | )
10 |
11 | from src.nn import HGNetv2
12 |
13 | training_params = {
14 | "clip_max_norm": 0.1,
15 | "save_checkpoint_interval": 1,
16 | "grad_accum_steps": 2,
17 | "print_freq": 100,
18 | 'sync_bn': True,
19 | 'use_ema': False,
20 | 'dist_url': 'env://',
21 | }
22 |
23 | eval_spatial_size = (640, 640)
24 | hidden_dim = 256
25 | n_levels = 3
26 | feat_strides = [8, 16, 32]
27 | num_classes = 2
28 |
29 | model = L(DETRPose)(
30 | backbone=L(HGNetv2)(
31 | name='B4',
32 | use_lab=False,
33 | return_idx=[1, 2, 3],
34 | freeze_stem_only=True,
35 | freeze_at=-1,
36 | freeze_norm=True,
37 | pretrained=True,
38 | ),
39 | encoder=L(HybridEncoder)(
40 | in_channels=[512, 1024, 2048],
41 | feat_strides=feat_strides,
42 | n_levels=n_levels,
43 | hidden_dim=hidden_dim,
44 | nhead=8,
45 | dim_feedforward=1024,
46 | dropout=0.0,
47 | enc_act='gelu',
48 | expansion=1.0,
49 | depth_mult=1.0,
50 | act='silu',
51 | temperatureH=20,
52 | temperatureW=20,
53 | eval_spatial_size= eval_spatial_size
54 | ),
55 | transformer=L(Transformer)(
56 | hidden_dim=hidden_dim,
57 | dropout=0.0,
58 | nhead=8,
59 | num_queries=60,
60 | dim_feedforward=1024,
61 | num_decoder_layers=6,
62 | normalize_before=False,
63 | return_intermediate_dec=True,
64 | activation='relu',
65 | num_feature_levels=3,
66 | dec_n_points=4,
67 | learnable_tgt_init=True,
68 | two_stage_type='standard',
69 | num_body_points=17,
70 | aux_loss=True,
71 | num_classes=num_classes,
72 | dec_pred_class_embed_share = False,
73 | dec_pred_pose_embed_share = False,
74 | two_stage_class_embed_share=False,
75 | two_stage_bbox_embed_share=False,
76 | cls_no_bias = False,
77 | # new parameters
78 | feat_strides=[8, 16, 32],
79 | eval_spatial_size=eval_spatial_size,
80 | reg_max=32,
81 | reg_scale=4
82 | ),
83 | )
84 |
85 | criterion = L(Criterion)(
86 | num_classes=num_classes,
87 | weight_dict={'loss_vfl': 2.0, 'loss_keypoints': 10.0, 'loss_oks': 4.0},
88 | focal_alpha=0.25,
89 | losses=['vfl', 'keypoints'],
90 | matcher=L(HungarianMatcher)(
91 | cost_class=2.0,
92 | cost_keypoints=10.0,
93 | cost_oks=4.0,
94 | focal_alpha=0.25
95 | ),
96 | num_body_points=17
97 | )
98 |
99 | postprocessor = L(PostProcess)(num_select=60, num_body_points=17)
100 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from omegaconf import OmegaConf
3 |
4 | from src.solver import Trainer
5 | from src.misc import dist_utils
6 | from src.core import LazyConfig, instantiate
7 |
8 | def get_args_parser():
9 | parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
10 | parser.add_argument('--config_file', '-c', type=str, required=True)
11 | parser.add_argument('--options',
12 | nargs='+',
13 | help='override some settings in the used config, the key-value pair '
14 | 'in xxx=yyy format will be merged into config file.')
15 | parser.add_argument('--device', default='cuda',
16 | help='device to use for training / testing')
17 | parser.add_argument('--seed', default=42, type=int)
18 | parser.add_argument('--resume', default=None, help='resume from checkpoint')
19 | parser.add_argument('--pretrain', default=None, help='apply transfer learning to the backbone and encoder using DFINE weights')
20 | parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
21 | help='start epoch')
22 | parser.add_argument('--eval', action='store_true')
23 | parser.add_argument('--test', action='store_true')
24 | parser.add_argument('--find_unused_params', action='store_true')
25 |
26 | # distributed training parameters
27 | parser.add_argument('--world_size', default=1, type=int,
28 | help='number of distributed processes')
29 | parser.add_argument('--rank', default=0, type=int,
30 | help='number of distributed processes')
31 | parser.add_argument("--local_rank", type=int, help='local rank for DistributedDataParallel')
32 | parser.add_argument('--amp', action='store_true',
33 | help="Train with mixed precision")
34 |
35 | return parser
36 |
37 | def main(args):
38 | cfg = LazyConfig.load(args.config_file)
39 |
40 | updates = OmegaConf.create()
41 | for k, v in args.__dict__.items():
42 | if k not in ["options"] and v is not None:
43 | updates[k] = v
44 | cfg.training_params = OmegaConf.merge(cfg.training_params, updates)
45 |
46 | if args.options:
47 | cfg = LazyConfig.apply_overrides(cfg, args.options)
48 | print(cfg)
49 |
50 | solver = Trainer(cfg)
51 |
52 | assert not(args.eval and args.test), "you can't do evaluation and test at the same time"
53 |
54 | if args.eval:
55 | if hasattr(cfg.model.backbone, 'pretrained'):
56 | cfg.model.backbone.pretrained = False
57 | solver.eval()
58 | elif args.test:
59 | if hasattr(cfg.model.backbone, 'pretrained'):
60 | cfg.model.backbone.pretrained = False
61 | solver.test()
62 | else:
63 | solver.fit()
64 | dist_utils.cleanup()
65 |
66 | if __name__ == '__main__':
67 | parser = argparse.ArgumentParser('RT-GroupPose training and evaluation script', parents=[get_args_parser()])
68 | args = parser.parse_args()
69 | main(args)
70 |
--------------------------------------------------------------------------------
/assets/TENSORRT_CONTAINER_LAMBDA.AI.md:
--------------------------------------------------------------------------------
1 |
2 | Manual to install TensorRT Containers in Lambda.ai instances
3 |
4 |
5 | ## Quick Start
6 | ### Lambda.ai
7 | 1. Go to [Lambda.ai](https://lambda.ai) and create an account.
8 | 2. Log in to your Lambda.ai account.
9 | 3. Click on the `Launch instance' button. It is located on the top right side of the website.
10 | 4. Select an instance. To replicate our results from the appendix, select `8x Tesla V100 (16 GB)`
11 |
12 | ### TensorRT Container Installation
13 | 1. Docker setup
14 | ```shell
15 | sudo usermod -aG docker $USER
16 | newgrp docker
17 | ```
18 | 3. Installing Nvidia DeepLearning container
19 | ```shell
20 | curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
21 | && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
22 | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
23 | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
24 | ```
25 | 2. Installing TensorRT docker container
26 | ```shell
27 | docker pull nvcr.io/nvidia/tensorrt:24.04-py3
28 | docker run --gpus all -it --rm nvcr.io/nvidia/tensorrt:24.04-py3
29 | ```
30 |
31 | 3. Install the CUDA toolkit with the correct version (in our case 12.8)
32 | ```shell
33 | # cuda installation
34 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
35 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
36 | sudo apt-get update
37 | sudo apt-get -y install cuda-toolkit-12-8
38 | ```
39 |
40 | The complete installation takes approximately 5 minutes.
41 |
42 | ## Installing DETRPose
43 | ### Quick Start
44 | ```shell
45 | git clone https://github.com/SebastianJanampa/DETRPose.git
46 | cd DETRPose
47 | pip install -r requirements.txt
48 | apt-get update && apt-get install libgl1
49 | ```
50 |
51 | ### Data Preparation
52 | ```
53 | pip install gdown # to download files from google drive
54 | gdown 1VprytECcLtU4tKP32SYi_7oDRbw7yUTL # images
55 | unzip images.zip
56 | ```
57 |
58 | ### Usage
59 | ```shell
60 | pip install onnx onnxsim
61 | pip install -r tools/benchmark/requirements.txt
62 |
63 | export model=l #n, s, m, l, x
64 | mkdir trt_engines
65 | ```
66 | 1. Download official weights
67 | ```shell
68 | wget https://github.com/SebastianJanampa/DETRPose/releases/download/model_weights/detrpose_hgnetv2_${model}.pth
69 | ```
70 | 2. Export onnx
71 | ```shell
72 | python tools/deployment/export_onnx.py --check -c configs/detrpose/detrpose_hgnetv2_${model}.py -r detrpose_hgnetv2_${model}.pth
73 | ```
74 | 3. Export tensorrt
75 | ```shell
76 | trtexec --onnx="onnx_engines/detrpose_hgnetv2_${model}.onnx" --saveEngine="trt_engines/detrpose_hgnetv2_${model}.engine" --fp16
77 | ```
78 | 4. Benchmark
79 | ```shell
80 | python tools/benchmark/trt_benchmark.py --infer_dir ./images --engine_dir trt_engines
81 | ```
--------------------------------------------------------------------------------
/configs/detrpose/include/dataset.py:
--------------------------------------------------------------------------------
1 | from src.core import LazyCall as L
2 | from src.data import CocoDetection
3 | from src.data.dataloader import (
4 | BatchImageCollateFunction,
5 | DataLoader
6 | )
7 | from src.data.coco_eval import CocoEvaluator
8 | from src.data.container import Compose
9 | import src.data.transforms as T
10 |
11 | from .detrpose_hgnetv2 import eval_spatial_size
12 |
13 | from omegaconf import OmegaConf
14 |
15 | scales = [(640, 640)]
16 | max_size = 1333
17 | scales2_resize = [400, 500, 600]
18 |
19 | __all__ = ["dataset_train", "dataset_val", "dataset_test", "evaluator"]
20 |
21 | dataset_train = L(DataLoader)(
22 | dataset=L(CocoDetection)(
23 | img_folder="./data/COCO2017/train2017",
24 | ann_file="./data/COCO2017/annotations/person_keypoints_train2017.json",
25 | transforms=L(Compose)(
26 | policy={
27 | 'name': 'stop_epoch',
28 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
29 | 'epoch': [5, 29, 48]
30 | },
31 | mosaic_prob=0.5,
32 | transforms1=L(T.Mosaic)(output_size=320, probability=1.0),
33 | transforms2=L(T.RandomZoomOut)(p=0.5),
34 | transforms3=L(T.RandomHorizontalFlip)(),
35 | transforms4=L(T.ColorJitter)(),
36 | transforms5=L(T.RandomResize)(sizes=scales, max_size=max_size),
37 | transforms6=L(T.ToTensor)(),
38 | transforms7=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])
39 | ),
40 |
41 | ),
42 | total_batch_size=16,
43 | collate_fn=L(BatchImageCollateFunction)(
44 | base_size=eval_spatial_size[0],
45 | base_size_repeat=4,
46 | stop_epoch=48,
47 | ),
48 | num_workers=4,
49 | shuffle=True,
50 | drop_last=True,
51 | pin_memory=True
52 | )
53 |
54 | dataset_val = L(DataLoader)(
55 | dataset=L(CocoDetection)(
56 | img_folder="./data/COCO2017/val2017",
57 | ann_file="./data/COCO2017/annotations/person_keypoints_val2017.json",
58 | transforms=L(Compose)(
59 | transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size),
60 | transforms2=L(T.ToTensor)(),
61 | transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])
62 | ),
63 | ),
64 | total_batch_size=32,
65 | collate_fn=L(BatchImageCollateFunction)(
66 | base_size=eval_spatial_size[0],
67 | ),
68 | num_workers=4,
69 | shuffle=False,
70 | drop_last=False,
71 | pin_memory=True
72 | )
73 |
74 | dataset_test = L(DataLoader)(
75 | dataset=L(CocoDetection)(
76 | img_folder="./data/COCO2017/test2017",
77 | ann_file="./data/COCO2017/annotations/image_info_test-dev2017.json",
78 | transforms=L(Compose)(
79 | transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size),
80 | transforms2=L(T.ToTensor)(),
81 | transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])
82 | ),
83 | ),
84 | total_batch_size=32,
85 | collate_fn=L(BatchImageCollateFunction)(
86 | base_size=eval_spatial_size[0],
87 | ),
88 | num_workers=4,
89 | shuffle=False,
90 | drop_last=False,
91 | pin_memory=True
92 | )
93 |
94 | evaluator = L(CocoEvaluator)(
95 | ann_file="./data/COCO2017/annotations/person_keypoints_val2017.json",
96 | iou_types=['keypoints'],
97 | useCats=True
98 | )
99 |
100 |
--------------------------------------------------------------------------------
/configs/detrpose/include/dataset_crowdpose.py:
--------------------------------------------------------------------------------
1 | from src.core import LazyCall as L
2 | from src.data import CrowdPoseDetection
3 | from src.data.dataloader import (
4 | BatchImageCollateFunction,
5 | DataLoader
6 | )
7 | from src.data.crowdpose_eval import CrowdPoseEvaluator
8 | from src.data.container import Compose
9 | import src.data.transforms as T
10 | import src.data.transforms_crowdpose as CrowdT
11 |
12 | from .detrpose_hgnetv2 import eval_spatial_size
13 |
14 | from omegaconf import OmegaConf
15 |
16 | scales = [(640, 640)]
17 | max_size = 1333
18 | scales2_resize = [400, 500, 600]
19 |
20 | __all__ = ["dataset_train", "dataset_val", "dataset_test", "evaluator"]
21 |
22 | dataset_train = L(DataLoader)(
23 | dataset=L(CrowdPoseDetection)(
24 | img_folder="./data/crowdpose/images",
25 | ann_file="./data/crowdpose/annotations/crowdpose_trainval.json",
26 | transforms=L(Compose)(
27 | policy={
28 | 'name': 'stop_epoch',
29 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
30 | 'epoch': [5, 29, 60]
31 | },
32 | mosaic_prob=0.5,
33 | transforms1=L(T.Mosaic)(output_size=320, probability=1.0),
34 | transforms2=L(T.RandomZoomOut)(p=0.5),
35 | transforms3=L(CrowdT.RandomHorizontalFlip)(p=0.5),
36 | transforms4=L(T.ColorJitter)(),
37 | transforms5=L(T.RandomResize)(sizes=scales, max_size=max_size),
38 | transforms6=L(T.ToTensor)(),
39 | transforms7=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])
40 | ),
41 |
42 | ),
43 | total_batch_size=16,
44 | collate_fn=L(BatchImageCollateFunction)(
45 | base_size=eval_spatial_size[0],
46 | base_size_repeat=4,
47 | stop_epoch=60,
48 | ),
49 | num_workers=4,
50 | shuffle=True,
51 | drop_last=True,
52 | pin_memory=True
53 | )
54 |
55 | dataset_val = L(DataLoader)(
56 | dataset=L(CrowdPoseDetection)(
57 | img_folder="./data/crowdpose/images",
58 | ann_file="./data/crowdpose/annotations/crowdpose_test.json",
59 | transforms=L(Compose)(
60 | transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size),
61 | transforms2=L(T.ToTensor)(),
62 | transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])
63 | ),
64 | ),
65 | total_batch_size=32,
66 | collate_fn=L(BatchImageCollateFunction)(
67 | base_size=eval_spatial_size[0],
68 | ),
69 | num_workers=4,
70 | shuffle=False,
71 | drop_last=False,
72 | pin_memory=True
73 | )
74 |
75 | dataset_test = L(DataLoader)(
76 | dataset=L(CrowdPoseDetection)(
77 | img_folder="./data/crowdpose/images",
78 | ann_file="./data/crowdpose/annotations/crowdpose_test.json",
79 | transforms=L(Compose)(
80 | transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size),
81 | transforms2=L(T.ToTensor)(),
82 | transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1])
83 | ),
84 | ),
85 | total_batch_size=32,
86 | collate_fn=L(BatchImageCollateFunction)(
87 | base_size=eval_spatial_size[0],
88 | ),
89 | num_workers=4,
90 | shuffle=False,
91 | drop_last=False,
92 | pin_memory=True
93 | )
94 |
95 | evaluator = L(CrowdPoseEvaluator)(
96 | ann_file="./data/crowdpose/annotations/crowdpose_test.json",
97 | iou_types=['keypoints_crowd'],
98 | useCats=True
99 | )
100 |
101 |
--------------------------------------------------------------------------------
/src/core/instantiate.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | import collections.abc as abc
4 | import dataclasses
5 | import logging
6 | from typing import Any
7 |
8 | from .utils import _convert_target_to_string, locate
9 |
10 | __all__ = ["dump_dataclass", "instantiate"]
11 |
12 |
13 | def dump_dataclass(obj: Any):
14 | """
15 | Dump a dataclass recursively into a dict that can be later instantiated.
16 |
17 | Args:
18 | obj: a dataclass object
19 |
20 | Returns:
21 | dict
22 | """
23 | assert dataclasses.is_dataclass(obj) and not isinstance(
24 | obj, type
25 | ), "dump_dataclass() requires an instance of a dataclass."
26 | ret = {"_target_": _convert_target_to_string(type(obj))}
27 | for f in dataclasses.fields(obj):
28 | v = getattr(obj, f.name)
29 | if dataclasses.is_dataclass(v):
30 | v = dump_dataclass(v)
31 | if isinstance(v, (list, tuple)):
32 | v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v]
33 | ret[f.name] = v
34 | return ret
35 |
36 |
37 | def instantiate(cfg):
38 | """
39 | Recursively instantiate objects defined in dictionaries by
40 | "_target_" and arguments.
41 |
42 | Args:
43 | cfg: a dict-like object with "_target_" that defines the caller, and
44 | other keys that define the arguments
45 |
46 | Returns:
47 | object instantiated by cfg
48 | """
49 | from omegaconf import ListConfig, DictConfig, OmegaConf
50 |
51 | if isinstance(cfg, ListConfig):
52 | lst = [instantiate(x) for x in cfg]
53 | return ListConfig(lst, flags={"allow_objects": True})
54 | if isinstance(cfg, list):
55 | # Specialize for list, because many classes take
56 | # list[objects] as arguments, such as ResNet, DatasetMapper
57 | return [instantiate(x) for x in cfg]
58 |
59 | # If input is a DictConfig backed by dataclasses (i.e. omegaconf's structured config),
60 | # instantiate it to the actual dataclass.
61 | if isinstance(cfg, DictConfig) and dataclasses.is_dataclass(cfg._metadata.object_type):
62 | return OmegaConf.to_object(cfg)
63 |
64 | if isinstance(cfg, abc.Mapping) and "_target_" in cfg:
65 | # conceptually equivalent to hydra.utils.instantiate(cfg) with _convert_=all,
66 | # but faster: https://github.com/facebookresearch/hydra/issues/1200
67 | cfg = {k: instantiate(v) for k, v in cfg.items()}
68 | cls = cfg.pop("_target_")
69 | cls = instantiate(cls)
70 |
71 | if isinstance(cls, str):
72 | cls_name = cls
73 | cls = locate(cls_name)
74 | assert cls is not None, cls_name
75 | else:
76 | try:
77 | cls_name = cls.__module__ + "." + cls.__qualname__
78 | except Exception:
79 | # target could be anything, so the above could fail
80 | cls_name = str(cls)
81 | assert callable(cls), f"_target_ {cls} does not define a callable object"
82 | try:
83 | return cls(**cfg)
84 | except TypeError:
85 | logger = logging.getLogger(__name__)
86 | logger.error(f"Error when instantiating {cls_name}!")
87 | raise
88 | return cfg # return as-is if don't know what to do
--------------------------------------------------------------------------------
/tools/benchmark/dataset.py:
--------------------------------------------------------------------------------
1 | """
2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
4 | """
5 |
6 | import os
7 | import glob
8 | from PIL import Image
9 |
10 | import torch
11 | import torch.utils.data as data
12 | import torchvision
13 | import torchvision.transforms as T
14 | import torchvision.transforms.functional as F
15 |
16 | Image.MAX_IMAGE_PIXELS = None
17 |
18 | class ToTensor(T.ToTensor):
19 | def __init__(self) -> None:
20 | super().__init__()
21 |
22 | def __call__(self, pic):
23 | if isinstance(pic, torch.Tensor):
24 | return pic
25 | return super().__call__(pic)
26 |
27 | class PadToSize(T.Pad):
28 | def __init__(self, size, fill=0, padding_mode='constant'):
29 | super().__init__(0, fill, padding_mode)
30 | self.size = size
31 | self.fill = fill
32 |
33 | def __call__(self, img):
34 | """
35 | Args:
36 | img (PIL Image or Tensor): Image to be padded.
37 |
38 | Returns:
39 | PIL Image or Tensor: Padded image.
40 | """
41 | w, h = F.get_image_size(img)
42 | padding = (0, 0, self.size[0] - w, self.size[1] - h)
43 | return F.pad(img, padding, self.fill, self.padding_mode)
44 |
45 |
46 | class Dataset(data.Dataset):
47 | def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None:
48 | super().__init__()
49 |
50 | self.device = device
51 | self.size = 640
52 |
53 | self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg')))
54 |
55 | if preprocess is None:
56 | self.preprocess = T.Compose([
57 | T.Resize(size=639, max_size=640),
58 | PadToSize(size=(640, 640), fill=114),
59 | ToTensor(),
60 | T.ConvertImageDtype(torch.float),
61 | ])
62 | else:
63 | self.preprocess = preprocess
64 |
65 | def __len__(self, ):
66 | return len(self.im_path_list)
67 |
68 | def __getitem__(self, index):
69 | # im = Image.open(self.img_path_list[index]).convert('RGB')
70 | im = torchvision.io.read_file(self.im_path_list[index])
71 | im = torchvision.io.decode_image(im, mode=torchvision.io.ImageReadMode.RGB).to(self.device)
72 | _, h, w = im.shape # c,h,w
73 |
74 | im = self.preprocess(im)
75 |
76 | blob = {
77 | 'images': im,
78 | # 'im_shape': torch.tensor([self.size, self.size]).to(im.device),
79 | # 'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device),
80 | 'orig_target_sizes': torch.tensor([w, h]).to(im.device),
81 | }
82 |
83 | return blob
84 |
85 | @staticmethod
86 | def post_process():
87 | pass
88 |
89 | @staticmethod
90 | def collate_fn():
91 | pass
92 |
93 |
94 | def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''):
95 | '''show result
96 | Keys:
97 | 'num_dets', 'det_boxes', 'det_scores', 'det_classes'
98 | '''
99 | for i in range(blob['image'].shape[0]):
100 | det_scores = outputs['det_scores'][i]
101 | det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold]
102 |
103 | im = (blob['image'][i] * 255).to(torch.uint8)
104 | im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2)
105 | Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg')
106 |
--------------------------------------------------------------------------------
/src/misc/keypoint_loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import torch.nn as nn
4 |
5 | def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):
6 | sigmas = kpt_preds.new_tensor(sigmas)
7 | variances = (sigmas * 2)**2
8 |
9 | assert kpt_preds.size(0) == kpt_gts.size(0)
10 | kpt_preds = kpt_preds.reshape(-1, kpt_preds.size(-1) // 2, 2)
11 | kpt_gts = kpt_gts.reshape(-1, kpt_gts.size(-1) // 2, 2)
12 |
13 | squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \
14 | (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2
15 | squared_distance0 = squared_distance / (kpt_areas[:, None] * variances[None, :] * 2)
16 | squared_distance1 = torch.exp(-squared_distance0)
17 | squared_distance1 = squared_distance1 * kpt_valids
18 | oks = squared_distance1.sum(dim=1) / (kpt_valids.sum(dim=1)+1e-6)
19 |
20 | return oks
21 |
22 | def oks_loss(pred,
23 | target,
24 | valid=None,
25 | area=None,
26 | linear=False,
27 | sigmas=None,
28 | eps=1e-6):
29 | oks = oks_overlaps(pred, target, valid, area, sigmas).clamp(min=eps)
30 | if linear:
31 | loss = oks
32 | else:
33 | loss = -oks.log()
34 | return loss
35 |
36 |
37 | class OKSLoss(nn.Module):
38 | def __init__(self,
39 | linear=False,
40 | num_keypoints=17,
41 | eps=1e-6,
42 | reduction='mean',
43 | loss_weight=1.0):
44 | super(OKSLoss, self).__init__()
45 | self.linear = linear
46 | self.eps = eps
47 | self.reduction = reduction
48 | self.loss_weight = loss_weight
49 | if num_keypoints == 17:
50 | self.sigmas = np.array([
51 | .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
52 | 1.07, .87, .87, .89, .89
53 | ], dtype=np.float32) / 10.0
54 | elif num_keypoints == 14:
55 | self.sigmas = np.array([
56 | .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
57 | .79, .79
58 | ]) / 10.0
59 | elif num_keypoints == 3:
60 | self.sigmas = np.array([
61 | 1.07, 1.07, 0.67
62 | ]) / 10.0
63 | else:
64 | raise ValueError(f'Unsupported keypoints number {num_keypoints}')
65 |
66 | def forward(self,
67 | pred,
68 | target,
69 | valid,
70 | area,
71 | weight=None,
72 | avg_factor=None,
73 | reduction_override=None):
74 | assert reduction_override in (None, 'none', 'mean', 'sum')
75 | reduction = (
76 | reduction_override if reduction_override else self.reduction)
77 | if (weight is not None) and (not torch.any(weight > 0)) and (
78 | reduction != 'none'):
79 | if pred.dim() == weight.dim() + 1:
80 | weight = weight.unsqueeze(1)
81 | return (pred * weight).sum() # 0
82 | if weight is not None and weight.dim() > 1:
83 | # TODO: remove this in the future
84 | # reduce the weight of shape (n, 4) to (n,) to match the
85 | # iou_loss of shape (n,)
86 | assert weight.shape == pred.shape
87 | weight = weight.mean(-1)
88 | loss = self.loss_weight * oks_loss(
89 | pred,
90 | target,
91 | valid=valid,
92 | area=area,
93 | linear=self.linear,
94 | sigmas=self.sigmas,
95 | eps=self.eps)
96 | return loss
--------------------------------------------------------------------------------
/tools/benchmark/torch_benchmark.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
3 | """
4 | import time
5 | import torch
6 | from torch import nn
7 | import torch.backends.cudnn as cudnn
8 | cudnn.benchmark = True
9 |
10 | import argparse
11 | from dataset import Dataset
12 | from tqdm import tqdm
13 |
14 | import os, sys
15 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
16 | from src.core import LazyConfig, instantiate
17 |
18 | def parse_args():
19 | parser = argparse.ArgumentParser(description='Argument Parser Example')
20 | parser.add_argument('--config_file', '-c', default='./configs/detrpose/detrpose_hgnetv2_l.py', type=str, )
21 | parser.add_argument('--resume', '-r', type=str, )
22 | parser.add_argument('--infer_dir',
23 | type=str,
24 | default='./data/COCO2017/val2017',
25 | help="Directory for images to perform inference on.")
26 | args = parser.parse_args()
27 | return args
28 |
29 | @torch.no_grad()
30 | def warmup(model, data, img_size, n):
31 | for _ in range(n):
32 | _ = model(data, img_size)
33 |
34 | @torch.no_grad()
35 | def speed(model, data, n):
36 | times = []
37 | for i in tqdm(range(n), desc="Running Inference", unit="iteration"):
38 | blob = data[i]
39 | samples, target_sizes = blob['images'].unsqueeze(0), blob['orig_target_sizes']
40 | torch.cuda.synchronize()
41 | t_ = time.perf_counter()
42 | _ = model(samples, target_sizes)
43 | torch.cuda.synchronize()
44 | t = time.perf_counter() - t_
45 | times.append(t)
46 |
47 | # end-to-end model only
48 | times = sorted(times)
49 | if len(times) > 100:
50 | times = times[:100]
51 | return sum(times) / len(times)
52 |
53 | def main():
54 | FLAGS = parse_args()
55 | dataset = Dataset(FLAGS.infer_dir)
56 | blob = torch.ones(1, 3, 640, 640).cuda()
57 |
58 | img_size = torch.tensor([[640, 640]], device='cuda')
59 |
60 | cfg = LazyConfig.load(FLAGS.config_file)
61 |
62 | if hasattr(cfg.model.backbone, 'pretrained'):
63 | cfg.model.backbone.pretrained = False
64 |
65 | model = instantiate(cfg.model)
66 | postprocessor = instantiate(cfg.postprocessor)
67 |
68 | if FLAGS.resume:
69 | checkpoint = torch.load(FLAGS.resume, map_location='cpu')
70 | if 'ema' in checkpoint:
71 | state = checkpoint['ema']['module']
72 | else:
73 | state = checkpoint['model']
74 |
75 | # NOTE load train mode state -> convert to deploy mode
76 | linea.load_state_dict(state)
77 |
78 | else:
79 | # raise AttributeError('Only support resume to load model.state_dict by now.')
80 | print('not load model.state_dict, use default init state dict...')
81 |
82 | class Model(nn.Module):
83 | def __init__(self, ) -> None:
84 | super().__init__()
85 | self.model = model.deploy()
86 | self.postprocessor = postprocessor.deploy()
87 |
88 | def forward(self, images, orig_target_sizes):
89 | outputs = self.model(images)
90 | outputs = self.postprocessor(outputs, orig_target_sizes)
91 | return outputs
92 |
93 | model = Model().cuda()
94 |
95 | warmup(model, blob, img_size, 400)
96 | t = []
97 | for _ in range(1):
98 | t.append(speed(model, dataset, 1000))
99 | avg_latency = 1000 * torch.tensor(t).mean()
100 | print(f"model: {FLAGS.config_file}, Latency: {avg_latency:.2f} ms")
101 |
102 | del model
103 | torch.cuda.empty_cache()
104 | time.sleep(1)
105 |
106 |
107 | if __name__ == '__main__':
108 | main()
109 |
--------------------------------------------------------------------------------
/src/nn/optimizer/ema.py:
--------------------------------------------------------------------------------
1 | """
2 | D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement
3 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
4 | ---------------------------------------------------------------------------------
5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
6 | Copyright (c) 2023 lyuwenyu. All Rights Reserved.
7 | """
8 |
9 | import math
10 | from copy import deepcopy
11 |
12 | import torch
13 | import torch.nn as nn
14 |
15 | from ...misc import dist_utils
16 |
17 | __all__ = ["ModelEMA"]
18 |
19 |
20 | class ModelEMA(object):
21 | """
22 | Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
23 | Keep a moving average of everything in the model state_dict (parameters and buffers).
24 | This is intended to allow functionality like
25 | https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
26 | A smoothed version of the weights is necessary for some training schemes to perform well.
27 | This class is sensitive where it is initialized in the sequence of model init,
28 | GPU assignment and distributed training wrappers.
29 | """
30 |
31 | def __init__(
32 | self, model: nn.Module, decay: float = 0.9999, warmups: int = 1000, start: int = 0
33 | ):
34 | super().__init__()
35 |
36 | self.module = deepcopy(dist_utils.de_parallel(model)).eval()
37 | # if next(model.parameters()).device.type != 'cpu':
38 | # self.module.half() # FP16 EMA
39 |
40 | self.decay = decay
41 | self.warmups = warmups
42 | self.before_start = 0
43 | self.start = start
44 | self.updates = 0 # number of EMA updates
45 | if warmups == 0:
46 | self.decay_fn = lambda x: decay
47 | else:
48 | self.decay_fn = lambda x: decay * (
49 | 1 - math.exp(-x / warmups)
50 | ) # decay exponential ramp (to help early epochs)
51 |
52 | for p in self.module.parameters():
53 | p.requires_grad_(False)
54 |
55 | def update(self, model: nn.Module):
56 | if self.before_start < self.start:
57 | self.before_start += 1
58 | return
59 | # Update EMA parameters
60 | with torch.no_grad():
61 | self.updates += 1
62 | d = self.decay_fn(self.updates)
63 | msd = dist_utils.de_parallel(model).state_dict()
64 | for k, v in self.module.state_dict().items():
65 | if v.dtype.is_floating_point:
66 | v *= d
67 | v += (1 - d) * msd[k].detach()
68 |
69 | def to(self, *args, **kwargs):
70 | self.module = self.module.to(*args, **kwargs)
71 | return self
72 |
73 | def state_dict(
74 | self,
75 | ):
76 | return dict(module=self.module.state_dict(), updates=self.updates)
77 |
78 | def load_state_dict(self, state, strict=True):
79 | self.module.load_state_dict(state["module"], strict=strict)
80 | if "updates" in state:
81 | self.updates = state["updates"]
82 |
83 | def forwad(
84 | self,
85 | ):
86 | raise RuntimeError("ema...")
87 |
88 | def extra_repr(self) -> str:
89 | return f"decay={self.decay}, warmups={self.warmups}, name=ema"
90 |
91 |
92 | class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel):
93 | """Maintains moving averages of model parameters using an exponential decay.
94 | ``ema_avg = decay * avg_model_param + (1 - decay) * model_param``
95 | `torch.optim.swa_utils.AveragedModel `_
96 | is used to compute the EMA.
97 | """
98 |
99 | def __init__(self, model, decay, device="cpu", use_buffers=True):
100 | self.decay_fn = lambda x: decay * (1 - math.exp(-x / 2000))
101 |
102 | def ema_avg(avg_model_param, model_param, num_averaged):
103 | decay = self.decay_fn(num_averaged)
104 | return decay * avg_model_param + (1 - decay) * model_param
105 |
106 | super().__init__(model, device, ema_avg, use_buffers=use_buffers)
107 |
--------------------------------------------------------------------------------
/tools/deployment/export_onnx.py:
--------------------------------------------------------------------------------
1 | """
2 | ---------------------------------------------------------------------------------
3 | Modified from D-FINE
4 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
5 | ---------------------------------------------------------------------------------
6 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
7 | Copyright (c) 2023 lyuwenyu. All Rights Reserved.
8 | """
9 |
10 | import os
11 | import sys
12 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
13 | from src.core import LazyConfig, instantiate
14 |
15 | import torch
16 | import torch.nn as nn
17 |
18 | def main(args, ):
19 | """main
20 | """
21 | cfg = LazyConfig.load(args.config_file)
22 |
23 | if hasattr(cfg.model.backbone, 'pretrained'):
24 | cfg.model.backbone.pretrained = False
25 |
26 | model = instantiate(cfg.model)
27 | postprocessor = instantiate(cfg.postprocessor)
28 |
29 | if args.resume:
30 | checkpoint = torch.load(args.resume, map_location='cpu', weights_only=False)
31 | if 'ema' in checkpoint:
32 | state = checkpoint['ema']['module']
33 | else:
34 | state = checkpoint['model']
35 |
36 | # NOTE load train mode state -> convert to deploy mode
37 | model.load_state_dict(state)
38 |
39 | else:
40 | # raise AttributeError('Only support resume to load model.state_dict by now.')
41 | print('not load model.state_dict, use default init state dict...')
42 |
43 | model = model.deploy()
44 | model.eval()
45 |
46 | class Model(nn.Module):
47 | def __init__(self, ) -> None:
48 | super().__init__()
49 | self.model = model
50 | self.postprocessor = postprocessor.deploy()
51 |
52 | def forward(self, images, orig_target_sizes):
53 | outputs = self.model(images)
54 | outputs = self.postprocessor(outputs, orig_target_sizes)
55 | return outputs
56 |
57 | model = Model()
58 |
59 | data = torch.rand(1, 3, 640, 640)
60 | size = torch.tensor([[640, 640]])
61 | _ = model(data, size)
62 |
63 | dynamic_axes = {
64 | 'images': {0: 'N', },
65 | 'orig_target_sizes': {0: 'N'}
66 | }
67 |
68 | outout_folder = 'onnx_engines'
69 | os.makedirs(outout_folder , exist_ok=True)
70 | output_file = args.config_file.split('/')[-1].replace('py', 'onnx')
71 | output_file = f'{outout_folder}/{output_file}'
72 | # args.resume.replace('.pth', '.onnx') if args.resume else 'model.onnx'
73 |
74 | torch.onnx.export(
75 | model,
76 | (data, size),
77 | output_file,
78 | input_names=['images', 'orig_target_sizes'],
79 | output_names=['scores', 'labels', 'keypoints'],
80 | dynamic_axes=dynamic_axes,
81 | opset_version=16,
82 | # dynamo=True,
83 | # external_data=False,
84 | # verify=True,
85 | # report=True,
86 | verbose=False,
87 | do_constant_folding=True,
88 | )
89 |
90 | if args.check:
91 | import onnx
92 | onnx_model = onnx.load(output_file)
93 | onnx.checker.check_model(onnx_model)
94 | print('Check export onnx model done...')
95 |
96 | if args.simplify:
97 | import onnx
98 | import onnxsim
99 | dynamic = True
100 | # input_shapes = {'images': [1, 3, 640, 640], 'orig_target_sizes': [1, 2]} if dynamic else None
101 | input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None
102 | onnx_model_simplify, check = onnxsim.simplify(output_file, test_input_shapes=input_shapes)
103 | onnx.save(onnx_model_simplify, output_file)
104 | print(f'Simplify onnx model {check}...')
105 |
106 |
107 | if __name__ == '__main__':
108 |
109 | import argparse
110 | parser = argparse.ArgumentParser()
111 | parser.add_argument('--config_file', '-c', default='configs/linea/linea_l.py', type=str, )
112 | parser.add_argument('--resume', '-r', type=str, )
113 | parser.add_argument('--check', action='store_true', default=True,)
114 | parser.add_argument('--simplify', action='store_true', default=True,)
115 | args = parser.parse_args()
116 | main(args)
117 |
--------------------------------------------------------------------------------
/src/misc/box_ops.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | """
3 | Utilities for bounding box manipulation and GIoU.
4 | """
5 | import torch, os
6 | from torchvision.ops.boxes import box_area
7 |
8 |
9 | def box_cxcywh_to_xyxy(x):
10 | x_c, y_c, w, h = x.unbind(-1)
11 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
12 | (x_c + 0.5 * w), (y_c + 0.5 * h)]
13 | return torch.stack(b, dim=-1)
14 |
15 |
16 | def box_xyxy_to_cxcywh(x):
17 | x0, y0, x1, y1 = x.unbind(-1)
18 | b = [(x0 + x1) / 2, (y0 + y1) / 2,
19 | (x1 - x0), (y1 - y0)]
20 | return torch.stack(b, dim=-1)
21 |
22 |
23 | # modified from torchvision to also return the union
24 | def box_iou(boxes1, boxes2):
25 | area1 = box_area(boxes1)
26 | area2 = box_area(boxes2)
27 |
28 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
29 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
30 |
31 | wh = (rb - lt).clamp(min=0) # [N,M,2]
32 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
33 |
34 | union = area1[:, None] + area2 - inter
35 |
36 | iou = inter / (union + 1e-6)
37 | return iou, union
38 |
39 |
40 | def generalized_box_iou(boxes1, boxes2):
41 | """
42 | Generalized IoU from https://giou.stanford.edu/
43 |
44 | The boxes should be in [x0, y0, x1, y1] format
45 |
46 | Returns a [N, M] pairwise matrix, where N = len(boxes1)
47 | and M = len(boxes2)
48 | """
49 | # degenerate boxes gives inf / nan results
50 | # so do an early check
51 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
52 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
53 | iou, union = box_iou(boxes1, boxes2)
54 |
55 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
56 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
57 |
58 | wh = (rb - lt).clamp(min=0) # [N,M,2]
59 | area = wh[:, :, 0] * wh[:, :, 1]
60 |
61 | return iou - (area - union) / (area + 1e-6)
62 |
63 |
64 |
65 | # modified from torchvision to also return the union
66 | def box_iou_pairwise(boxes1, boxes2):
67 | area1 = box_area(boxes1)
68 | area2 = box_area(boxes2)
69 |
70 | lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2]
71 | rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2]
72 |
73 | wh = (rb - lt).clamp(min=0) # [N,2]
74 | inter = wh[:, 0] * wh[:, 1] # [N]
75 |
76 | union = area1 + area2 - inter
77 |
78 | iou = inter / union
79 | return iou, union
80 |
81 |
82 | def generalized_box_iou_pairwise(boxes1, boxes2):
83 | """
84 | Generalized IoU from https://giou.stanford.edu/
85 |
86 | Input:
87 | - boxes1, boxes2: N,4
88 | Output:
89 | - giou: N, 4
90 | """
91 | # degenerate boxes gives inf / nan results
92 | # so do an early check
93 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
94 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
95 | assert boxes1.shape == boxes2.shape
96 | iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
97 |
98 | lt = torch.min(boxes1[:, :2], boxes2[:, :2])
99 | rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
100 |
101 | wh = (rb - lt).clamp(min=0) # [N,2]
102 | area = wh[:, 0] * wh[:, 1]
103 |
104 | return iou - (area - union) / area
105 |
106 | def masks_to_boxes(masks):
107 | """Compute the bounding boxes around the provided masks
108 |
109 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
110 |
111 | Returns a [N, 4] tensors, with the boxes in xyxy format
112 | """
113 | if masks.numel() == 0:
114 | return torch.zeros((0, 4), device=masks.device)
115 |
116 | h, w = masks.shape[-2:]
117 |
118 | y = torch.arange(0, h, dtype=torch.float)
119 | x = torch.arange(0, w, dtype=torch.float)
120 | y, x = torch.meshgrid(y, x)
121 |
122 | x_mask = (masks * x.unsqueeze(0))
123 | x_max = x_mask.flatten(1).max(-1)[0]
124 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
125 |
126 | y_mask = (masks * y.unsqueeze(0))
127 | y_max = y_mask.flatten(1).max(-1)[0]
128 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
129 |
130 | return torch.stack([x_min, y_min, x_max, y_max], 1)
131 |
132 | if __name__ == '__main__':
133 | x = torch.rand(5, 4)
134 | y = torch.rand(3, 4)
135 | iou, union = box_iou(x, y)
--------------------------------------------------------------------------------
/src/data/container.py:
--------------------------------------------------------------------------------
1 | """
2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
4 | ---------------------------------------------------------------------------------
5 | Modified from D-DEIM (https://github.com/Intellindust-AI-Lab/DEIM/)
6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
7 | ---------------------------------------------------------------------------------
8 | Modified from D-FINE (https://github.com/Peterande/D-FINE/)
9 | Copyright (c) 2024 D-FINE Authors. All Rights Reserved.
10 | ---------------------------------------------------------------------------------
11 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/)
12 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved.
13 | """
14 |
15 | from omegaconf import ListConfig
16 | import random
17 |
18 | class Compose(object):
19 | def __init__(self, policy=None, mosaic_prob=0.0, **transforms):
20 | self.transforms = []
21 | for transform in transforms.values():
22 | self.transforms.append(transform)
23 |
24 | self.mosaic_prob = mosaic_prob
25 |
26 | if policy is None:
27 | self.policy = {'name': 'default'}
28 | else:
29 | self.policy = policy
30 | if self.mosaic_prob > 0:
31 | print(" ### Mosaic with Prob.@{} and RandomZoomOut/RandomCrop existed ### ".format(self.mosaic_prob))
32 | print(" ### ImgTransforms Epochs: {} ### ".format(policy['epoch']))
33 | print(' ### Policy_ops@{} ###'.format(policy['ops']))
34 |
35 | ### warnings ##
36 | self.warning_mosaic_start = True
37 |
38 | def __call__(self, image, target, dataset=None):
39 | return self.get_forward(self.policy['name'])(image, target, dataset)
40 |
41 | def get_forward(self, name):
42 | forwards = {
43 | 'default': self.default_forward,
44 | 'stop_epoch': self.stop_epoch_forward,
45 | }
46 | return forwards[name]
47 |
48 | def default_forward(self, image, target, dataset=None):
49 | for transform in self.transforms:
50 | image, target = transform(image, target)
51 | return image, target
52 |
53 | def stop_epoch_forward(self, image, target, dataset=None):
54 | cur_epoch = dataset.epoch
55 | policy_ops = self.policy['ops']
56 | policy_epoch = self.policy['epoch']
57 |
58 | if isinstance(policy_epoch, (list, ListConfig)) and len(policy_epoch) == 3:
59 | if policy_epoch[0] <= cur_epoch < policy_epoch[1]:
60 | with_mosaic = random.random() <= self.mosaic_prob # Probility for Mosaic
61 | else:
62 | with_mosaic = False
63 |
64 | for transform in self.transforms:
65 | if (type(transform).__name__ in policy_ops and cur_epoch < policy_epoch[0]): # first stage: NoAug
66 | pass
67 | elif (type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch[-1]): # last stage: NoAug
68 | pass
69 | else:
70 | # Using Mosaic for [policy_epoch[0], policy_epoch[1]] with probability
71 | if (type(transform).__name__ == 'Mosaic' and not with_mosaic):
72 | pass
73 | # Mosaic and Zoomout/IoUCrop can not be co-existed in the same sample
74 | elif (type(transform).__name__ == 'RandomZoomOut' or type(transform).__name__ == 'RandomCrop') and with_mosaic:
75 | pass
76 | else:
77 | if type(transform).__name__ == 'Mosaic':
78 | if self.warning_mosaic_start:
79 | # It shows in which epochs mosaic is being used
80 | print(f' ### Mosaic is being used @ epoch {cur_epoch}...')
81 | self.warning_mosaic_start = False
82 | image, target = transform(image, target, dataset)
83 | else:
84 | image, target = transform(image, target)
85 | else:
86 | for transform in self.transforms:
87 | image, target = transform(image, target)
88 |
89 | return image, target
90 |
91 | def __repr__(self):
92 | format_string = self.__class__.__name__ + "("
93 | for t in self.transforms:
94 | format_string += "\n"
95 | format_string += " {0}".format(t)
96 | format_string += "\n)"
97 | return format_string
--------------------------------------------------------------------------------
/src/nn/backbone/resnet.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Conditional DETR
3 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Copied from DETR (https://github.com/facebookresearch/detr)
7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
8 | # ------------------------------------------------------------------------
9 |
10 | """
11 | Backbone modules.
12 | """
13 | import os
14 |
15 | import torch
16 | import torch.nn.functional as F
17 | import torchvision
18 | from torch import nn
19 | from torchvision.models._utils import IntermediateLayerGetter
20 | from typing import Dict, List
21 |
22 | class FrozenBatchNorm2d(torch.nn.Module):
23 | """
24 | BatchNorm2d where the batch statistics and the affine parameters are fixed.
25 |
26 | Copy-paste from torchvision.misc.ops with added eps before rqsrt,
27 | without which any other models than torchvision.models.resnet[18,34,50,101]
28 | produce nans.
29 | """
30 |
31 | def __init__(self, n):
32 | super(FrozenBatchNorm2d, self).__init__()
33 | self.register_buffer("weight", torch.ones(n))
34 | self.register_buffer("bias", torch.zeros(n))
35 | self.register_buffer("running_mean", torch.zeros(n))
36 | self.register_buffer("running_var", torch.ones(n))
37 |
38 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
39 | missing_keys, unexpected_keys, error_msgs):
40 | num_batches_tracked_key = prefix + 'num_batches_tracked'
41 | if num_batches_tracked_key in state_dict:
42 | del state_dict[num_batches_tracked_key]
43 |
44 | super(FrozenBatchNorm2d, self)._load_from_state_dict(
45 | state_dict, prefix, local_metadata, strict,
46 | missing_keys, unexpected_keys, error_msgs)
47 |
48 | def forward(self, x):
49 | # move reshapes to the beginning
50 | # to make it fuser-friendly
51 | w = self.weight.reshape(1, -1, 1, 1)
52 | b = self.bias.reshape(1, -1, 1, 1)
53 | rv = self.running_var.reshape(1, -1, 1, 1)
54 | rm = self.running_mean.reshape(1, -1, 1, 1)
55 | eps = 1e-5
56 | scale = w * (rv + eps).rsqrt()
57 | bias = b - rm * scale
58 | return x * scale + bias
59 |
60 |
61 | class BackboneBase(nn.Module):
62 | def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_indices: list):
63 | super().__init__()
64 | for name, parameter in backbone.named_parameters():
65 | if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
66 | parameter.requires_grad_(False)
67 |
68 | return_layers = {}
69 | for idx, layer_index in enumerate(return_interm_indices):
70 | return_layers.update({"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)})
71 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
72 | self.num_channels = num_channels
73 |
74 | def forward(self, input):
75 | xs = self.body(input)
76 | return xs.values()
77 |
78 |
79 | class ResNet(BackboneBase):
80 | """ResNet backbone with frozen BatchNorm."""
81 | def __init__(self, name: str,
82 | train_backbone: bool,
83 | dilation: bool,
84 | return_interm_indices:list,
85 | batch_norm=FrozenBatchNorm2d,
86 | pretrained=False,
87 | ):
88 | if name in ['resnet18', 'resnet34', 'resnet50', 'resnet101']:
89 | backbone = getattr(torchvision.models, name)(
90 | replace_stride_with_dilation=[False, False, dilation],
91 | pretrained=pretrained, norm_layer=batch_norm)
92 | else:
93 | raise NotImplementedError("Why you can get here with name {}".format(name))
94 | # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
95 | assert name not in ('resnet18', 'resnet34'), "Only resnet50 and resnet101 are available."
96 | assert return_interm_indices in [[0,1,2,3], [1,2,3], [2, 3], [3]]
97 | num_channels_all = [256, 512, 1024, 2048]
98 | num_channels = num_channels_all[4-len(return_interm_indices):]
99 | super().__init__(backbone, train_backbone, num_channels, return_interm_indices)
100 |
101 |
--------------------------------------------------------------------------------
/src/misc/get_param_dicts.py:
--------------------------------------------------------------------------------
1 | import json
2 | import torch
3 | import torch.nn as nn
4 |
5 | import re
6 |
7 |
8 | def get_optim_params(cfg: list, model: nn.Module):
9 | """
10 | E.g.:
11 | ^(?=.*a)(?=.*b).*$ means including a and b
12 | ^(?=.*(?:a|b)).*$ means including a or b
13 | ^(?=.*a)(?!.*b).*$ means including a, but not b
14 | """
15 |
16 | param_groups = []
17 | visited = []
18 |
19 | cfg_ = []
20 | for pg in cfg:
21 | cfg_.append(dict(pg))
22 |
23 | for pg in cfg_:
24 | pattern = pg['params']
25 | params = {k: v for k, v in model.named_parameters() if v.requires_grad and len(re.findall(pattern, k)) > 0}
26 | pg['params'] = params.values()
27 | param_groups.append(pg)
28 | visited.extend(list(params.keys()))
29 |
30 | names = [k for k, v in model.named_parameters() if v.requires_grad]
31 |
32 | if len(visited) < len(names):
33 | unseen = set(names) - set(visited)
34 | params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen}
35 | param_groups.append({'params': params.values()})
36 | visited.extend(list(params.keys()))
37 |
38 | assert len(visited) == len(names), ''
39 |
40 | return param_groups
41 |
42 | def match_name_keywords(n: str, name_keywords: list):
43 | out = False
44 | for b in name_keywords:
45 | if b in n:
46 | out = True
47 | break
48 | return out
49 |
50 |
51 | def get_param_dict(args, model_without_ddp: nn.Module):
52 | try:
53 | param_dict_type = args.param_dict_type
54 | except:
55 | param_dict_type = 'default'
56 | assert param_dict_type in ['default', 'ddetr_in_mmdet', 'large_wd']
57 |
58 | # by default
59 | if param_dict_type == 'default':
60 | param_dicts = [
61 | {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
62 | {
63 | "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
64 | "lr": args.lr_backbone,
65 | }
66 | ]
67 | return param_dicts
68 |
69 | if param_dict_type == 'ddetr_in_mmdet':
70 | param_dicts = [
71 | {
72 | "params":
73 | [p for n, p in model_without_ddp.named_parameters()
74 | if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
75 | "lr": args.lr,
76 | },
77 | {
78 | "params": [p for n, p in model_without_ddp.named_parameters()
79 | if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
80 | "lr": args.lr_backbone,
81 | },
82 | {
83 | "params": [p for n, p in model_without_ddp.named_parameters()
84 | if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
85 | "lr": args.lr * args.lr_linear_proj_mult,
86 | }
87 | ]
88 | return param_dicts
89 |
90 | if param_dict_type == 'large_wd':
91 | param_dicts = [
92 | {
93 | "params":
94 | [p for n, p in model_without_ddp.named_parameters()
95 | if not match_name_keywords(n, ['backbone']) and not match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
96 | },
97 | {
98 | "params": [p for n, p in model_without_ddp.named_parameters()
99 | if match_name_keywords(n, ['backbone']) and match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
100 | "lr": args.lr_backbone,
101 | "weight_decay": 0.0,
102 | },
103 | {
104 | "params": [p for n, p in model_without_ddp.named_parameters()
105 | if match_name_keywords(n, ['backbone']) and not match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
106 | "lr": args.lr_backbone,
107 | "weight_decay": args.weight_decay,
108 | },
109 | {
110 | "params":
111 | [p for n, p in model_without_ddp.named_parameters()
112 | if not match_name_keywords(n, ['backbone']) and match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
113 | "lr": args.lr,
114 | "weight_decay": 0.0,
115 | }
116 | ]
117 |
118 | # print("param_dicts: {}".format(param_dicts))
119 |
120 | return param_dicts
121 |
--------------------------------------------------------------------------------
/assets/TENSORRT_DEB_LAMBDA.AI.md:
--------------------------------------------------------------------------------
1 |
2 | Manual to install TensorRT in Lambda.ai instances
3 |
4 |
5 | ## Quick Start
6 | ### Lambda.ai
7 | 1. Go to [Lambda.ai](https://lambda.ai) and create an account.
8 | 2. Log in to your Lambda.ai account.
9 | 3. Click on the `Launch instance' button. It is located on the top right side of the website.
10 | 4. Select an instance. To replicate our results from the appendix, select `1x A10 (24 GB PCle)`
11 |
12 | ### CUDA Installation
13 | The Lambda Stack installs a pre-packaged version of CUDA with only whats needed for typical deep learning workflows.
14 | But the `.deb` TensorRT installation expects the full CUDA Toolkit to already be installed in the system in the standard way via NVIDIAs `.deb` repo.
15 | Thats why your TensorRT installation only succeeded after installing CUDA.
16 | This ensured all the expected binaries, libraries, and metadata were in place for TensorRT to install cleanly.
17 |
18 | 1. Check which CUDA version your Lambda.ai instance is using
19 | ```shell
20 | nvidia-smi
21 | ```
22 | We got the following output
23 | ```shell
24 | +-----------------------------------------------------------------------------------------+
25 | | NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 |
26 | |-----------------------------------------+------------------------+----------------------+
27 | | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
28 | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
29 | | | | MIG M. |
30 | |=========================================+========================+======================|
31 | | 0 NVIDIA A10 On | 00000000:06:00.0 Off | 0 |
32 | | 0% 28C P8 9W / 150W | 1MiB / 23028MiB | 0% Default |
33 | | | | N/A |
34 | +-----------------------------------------+------------------------+----------------------+
35 |
36 | +-----------------------------------------------------------------------------------------+
37 | | Processes: |
38 | | GPU GI CI PID Type Process name GPU Memory |
39 | | ID ID Usage |
40 | |=========================================================================================|
41 | | No running processes found |
42 | +-----------------------------------------------------------------------------------------+
43 | ```
44 |
45 | 2. Install the CUDA toolkit with the correct version (in our case 12.8)
46 | ```shell
47 | # cuda installation
48 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
49 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
50 | sudo apt-get update
51 | sudo apt-get -y install cuda-toolkit-12-8
52 | ```
53 |
54 | 3. Install TensorRT
55 |
56 | When you use the `.deb` installation, you will install the latest TensorRT.
57 | ```shell
58 | #tensorrt installation
59 | wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.9.0/local_repo/nv-tensorrt-local-repo-ubuntu2204-10.9.0-cuda-12.8_1.0-1_amd64.deb
60 | sudo dpkg -i nv-tensorrt-local-repo-ubuntu2204-10.9.0-cuda-12.8_1.0-1_amd64.deb
61 | sudo cp /var/nv-tensorrt-local-repo-ubuntu2204-10.9.0-cuda-12.8/nv-tensorrt-local-AD7406A2-keyring.gpg /usr/share/keyrings/
62 | sudo apt-get update
63 | sudo apt-get install tensorrt
64 | ```
65 |
66 | The complete installation takes approximately 10-15 minutes.
67 |
68 | ## Installing DETRPose
69 | ### Quick Start
70 | ```shell
71 | git clone https://github.com/SebastianJanampa/DETRPose.git
72 | cd DETRPose
73 | pip install -r requirements.txt
74 | ```
75 |
76 | ### Data Preparation
77 | ```
78 | pip install gdown # to download files from google drive
79 | gdown 1VprytECcLtU4tKP32SYi_7oDRbw7yUTL # images
80 | unzip images.zip
81 | ```
82 |
83 | ### Usage
84 | ```shell
85 | pip install onnx onnxsim
86 | pip install -r tools/benchmark/requirements.txt
87 |
88 | export model=l #n, s, m, l, x
89 | mkdir trt_engines
90 | ```
91 | 1. Download official weights
92 | ```shell
93 | wget https://github.com/SebastianJanampa/DETRPose/releases/download/model_weights/detrpose_hgnetv2_${model}.pth
94 | ```
95 | 2. Export onnx
96 | ```shell
97 | python tools/deployment/export_onnx.py --check -c configs/detrpose/detrpose_hgnetv2_${model}.py -r detrpose_hgnetv2_${model}.pth
98 | ```
99 | 3. Export tensorrt
100 | ```shell
101 | alias trtexec="/usr/src/tensorrt/bin/trtexec"
102 | trtexec --onnx="onnx_engines/detrpose_hgnetv2_${model}.onnx" --saveEngine="trt_engines/detrpose_hgnetv2_${model}.engine" --fp16
103 | ```
104 | 4. Benchmark
105 | ```shell
106 | python tools/benchmark/trt_benchmark.py --infer_dir ./images --engine_dir trt_engines
107 | ```
--------------------------------------------------------------------------------
/tools/visualization/backbone_encoder.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 IDEA. All Rights Reserved.
2 | # ------------------------------------------------------------------------
3 | import os, sys
4 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
5 |
6 | import argparse
7 |
8 | import matplotlib as mpl
9 | import matplotlib.pyplot as plt
10 | from matplotlib.backends.backend_agg import FigureCanvasAgg
11 |
12 | import torch
13 | from torch.utils.data import DataLoader
14 |
15 | from util.slconfig import SLConfig
16 | import util.misc as utils
17 |
18 | import datasets
19 | from datasets import build_dataset, BatchImageCollateFunction
20 |
21 |
22 | def create(args, classname):
23 | # we use register to maintain models from catdet6 on.
24 | from models.registry import MODULE_BUILD_FUNCS
25 | class_module = getattr(args, classname)
26 | assert class_module in MODULE_BUILD_FUNCS._module_dict
27 | build_func = MODULE_BUILD_FUNCS.get(class_module)
28 | return build_func(args)
29 |
30 | def main(args):
31 | cfg = SLConfig.fromfile(args.config)
32 | device = args.device
33 |
34 | setattr(cfg, 'coco_path', args.data_path)
35 | setattr(cfg, 'batch_size_train', 1)
36 | setattr(cfg, 'batch_size_val', 1)
37 |
38 | if 'HGNetv2' in cfg.backbone:
39 | cfg.pretrained = False
40 |
41 | # build model
42 | model, _ = create(cfg, 'modelname')
43 | model.to(device)
44 |
45 | dataset_val = build_dataset(image_set='val', args=cfg)
46 |
47 | sampler_val = torch.utils.data.SequentialSampler(dataset_val)
48 |
49 | data_loader_val = DataLoader(dataset_val, 1, sampler=sampler_val, drop_last=False, collate_fn=BatchImageCollateFunction(), num_workers=4)
50 |
51 | if args.resume:
52 | checkpoint = torch.load(args.resume, map_location='cpu')
53 | if 'ema' in checkpoint:
54 | state = checkpoint['ema']['module']
55 | else:
56 | state = checkpoint['model']
57 |
58 | # NOTE load train mode state -> convert to deploy mode
59 | model.load_state_dict(state)
60 |
61 | # folder path
62 | main_folder = cfg.output_dir
63 | if 'data/wireframe_processed' in args.data_path:
64 | backbone_dir = f'{main_folder}/visualization/backbone_wireframe'
65 | encoder_dir = f'{main_folder}/visualization/encoder_wireframe'
66 |
67 | elif 'data/york_processed' in args.data_path:
68 | backbone_dir = f'{main_folder}/visualization/backbone_york'
69 | encoder_dir = f'{main_folder}/visualization/encoder_york'
70 | else:
71 | raise 'Dataset does not exist. We support only wireframe and york datasets'
72 |
73 | os.makedirs(backbone_dir , exist_ok=True)
74 | os.makedirs(encoder_dir, exist_ok=True)
75 |
76 | with torch.no_grad():
77 |
78 | for i, (samples, targets) in enumerate(data_loader_val):
79 | samples = samples.to(device)
80 |
81 | enc_feature_maps = []
82 | backbone_feature_maps = []
83 | hooks = [
84 | model.backbone.register_forward_hook(
85 | lambda self, input, output: backbone_feature_maps.append(output)
86 | ),
87 | model.encoder.register_forward_hook(
88 | lambda self, input, output: enc_feature_maps.append(output)
89 | ),
90 | ]
91 | model(samples)
92 |
93 | for hook in hooks:
94 | hook.remove()
95 |
96 | back_feats = backbone_feature_maps[0]
97 | enc_feats = enc_feature_maps[0]
98 |
99 | curr_img_id = targets[0]['image_id'].tolist()[0]
100 |
101 | for j, back_feat in enumerate(back_feats):
102 | down = j + 1
103 |
104 | back_feat = back_feat[0].mean(0).cpu()
105 |
106 | fig = plt.figure(figsize=(16, 16))
107 | plt.axis('off')
108 | plt.imshow(back_feat)
109 | plt.savefig(
110 | f"{backbone_dir}/{curr_img_id}_ds_{down}.png",
111 | bbox_inches='tight',
112 | pad_inches=0,
113 | dpi=200
114 | )
115 | plt.close()
116 |
117 | for j, enc_feat in enumerate(enc_feats):
118 | down = j + 1
119 |
120 | enc_feat = enc_feat[0].mean(0).cpu()
121 |
122 | fig = plt.figure(figsize=(16, 16))
123 | plt.axis('off')
124 | plt.imshow(enc_feat)
125 | plt.savefig(
126 | f"{encoder_dir}/{curr_img_id}_ds_{down}.png",
127 | bbox_inches='tight',
128 | pad_inches=0,
129 | dpi=200
130 | )
131 | plt.close()
132 |
133 | # check condition to stop program
134 | if args.num_images is not None and i + 1 >= args.num_images:
135 | break
136 |
137 |
138 | if __name__ == '__main__':
139 | parser = argparse.ArgumentParser('Visualization of Deformable Line Attention')
140 | parser.add_argument('-c', '--config', type=str, required=True)
141 | parser.add_argument('-r', '--resume', default='', help='resume from checkpoint')
142 | parser.add_argument('-p', '--data-path', type=str, default='data/wireframe_processed', help='data path')
143 | parser.add_argument('-d', '--device', type=str, default='cpu')
144 | parser.add_argument('-n', '--num_images', type=int, help='total number of images to plot')
145 | args = parser.parse_args()
146 | main(args)
147 |
--------------------------------------------------------------------------------
/src/data/coco.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | """
3 | COCO dataset which returns image_id for evaluation.
4 |
5 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
6 | """
7 |
8 | from pathlib import Path
9 | import cv2
10 | import numpy as np
11 | import torch
12 | import torch.utils.data
13 | from PIL import Image
14 | from pycocotools import mask as coco_mask
15 | from pycocotools.coco import COCO
16 | # import datasets.transforms as T
17 |
18 | __all__ = ['build']
19 |
20 |
21 | class CocoDetection(torch.utils.data.Dataset):
22 | def __init__(self, img_folder, ann_file, transforms, return_masks=False):
23 | super(CocoDetection, self).__init__()
24 | self._transforms = transforms
25 | self.prepare = ConvertCocoPolysToMask(return_masks)
26 |
27 | self.img_folder = Path(img_folder)
28 | self.coco = COCO(ann_file)
29 | imgIds = sorted(self.coco.getImgIds())
30 |
31 | if "train" in ann_file:
32 | self.all_imgIds = []
33 | for image_id in imgIds:
34 | if self.coco.getAnnIds(imgIds=image_id) == []:
35 | continue
36 | ann_ids = self.coco.getAnnIds(imgIds=image_id)
37 | target = self.coco.loadAnns(ann_ids)
38 | num_keypoints = [obj["num_keypoints"] for obj in target]
39 | if sum(num_keypoints) == 0:
40 | continue
41 | self.all_imgIds.append(image_id)
42 | else:
43 | self.all_imgIds = []
44 | for image_id in imgIds:
45 | self.all_imgIds.append(image_id)
46 |
47 | def set_epoch(self, epoch):
48 | self._epoch = epoch
49 |
50 | @property
51 | def epoch(self):
52 | return self._epoch if hasattr(self, '_epoch') else -1
53 |
54 | def __len__(self):
55 | return len(self.all_imgIds)
56 |
57 | def load_item(self, idx):
58 | image_id = self.all_imgIds[idx]
59 | ann_ids = self.coco.getAnnIds(imgIds=image_id)
60 | target = self.coco.loadAnns(ann_ids)
61 |
62 | target = {'image_id': image_id, 'annotations': target}
63 | img = Image.open(self.img_folder / self.coco.loadImgs(image_id)[0]['file_name'])
64 | img, target = self.prepare(img, target)
65 | return img, target
66 |
67 | def __getitem__(self, idx):
68 | img, target = self.load_item(idx)
69 | if self._transforms is not None:
70 | img, target = self._transforms(img, target, self)
71 | return img, target
72 |
73 |
74 | def convert_coco_poly_to_mask(segmentations, height, width):
75 | masks = []
76 | for polygons in segmentations:
77 | rles = coco_mask.frPyObjects(polygons, height, width)
78 | mask = coco_mask.decode(rles)
79 | if len(mask.shape) < 3:
80 | mask = mask[..., None]
81 | mask = torch.as_tensor(mask, dtype=torch.uint8)
82 | mask = mask.any(dim=2)
83 | masks.append(mask)
84 | if masks:
85 | masks = torch.stack(masks, dim=0)
86 | else:
87 | masks = torch.zeros((0, height, width), dtype=torch.uint8)
88 | return masks
89 |
90 |
91 | class ConvertCocoPolysToMask(object):
92 | def __init__(self, return_masks=False):
93 | self.return_masks = return_masks
94 |
95 | def __call__(self, image, target):
96 | w, h = image.size
97 |
98 | img_array = np.array(image)
99 | if len(img_array.shape) == 2:
100 | img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
101 | image = Image.fromarray(img_array)
102 | image_id = target["image_id"]
103 | image_id = torch.tensor([image_id])
104 | anno = target["annotations"]
105 | anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
106 | anno = [obj for obj in anno if obj['num_keypoints'] != 0]
107 | keypoints = [obj["keypoints"] for obj in anno]
108 | boxes = [obj["bbox"] for obj in anno]
109 | keypoints = torch.as_tensor(keypoints, dtype=torch.float32).reshape(-1, 17, 3)
110 | # guard against no boxes via resizing
111 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
112 | boxes[:, 2:] += boxes[:, :2]
113 | boxes[:, 0::2].clamp_(min=0, max=w)
114 | boxes[:, 1::2].clamp_(min=0, max=h)
115 | classes = [obj["category_id"] for obj in anno]
116 | classes = torch.tensor(classes, dtype=torch.int64)
117 | if self.return_masks:
118 | segmentations = [obj["segmentation"] for obj in anno]
119 | masks = convert_coco_poly_to_mask(segmentations, h, w)
120 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
121 | boxes = boxes[keep]
122 | classes = classes[keep]
123 | keypoints = keypoints[keep]
124 | if self.return_masks:
125 | masks = masks[keep]
126 | target = {}
127 | target["boxes"] = boxes
128 | target["labels"] = classes
129 | if self.return_masks:
130 | target["masks"] = masks
131 | target["image_id"] = image_id
132 | if keypoints is not None:
133 | target["keypoints"] = keypoints
134 | # for conversion to coco api
135 | area = torch.tensor([obj["area"] for obj in anno])
136 | iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
137 | target["area"] = area[keep]
138 | target["iscrowd"] = iscrowd[keep]
139 | target["orig_size"] = torch.as_tensor([int(w), int(h)])
140 | target["size"] = torch.as_tensor([int(h), int(w)])
141 | return image, target
142 |
143 |
144 |
--------------------------------------------------------------------------------
/tools/inference/onnx_inf.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
3 | """
4 | import os
5 | import cv2
6 | import glob
7 | import numpy as np
8 | import onnxruntime as ort
9 | import torch
10 | import torchvision.transforms as T
11 |
12 | from PIL import Image, ImageDraw
13 | from copy import deepcopy
14 | from annotator import Annotator
15 | from annotator_crowdpose import AnnotatorCrowdpose
16 |
17 | annotators = {'COCO': Annotator, 'CrowdPose': AnnotatorCrowdpose}
18 |
19 | def process_image(sess, im_pil):
20 | w, h = im_pil.size
21 | orig_size = torch.tensor([w, h])[None]
22 |
23 | transforms = T.Compose(
24 | [
25 | T.Resize((640, 640)),
26 | T.ToTensor(),
27 | ]
28 | )
29 | im_data = transforms(im_pil).unsqueeze(0)
30 | annotator = annotators[annotator_type](deepcopy(im_pil))
31 |
32 |
33 | output = sess.run(
34 | output_names=None,
35 | input_feed={"images": im_data.numpy(), "orig_target_sizes": orig_size.numpy()},
36 | )
37 |
38 | scores, labels, keypoints = output
39 | scores, labels, keypoints = scores[0], labels[0], keypoints[0]
40 | for kpt, score in zip(keypoints, scores):
41 | if score > thrh:
42 | annotator.kpts(
43 | kpt,
44 | [h, w]
45 | )
46 | annotator.save(f"{OUTPUT_NAME}.jpg")
47 |
48 |
49 | def process_video(sess, video_path):
50 | cap = cv2.VideoCapture(video_path)
51 |
52 | # Get video properties
53 | fps = cap.get(cv2.CAP_PROP_FPS)
54 | orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
55 | orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
56 |
57 | # Define the codec and create VideoWriter object
58 | fourcc = cv2.VideoWriter_fourcc(*"mp4v")
59 | out = cv2.VideoWriter(f"{OUTPUT_NAME}.mp4", fourcc, fps, (orig_w, orig_h))
60 |
61 | transforms = T.Compose(
62 | [
63 | T.Resize((640, 640)),
64 | T.ToTensor(),
65 | ]
66 | )
67 |
68 | frame_count = 0
69 | print("Processing video frames...")
70 | while cap.isOpened():
71 | ret, frame = cap.read()
72 | if not ret:
73 | break
74 |
75 | # Convert frame to PIL image
76 | frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
77 |
78 | w, h = frame_pil.size
79 | orig_size = torch.tensor([w, h])[None]
80 | annotator = annotators[annotator_type](deepcopy(frame_pil))
81 |
82 | im_data = transforms(frame_pil).unsqueeze(0)
83 |
84 | output = sess.run(
85 | output_names=None,
86 | input_feed={"images": im_data.numpy(), "orig_target_sizes": orig_size.numpy()},
87 | )
88 |
89 | scores, labels, keypoints = output
90 | scores, labels, keypoints = scores[0], labels[0], keypoints[0]
91 | for kpt, score in zip(keypoints, scores):
92 | if score > thrh:
93 | annotator.kpts(
94 | kpt,
95 | [h, w]
96 | )
97 |
98 | # Convert back to OpenCV image
99 | frame = annotator.result()
100 |
101 | # Write the frame
102 | out.write(frame)
103 | frame_count += 1
104 |
105 | if frame_count % 10 == 0:
106 | print(f"Processed {frame_count} frames...")
107 |
108 | cap.release()
109 | out.release()
110 | print(f"Video processing complete. Result saved as '{OUTPUT_NAME}.mp4'.")
111 |
112 | def process_file(sess, file_path):
113 | # Check if the input file is an image or a video
114 | try:
115 | # Try to open the input as an image
116 | im_pil = Image.open(file_path).convert("RGB")
117 | process_image(sess, im_pil)
118 | except IOError:
119 | # Not an image, process as video
120 | process_video(sess, file_path)
121 |
122 | def main(args):
123 | assert args.annotator.lower() in ['coco', 'crowdpose']
124 | # Global variable
125 | global OUTPUT_NAME, thrh, annotator_type
126 |
127 | """Main function."""
128 | # Load the ONNX model
129 | sess = ort.InferenceSession(args.onnx)
130 | print(f"Using device: {ort.get_device()}")
131 |
132 | input_path = args.input
133 | thrh = 0.5 if args.thrh is None else args.thrh
134 |
135 | annotator_name = args.annotator.lower()
136 | if annotator_name == 'coco':
137 | annotator_type = 'COCO'
138 | elif annotator_name == 'crowdpose':
139 | annotator_type = 'CrowdPose'
140 |
141 | # Check if the input argumnet is a file or a folder
142 | file_path = args.input
143 | if os.path.isdir(file_path):
144 | # Process a folder
145 | folder_dir = args.input
146 | output_dir = f"{folder_dir}/output"
147 | os.makedirs(output_dir, exist_ok=True)
148 | paths = list(glob.iglob(f"{folder_dir}/*.*"))
149 | for file_path in paths:
150 | OUTPUT_NAME = file_path.replace(f'{folder_dir}/', f'{output_dir}/').split('.')[0]
151 | OUTPUT_NAME = f"{OUTPUT_NAME}_{annotator_type}"
152 | process_file(sess, file_path)
153 | else:
154 | # Process a file
155 | OUTPUT_NAME = f'onxx_results_{annotator_type}'
156 | process_file(sess, file_path)
157 |
158 | if __name__ == "__main__":
159 | import argparse
160 |
161 | parser = argparse.ArgumentParser()
162 | parser.add_argument("--onnx", type=str, required=True, help="Path to the ONNX model file.")
163 | parser.add_argument("--annotator", type=str, required=True, help="Annotator type: COCO or CrowdPose.")
164 | parser.add_argument("-i", "--input", type=str, required=True, help="Path to the input image or video file.")
165 | parser.add_argument("-t", "--thrh", type=float, required=False, default=None)
166 | args = parser.parse_args()
167 | main(args)
168 |
--------------------------------------------------------------------------------
/src/data/crowdpose.py:
--------------------------------------------------------------------------------
1 | """
2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
4 | ---------------------------------------------------------------------------------
5 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/)
6 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved.
7 | ---------------------------------------------------------------------------------
8 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/)
9 | Copyright (c) 2023 IDEA. All Rights Reserved.
10 | """
11 |
12 | import json
13 | from pathlib import Path
14 | import cv2
15 | import numpy as np
16 | import torch
17 | import torch.utils.data
18 | from PIL import Image
19 | from xtcocotools.coco import COCO
20 |
21 | class CrowdPoseDetection(torch.utils.data.Dataset):
22 | def __init__(self, img_folder, ann_file, transforms, return_masks=False):
23 | super(CrowdPoseDetection, self).__init__()
24 | self._transforms = transforms
25 | self.prepare = ConvertCocoPolysToMask(return_masks)
26 |
27 | self.img_folder = Path(img_folder)
28 | self.coco = COCO(ann_file)
29 | imgIds = sorted(self.coco.getImgIds())
30 |
31 | if "train" in ann_file:
32 | self.all_imgIds = []
33 | for image_id in imgIds:
34 | if self.coco.getAnnIds(imgIds=image_id) == []:
35 | continue
36 | ann_ids = self.coco.getAnnIds(imgIds=image_id)
37 | target = self.coco.loadAnns(ann_ids)
38 | num_keypoints = [obj["num_keypoints"] for obj in target]
39 | if sum(num_keypoints) == 0:
40 | continue
41 | self.all_imgIds.append(image_id)
42 | else:
43 | self.all_imgIds = []
44 | for image_id in imgIds:
45 | self.all_imgIds.append(image_id)
46 |
47 | def set_epoch(self, epoch):
48 | self._epoch = epoch
49 |
50 | @property
51 | def epoch(self):
52 | return self._epoch if hasattr(self, '_epoch') else -1
53 |
54 | def __len__(self):
55 | return len(self.all_imgIds)
56 |
57 | def load_item(self, idx):
58 | image_id = self.all_imgIds[idx]
59 | ann_ids = self.coco.getAnnIds(imgIds=image_id)
60 | target = self.coco.loadAnns(ann_ids)
61 |
62 | target = {'image_id': image_id, 'annotations': target}
63 | img = Image.open(self.img_folder / self.coco.loadImgs(image_id)[0]['file_name'])
64 | img, target = self.prepare(img, target)
65 | return img, target
66 |
67 | def __getitem__(self, idx):
68 | img, target = self.load_item(idx)
69 | if self._transforms is not None:
70 | img, target = self._transforms(img, target, self)
71 | return img, target
72 |
73 |
74 | def convert_coco_poly_to_mask(segmentations, height, width):
75 | masks = []
76 | for polygons in segmentations:
77 | rles = coco_mask.frPyObjects(polygons, height, width)
78 | mask = coco_mask.decode(rles)
79 | if len(mask.shape) < 3:
80 | mask = mask[..., None]
81 | mask = torch.as_tensor(mask, dtype=torch.uint8)
82 | mask = mask.any(dim=2)
83 | masks.append(mask)
84 | if masks:
85 | masks = torch.stack(masks, dim=0)
86 | else:
87 | masks = torch.zeros((0, height, width), dtype=torch.uint8)
88 | return masks
89 |
90 |
91 | class ConvertCocoPolysToMask(object):
92 | def __init__(self, return_masks=False):
93 | self.return_masks = return_masks
94 |
95 | def __call__(self, image, target):
96 | w, h = image.size
97 |
98 | img_array = np.array(image)
99 | if len(img_array.shape) == 2:
100 | img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
101 | image = Image.fromarray(img_array)
102 | image_id = target["image_id"]
103 | image_id = torch.tensor([image_id])
104 | anno = target["annotations"]
105 | anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
106 | anno = [obj for obj in anno if obj['num_keypoints'] != 0]
107 | keypoints = [obj["keypoints"] for obj in anno]
108 | boxes = [obj["bbox"] for obj in anno]
109 | keypoints = torch.as_tensor(keypoints, dtype=torch.float32).reshape(-1, 14, 3)
110 | # guard against no boxes via resizing
111 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
112 | boxes[:, 2:] += boxes[:, :2]
113 | boxes[:, 0::2].clamp_(min=0, max=w)
114 | boxes[:, 1::2].clamp_(min=0, max=h)
115 | classes = [obj["category_id"] for obj in anno]
116 | classes = torch.tensor(classes, dtype=torch.int64)
117 | if self.return_masks:
118 | segmentations = [obj["segmentation"] for obj in anno]
119 | masks = convert_coco_poly_to_mask(segmentations, h, w)
120 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
121 | boxes = boxes[keep]
122 | classes = classes[keep]
123 | keypoints = keypoints[keep]
124 | if self.return_masks:
125 | masks = masks[keep]
126 | target = {}
127 | target["boxes"] = boxes
128 | target["labels"] = classes
129 | if self.return_masks:
130 | target["masks"] = masks
131 | target["image_id"] = image_id
132 | if keypoints is not None:
133 | target["keypoints"] = keypoints
134 | iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
135 | target["iscrowd"] = iscrowd[keep]
136 | target["orig_size"] = torch.as_tensor([int(w), int(h)])
137 | target["size"] = torch.as_tensor([int(h), int(w)])
138 | return image, target
139 |
140 |
--------------------------------------------------------------------------------
/src/models/detrpose/matcher.py:
--------------------------------------------------------------------------------
1 | """
2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
4 | ---------------------------------------------------------------------------------
5 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/)
6 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved.
7 | ---------------------------------------------------------------------------------
8 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/)
9 | Copyright (c) 2023 IDEA. All Rights Reserved.
10 | """
11 |
12 | import torch
13 | from scipy.optimize import linear_sum_assignment
14 | from torch import nn
15 | import numpy as np
16 |
17 |
18 | class HungarianMatcher(nn.Module):
19 | def __init__(self, cost_class: float = 1, focal_alpha=0.25,
20 | cost_keypoints=1.0, cost_oks=0.01, num_body_points=17):
21 | super().__init__()
22 | self.cost_class = cost_class
23 |
24 | self.cost_keypoints = cost_keypoints
25 | self.cost_oks = cost_oks
26 | self.focal_alpha = focal_alpha
27 | self.num_body_points = num_body_points
28 |
29 | if num_body_points==17:
30 | self.sigmas = np.array([
31 | .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
32 | 1.07, .87, .87, .89, .89
33 | ], dtype=np.float32) / 10.0
34 |
35 | elif num_body_points==14:
36 | self.sigmas = np.array([
37 | .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
38 | .79, .79
39 | ]) / 10.0
40 | else:
41 | raise NotImplementedError
42 |
43 | @torch.no_grad()
44 | def forward(self, outputs, targets):
45 | bs, num_queries = outputs["pred_logits"].shape[:2]
46 |
47 | # We flatten to compute the cost matrices in a batch
48 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes]
49 | out_keypoints = outputs["pred_keypoints"].flatten(0, 1) # [batch_size * num_queries, 51]
50 |
51 | # Also concat the target labels and boxes
52 | tgt_ids = torch.cat([v["labels"] for v in targets])
53 | tgt_keypoints = torch.cat([v["keypoints"] for v in targets]) # nkp, 51
54 | tgt_area = torch.cat([v["area"] for v in targets]) # nkp, 51
55 |
56 | # Compute the classification cost.
57 | alpha = self.focal_alpha
58 | gamma = 2.0
59 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
60 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
61 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
62 |
63 | # compute the keypoint costs
64 | Z_pred = out_keypoints[:, 0:(self.num_body_points * 2)]
65 | Z_gt = tgt_keypoints[:, 0:(self.num_body_points * 2)]
66 | V_gt: torch.Tensor = tgt_keypoints[:, (self.num_body_points * 2):]
67 | if Z_pred.sum() > 0:
68 | sigmas = Z_pred.new_tensor(self.sigmas)
69 | variances = (sigmas * 2) ** 2
70 | kpt_preds = Z_pred.reshape(-1, Z_pred.size(-1) // 2, 2)
71 | kpt_gts = Z_gt.reshape(-1, Z_gt.size(-1) // 2, 2)
72 | squared_distance = (kpt_preds[:, None, :, 0] - kpt_gts[None, :, :, 0]) ** 2 + \
73 | (kpt_preds[:, None, :, 1] - kpt_gts[None, :, :, 1]) ** 2
74 | squared_distance0 = squared_distance / (tgt_area[:, None] * variances[None, :] * 2)
75 | squared_distance1 = torch.exp(-squared_distance0)
76 | squared_distance1 = squared_distance1 * V_gt
77 | oks = squared_distance1.sum(dim=-1) / (V_gt.sum(dim=-1) + 1e-6)
78 | oks = oks.clamp(min=1e-6)
79 | cost_oks = 1 - oks
80 |
81 | cost_keypoints = torch.abs(Z_pred[:, None, :] - Z_gt[None]) # npred, ngt, 34
82 | cost_keypoints = cost_keypoints * V_gt.repeat_interleave(2, dim=1)[None]
83 | cost_keypoints = cost_keypoints.sum(-1)
84 | C = self.cost_class * cost_class + self.cost_keypoints * cost_keypoints + self.cost_oks * cost_oks
85 | C = C.view(bs, num_queries, -1).cpu()
86 |
87 | else:
88 | cost_keypoints = cost_oks = 0
89 | C = self.cost_class * cost_class + self.cost_keypoints * cost_keypoints + self.cost_oks * cost_oks
90 | C = C.view(bs, num_queries, -1).cpu()
91 |
92 | # Final cost matrix
93 | sizes = [len(v["boxes"]) for v in targets]
94 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
95 |
96 | if tgt_ids.shape[0] > 0:
97 | cost_mean_dict = {
98 | 'class': cost_class.mean(),
99 | "keypoints": cost_keypoints.mean()
100 | }
101 | else:
102 | # for the cases when no grounding truth boxes
103 | cost_mean_dict = {
104 | 'class': torch.zeros_like(cost_class.mean()),
105 | 'keypoints': torch.zeros_like(cost_keypoints.mean()),
106 | }
107 |
108 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in
109 | indices]#, cost_mean_dict
110 |
111 | def build_matcher(args):
112 | assert args.matcher_type in ['HungarianMatcher'], "Unknown args.matcher_type: {}".format(
113 | args.matcher_type)
114 | if args.matcher_type == 'HungarianMatcher':
115 | return HungarianMatcher(
116 | cost_class=args.set_cost_class, focal_alpha=args.focal_alpha, cost_keypoints=args.set_cost_keypoints, cost_oks=args.set_cost_oks, num_body_points=args.num_body_points)
117 | else:
118 | raise NotImplementedError("Unknown args.matcher_type: {}".format(args.matcher_type))
--------------------------------------------------------------------------------
/src/misc/dist_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import atexit
3 | import json
4 | import torch
5 | import torch.nn as nn
6 | import torch.distributed as dist
7 |
8 | from torch.utils.data import DistributedSampler
9 | from torch.nn.parallel import DataParallel as DP
10 | from torch.nn.parallel import DistributedDataParallel as DDP
11 |
12 | from ..data.dataloader import DataLoader
13 |
14 | def is_dist_avail_and_initialized():
15 | if not dist.is_available():
16 | return False
17 | if not dist.is_initialized():
18 | return False
19 | return True
20 |
21 |
22 | def get_world_size():
23 | if not is_dist_avail_and_initialized():
24 | return 1
25 | return dist.get_world_size()
26 |
27 |
28 | def get_rank():
29 | if not is_dist_avail_and_initialized():
30 | return 0
31 | return dist.get_rank()
32 |
33 |
34 | def is_main_process():
35 | return get_rank() == 0
36 |
37 |
38 | def save_on_master(*args, **kwargs):
39 | if is_main_process():
40 | torch.save(*args, **kwargs)
41 |
42 |
43 | def init_distributed_mode(args):
44 | if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != '': # 'RANK' in os.environ and
45 | args.rank = int(os.environ["RANK"])
46 | args.world_size = int(os.environ['WORLD_SIZE'])
47 | args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
48 | # local_world_size = int(os.environ['WORLD_SIZE'])
49 | # args.world_size = args.world_size * local_world_size
50 | # args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
51 | # args.rank = args.rank * local_world_size + args.local_rank
52 | # print('world size: {}, rank: {}, local rank: {}'.format(args.world_size, args.rank, args.local_rank))
53 | # print(json.dumps(dict(os.environ), indent=2))
54 | elif 'SLURM_PROCID' in os.environ:
55 | args.rank = int(os.environ['SLURM_PROCID'])
56 | args.gpu = args.local_rank = int(os.environ['SLURM_LOCALID'])
57 | args.world_size = int(os.environ['SLURM_NPROCS'])
58 |
59 | # print('world size: {}, world rank: {}, local rank: {}, device_count: {}'.format(args.world_size, args.rank, args.local_rank, torch.cuda.device_count()))
60 | # print("os.environ['SLURM_JOB_NODELIST']:", os.environ['SLURM_JOB_NODELIST'])
61 | # print(json.dumps(dict(os.environ), indent=2))
62 | # print('args:')
63 | # print(json.dumps(vars(args), indent=2))
64 | else:
65 | print('Not using distributed mode')
66 | args.distributed = False
67 | args.world_size = 1
68 | args.rank = 0
69 | args.local_rank = 0
70 | return
71 |
72 | print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank))
73 | args.distributed = True
74 | torch.cuda.set_device(args.local_rank)
75 | args.dist_backend = 'nccl'
76 | print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True)
77 | torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
78 | world_size=args.world_size, rank=args.rank)
79 | print("Before torch.distributed.barrier()")
80 | torch.distributed.barrier()
81 | print("End torch.distributed.barrier()")
82 | setup_for_distributed(args.rank == 0)
83 |
84 | def setup_for_distributed(is_master):
85 | """
86 | This function disables printing when not in master process
87 | """
88 | import builtins as __builtin__
89 | builtin_print = __builtin__.print
90 |
91 | def print(*args, **kwargs):
92 | force = kwargs.pop('force', False)
93 | if is_master or force:
94 | builtin_print(*args, **kwargs)
95 |
96 | __builtin__.print = print
97 |
98 | def warp_loader(loader, shuffle=False):
99 | if is_dist_avail_and_initialized():
100 | sampler = DistributedSampler(loader.dataset, shuffle=shuffle)
101 | loader = DataLoader(loader.dataset,
102 | loader.batch_size,
103 | sampler=sampler,
104 | drop_last=loader.drop_last,
105 | collate_fn=loader.collate_fn,
106 | pin_memory=loader.pin_memory,
107 | num_workers=loader.num_workers)
108 | return loader
109 |
110 |
111 | def warp_model(
112 | model: torch.nn.Module,
113 | sync_bn: bool=False,
114 | dist_mode: str='ddp',
115 | find_unused_parameters: bool=False,
116 | compile: bool=False,
117 | compile_mode: str='reduce-overhead',
118 | **kwargs
119 | ):
120 | if is_dist_avail_and_initialized():
121 | rank = get_rank()
122 | model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if sync_bn else model
123 | if dist_mode == 'dp':
124 | model = DP(model, device_ids=[rank], output_device=rank)
125 | elif dist_mode == 'ddp':
126 | model = DDP(model, device_ids=[rank], output_device=rank, find_unused_parameters=find_unused_parameters)
127 | else:
128 | raise AttributeError('')
129 |
130 | if compile:
131 | model = torch.compile(model, mode=compile_mode)
132 |
133 | return model
134 |
135 | @atexit.register
136 | def cleanup():
137 | """cleanup distributed environment"""
138 | if is_dist_avail_and_initialized():
139 | torch.distributed.barrier()
140 | torch.distributed.destroy_process_group()
141 |
142 |
143 | def is_parallel(model) -> bool:
144 | # Returns True if model is of type DP or DDP
145 | return type(model) in (
146 | torch.nn.parallel.DataParallel,
147 | torch.nn.parallel.DistributedDataParallel,
148 | )
149 |
150 |
151 | def de_parallel(model) -> nn.Module:
152 | # De-parallelize a model: returns single-GPU model if model is of type DP or DDP
153 | return model.module if is_parallel(model) else model
154 |
--------------------------------------------------------------------------------
/src/models/detrpose/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
4 | ---------------------------------------------------------------------------------
5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/)
6 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved.
7 | ---------------------------------------------------------------------------------
8 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/)
9 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved.
10 | ---------------------------------------------------------------------------------
11 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/)
12 | Copyright (c) 2023 IDEA. All Rights Reserved.
13 | """
14 |
15 | import torch
16 | import random
17 | from torch import nn, Tensor
18 | import os
19 | import numpy as np
20 | import math
21 | import torch.nn.functional as F
22 | from torch import nn
23 |
24 |
25 | def gen_encoder_output_proposals(memory:Tensor, spatial_shapes:Tensor):
26 | """
27 | Input:
28 | - memory: bs, \sum{hw}, d_model
29 | - spatial_shapes: nlevel, 2
30 | - learnedwh: 2
31 | Output:
32 | - output_memory: bs, \sum{hw}, d_model
33 | - output_proposals: bs, \sum{hw}, 4
34 | """
35 | N_, S_, C_ = memory.shape
36 | base_scale = 4.0
37 | proposals = []
38 | _cur = 0
39 | for lvl, (H_, W_) in enumerate(spatial_shapes):
40 | grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
41 | torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device),
42 | indexing='ij')
43 | grid = torch.stack([grid_x, grid_y], -1) # H_, W_, 2
44 |
45 | grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / torch.tensor([W_, H_], dtype=torch.float32, device=memory.device)
46 |
47 | proposal = grid.view(N_, -1, 2)
48 | proposals.append(proposal)
49 | _cur += (H_ * W_)
50 | output_proposals = torch.cat(proposals, 1)
51 | output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
52 | output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid
53 | output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
54 |
55 | output_memory = memory
56 | output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
57 |
58 | return output_memory, output_proposals
59 |
60 |
61 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
62 | """
63 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
64 | Args:
65 | inputs: A float tensor of arbitrary shape.
66 | The predictions for each example.
67 | targets: A float tensor with the same shape as inputs. Stores the binary
68 | classification label for each element in inputs
69 | (0 for the negative class and 1 for the positive class).
70 | alpha: (optional) Weighting factor in range (0,1) to balance
71 | positive vs negative examples. Default = -1 (no weighting).
72 | gamma: Exponent of the modulating factor (1 - p_t) to
73 | balance easy vs hard examples.
74 | Returns:
75 | Loss tensor
76 | """
77 | prob = inputs.sigmoid()
78 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
79 | p_t = prob * targets + (1 - prob) * (1 - targets)
80 | loss = ce_loss * ((1 - p_t) ** gamma)
81 |
82 | if alpha >= 0:
83 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
84 | loss = alpha_t * loss
85 |
86 |
87 | return loss.mean(1).sum() / num_boxes
88 |
89 | class MLP(nn.Module):
90 | """ Very simple multi-layer perceptron (also called FFN)"""
91 |
92 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
93 | super().__init__()
94 | self.num_layers = num_layers
95 | h = [hidden_dim] * (num_layers - 1)
96 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
97 |
98 | def forward(self, x):
99 | for i, layer in enumerate(self.layers):
100 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
101 | return x
102 |
103 | def _get_activation_fn(activation, d_model=256, batch_dim=0):
104 | """Return an activation function given a string"""
105 | if activation == "relu":
106 | return F.relu
107 | if activation == "gelu":
108 | return F.gelu
109 | if activation == "glu":
110 | return F.glu
111 | if activation == "prelu":
112 | return nn.PReLU()
113 | if activation == "selu":
114 | return F.selu
115 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
116 |
117 |
118 | def gen_sineembed_for_position(pos_tensor):
119 | # n_query, bs, _ = pos_tensor.size()
120 | # sineembed_tensor = torch.zeros(n_query, bs, 256)
121 | scale = 2 * math.pi
122 | dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
123 | dim_t = 10000 ** (2 * (dim_t // 2) / 128)
124 | x_embed = pos_tensor[:, :, 0] * scale
125 | y_embed = pos_tensor[:, :, 1] * scale
126 | pos_x = x_embed[:, :, None] / dim_t
127 | pos_y = y_embed[:, :, None] / dim_t
128 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
129 | pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
130 | if pos_tensor.size(-1) == 2:
131 | pos = torch.cat((pos_y, pos_x), dim=2)
132 | elif pos_tensor.size(-1) == 4:
133 | w_embed = pos_tensor[:, :, 2] * scale
134 | pos_w = w_embed[:, :, None] / dim_t
135 | pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
136 |
137 | h_embed = pos_tensor[:, :, 3] * scale
138 | pos_h = h_embed[:, :, None] / dim_t
139 | pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
140 |
141 | pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
142 | else:
143 | raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
144 | return pos
145 |
146 |
147 | def inverse_sigmoid(x, eps=1e-3):
148 | x = x.clamp(min=0, max=1)
149 | x1 = x.clamp(min=eps)
150 | x2 = (1 - x).clamp(min=eps)
151 | return torch.log(x1/x2)
--------------------------------------------------------------------------------
/tools/inference/annotator_crowdpose.py:
--------------------------------------------------------------------------------
1 | #########################################################################################
2 | # Modified from:
3 | # Ultralytics
4 | # https://github.com/ultralytics/ultralytics/blob/main/ultralytics/utils/plotting.py
5 | #########################################################################################
6 |
7 | import math
8 | import warnings
9 | from pathlib import Path
10 | from typing import Callable, Dict, List, Optional, Union
11 |
12 | import cv2
13 | import numpy as np
14 | import torch
15 | from PIL import Image, ImageDraw, ImageFont
16 | from PIL import __version__ as pil_version
17 |
18 | from annotator import Annotator, Colors
19 |
20 |
21 | colors = Colors() # create instance for 'from utils.plots import colors'
22 |
23 | class AnnotatorCrowdpose(Annotator):
24 | """
25 | Ultralytics Annotator for train/val mosaics and JPGs and predictions annotations.
26 |
27 | Attributes:
28 | im (Image.Image | np.ndarray): The image to annotate.
29 | pil (bool): Whether to use PIL or cv2 for drawing annotations.
30 | font (ImageFont.truetype | ImageFont.load_default): Font used for text annotations.
31 | lw (float): Line width for drawing.
32 | skeleton (List[List[int]]): Skeleton structure for keypoints.
33 | limb_color (List[int]): Color palette for limbs.
34 | kpt_color (List[int]): Color palette for keypoints.
35 | dark_colors (set): Set of colors considered dark for text contrast.
36 | light_colors (set): Set of colors considered light for text contrast.
37 |
38 | Examples:
39 | >>> from ultralytics.utils.plotting import Annotator
40 | >>> im0 = cv2.imread("test.png")
41 | >>> annotator = Annotator(im0, line_width=10)
42 | >>> annotator.box_label([10, 10, 100, 100], "person", (255, 0, 0))
43 | """
44 |
45 | def __init__(
46 | self,
47 | im,
48 | line_width: Optional[int] = None,
49 | font_size: Optional[int] = None,
50 | font: str = "Arial.ttf",
51 | pil: bool = False,
52 | example: str = "abc",
53 | ):
54 | """Initialize the Annotator class with image and line width along with color palette for keypoints and limbs."""
55 | super().__init__(im, line_width, font_size, font, pil, example)
56 |
57 | # Pose Crowdpose
58 | self.skeleton = [
59 | # limbs
60 | [12, 10],
61 | [10, 8],
62 | [11, 9],
63 | [9, 7],
64 | # torso
65 | [8, 7],
66 | [8, 2],
67 | [7, 1],
68 | # arms
69 | [14, 1],
70 | [14, 2],
71 | [1, 3],
72 | [3, 5],
73 | [2, 4],
74 | [4, 6],
75 | # head
76 | [14, 13],
77 | ]
78 |
79 | self.limb_color = colors.pose_palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 0, 16]]
80 | self.kpt_color = colors.pose_palette[[0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 16, 0]]
81 | # 9, 9, 9, 9, 9, 9, 9, 0, 16, 16, 0, 0, 0, 0, 0, 0]]
82 | self.dark_colors = {
83 | (235, 219, 11),
84 | (243, 243, 243),
85 | (183, 223, 0),
86 | (221, 111, 255),
87 | (0, 237, 204),
88 | (68, 243, 0),
89 | (255, 255, 0),
90 | (179, 255, 1),
91 | (11, 255, 162),
92 | }
93 | self.light_colors = {
94 | (255, 42, 4),
95 | (79, 68, 255),
96 | (255, 0, 189),
97 | (255, 180, 0),
98 | (186, 0, 221),
99 | (0, 192, 38),
100 | (255, 36, 125),
101 | (104, 0, 123),
102 | (108, 27, 255),
103 | (47, 109, 252),
104 | (104, 31, 17),
105 | }
106 |
107 | # def kpts(
108 | # self,
109 | # kpts,
110 | # shape: tuple = (640, 640),
111 | # radius: Optional[int] = None,
112 | # kpt_line: bool = True,
113 | # conf_thres: float = 0.25,
114 | # kpt_color: Optional[tuple] = None,
115 | # ):
116 | # """
117 | # Plot keypoints on the image.
118 |
119 | # Args:
120 | # kpts (torch.Tensor): Keypoints, shape [17, 3] (x, y, confidence).
121 | # shape (tuple, optional): Image shape (h, w).
122 | # radius (int, optional): Keypoint radius.
123 | # kpt_line (bool, optional): Draw lines between keypoints.
124 | # conf_thres (float, optional): Confidence threshold.
125 | # kpt_color (tuple, optional): Keypoint color (B, G, R).
126 |
127 | # Note:
128 | # - `kpt_line=True` currently only supports human pose plotting.
129 | # - Modifies self.im in-place.
130 | # - If self.pil is True, converts image to numpy array and back to PIL.
131 | # """
132 | # radius = radius if radius is not None else self.lw
133 | # if self.pil:
134 | # # Convert to numpy first
135 | # self.im = np.asarray(self.im).copy()
136 | # nkpt, ndim = kpts.shape
137 | # is_pose = nkpt == 17 and ndim in {2, 3}
138 | # kpt_line &= is_pose # `kpt_line=True` for now only supports human pose plotting
139 | # for i, k in enumerate(kpts):
140 | # color_k = kpt_color or (self.kpt_color[i].tolist() if is_pose else colors(i))
141 | # x_coord, y_coord = k[0], k[1]
142 | # if x_coord % shape[1] != 0 and y_coord % shape[0] != 0:
143 | # if len(k) == 3:
144 | # conf = k[2]
145 | # if conf < conf_thres:
146 | # continue
147 | # cv2.circle(self.im, (int(x_coord), int(y_coord)), radius, color_k, -1, lineType=cv2.LINE_AA)
148 |
149 | # if kpt_line:
150 | # ndim = kpts.shape[-1]
151 | # for i, sk in enumerate(self.skeleton):
152 | # pos1 = (int(kpts[(sk[0] - 1), 0]), int(kpts[(sk[0] - 1), 1]))
153 | # pos2 = (int(kpts[(sk[1] - 1), 0]), int(kpts[(sk[1] - 1), 1]))
154 | # if ndim == 3:
155 | # conf1 = kpts[(sk[0] - 1), 2]
156 | # conf2 = kpts[(sk[1] - 1), 2]
157 | # if conf1 < conf_thres or conf2 < conf_thres:
158 | # continue
159 | # if pos1[0] % shape[1] == 0 or pos1[1] % shape[0] == 0 or pos1[0] < 0 or pos1[1] < 0:
160 | # continue
161 | # if pos2[0] % shape[1] == 0 or pos2[1] % shape[0] == 0 or pos2[0] < 0 or pos2[1] < 0:
162 | # continue
163 | # cv2.line(
164 | # self.im,
165 | # pos1,
166 | # pos2,
167 | # kpt_color or self.limb_color[i].tolist(),
168 | # thickness=int(np.ceil(self.lw / 2)),
169 | # lineType=cv2.LINE_AA,
170 | # )
171 | # if self.pil:
172 | # # Convert im back to PIL and update draw
173 | # self.fromarray(self.im)
174 |
--------------------------------------------------------------------------------
/tools/visualization/line_attention.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 IDEA. All Rights Reserved.
2 | # ------------------------------------------------------------------------
3 | import os, sys
4 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
5 |
6 | import argparse
7 |
8 | import matplotlib as mpl
9 | import matplotlib.pyplot as plt
10 | from matplotlib.backends.backend_agg import FigureCanvasAgg
11 |
12 | import torch
13 | from torch.utils.data import DataLoader
14 |
15 | from util.slconfig import SLConfig
16 |
17 | import datasets
18 | from datasets import build_dataset, BatchImageCollateFunction
19 |
20 |
21 | def create(args, classname):
22 | # we use register to maintain models from catdet6 on.
23 | from models.registry import MODULE_BUILD_FUNCS
24 | class_module = getattr(args, classname)
25 | assert class_module in MODULE_BUILD_FUNCS._module_dict
26 | build_func = MODULE_BUILD_FUNCS.get(class_module)
27 | return build_func(args)
28 |
29 | def main(args):
30 | cfg = SLConfig.fromfile(args.config)
31 | device = args.device
32 |
33 | setattr(cfg, 'coco_path', args.data_path)
34 | setattr(cfg, 'batch_size_train', 1)
35 | setattr(cfg, 'batch_size_val', 1)
36 |
37 | if 'HGNetv2' in cfg.backbone:
38 | cfg.pretrained = False
39 |
40 | # build model
41 | model, _ = create(cfg, 'modelname')
42 | model.to(device)
43 |
44 | criterion = create(cfg, 'criterionname')
45 |
46 | dataset_val = build_dataset(image_set='val', args=cfg)
47 |
48 | sampler_val = torch.utils.data.SequentialSampler(dataset_val)
49 |
50 | data_loader_val = DataLoader(dataset_val, 1, sampler=sampler_val, drop_last=False, collate_fn=BatchImageCollateFunction(), num_workers=4)
51 |
52 | if args.resume:
53 | checkpoint = torch.load(args.resume, map_location='cpu')
54 | if 'ema' in checkpoint:
55 | state = checkpoint['ema']['module']
56 | else:
57 | state = checkpoint['model']
58 |
59 | # NOTE load train mode state -> convert to deploy mode
60 | model.load_state_dict(state)
61 |
62 | # change to device
63 | model.to(device)
64 |
65 | # transformer parameters
66 | len_q = cfg.num_queries
67 | nheads = cfg.nheads
68 | num_sampling_points = cfg.dec_n_points
69 | num_points_scale = torch.tensor([1/n for n in num_sampling_points for _ in range(n)], dtype=torch.float32).reshape(-1, 1)
70 |
71 | # folder path
72 | main_folder = cfg.output_dir
73 | if 'data/wireframe_processed' in args.data_path:
74 | append_path = f'{main_folder}/visualization/line_attention_wireframe'
75 |
76 | elif 'data/york_processed' in args.data_path:
77 | append_path = f'{main_folder}/visualization/line_attention_york'
78 | os.makedirs(append_path , exist_ok=True)
79 |
80 | with torch.no_grad():
81 |
82 | for i, (samples, targets) in enumerate(data_loader_val):
83 | samples = samples.to(device)
84 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
85 |
86 | sampling_ratios = []
87 | reference_points = []
88 | attention_weights = []
89 | hooks = [
90 | model.decoder.decoder.layers[-1].cross_attn.sampling_ratios.register_forward_hook(
91 | lambda self, input, output: sampling_ratios.append(output[0])
92 | ),
93 | model.decoder.decoder.layers[-1].cross_attn.attention_weights.register_forward_hook(
94 | lambda self, input, output: attention_weights.append(output[0])
95 | ),
96 | model.decoder.decoder.register_forward_hook(
97 | lambda self, input, output: reference_points.append(output[0])
98 | ),
99 | ]
100 |
101 | output = model(samples, None)
102 |
103 | [(src_idx, tgt_idx)] = criterion(output, targets, return_indices=True)
104 |
105 | for hook in hooks:
106 | hook.remove()
107 |
108 | sampling_ratios = sampling_ratios[0].cpu().view(1, len_q, nheads, sum(num_sampling_points), 1)
109 | attention_weights = attention_weights[0].cpu().view(1, len_q, nheads, sum(num_sampling_points))
110 | attention_weights = torch.nn.functional.softmax(attention_weights, dim=-1)
111 |
112 | reference_points = reference_points[0][-2:-1].cpu().transpose(1, 2)
113 |
114 | vector = reference_points[:, :, None, :, :2] - reference_points[:, :, None, :, 2:]
115 | center = 0.5 * (reference_points[:, :, None, :, :2] + reference_points[:, :, None, :, 2:])
116 |
117 | sampling_locations = center + sampling_ratios * num_points_scale * vector * 0.5
118 |
119 | # Plot image
120 | img = samples[0].permute(1, 2, 0).cpu()
121 | img = (img - img.min()) / (img.max() - img.min())
122 | fig, ax = plt.subplots()
123 | ax.imshow(img, extent=[0, 1, 1, 0])
124 |
125 | reference_points = reference_points.transpose(1, 2)[0, 0]
126 | sampling_locations = sampling_locations[0]
127 | attention_weights = attention_weights[0]
128 |
129 | # choose the query idx
130 | line_idx = src_idx[tgt_idx == 0][0]
131 | reference_points = reference_points[line_idx]
132 | sampling_locations = sampling_locations[line_idx]
133 | attention_weights = attention_weights[line_idx]
134 |
135 | # sampling points
136 | for j in range(nheads):
137 | x1, y1 = sampling_locations[j].split(1, dim=-1)
138 | pos = ax.scatter(x1, y1, marker='*', c=attention_weights[j], cmap='jet', zorder=2)
139 | cbar = fig.colorbar(pos, ax=ax)
140 | cbar.ax.tick_params(size=0)
141 | cbar.set_ticks([])
142 |
143 | # reference lines
144 | x1, y1, x2, y2 = reference_points.split(1, dim=-1)
145 | ax.plot((x1[0], x2[0]), (y1[0], y2[0]), c='k', marker='o', zorder=3)
146 |
147 | plt.axis([0, 1, 1, 0])
148 | plt.axis(False)
149 |
150 |
151 | curr_img_id = targets[0]['image_id'].tolist()[0]
152 | plt.savefig(f'{append_path}/{curr_img_id}.png', bbox_inches="tight", pad_inches=0.0, dpi=100)
153 | plt.close()
154 |
155 | # check condition to stop program
156 | if args.num_images is not None and i + 1 >= args.num_images:
157 | break
158 |
159 |
160 | if __name__ == '__main__':
161 | parser = argparse.ArgumentParser('Visualization of Deformable Line Attention')
162 | parser.add_argument('-c', '--config', type=str, required=True)
163 | parser.add_argument('-r', '--resume', default='', help='resume from checkpoint')
164 | parser.add_argument('-p', '--data-path', type=str, default='data/wireframe_processed', help='data path')
165 | parser.add_argument('-d', '--device', type=str, default='cpu')
166 | parser.add_argument('-n', '--num_images', type=int, help='total number of images to plot')
167 | args = parser.parse_args()
168 | main(args)
169 |
--------------------------------------------------------------------------------
/tools/inference/torch_inf.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
3 | """
4 | import os
5 | import sys
6 | import glob
7 |
8 | import cv2 # Added for video processing
9 | import numpy as np
10 | import torch
11 | import torch.nn as nn
12 | import torchvision.transforms as T
13 |
14 | from PIL import Image, ImageDraw
15 | from copy import deepcopy
16 | from annotator import Annotator
17 | from annotator_crowdpose import AnnotatorCrowdpose
18 |
19 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
20 | from src.core import LazyConfig, instantiate
21 |
22 | annotators = {'COCO': Annotator, 'CrowdPose': AnnotatorCrowdpose}
23 |
24 | def process_image(model, device, file_path):
25 | im_pil = Image.open(file_path).convert("RGB")
26 | w, h = im_pil.size
27 | orig_size = torch.tensor([[w, h]]).to(device)
28 | annotator = annotators[annotator_type](deepcopy(im_pil))
29 |
30 | transforms = T.Compose(
31 | [
32 | T.Resize((640, 640)),
33 | T.ToTensor(),
34 | ]
35 | )
36 | im_data = transforms(im_pil).unsqueeze(0).to(device)
37 |
38 | output = model(im_data, orig_size)
39 |
40 | scores, labels, keypoints = output
41 | scores, labels, keypoints = scores[0], labels[0], keypoints[0]
42 | for kpt, score in zip(keypoints, scores):
43 | if score > thrh:
44 | annotator.kpts(
45 | kpt,
46 | [h, w]
47 | )
48 | annotator.save(f"{OUTPUT_NAME}.jpg")
49 |
50 |
51 | def process_video(model, device, file_path):
52 | cap = cv2.VideoCapture(file_path)
53 |
54 | # Get video properties
55 | fps = cap.get(cv2.CAP_PROP_FPS)
56 | orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
57 | orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
58 |
59 | # Define the codec and create VideoWriter object
60 | fourcc = cv2.VideoWriter_fourcc(*"mp4v")
61 | out = cv2.VideoWriter(f"{OUTPUT_NAME}.mp4", fourcc, fps, (orig_w, orig_h))
62 |
63 | transforms = T.Compose(
64 | [
65 | T.Resize((640, 640)),
66 | T.ToTensor(),
67 | ]
68 | )
69 |
70 | frame_count = 0
71 | print("Processing video frames...")
72 | while cap.isOpened():
73 | ret, frame = cap.read()
74 | if not ret:
75 | break
76 |
77 | # Convert frame to PIL image
78 | frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
79 |
80 | w, h = frame_pil.size
81 | orig_size = torch.tensor([[w, h]]).to(device)
82 |
83 | annotator = annotators[annotator_type](deepcopy(frame_pil))
84 |
85 | im_data = transforms(frame_pil).unsqueeze(0).to(device)
86 |
87 | output = model(im_data, orig_size)
88 |
89 | scores, labels, keypoints = output
90 | scores, labels, keypoints = scores[0], labels[0], keypoints[0]
91 | for kpt, score in zip(keypoints, scores):
92 | if score > thrh:
93 | annotator.kpts(
94 | kpt,
95 | [h, w]
96 | )
97 |
98 | # Convert back to OpenCV image
99 | frame = annotator.result()
100 |
101 | # Write the frame
102 | out.write(frame)
103 | frame_count += 1
104 |
105 | if frame_count % 10 == 0:
106 | print(f"Processed {frame_count} frames...")
107 |
108 | cap.release()
109 | out.release()
110 | print("Video processing complete. Result saved as 'results_video.mp4'.")
111 |
112 | def process_file(model, device, file_path):
113 | # Check if the input file is an image or a vide
114 | if os.path.splitext(file_path)[-1].lower() in [".jpg", ".jpeg", ".png", ".bmp"]:
115 | # Process as image
116 | process_image(model, device, file_path)
117 | print("Image processing complete.")
118 | else:
119 | # Process as video
120 | process_video(model, device, file_path)
121 | print("Video processing complete.")
122 |
123 | def create(args, classname):
124 | # we use register to maintain models from catdet6 on.
125 | from models.registry import MODULE_BUILD_FUNCS
126 | class_module = getattr(args, classname)
127 | assert class_module in MODULE_BUILD_FUNCS._module_dict
128 | build_func = MODULE_BUILD_FUNCS.get(class_module)
129 | return build_func(args)
130 |
131 | def main(args):
132 | # Global variable
133 | global OUTPUT_NAME, thrh, annotator_type
134 |
135 | """Main function"""
136 | cfg = LazyConfig.load(args.config)
137 |
138 | if hasattr(cfg.model.backbone, 'pretrained'):
139 | cfg.model.backbone.pretrained = False
140 |
141 | model = instantiate(cfg.model)
142 | postprocessor = instantiate(cfg.postprocessor)
143 |
144 | num_body_points = model.transformer.num_body_points
145 | if num_body_points == 17:
146 | annotator_type = 'COCO'
147 | elif num_body_points == 14:
148 | annotator_type = 'CrowdPose'
149 | else:
150 | raise Exception(f'Not implemented annotator for model with {num_body_points} keypoints')
151 |
152 | if args.resume:
153 | checkpoint = torch.load(args.resume, map_location='cpu', weights_only=False)
154 | if 'ema' in checkpoint:
155 | state = checkpoint['ema']['module']
156 | else:
157 | state = checkpoint['model']
158 |
159 | # NOTE load train mode state -> convert to deploy mode
160 | model.load_state_dict(state)
161 |
162 | else:
163 | # raise AttributeError('Only support resume to load model.state_dict by now.')
164 | print('not load model.state_dict, use default init state dict...')
165 |
166 | class Model(nn.Module):
167 | def __init__(self):
168 | super().__init__()
169 | self.model = model.deploy()
170 | self.postprocessor = postprocessor.deploy()
171 |
172 | def forward(self, images, orig_target_sizes):
173 | outputs = self.model(images)
174 | outputs = self.postprocessor(outputs, orig_target_sizes)
175 | return outputs
176 |
177 | device = args.device
178 | model = Model().to(device)
179 | thrh = 0.5 if args.thrh is None else args.thrh
180 |
181 | # Check if the input argumnet is a file or a folder
182 | file_path = args.input
183 | if os.path.isdir(file_path):
184 | # Process a folder
185 | folder_dir = args.input
186 | output_dir = f"{folder_dir}/output"
187 | os.makedirs(output_dir, exist_ok=True)
188 | paths = list(glob.iglob(f"{folder_dir}/*.*"))
189 | for file_path in paths:
190 | OUTPUT_NAME = file_path.replace(f'{folder_dir}/', f'{output_dir}/').split('.')[0]
191 | OUTPUT_NAME = f"{OUTPUT_NAME}_{annotator_type}"
192 | process_file(model, device, file_path)
193 | else:
194 | # Process a file
195 | OUTPUT_NAME = f'torch_results_{annotator_type}'
196 | process_file(model, device, file_path)
197 |
198 |
199 | if __name__ == "__main__":
200 | import argparse
201 |
202 | parser = argparse.ArgumentParser()
203 | parser.add_argument("-c", "--config", type=str, required=True)
204 | parser.add_argument("-r", "--resume", type=str, required=True)
205 | parser.add_argument("-d", "--device", type=str, default="cpu")
206 | parser.add_argument("-i", "--input", type=str, required=True)
207 | parser.add_argument("-t", "--thrh", type=float, required=False, default=None)
208 | args = parser.parse_args()
209 | main(args)
210 |
--------------------------------------------------------------------------------
/src/models/detrpose/ms_deform_attn.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | import math
10 |
11 | import torch
12 | from torch import nn
13 | import torch.nn.functional as F
14 | from torch.nn.init import xavier_uniform_, constant_
15 |
16 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
17 | # N_, S_, M_, D_ = value.shape
18 | _, D_ , _= value[0].shape
19 | N_, Lq_, M_, L_, P_, _ = sampling_locations.shape
20 | # value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
21 |
22 | sampling_grids = 2 * sampling_locations - 1
23 | sampling_grids = sampling_grids.transpose(1, 2).flatten(0, 1)
24 |
25 | sampling_value_list = []
26 | for lid_, (H_, W_) in enumerate(value_spatial_shapes):
27 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
28 | value_l_ = value[lid_].unflatten(2, (H_, W_))
29 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
30 | sampling_grid_l_ = sampling_grids[:, :, lid_]
31 | # N_*M_, D_, Lq_, P_
32 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
33 | mode='bilinear', padding_mode='zeros', align_corners=False)
34 | sampling_value_list.append(sampling_value_l_)
35 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
36 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
37 | output = (torch.concat(sampling_value_list, dim=-1) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
38 | return output.transpose(1, 2)#.contiguous()
39 |
40 |
41 | class MSDeformAttn(nn.Module):
42 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
43 | """
44 | Multi-Scale Deformable Attention Module
45 | :param d_model hidden dimension
46 | :param n_levels number of feature levels
47 | :param n_heads number of attention heads
48 | :param n_points number of sampling points per attention head per feature level
49 | """
50 | super().__init__()
51 | if d_model % n_heads != 0:
52 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
53 | _d_per_head = d_model // n_heads
54 |
55 | self.d_model = d_model
56 | self.n_levels = n_levels
57 | self.n_heads = n_heads
58 | self.n_points = n_points
59 |
60 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
61 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
62 | # self.value_proj = nn.Linear(d_model, d_model)
63 | # self.output_proj = nn.Linear(d_model, d_model)
64 |
65 | self.use_4D_normalizer = use_4D_normalizer
66 |
67 | self._reset_parameters()
68 |
69 | def _reset_parameters(self):
70 | constant_(self.sampling_offsets.weight.data, 0.)
71 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
72 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
73 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
74 | for i in range(self.n_points):
75 | grid_init[:, :, i, :] *= i % 4 + 1
76 | with torch.no_grad():
77 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
78 | if self.n_points % 4 != 0:
79 | constant_(self.sampling_offsets.bias, 0.)
80 | constant_(self.attention_weights.weight.data, 0.)
81 | constant_(self.attention_weights.bias.data, 0.)
82 | # xavier_uniform_(self.value_proj.weight.data)
83 | # constant_(self.value_proj.bias.data, 0.)
84 | # xavier_uniform_(self.output_proj.weight.data)
85 | # constant_(self.output_proj.bias.data, 0.)
86 |
87 | def forward(self, query, reference_points, value, input_spatial_shapes):
88 | """
89 | :param query (N, Length_{query}, C)
90 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
91 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
92 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
93 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
94 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
95 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
96 |
97 | :return output (N, Length_{query}, C)
98 | """
99 | N, Len_q, _ = query.shape
100 | # assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
101 |
102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |
106 | # N, Len_q, n_heads, n_levels, n_points, 2
107 | reference_points = torch.transpose(reference_points, 2, 3).flatten(1, 2)
108 |
109 | if reference_points.shape[-1] == 2:
110 | offset_normalizer = torch.tensor(input_spatial_shapes, device=query.device)
111 | offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.n_levels, 1, 2)
112 | sampling_locations = reference_points[:, :, None, :, None, :] \
113 | + sampling_offsets / offset_normalizer
114 | elif reference_points.shape[-1] == 4:
115 | if self.use_4D_normalizer:
116 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
117 | sampling_locations = reference_points[:, :, None, :, None, :2] \
118 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
119 | else:
120 | sampling_locations = reference_points[:, :, None, :, None, :2] \
121 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
122 | else:
123 | raise ValueError(
124 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
125 |
126 | output = ms_deform_attn_core_pytorch(
127 | value, input_spatial_shapes, sampling_locations, attention_weights)
128 | return output
129 |
--------------------------------------------------------------------------------
/src/solver/engine.py:
--------------------------------------------------------------------------------
1 | """
2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
4 | ---------------------------------------------------------------------------------
5 | Modified from DEIM (https://github.com/Intellindust-AI-Lab/DEIM/)
6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
7 | ---------------------------------------------------------------------------------
8 | Modified from DETR (https://github.com/facebookresearch/detr/blob/main/engine.py)
9 | Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
10 | """
11 |
12 | import math
13 | import sys
14 | from typing import Iterable
15 |
16 | import torch
17 | from ..misc import logger as utils
18 | from ..misc import dist_utils
19 |
20 | GIGABYTE = 1024 ** 3
21 |
22 | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
23 | data_loader: Iterable, optimizer: torch.optim.Optimizer,
24 | batch_size:int, grad_accum_steps:int,
25 | device: torch.device, epoch: int, max_norm: float = 0, writer=None,
26 | lr_scheduler=None, warmup_scheduler=None, ema=None, args=None):
27 | scaler = torch.amp.GradScaler(str(device), enabled=args.amp)
28 | model.train()
29 | criterion.train()
30 | metric_logger = utils.MetricLogger(delimiter=" ")
31 | metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
32 | header = 'Epoch: [{}]'.format(epoch)
33 | print_freq = args.print_freq
34 |
35 | sub_batch_size = batch_size // args.grad_accum_steps
36 |
37 | print("Grad accum steps: ", args.grad_accum_steps)
38 | print("Batch size/GPU: ", batch_size)
39 | print("Total batch size: ", batch_size * dist_utils.get_world_size())
40 |
41 | optimizer.zero_grad()
42 |
43 |
44 | for i, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
45 | samples = samples.to(device)
46 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
47 |
48 | global_step = epoch * len(data_loader) + i
49 |
50 | for j in range(args.grad_accum_steps):
51 | start_idx = j * sub_batch_size
52 | final_idx = start_idx + sub_batch_size
53 | new_samples = samples[start_idx:final_idx]
54 | new_samples = new_samples.to(device)
55 | new_targets = [{k: v.to(device) for k, v in t.items()} for t in targets[start_idx:final_idx]]
56 |
57 | with torch.amp.autocast(str(device), enabled=args.amp):
58 | outputs = model(new_samples, new_targets)
59 |
60 | with torch.amp.autocast(str(device), enabled=False):
61 | loss_dict = criterion(outputs, new_targets)
62 | losses = sum(loss_dict.values())
63 |
64 | if args.amp:
65 | scaler.scale(losses).backward()
66 | else:
67 | losses.backward()
68 |
69 | # reduce losses over all GPUs for logging purposes
70 | loss_dict_reduced = utils.reduce_dict(loss_dict)
71 | losses_reduced_scaled = sum(loss_dict_reduced.values())
72 |
73 | loss_value = losses_reduced_scaled.item()
74 |
75 | if not math.isfinite(loss_value):
76 | print("Loss is {}, stopping training".format(loss_value))
77 | print(loss_dict_reduced)
78 | sys.exit(1)
79 |
80 |
81 | # amp backward function
82 | if args.amp:
83 | if max_norm > 0:
84 | scaler.unscale_(optimizer)
85 | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
86 | scaler.step(optimizer)
87 | scaler.update()
88 | else:
89 | # original backward function
90 | if max_norm > 0:
91 | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
92 | optimizer.step()
93 |
94 | # ema
95 | if ema is not None:
96 | ema.update(model)
97 |
98 | if warmup_scheduler is not None:
99 | warmup_scheduler.step()
100 |
101 |
102 | metric_logger.update(loss=loss_value, **loss_dict_reduced)
103 | metric_logger.update(lr=optimizer.param_groups[0]["lr"])
104 |
105 |
106 | if writer and dist_utils.is_main_process() and global_step % 10 == 0:
107 | writer.add_scalar('Loss/total', loss_value, global_step)
108 | for j, pg in enumerate(optimizer.param_groups):
109 | writer.add_scalar(f'Lr/pg_{j}', pg['lr'], global_step)
110 | for k, v in loss_dict_reduced.items():
111 | writer.add_scalar(f'Loss/{k}', v.item(), global_step)
112 | free, total = torch.cuda.mem_get_info(device)
113 | mem_used_MB = (total - free) / GIGABYTE
114 | writer.add_scalar('Info/memory', mem_used_MB, global_step)
115 |
116 | optimizer.zero_grad()
117 |
118 | # gather the stats from all processes
119 | metric_logger.synchronize_between_processes()
120 | print("Averaged stats:", metric_logger)
121 | return {k: meter.global_avg for k, meter in metric_logger.meters.items() if meter.count > 0}
122 |
123 |
124 |
125 |
126 | @torch.no_grad()
127 | def evaluate(model, postprocessors, coco_evaluator, data_loader, device, writer=None, save_results=False):
128 | model.eval()
129 | if coco_evaluator is not None:
130 | coco_evaluator.cleanup()
131 |
132 | metric_logger = utils.MetricLogger(delimiter=" ")
133 | header = 'Test:'
134 | res_json = []
135 |
136 | for samples, targets in metric_logger.log_every(data_loader, 10, header):
137 | samples = samples.to(device)
138 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
139 |
140 | outputs = model(samples, targets)
141 |
142 | orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
143 | results = postprocessors(outputs, orig_target_sizes)
144 |
145 | res = {target['image_id'].item(): output for target, output in zip(targets, results)}
146 | if coco_evaluator is not None:
147 | coco_evaluator.update(res)
148 |
149 | if save_results:
150 | for k, v in res.items():
151 | scores = v['scores']
152 | labels = v['labels']
153 | keypoints = v['keypoints']
154 |
155 | for s, l, kpt in zip(scores, labels, keypoints):
156 | res_json.append(
157 | {
158 | "image_id": k,
159 | "category_id": l.item(),
160 | "keypoints": kpt.round(decimals=4).tolist(),
161 | "score": s.item()
162 | }
163 | )
164 |
165 | # gather the stats from all processes
166 | metric_logger.synchronize_between_processes()
167 | print("Averaged stats:", metric_logger)
168 | if coco_evaluator is not None:
169 | coco_evaluator.synchronize_between_processes()
170 |
171 | if save_results:
172 | return res_json
173 |
174 | # accumulate predictions from all images
175 | if coco_evaluator is not None:
176 | coco_evaluator.accumulate()
177 | coco_evaluator.summarize()
178 |
179 | stats = {k: meter.global_avg for k, meter in metric_logger.meters.items() if meter.count > 0}
180 | if coco_evaluator is not None:
181 | stats['coco_eval_keypoints'] = coco_evaluator.coco_eval['keypoints'].stats.tolist()
182 | return stats
183 |
--------------------------------------------------------------------------------
/src/data/dataloader.py:
--------------------------------------------------------------------------------
1 | """
2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
4 | ---------------------------------------------------------------------------------
5 | Modified from D-DEIM (https://github.com/Intellindust-AI-Lab/DEIM/)
6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
7 | ---------------------------------------------------------------------------------
8 | Modified from D-FINE (https://github.com/Peterande/D-FINE/)
9 | Copyright (c) 2024 D-FINE Authors. All Rights Reserved.
10 | ---------------------------------------------------------------------------------
11 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/)
12 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved.
13 | """
14 |
15 | import torch
16 | import torch.nn.functional as F
17 | import torch.utils.data as data
18 | from torchvision.transforms.functional import resize
19 | import random
20 |
21 | from PIL import Image, ImageDraw
22 | import os
23 |
24 | from copy import deepcopy
25 |
26 | # This only for printing
27 | RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
28 |
29 |
30 | class DataLoader(data.DataLoader):
31 | def __repr__(self) -> str:
32 | format_string = self.__class__.__name__ + "("
33 | for n in ['dataset', 'batch_size', 'num_workers', 'drop_last', 'collate_fn']:
34 | format_string += "\n"
35 | format_string += " {0}: {1}".format(n, getattr(self, n))
36 | format_string += "\n)"
37 | return format_string
38 |
39 | def set_epoch(self, epoch):
40 | self._epoch = epoch
41 | self.dataset.set_epoch(epoch)
42 | self.collate_fn.set_epoch(epoch)
43 |
44 | @property
45 | def epoch(self):
46 | return self._epoch if hasattr(self, '_epoch') else -1
47 |
48 | @property
49 | def shuffle(self):
50 | return self._shuffle
51 |
52 | @shuffle.setter
53 | def shuffle(self, shuffle):
54 | assert isinstance(shuffle, bool), 'shuffle must be a boolean'
55 | self._shuffle = shuffle
56 |
57 |
58 | class BaseCollateFunction(object):
59 | def set_epoch(self, epoch):
60 | self._epoch = epoch
61 |
62 | @property
63 | def epoch(self):
64 | return self._epoch if hasattr(self, '_epoch') else -1
65 |
66 | def __call__(self, items):
67 | raise NotImplementedError('')
68 |
69 |
70 | def generate_scales(base_size, base_size_repeat):
71 | scale_repeat = (base_size - int(base_size * 0.75 / 32) * 32) // 32
72 | scales = [int(base_size * 0.75 / 32) * 32 + i * 32 for i in range(scale_repeat)]
73 | scales += [base_size] * base_size_repeat
74 | scales += [int(base_size * 1.25 / 32) * 32 - i * 32 for i in range(scale_repeat)]
75 | return scales
76 |
77 |
78 | class BatchImageCollateFunction(BaseCollateFunction):
79 | def __init__(
80 | self,
81 | stop_epoch=None,
82 | ema_restart_decay=0.9999,
83 | base_size=640,
84 | base_size_repeat=None,
85 | mixup_prob=0.0,
86 | mixup_epochs=[0, 0],
87 | vis_folder='./vis_dataset/',
88 | vis_save=False
89 | ) -> None:
90 | super().__init__()
91 | self.base_size = base_size
92 | self.scales = generate_scales(base_size, base_size_repeat) if base_size_repeat is not None else None
93 | if self.scales is not None:
94 | self.scales.sort()
95 | print(GREEN + "Multi-scaling uses the following size: " + RESET, self.scales)
96 | self.stop_epoch = stop_epoch if stop_epoch is not None else 100000000
97 | self.ema_restart_decay = ema_restart_decay
98 |
99 | self.mixup_prob = mixup_prob
100 | self.mixup_epochs = mixup_epochs
101 | self.print_info_flag = True
102 |
103 | self.vis_save = vis_save
104 | self.vis_folder = vis_folder
105 | self.vis_image_number = 0
106 | self.max_vis_image_number = 10
107 |
108 | if vis_save:
109 | os.makedirs(self.vis_folder, exist_ok=True)
110 |
111 | def __call__(self, items):
112 | images = torch.cat([x[0][None] for x in items], dim=0)
113 | targets = [x[1] for x in items]
114 |
115 | images, targets = self.apply_mixup(images, targets)
116 |
117 | if self.scales is not None and self.epoch < self.stop_epoch:
118 | sz = random.choice(self.scales)
119 | images = resize(images, [sz, sz])
120 | return images, targets
121 |
122 | def apply_mixup(self, images, targets):
123 | """
124 | Applies Mixup augmentation to the batch if conditions are met.
125 |
126 | Args:
127 | images (torch.Tensor): Batch of images.
128 | targets (list[dict]): List of target dictionaries corresponding to images.
129 |
130 | Returns:
131 | tuple: Updated images and targets
132 | """
133 | # Log when Mixup is permanently disabled
134 | if self.epoch == self.mixup_epochs[-1] and self.print_info_flag:
135 | print(f" ### Attention --- Mixup is closed after epoch@ {self.epoch} ###")
136 | self.print_info_flag = False
137 |
138 | # Apply Mixup if within specified epoch range and probability threshold
139 | if random.random() < self.mixup_prob and self.mixup_epochs[0] <= self.epoch < self.mixup_epochs[1]:
140 | # Generate mixup ratio
141 | beta = round(random.uniform(0.45, 0.55), 6)
142 |
143 | # Mix images
144 | images = images.roll(shifts=1, dims=0).mul_(1.0 - beta).add_(images.mul(beta))
145 |
146 | # Prepare targets for Mixup
147 | shifted_targets = targets[-1:] + targets[:-1]
148 | updated_targets = deepcopy(targets)
149 |
150 | for i in range(len(targets)):
151 | # Combine boxes, labels, and areas from original and shifted targets
152 | updated_targets[i]['boxes'] = torch.cat([targets[i]['boxes'], shifted_targets[i]['boxes']], dim=0)
153 | updated_targets[i]['keypoints'] = torch.cat([targets[i]['keypoints'], shifted_targets[i]['keypoints']], dim=0)
154 | updated_targets[i]['labels'] = torch.cat([targets[i]['labels'], shifted_targets[i]['labels']], dim=0)
155 | updated_targets[i]['area'] = torch.cat([targets[i]['area'], shifted_targets[i]['area']], dim=0)
156 |
157 | # Add mixup ratio to targets
158 | updated_targets[i]['mixup'] = torch.tensor(
159 | [beta] * len(targets[i]['labels']) + [1.0 - beta] * len(shifted_targets[i]['labels']),
160 | dtype=torch.float32
161 | )
162 | targets = updated_targets
163 |
164 | if self.vis_save and self.vis_image_number < self.max_vis_image_number:
165 | for i in range(len(updated_targets)):
166 | image_tensor = images[i]
167 | image_tensor_uint8 = ((image_tensor - image_tensor.min()) / (image_tensor.max() - image_tensor.min()) * 255).type(torch.uint8)
168 | image_numpy = image_tensor_uint8.numpy().transpose((1, 2, 0))
169 | pilImage = Image.fromarray(image_numpy)
170 | draw = ImageDraw.Draw(pilImage)
171 | print('mix_vis:', i, 'boxes.len=', len(updated_targets[i]['boxes']))
172 | for box in updated_targets[i]['boxes']:
173 | draw.rectangle([int(box[0]*640 - (box[2]*640)/2), int(box[1]*640 - (box[3]*640)/2),
174 | int(box[0]*640 + (box[2]*640)/2), int(box[1]*640 + (box[3]*640)/2)], outline=(255,255,0))
175 | for pose in updated_targets[i]['keypoints']:
176 | num_pose_point = pose.shape[0] // 3
177 | pose_ = pose[:-num_pose_point].reshape(-1, 2)
178 | for p in pose_:
179 | if sum(p) != 0:
180 | draw.circle((p[0]*640, p[1]*640), 4, fill='blue')
181 |
182 |
183 | pilImage.save(self.vis_folder + f"example_{self.vis_image_number}_" + str(i) + "_"+ str(len(updated_targets[i]['boxes'])) +'_out.jpg')
184 | self.vis_image_number += 1
185 |
186 | return images, targets
187 |
--------------------------------------------------------------------------------
/src/misc/logger.py:
--------------------------------------------------------------------------------
1 | import time
2 | import pickle
3 | import datetime
4 | from collections import defaultdict, deque
5 | from typing import Dict
6 |
7 | import torch
8 | import torch.distributed as dist
9 |
10 | from .dist_utils import is_dist_avail_and_initialized, get_world_size
11 |
12 |
13 | class SmoothedValue(object):
14 | """Track a series of values and provide access to smoothed values over a
15 | window or the global series average.
16 | """
17 |
18 | def __init__(self, window_size=20, fmt=None):
19 | if fmt is None:
20 | fmt = "{median:.4f} ({global_avg:.4f})"
21 | self.deque = deque(maxlen=window_size)
22 | self.total = 0.0
23 | self.count = 0
24 | self.fmt = fmt
25 |
26 | def update(self, value, n=1):
27 | self.deque.append(value)
28 | self.count += n
29 | self.total += value * n
30 |
31 | def synchronize_between_processes(self):
32 | """
33 | Warning: does not synchronize the deque!
34 | """
35 | if not is_dist_avail_and_initialized():
36 | return
37 | t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
38 | dist.barrier()
39 | dist.all_reduce(t)
40 | t = t.tolist()
41 | self.count = int(t[0])
42 | self.total = t[1]
43 |
44 | @property
45 | def median(self):
46 | d = torch.tensor(list(self.deque))
47 | if d.shape[0] == 0:
48 | return 0
49 | return d.median().item()
50 |
51 | @property
52 | def avg(self):
53 | d = torch.tensor(list(self.deque), dtype=torch.float32)
54 | return d.mean().item()
55 |
56 | @property
57 | def global_avg(self):
58 | return self.total / self.count
59 |
60 | @property
61 | def max(self):
62 | return max(self.deque)
63 |
64 | @property
65 | def value(self):
66 | return self.deque[-1]
67 |
68 | def __str__(self):
69 | return self.fmt.format(
70 | median=self.median,
71 | avg=self.avg,
72 | global_avg=self.global_avg,
73 | max=self.max,
74 | value=self.value)
75 |
76 |
77 | def all_gather(data):
78 | """
79 | Run all_gather on arbitrary picklable data (not necessarily tensors)
80 | Args:
81 | data: any picklable object
82 | Returns:
83 | list[data]: list of data gathered from each rank
84 | """
85 | world_size = get_world_size()
86 | if world_size == 1:
87 | return [data]
88 |
89 | # serialized to a Tensor
90 | buffer = pickle.dumps(data)
91 | storage = torch.ByteStorage.from_buffer(buffer)
92 | tensor = torch.ByteTensor(storage).to("cuda")
93 |
94 | # obtain Tensor size of each rank
95 | local_size = torch.tensor([tensor.numel()], device="cuda")
96 | size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
97 | dist.all_gather(size_list, local_size)
98 | size_list = [int(size.item()) for size in size_list]
99 | max_size = max(size_list)
100 |
101 | # receiving Tensor from all ranks
102 | # we pad the tensor because torch all_gather does not support
103 | # gathering tensors of different shapes
104 | tensor_list = []
105 | for _ in size_list:
106 | tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
107 | if local_size != max_size:
108 | padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
109 | tensor = torch.cat((tensor, padding), dim=0)
110 | dist.all_gather(tensor_list, tensor)
111 |
112 | data_list = []
113 | for size, tensor in zip(size_list, tensor_list):
114 | buffer = tensor.cpu().numpy().tobytes()[:size]
115 | data_list.append(pickle.loads(buffer))
116 |
117 | return data_list
118 |
119 |
120 | def reduce_dict(input_dict, average=True):
121 | """
122 | Args:
123 | input_dict (dict): all the values will be reduced
124 | average (bool): whether to do average or sum
125 | Reduce the values in the dictionary from all processes so that all processes
126 | have the averaged results. Returns a dict with the same fields as
127 | input_dict, after reduction.
128 | """
129 | world_size = get_world_size()
130 | if world_size < 2:
131 | return input_dict
132 | with torch.no_grad():
133 | names = []
134 | values = []
135 | # sort the keys so that they are consistent across processes
136 | for k in sorted(input_dict.keys()):
137 | names.append(k)
138 | values.append(input_dict[k])
139 | values = torch.stack(values, dim=0)
140 | dist.all_reduce(values)
141 | if average:
142 | values /= world_size
143 | reduced_dict = {k: v for k, v in zip(names, values)}
144 | return reduced_dict
145 |
146 |
147 | class MetricLogger(object):
148 | def __init__(self, delimiter="\t"):
149 | self.meters = defaultdict(SmoothedValue)
150 | self.delimiter = delimiter
151 |
152 | def update(self, **kwargs):
153 | for k, v in kwargs.items():
154 | if isinstance(v, torch.Tensor):
155 | v = v.item()
156 | assert isinstance(v, (float, int))
157 | self.meters[k].update(v)
158 |
159 | def __getattr__(self, attr):
160 | if attr in self.meters:
161 | return self.meters[attr]
162 | if attr in self.__dict__:
163 | return self.__dict__[attr]
164 | raise AttributeError("'{}' object has no attribute '{}'".format(
165 | type(self).__name__, attr))
166 |
167 | def __str__(self):
168 | loss_str = []
169 | for name, meter in self.meters.items():
170 | if meter.count > 0:
171 | loss_str.append(
172 | "{}: {}".format(name, str(meter))
173 | )
174 | return self.delimiter.join(loss_str)
175 |
176 | def synchronize_between_processes(self):
177 | for meter in self.meters.values():
178 | meter.synchronize_between_processes()
179 |
180 | def add_meter(self, name, meter):
181 | self.meters[name] = meter
182 |
183 | def log_every(self, iterable, print_freq, header=None, logger=None):
184 | if logger is None:
185 | print_func = print
186 | else:
187 | print_func = logger.info
188 |
189 | i = 0
190 | if not header:
191 | header = ''
192 | start_time = time.time()
193 | end = time.time()
194 | iter_time = SmoothedValue(fmt='{avg:.4f}')
195 | data_time = SmoothedValue(fmt='{avg:.4f}')
196 | space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
197 | if torch.cuda.is_available():
198 | log_msg = self.delimiter.join([
199 | header,
200 | '[{0' + space_fmt + '}/{1}]',
201 | 'eta: {eta}',
202 | '{meters}',
203 | 'time: {time}',
204 | 'data: {data}',
205 | 'max mem: {memory:.0f}'
206 | ])
207 | else:
208 | log_msg = self.delimiter.join([
209 | header,
210 | '[{0' + space_fmt + '}/{1}]',
211 | 'eta: {eta}',
212 | '{meters}',
213 | 'time: {time}',
214 | 'data: {data}'
215 | ])
216 | MB = 1024.0 * 1024.0
217 | for obj in iterable:
218 | data_time.update(time.time() - end)
219 | yield obj
220 | iter_time.update(time.time() - end)
221 | if i % print_freq == 0 or i == len(iterable) - 1:
222 | eta_seconds = iter_time.global_avg * (len(iterable) - i)
223 | eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
224 | if torch.cuda.is_available():
225 | print_func(log_msg.format(
226 | i, len(iterable), eta=eta_string,
227 | meters=str(self),
228 | time=str(iter_time), data=str(data_time),
229 | memory=torch.cuda.max_memory_allocated() / MB))
230 | else:
231 | print_func(log_msg.format(
232 | i, len(iterable), eta=eta_string,
233 | meters=str(self),
234 | time=str(iter_time), data=str(data_time)))
235 | i += 1
236 | end = time.time()
237 | total_time = time.time() - start_time
238 | total_time_str = str(datetime.timedelta(seconds=int(total_time)))
239 | print_func('{} Total time: {} ({:.4f} s / it)'.format(
240 | header, total_time_str, total_time / len(iterable)))
241 |
--------------------------------------------------------------------------------
/tools/benchmark/trt_benchmark.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
3 | """
4 |
5 | import tensorrt as trt
6 | import pycuda.driver as cuda
7 | from utils import TimeProfiler
8 | import numpy as np
9 | import os
10 | import time
11 | import torch
12 |
13 | from collections import namedtuple, OrderedDict
14 | import glob
15 | import argparse
16 | from dataset import Dataset
17 | from tqdm import tqdm
18 |
19 |
20 | def parse_args():
21 | parser = argparse.ArgumentParser(description='Argument Parser Example')
22 | parser.add_argument('--infer_dir',
23 | type=str,
24 | default='./data/COCO2017/val2017',
25 | help="Directory for images to perform inference on.")
26 | parser.add_argument("--engine_dir",
27 | type=str,
28 | default='trt_engines',
29 | help="Directory containing model engine files.")
30 | parser.add_argument('--busy',
31 | action='store_true',
32 | help="Flag to indicate that other processes may be running.")
33 | args = parser.parse_args()
34 | return args
35 |
36 | class TRTInference(object):
37 | def __init__(self, engine_path, device='cuda', backend='torch', max_batch_size=32, verbose=False):
38 | self.engine_path = engine_path
39 | self.device = device
40 | self.backend = backend
41 | self.max_batch_size = max_batch_size
42 |
43 | self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
44 | self.engine = self.load_engine(engine_path)
45 | self.context = self.engine.create_execution_context()
46 | self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
47 | self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
48 | self.input_names = self.get_input_names()
49 | self.output_names = self.get_output_names()
50 |
51 | if self.backend == 'cuda':
52 | self.stream = cuda.Stream()
53 | self.time_profile = TimeProfiler()
54 | self.time_profile_dataset = TimeProfiler()
55 | self.yolo = 'yolo' in engine_path
56 |
57 | def init(self):
58 | self.dynamic = False
59 |
60 | def load_engine(self, path):
61 | trt.init_libnvinfer_plugins(self.logger, '')
62 | with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
63 | return runtime.deserialize_cuda_engine(f.read())
64 |
65 | def get_input_names(self):
66 | names = []
67 | for _, name in enumerate(self.engine):
68 | if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
69 | names.append(name)
70 | return names
71 |
72 | def get_output_names(self):
73 | names = []
74 | for _, name in enumerate(self.engine):
75 | if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
76 | names.append(name)
77 | return names
78 |
79 | def get_bindings(self, engine, context, max_batch_size=32, device=None):
80 | Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
81 | bindings = OrderedDict()
82 | for i, name in enumerate(engine):
83 | shape = engine.get_tensor_shape(name)
84 | dtype = trt.nptype(engine.get_tensor_dtype(name))
85 |
86 | if shape[0] == -1:
87 | dynamic = True
88 | shape[0] = max_batch_size
89 | if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
90 | context.set_input_shape(name, shape)
91 |
92 | if self.backend == 'cuda':
93 | if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
94 | data = np.random.randn(*shape).astype(dtype)
95 | ptr = cuda.mem_alloc(data.nbytes)
96 | bindings[name] = Binding(name, dtype, shape, data, ptr)
97 | else:
98 | data = cuda.pagelocked_empty(trt.volume(shape), dtype)
99 | ptr = cuda.mem_alloc(data.nbytes)
100 | bindings[name] = Binding(name, dtype, shape, data, ptr)
101 | else:
102 | data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
103 | bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
104 | return bindings
105 |
106 | def run_torch(self, blob):
107 | for n in self.input_names:
108 | if self.bindings[n].shape != blob[n].shape:
109 | self.context.set_input_shape(n, blob[n].shape)
110 | self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
111 |
112 | self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
113 | self.context.execute_v2(list(self.bindings_addr.values()))
114 | outputs = {n: self.bindings[n].data for n in self.output_names}
115 | return outputs
116 |
117 | def async_run_cuda(self, blob):
118 | for n in self.input_names:
119 | cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream)
120 |
121 | bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
122 | self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)
123 |
124 | outputs = {}
125 | for n in self.output_names:
126 | cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream)
127 | outputs[n] = self.bindings[n].data
128 |
129 | self.stream.synchronize()
130 |
131 | return outputs
132 |
133 | def __call__(self, blob):
134 | if self.backend == 'torch':
135 | return self.run_torch(blob)
136 | elif self.backend == 'cuda':
137 | return self.async_run_cuda(blob)
138 |
139 | def synchronize(self):
140 | if self.backend == 'torch' and torch.cuda.is_available():
141 | torch.cuda.synchronize()
142 | elif self.backend == 'cuda':
143 | self.stream.synchronize()
144 |
145 | def warmup(self, blob, n):
146 | for _ in range(n):
147 | _ = self(blob)
148 |
149 | def speed(self, blob, n, nonempty_process=False):
150 | times = []
151 | self.time_profile_dataset.reset()
152 | for i in tqdm(range(n), desc="Running Inference", unit="iteration"):
153 | self.time_profile.reset()
154 | with self.time_profile_dataset:
155 | img = blob[i]
156 | if img['images'] is not None:
157 | img['image'] = img['input'] = img['images'].unsqueeze(0)
158 | else:
159 | img['images'] = img['input'] = img['image'].unsqueeze(0)
160 | with self.time_profile:
161 | _ = self(img)
162 | times.append(self.time_profile.total)
163 |
164 | # end-to-end model only
165 | if not self.yolo:
166 | print('end-to-end')
167 | times = sorted(times)
168 | if len(times) > 100 and nonempty_process:
169 | times = times[:100]
170 |
171 | avg_time = sum(times) / len(times) # Calculate the average of the remaining times
172 | return avg_time
173 |
174 | def main():
175 | FLAGS = parse_args()
176 | dataset = Dataset(FLAGS.infer_dir)
177 | im = torch.ones(1, 3, 640, 640).cuda()
178 | blob = {
179 | 'image': im,
180 | 'images': im,
181 | 'input': im,
182 | 'im_shape': torch.tensor([640, 640]).to(im.device),
183 | 'scale_factor': torch.tensor([1, 1]).to(im.device),
184 | 'orig_target_sizes': torch.tensor([[640, 640]]).to(im.device),
185 | }
186 |
187 | engine_files = glob.glob(os.path.join(FLAGS.engine_dir, "*.engine"))
188 | results = []
189 |
190 | for engine_file in engine_files:
191 | print(f"Testing engine: {engine_file}")
192 | model = TRTInference(engine_file, max_batch_size=1, verbose=False)
193 | model.init()
194 | model.warmup(blob, 400)
195 | t = []
196 | for _ in range(1):
197 | t.append(model.speed(dataset, 1000, FLAGS.busy))
198 | avg_latency = 1000 * torch.tensor(t).mean()
199 | results.append((engine_file, avg_latency))
200 | print(f"Engine: {engine_file}, Latency: {avg_latency:.2f} ms")
201 |
202 | del model
203 | torch.cuda.empty_cache()
204 | time.sleep(1)
205 |
206 | sorted_results = sorted(results, key=lambda x: x[1])
207 | for engine_file, latency in sorted_results:
208 | print(f"Engine: {engine_file}, Latency: {latency:.2f} ms")
209 |
210 | if __name__ == '__main__':
211 | main()
212 |
--------------------------------------------------------------------------------
/tools/inference/trt_inf.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
3 | """
4 |
5 | import os
6 | import time
7 | import glob
8 | import collections
9 | import contextlib
10 | from collections import OrderedDict
11 |
12 | import cv2 # Added for video processing
13 | import numpy as np
14 | import tensorrt as trt
15 | import torch
16 | import torchvision.transforms as T
17 |
18 | from PIL import Image, ImageDraw
19 | from copy import deepcopy
20 | from annotator import Annotator
21 | from annotator_crowdpose import AnnotatorCrowdpose
22 |
23 | annotators = {'COCO': Annotator, 'CrowdPose': AnnotatorCrowdpose}
24 |
25 |
26 | class TimeProfiler(contextlib.ContextDecorator):
27 | def __init__(self):
28 | self.total = 0
29 |
30 | def __enter__(self):
31 | self.start = self.time()
32 | return self
33 |
34 | def __exit__(self, type, value, traceback):
35 | self.total += self.time() - self.start
36 |
37 | def reset(self):
38 | self.total = 0
39 |
40 | def time(self):
41 | if torch.cuda.is_available():
42 | torch.cuda.synchronize()
43 | return time.time()
44 |
45 |
46 | class TRTInference(object):
47 | def __init__(
48 | self, engine_path, device="cuda:0", backend="torch", max_batch_size=32, verbose=False
49 | ):
50 | self.engine_path = engine_path
51 | self.device = device
52 | self.backend = backend
53 | self.max_batch_size = max_batch_size
54 |
55 | self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
56 |
57 | self.engine = self.load_engine(engine_path)
58 | self.context = self.engine.create_execution_context()
59 | self.bindings = self.get_bindings(
60 | self.engine, self.context, self.max_batch_size, self.device
61 | )
62 | self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
63 | self.input_names = self.get_input_names()
64 | self.output_names = self.get_output_names()
65 | self.time_profile = TimeProfiler()
66 |
67 | def load_engine(self, path):
68 | trt.init_libnvinfer_plugins(self.logger, "")
69 | with open(path, "rb") as f, trt.Runtime(self.logger) as runtime:
70 | return runtime.deserialize_cuda_engine(f.read())
71 |
72 | def get_input_names(self):
73 | names = []
74 | for _, name in enumerate(self.engine):
75 | if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
76 | names.append(name)
77 | return names
78 |
79 | def get_output_names(self):
80 | names = []
81 | for _, name in enumerate(self.engine):
82 | if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
83 | names.append(name)
84 | return names
85 |
86 | def get_bindings(self, engine, context, max_batch_size=32, device=None) -> OrderedDict:
87 | Binding = collections.namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr"))
88 | bindings = OrderedDict()
89 |
90 | for i, name in enumerate(engine):
91 | shape = engine.get_tensor_shape(name)
92 | dtype = trt.nptype(engine.get_tensor_dtype(name))
93 |
94 | if shape[0] == -1:
95 | shape[0] = max_batch_size
96 | if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
97 | context.set_input_shape(name, shape)
98 |
99 | data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
100 | bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
101 |
102 | return bindings
103 |
104 | def run_torch(self, blob):
105 | for n in self.input_names:
106 | if blob[n].dtype is not self.bindings[n].data.dtype:
107 | blob[n] = blob[n].to(dtype=self.bindings[n].data.dtype)
108 | if self.bindings[n].shape != blob[n].shape:
109 | self.context.set_input_shape(n, blob[n].shape)
110 | self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
111 |
112 | assert self.bindings[n].data.dtype == blob[n].dtype, "{} dtype mismatch".format(n)
113 |
114 | self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
115 | self.context.execute_v2(list(self.bindings_addr.values()))
116 | outputs = {n: self.bindings[n].data for n in self.output_names}
117 |
118 | return outputs
119 |
120 | def __call__(self, blob):
121 | if self.backend == "torch":
122 | return self.run_torch(blob)
123 | else:
124 | raise NotImplementedError("Only 'torch' backend is implemented.")
125 |
126 | def synchronize(self):
127 | if self.backend == "torch" and torch.cuda.is_available():
128 | torch.cuda.synchronize()
129 |
130 | def process_image(m, file_path, device):
131 | im_pil = Image.open(file_path).convert("RGB")
132 | w, h = im_pil.size
133 | orig_size = torch.tensor([w, h])[None].to(device)
134 |
135 | transforms = T.Compose(
136 | [
137 | T.Resize((640, 640)),
138 | T.ToTensor(),
139 | ]
140 | )
141 | im_data = transforms(im_pil)[None]
142 | annotator = annotators[annotator_type](deepcopy(im_pil))
143 |
144 | blob = {
145 | "images": im_data.to(device),
146 | "orig_target_sizes": orig_size.to(device),
147 | }
148 |
149 | output = m(blob)
150 |
151 | scores, labels, keypoints = output.values()
152 | scores, labels, keypoints = scores[0], labels[0], keypoints[0]
153 | for kpt, score in zip(keypoints, scores):
154 | if score > thrh:
155 | annotator.kpts(
156 | kpt,
157 | [h, w]
158 | )
159 | annotator.save(f"{OUTPUT_NAME}.jpg")
160 |
161 | def process_video(m, file_path, device):
162 | cap = cv2.VideoCapture(file_path)
163 |
164 | # Get video properties
165 | fps = cap.get(cv2.CAP_PROP_FPS)
166 | orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
167 | orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
168 |
169 | # Define the codec and create VideoWriter object
170 | fourcc = cv2.VideoWriter_fourcc(*"mp4v")
171 | out = cv2.VideoWriter(f"{OUTPUT_NAME}.mp4", fourcc, fps, (orig_w, orig_h))
172 |
173 | transforms = T.Compose(
174 | [
175 | T.Resize((640, 640)),
176 | T.ToTensor(),
177 | ]
178 | )
179 |
180 | frame_count = 0
181 | print("Processing video frames...")
182 | while cap.isOpened():
183 | ret, frame = cap.read()
184 | if not ret:
185 | break
186 |
187 | # Convert frame to PIL image
188 | frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
189 |
190 | w, h = frame_pil.size
191 | orig_size = torch.tensor([w, h], device=device)[None]
192 | annotator = annotators[annotator_type](deepcopy(frame_pil))
193 |
194 | im_data = transforms(frame_pil)[None]
195 |
196 | blob = {
197 | "images": im_data.to(device),
198 | "orig_target_sizes": orig_size,
199 | }
200 |
201 | output = m(blob)
202 |
203 | scores, labels, keypoints = output.values()
204 | scores, labels, keypoints = scores[0], labels[0], keypoints[0]
205 | for kpt, score in zip(keypoints, scores):
206 | if score > thrh:
207 | annotator.kpts(
208 | kpt,
209 | [h, w]
210 | )
211 |
212 | # Convert back to OpenCV image
213 | frame = annotator.result()
214 |
215 | # Write the frame
216 | out.write(frame)
217 | frame_count += 1
218 |
219 | if frame_count % 100 == 0:
220 | print(f"Processed {frame_count} frames...")
221 |
222 | cap.release()
223 | out.release()
224 | print(f"Video processing complete. Result saved as '{OUTPUT_NAME}.mp4'.")
225 |
226 | def process_file(m, file_path, device):
227 | # Check if the input file is an image or a vide
228 | if os.path.splitext(file_path)[-1].lower() in [".jpg", ".jpeg", ".png", ".bmp"]:
229 | # Process as image
230 | process_image(m, file_path, device)
231 | else:
232 | # Process as video
233 | process_video(m, file_path, device)
234 |
235 | if __name__ == "__main__":
236 | import argparse
237 |
238 | parser = argparse.ArgumentParser()
239 | parser.add_argument("-trt", "--trt", type=str, required=True)
240 | parser.add_argument("--annotator", type=str, required=True, help="Annotator type: COCO or CrowdPose.")
241 | parser.add_argument("-i", "--input", type=str, required=True)
242 | parser.add_argument("-d", "--device", type=str, default="cuda:0")
243 | parser.add_argument("-t", "--thrh", type=float, required=False, default=None)
244 |
245 | args = parser.parse_args()
246 |
247 | assert args.annotator.lower() in ['coco', 'crowdpose']
248 |
249 | # Global variable
250 | global OUTPUT_NAME, thrh, annotator_type
251 | thrh = 0.5 if args.thrh is None else args.thrh
252 |
253 | annotator_name = args.annotator.lower()
254 | if annotator_name == 'coco':
255 | annotator_type = 'COCO'
256 | elif annotator_name == 'crowdpose':
257 | annotator_type = 'CrowdPose'
258 |
259 | m = TRTInference(args.trt, device=args.device)
260 |
261 | # Check if the input argumnet is a file or a folder
262 | file_path = args.input
263 | if os.path.isdir(file_path):
264 | # Process a folder
265 | folder_dir = args.input
266 | if folder_dir[-1] == '/':
267 | folder_dir = folder_dir[:-1]
268 | output_dir = f"{folder_dir}/output"
269 | os.makedirs(output_dir, exist_ok=True)
270 | paths = list(glob.iglob(f"{folder_dir}/*.*"))
271 | for file_path in paths:
272 | OUTPUT_NAME = file_path.replace(f'{folder_dir}/', f'{output_dir}/').split('.')[0]
273 | OUTPUT_NAME = f"{OUTPUT_NAME}_{annotator_type}"
274 | process_file(m, file_path, args.device)
275 | else:
276 | # Process a file
277 | OUTPUT_NAME = f'trt_results_{annotator_type}'
278 | process_file(m, file_path, args.device)
--------------------------------------------------------------------------------
/src/models/detrpose/dn_component.py:
--------------------------------------------------------------------------------
1 | """
2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation
3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved.
4 | ---------------------------------------------------------------------------------
5 | Modified from DINO (https://github.com/IDEA-Research/DINO/)
6 | Copyright (c) 2022 IDEA. All Rights Reserved.
7 | ---------------------------------------------------------------------------------
8 | Modified from DN-DETR (https://github.com/IDEA-Research/DN-DETR/)
9 | Copyright (c) 2022 IDEA. All Rights Reserved.
10 | """
11 |
12 |
13 | import torch
14 | from .utils import inverse_sigmoid
15 | import torch.nn.functional as F
16 | import numpy as np
17 |
18 | def get_sigmas(num_keypoints, device):
19 | if num_keypoints == 17:
20 | sigmas = np.array([
21 | .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
22 | 1.07, .87, .87, .89, .89
23 | ], dtype=np.float32) / 10.0
24 | elif num_keypoints == 14:
25 | sigmas = np.array([
26 | .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
27 | .79, .79
28 | ]) / 10.0
29 | elif num_keypoints == 3:
30 | sigmas = np.array([
31 | 1.07, 1.07, 0.67
32 | ]) / 10.0
33 | else:
34 | raise ValueError(f'Unsupported keypoints number {num_keypoints}')
35 | sigmas = np.concatenate([[0.1], sigmas]) # for the center of the human
36 | sigmas = torch.tensor(sigmas, device=device, dtype=torch.float32)
37 | return sigmas[None, :, None]
38 |
39 |
40 | def prepare_for_cdn(dn_args, training, num_queries, num_classes, num_keypoints, hidden_dim, label_enc, pose_enc, img_dim, device):
41 | """
42 | A major difference of DINO from DN-DETR is that the author process pattern embedding pattern embedding in its detector
43 | forward function and use learnable tgt embedding, so we change this function a little bit.
44 | :param dn_args: targets, dn_number, label_noise_ratio
45 | :param training: if it is training or inference
46 | :param num_queries: number of queires
47 | :param num_classes: number of classes
48 | :param hidden_dim: transformer hidden dim
49 | :param label_enc: encode labels in dn
50 | :return:
51 | """
52 | if training:
53 | targets, dn_number, label_noise_ratio = dn_args
54 | # positive and negative dn queries
55 | dn_number = dn_number * 2
56 | known = [(torch.ones_like(t['labels'])) for t in targets]
57 | batch_size = len(known)
58 | known_num = [sum(k) for k in known]
59 |
60 | if int(max(known_num)) == 0:
61 | return None, None, None, None
62 |
63 | dn_number = dn_number // (int(max(known_num) * 2))
64 | dn_number = 1 if dn_number == 0 else dn_number
65 |
66 | unmask_bbox = unmask_label = torch.cat(known)
67 |
68 | # instance label denoise
69 | labels = torch.cat([t['labels'] for t in targets])
70 | batch_idx = torch.cat([torch.full_like(t['labels'].long(), i) for i, t in enumerate(targets)])
71 |
72 | known_indice = torch.nonzero(unmask_label + unmask_bbox)
73 | known_indice = known_indice.view(-1)
74 | known_indice = known_indice.repeat(2 * dn_number, 1).view(-1)
75 |
76 | known_labels = labels.repeat(2 * dn_number, 1).view(-1)
77 | known_labels_expaned = known_labels.clone()
78 |
79 | known_labels_poses_expaned = torch.arange(num_keypoints, dtype=torch.long, device=device)
80 | known_labels_poses_expaned = known_labels_poses_expaned[None].repeat(len(known_labels), 1)
81 |
82 | known_bid = batch_idx.repeat(2 * dn_number, 1).view(-1)
83 |
84 | if label_noise_ratio > 0:
85 | p = torch.rand_like(known_labels_expaned.float())
86 | chosen_indice = torch.nonzero(p < (label_noise_ratio * 0.5)).view(-1) # half of bbox prob
87 | new_label = torch.randint_like(chosen_indice, 0, num_classes) # randomly put a new one here
88 | known_labels_expaned.scatter_(0, chosen_indice, new_label)
89 |
90 | # weights = torch.ones((len(chosen_indice), num_keypoints), device=p.device)
91 | # new_label_pose = torch.multinomial(weights, num_samples=num_keypoints, replacement=False)
92 | # known_labels_poses_expaned.scatter_(0, chosen_indice.unsqueeze(-1).repeat(1, num_keypoints), new_label_pose)
93 |
94 | # keypoint noise
95 | boxes = torch.cat([t['boxes'] for t in targets])
96 | xy = (boxes[:, :2] + boxes[:, 2:]) / 2.
97 | keypoints = torch.cat([t['keypoints'] for t in targets])
98 | if 'area' in targets[0]:
99 | areas = torch.cat([t['area'] for t in targets])
100 | else:
101 | areas = boxes[:, 2] * boxes[:, 3] * 0.53
102 | poses = keypoints[:, 0:(num_keypoints * 2)]
103 | poses = torch.cat([xy, poses], dim=1)
104 | non_viz = keypoints[:, (num_keypoints * 2):] == 0
105 | non_viz = torch.cat((torch.ones_like(non_viz[:, 0:1]).bool(), non_viz), dim=1)
106 | vars = (2 * get_sigmas(num_keypoints, device)) ** 2
107 |
108 |
109 | known_poses = poses.repeat(2 * dn_number, 1).reshape(-1, num_keypoints+1, 2)
110 | known_areas = areas.repeat(2 * dn_number)[..., None, None] # normalized [0, 1]
111 | known_areas = known_areas * img_dim[0] * img_dim[1] # scaled [0, h*w]
112 | known_non_viz = non_viz.repeat(2 * dn_number, 1)
113 |
114 | single_pad = int(max(known_num))
115 | pad_size = int(single_pad * 2 * dn_number)
116 | positive_idx = torch.tensor(range(len(poses))).long().cuda().unsqueeze(0).repeat(dn_number, 1)
117 | positive_idx += (torch.tensor(range(dn_number)) * len(poses) * 2).long().cuda().unsqueeze(1)
118 | positive_idx = positive_idx.flatten()
119 | negative_idx = positive_idx + len(poses)
120 |
121 | eps = np.finfo('float32').eps
122 | rand_vector = torch.rand_like(known_poses)
123 | rand_vector = F.normalize(rand_vector, -1) # ||rand_vector|| = 1
124 | rand_alpha = torch.zeros_like(known_poses[..., :1]).uniform_(-np.log(1), -np.log(0.5))
125 | rand_alpha[negative_idx] = rand_alpha[negative_idx].uniform_(-np.log(0.5), -np.log(0.1))
126 | rand_alpha *= 2 * (known_areas + eps) * vars ## This is distance **2
127 | rand_alpha = torch.sqrt(rand_alpha) / max(img_dim)
128 | # rand_alpha = rand_alpha ** 1.25 ## This is distance
129 | rand_alpha[known_non_viz] = 0.
130 |
131 | known_poses_expand = known_poses + rand_alpha * rand_vector
132 |
133 | m = known_labels_expaned.long().to(device)
134 | input_label_embed = label_enc(m)
135 | # input_label_pose_embed = pose_enc(known_labels_poses_expaned)
136 | input_label_pose_embed = pose_enc.weight[None].repeat(known_poses_expand.size(0), 1, 1)
137 | input_label_embed = torch.cat([input_label_embed.unsqueeze(1), input_label_pose_embed], dim=1)
138 | input_label_embed = input_label_embed.flatten(1)
139 |
140 | input_pose_embed = inverse_sigmoid(known_poses_expand)
141 |
142 | padding_label = torch.zeros(pad_size, hidden_dim * (num_keypoints + 1)).cuda()
143 | padding_pose = torch.zeros(pad_size, num_keypoints+1).cuda()
144 |
145 | input_query_label = padding_label.repeat(batch_size, 1, 1)
146 | input_query_pose = padding_pose[...,None].repeat(batch_size, 1, 1, 2)
147 |
148 | map_known_indice = torch.tensor([], device=device)
149 | if len(known_num):
150 | map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num]) # [1,2, 1,2,3]
151 | map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(2 * dn_number)]).long()
152 | if len(known_bid):
153 | input_query_label[(known_bid.long(), map_known_indice)] = input_label_embed
154 | input_query_pose[(known_bid.long(), map_known_indice)] = input_pose_embed
155 |
156 | tgt_size = pad_size + num_queries
157 | attn_mask = torch.ones(tgt_size, tgt_size, device=device) < 0
158 | # match query cannot see the reconstruct
159 | attn_mask[pad_size:, :pad_size] = True
160 | # reconstruct cannot see each other
161 | for i in range(dn_number):
162 | if i == 0:
163 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
164 | if i == dn_number - 1:
165 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * i * 2] = True
166 | else:
167 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
168 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * 2 * i] = True
169 | # import matplotlib.pyplot as plt
170 | # plt.imshow(~attn_mask.detach().cpu().numpy(), cmap='gray')
171 | # plt.show()
172 |
173 | dn_meta = {
174 | 'pad_size': pad_size,
175 | 'num_dn_group': dn_number,
176 | }
177 | else:
178 |
179 | input_query_label = None
180 | input_query_bbox = None
181 | attn_mask = None
182 | dn_meta = None
183 |
184 | return input_query_label.unflatten(-1, (-1, hidden_dim)), input_query_pose, attn_mask, dn_meta
185 |
186 |
187 | def dn_post_process(outputs_class, outputs_keypoints, dn_meta, aux_loss, _set_aux_loss):
188 | """
189 | post process of dn after output from the transformer
190 | put the dn part in the dn_meta
191 | """
192 | if dn_meta and dn_meta['pad_size'] > 0:
193 | output_known_class = outputs_class[:, :, :dn_meta['pad_size'], :]
194 | output_known_keypoints = outputs_keypoints[:, :, :dn_meta['pad_size'], :]
195 | outputs_class = outputs_class[:, :, dn_meta['pad_size']:, :]
196 | outputs_keypoints = outputs_keypoints[:, :, dn_meta['pad_size']:, :]
197 | out = {'pred_logits': output_known_class[-1], 'pred_keypoints': output_known_keypoints[-1]}
198 | if aux_loss:
199 | out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_keypoints)
200 | dn_meta['output_known_lbs_keypoints'] = out
201 | return outputs_class, outputs_keypoints
202 |
--------------------------------------------------------------------------------