├── .gitattributes ├── src ├── solver │ ├── __init__.py │ └── engine.py ├── nn │ ├── __init__.py │ ├── optimizer │ │ ├── __init__.py │ │ ├── warmup.py │ │ └── ema.py │ └── backbone │ │ ├── __init__.py │ │ └── resnet.py ├── misc │ ├── __init__.py │ ├── profiler.py │ ├── keypoint_ops.py │ ├── mask_ops.py │ ├── metrics.py │ ├── keypoint_loss.py │ ├── box_ops.py │ ├── get_param_dicts.py │ ├── dist_utils.py │ └── logger.py ├── core │ ├── __init__.py │ ├── utils.py │ └── instantiate.py ├── data │ ├── __init__.py │ ├── container.py │ ├── coco.py │ ├── crowdpose.py │ └── dataloader.py └── models │ └── detrpose │ ├── __init__.py │ ├── detrpose.py │ ├── postprocesses.py │ ├── matcher.py │ ├── utils.py │ ├── ms_deform_attn.py │ └── dn_component.py ├── assets ├── metrics.png ├── lambda_logo1.png ├── lambda_logo2.png ├── TENSORRT_CONTAINER_LAMBDA.AI.md └── TENSORRT_DEB_LAMBDA.AI.md ├── examples ├── example1.jpg └── example2.jpg ├── tools ├── benchmark │ ├── requirements.txt │ ├── utils.py │ ├── get_info.py │ ├── dataset.py │ ├── torch_benchmark.py │ └── trt_benchmark.py ├── deployment │ ├── export_tensorrt.py │ ├── export_yolo_w_nms.py │ └── export_onnx.py ├── visualization │ ├── backbone_encoder.py │ └── line_attention.py └── inference │ ├── onnx_inf.py │ ├── annotator_crowdpose.py │ ├── torch_inf.py │ └── trt_inf.py ├── requirements.txt ├── .gitignore ├── configs └── detrpose │ ├── detrpose_hgnetv2_l.py │ ├── detrpose_hgnetv2_l_crowdpose.py │ ├── detrpose_hgnetv2_x.py │ ├── detrpose_hgnetv2_x_crowdpose.py │ ├── detrpose_hgnetv2_m.py │ ├── detrpose_hgnetv2_s.py │ ├── detrpose_hgnetv2_m_crowdpose.py │ ├── detrpose_hgnetv2_s_crowdpose.py │ ├── detrpose_hgnetv2_n.py │ ├── detrpose_hgnetv2_n_crowdpose.py │ └── include │ ├── detrpose_hgnetv2.py │ ├── dataset.py │ └── dataset_crowdpose.py └── train.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-documentation 2 | -------------------------------------------------------------------------------- /src/solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .trainer import Trainer -------------------------------------------------------------------------------- /src/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import * 2 | from .optimizer import * 3 | -------------------------------------------------------------------------------- /src/misc/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -------------------------------------------------------------------------------- /src/nn/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .warmup import LinearWarmup 2 | from .ema import ModelEMA -------------------------------------------------------------------------------- /assets/metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/assets/metrics.png -------------------------------------------------------------------------------- /examples/example1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/examples/example1.jpg -------------------------------------------------------------------------------- /examples/example2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/examples/example2.jpg -------------------------------------------------------------------------------- /src/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .instantiate import instantiate 2 | from .lazy import LazyConfig, LazyCall -------------------------------------------------------------------------------- /assets/lambda_logo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/assets/lambda_logo1.png -------------------------------------------------------------------------------- /assets/lambda_logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SebastianJanampa/DETRPose/HEAD/assets/lambda_logo2.png -------------------------------------------------------------------------------- /tools/benchmark/requirements.txt: -------------------------------------------------------------------------------- 1 | onnxruntime 2 | onnxscript 3 | onnxsim 4 | tensorrt 5 | pycuda 6 | tqdm 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | transformers 3 | cloudpickle 4 | pycocotools 5 | xtcocotools 6 | omegaconf 7 | calflops 8 | iopath 9 | scipy 10 | numpy==1.23.5 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | weight/ 2 | data/COCO2017 3 | data/crowdpose 4 | output/ 5 | official_weights/ 6 | onnx_engines/ 7 | trt_engines/ 8 | clean_pth_files.py 9 | **/__pycache__/** 10 | examples/output/ 11 | -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .coco import CocoDetection 2 | # from .coco_eval import CocoEvaluator 3 | 4 | from .crowdpose import CrowdPoseDetection 5 | # from .crowdpose_eval import CrowdPoseEvaluator 6 | 7 | # from .dataloader import DataLoader, BatchImageCollateFunction 8 | -------------------------------------------------------------------------------- /src/models/detrpose/__init__.py: -------------------------------------------------------------------------------- 1 | from .matcher import HungarianMatcher 2 | from .detrpose import DETRPose 3 | from .criterion import Criterion 4 | from .transformer import Transformer 5 | from .postprocesses import PostProcess 6 | from .hybrid_encoder import HybridEncoder 7 | -------------------------------------------------------------------------------- /tools/deployment/export_tensorrt.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | input_folder = 'onnx_engines' 4 | input_files = [f for f in os.listdir(input_folder)] 5 | 6 | output_folder = 'trt_engines' 7 | output_files = [f.replace('onnx', 'engine') for f in input_files] 8 | 9 | os.makedirs(output_folder, exist_ok=True) 10 | 11 | trtexec="/usr/src/tensorrt/bin/trtexec" 12 | 13 | for f_in, f_out in zip(input_files, output_files): 14 | cmd = f'{trtexec} --onnx="{input_folder}/{f_in}" --saveEngine="{output_folder}/{f_out}" --fp16' 15 | print(f'running:\t{cmd}') 16 | os.system(cmd) -------------------------------------------------------------------------------- /src/nn/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from Conditional DETR 3 | # Copyright (c) 2021 Microsoft. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Copied from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 8 | # ------------------------------------------------------------------------ 9 | 10 | from .resnet import ResNet 11 | from .hgnetv2 import HGNetv2 12 | -------------------------------------------------------------------------------- /src/misc/profiler.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from calflops import calculate_flops 3 | from typing import Tuple 4 | 5 | def stats( 6 | model, 7 | input_shape: Tuple=(1, 3, 640, 640), ) -> Tuple[int, dict]: 8 | 9 | model_for_info = copy.deepcopy(model).deploy() 10 | 11 | flops, macs, _ = calculate_flops(model=model_for_info, 12 | input_shape=input_shape, 13 | output_as_string=True, 14 | output_precision=4, 15 | print_detailed=False) 16 | params = sum(p.numel() for p in model_for_info.parameters()) 17 | del model_for_info 18 | return {'flops': flops, 'macs': macs, 'params': params} 19 | -------------------------------------------------------------------------------- /src/misc/keypoint_ops.py: -------------------------------------------------------------------------------- 1 | import torch, os 2 | 3 | def keypoint_xyxyzz_to_xyzxyz(keypoints: torch.Tensor): 4 | """_summary_ 5 | 6 | Args: 7 | keypoints (torch.Tensor): ..., 51 8 | """ 9 | res = torch.zeros_like(keypoints) 10 | num_points = keypoints.shape[-1] // 3 11 | Z = keypoints[..., :2*num_points] 12 | V = keypoints[..., 2*num_points:] 13 | res[...,0::3] = Z[..., 0::2] 14 | res[...,1::3] = Z[..., 1::2] 15 | res[...,2::3] = V[...] 16 | return res 17 | 18 | def keypoint_xyzxyz_to_xyxyzz(keypoints: torch.Tensor): 19 | """_summary_ 20 | 21 | Args: 22 | keypoints (torch.Tensor): ..., 51 23 | """ 24 | res = torch.zeros_like(keypoints) 25 | num_points = keypoints.shape[-1] // 3 26 | res[...,0:2*num_points:2] = keypoints[..., 0::3] 27 | res[...,1:2*num_points:2] = keypoints[..., 1::3] 28 | res[...,2*num_points:] = keypoints[..., 2::3] 29 | return res -------------------------------------------------------------------------------- /tools/benchmark/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import contextlib 3 | import numpy as np 4 | from PIL import Image 5 | from collections import OrderedDict 6 | 7 | import onnx 8 | import torch 9 | 10 | 11 | def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'): 12 | '''--loadInputs='image:input_tensor.bin' 13 | ''' 14 | im = Image.open(path).resize(size) 15 | data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255. 16 | data.tofile(output_name) 17 | 18 | 19 | class TimeProfiler(contextlib.ContextDecorator): 20 | def __init__(self, ): 21 | self.total = 0 22 | 23 | def __enter__(self, ): 24 | self.start = self.time() 25 | return self 26 | 27 | def __exit__(self, type, value, traceback): 28 | self.total += self.time() - self.start 29 | 30 | def reset(self, ): 31 | self.total = 0 32 | 33 | def time(self, ): 34 | if torch.cuda.is_available(): 35 | torch.cuda.synchronize() 36 | return time.time() 37 | -------------------------------------------------------------------------------- /tools/deployment/export_yolo_w_nms.py: -------------------------------------------------------------------------------- 1 | import os 2 | from ultralytics import YOLO 3 | 4 | def main(args): 5 | output_folder = 'trt_engines' 6 | os.makedirs(output_folder, exist_ok=True) 7 | 8 | model = YOLO(f"{args.name}.pt") 9 | model.export(format="engine", nms=True, iou=args.iou_threshold, conf=args.score_threshold, half=True, dynamic=False) 10 | 11 | with open(f"{args.name}.engine", "rb") as f: 12 | meta_len = int.from_bytes(f.read(4), byteorder="little") 13 | f.seek(meta_len + 4) 14 | engine = f.read() 15 | 16 | new_name = f"{args.name}_" + str(args.iou_threshold).split('.')[1] + '_' + str(args.score_threshold).split('.')[1] 17 | with open(f"{output_folder}/{new_name}.engine", "wb") as f: 18 | f.write(engine) 19 | 20 | if __name__ == "__main__": 21 | import argparse 22 | 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--name", type=str, default="yolo11n_tuned") 25 | parser.add_argument("--score_threshold", type=float, default=0.01) 26 | parser.add_argument("--iou_threshold", type=float, default=0.7) 27 | args = parser.parse_args() 28 | 29 | main(args) -------------------------------------------------------------------------------- /configs/detrpose/detrpose_hgnetv2_l.py: -------------------------------------------------------------------------------- 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor 2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator 3 | 4 | from src.core import LazyCall as L 5 | from src.nn.optimizer import ModelEMA 6 | from src.misc.get_param_dicts import get_optim_params 7 | 8 | from torch import optim 9 | 10 | training_params.output_dir = "output/detrpose_hgnetv2_l" 11 | training_params.epochs = 52 # 48 + 4 12 | training_params.use_ema = True 13 | 14 | ema = L(ModelEMA)( 15 | decay=0.9999, 16 | warmups=2000 17 | ) 18 | 19 | # optimizer params 20 | optimizer = L(optim.AdamW)( 21 | params=L(get_optim_params)( 22 | cfg=[ 23 | { 24 | 'params': '^(?=.*backbone).*$', 25 | 'lr': 0.00001 26 | }, 27 | ], 28 | # model=model 29 | ), 30 | lr=0.0001, 31 | betas=[0.9, 0.999], 32 | weight_decay=0.0001 33 | ) 34 | 35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)( 36 | # optimizer=optimizer, 37 | milestones=[1000], 38 | gamma=0.1 39 | ) 40 | 41 | -------------------------------------------------------------------------------- /src/misc/mask_ops.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List 2 | 3 | # needed due to empty tensor bug in pytorch and torchvision 0.5 4 | import torchvision 5 | __torchvision_need_compat_flag = float(torchvision.__version__.split('.')[1]) < 7 6 | if __torchvision_need_compat_flag: 7 | from torchvision.ops import _new_empty_tensor 8 | from torchvision.ops.misc import _output_size 9 | 10 | def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): 11 | # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor 12 | """ 13 | Equivalent to nn.functional.interpolate, but with support for empty batch sizes. 14 | This will eventually be supported natively by PyTorch, and this 15 | class can go away. 16 | """ 17 | if __torchvision_need_compat_flag < 0.7: 18 | if input.numel() > 0: 19 | return torch.nn.functional.interpolate( 20 | input, size, scale_factor, mode, align_corners 21 | ) 22 | 23 | output_shape = _output_size(2, input, size, scale_factor) 24 | output_shape = list(input.shape[:-2]) + list(output_shape) 25 | return _new_empty_tensor(input, output_shape) 26 | else: 27 | return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners) 28 | -------------------------------------------------------------------------------- /configs/detrpose/detrpose_hgnetv2_l_crowdpose.py: -------------------------------------------------------------------------------- 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor 2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator 3 | 4 | from src.core import LazyCall as L 5 | from src.nn.optimizer import ModelEMA 6 | from src.misc.get_param_dicts import get_optim_params 7 | 8 | from torch import optim 9 | 10 | training_params.output_dir = "output/detrpose_hgnetv2_l_crowdpose" 11 | training_params.epochs = 64 # 48 + 16 12 | training_params.use_ema = True 13 | 14 | ema = L(ModelEMA)( 15 | decay=0.9999, 16 | warmups=2000 17 | ) 18 | 19 | # optimizer params 20 | optimizer = L(optim.AdamW)( 21 | params=L(get_optim_params)( 22 | cfg=[ 23 | { 24 | 'params': '^(?=.*backbone).*$', 25 | 'lr': 0.00001 26 | }, 27 | ], 28 | # model=model 29 | ), 30 | lr=0.0001, 31 | betas=[0.9, 0.999], 32 | weight_decay=0.0001 33 | ) 34 | 35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)( 36 | # optimizer=optimizer, 37 | milestones=[1000], 38 | gamma=0.1 39 | ) 40 | 41 | model.transformer.num_body_points=14 42 | criterion.matcher.num_body_points=14 43 | criterion.num_body_points=14 44 | postprocessor.num_body_points=14 45 | -------------------------------------------------------------------------------- /configs/detrpose/detrpose_hgnetv2_x.py: -------------------------------------------------------------------------------- 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor 2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator 3 | 4 | from src.core import LazyCall as L 5 | from src.nn.optimizer import ModelEMA 6 | from src.misc.get_param_dicts import get_optim_params 7 | 8 | from torch import optim 9 | 10 | training_params.output_dir = "output/detrpose_hgnetv2_x" 11 | training_params.epochs = 52 # 48 + 4 12 | training_params.use_ema = True 13 | training_params.grad_accum_steps = 1 14 | 15 | ema = L(ModelEMA)( 16 | decay=0.9999, 17 | warmups=2000 18 | ) 19 | 20 | # optimizer params 21 | optimizer = L(optim.AdamW)( 22 | params=L(get_optim_params)( 23 | cfg=[ 24 | { 25 | 'params': '^(?=.*backbone).*$', 26 | 'lr': 0.00005 27 | }, 28 | ], 29 | # model=model 30 | ), 31 | lr=0.0001, 32 | betas=[0.9, 0.999], 33 | weight_decay=0.0001 34 | ) 35 | 36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)( 37 | # optimizer=optimizer, 38 | milestones=[1000], 39 | gamma=0.1 40 | ) 41 | 42 | model.backbone.name = 'B5' 43 | model.encoder.hidden_dim = 384 44 | model.encoder.dim_feedforward = 2048 45 | model.transformer.hidden_dim = 384 46 | # model.transformer.feat_channels = [384, 384, 384] 47 | model.transformer.reg_scale = 8 48 | -------------------------------------------------------------------------------- /tools/benchmark/get_info.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 3 | """ 4 | 5 | import os, sys 6 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..')) 7 | from src.core import LazyConfig, instantiate 8 | 9 | import argparse 10 | from calflops import calculate_flops 11 | 12 | import torch 13 | import torch.nn as nn 14 | 15 | def custom_repr(self): 16 | return f'{{Tensor:{tuple(self.shape)}}} {original_repr(self)}' 17 | original_repr = torch.Tensor.__repr__ 18 | torch.Tensor.__repr__ = custom_repr 19 | 20 | def main(args, ): 21 | """main 22 | """ 23 | cfg = LazyConfig.load(args.config_file) 24 | 25 | if hasattr(cfg.model.backbone, 'pretrained'): 26 | cfg.model.backbone.pretrained = False 27 | 28 | model = instantiate(cfg.model) 29 | 30 | model = model.deploy() 31 | model.eval() 32 | 33 | flops, macs, _ = calculate_flops(model=model, 34 | input_shape=(1, 3, 640, 640), 35 | output_as_string=True, 36 | output_precision=4) 37 | params = sum(p.numel() for p in model.parameters()) 38 | print("Model FLOPs:%s MACs:%s Params:%s \n" %(flops, macs, params)) 39 | 40 | 41 | if __name__ == '__main__': 42 | 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument('--config_file', '-c', default= "configs/linea/linea_hgnetv2_lpy", type=str) 45 | args = parser.parse_args() 46 | 47 | main(args) 48 | -------------------------------------------------------------------------------- /configs/detrpose/detrpose_hgnetv2_x_crowdpose.py: -------------------------------------------------------------------------------- 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor 2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator 3 | 4 | from src.core import LazyCall as L 5 | from src.nn.optimizer import ModelEMA 6 | from src.misc.get_param_dicts import get_optim_params 7 | 8 | from torch import optim 9 | 10 | training_params.output_dir = "output/detrpose_hgnetv2_x_crowdpose" 11 | training_params.epochs = 52 # 48 + 4 12 | training_params.use_ema = True 13 | 14 | ema = L(ModelEMA)( 15 | decay=0.9999, 16 | warmups=2000 17 | ) 18 | 19 | # optimizer params 20 | optimizer = L(optim.AdamW)( 21 | params=L(get_optim_params)( 22 | cfg=[ 23 | { 24 | 'params': '^(?=.*backbone).*$', 25 | 'lr': 0.00001 26 | }, 27 | ], 28 | # model=model 29 | ), 30 | lr=0.0001, 31 | betas=[0.9, 0.999], 32 | weight_decay=0.0001 33 | ) 34 | 35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)( 36 | # optimizer=optimizer, 37 | milestones=[1000], 38 | gamma=0.1 39 | ) 40 | 41 | model.transformer.num_body_points=14 42 | criterion.matcher.num_body_points=14 43 | criterion.num_body_points=14 44 | postprocessor.num_body_points=14 45 | 46 | model.backbone.name = 'B5' 47 | model.encoder.hidden_dim = 384 48 | model.encoder.dim_feedforward = 2048 49 | model.transformer.hidden_dim = 384 50 | # model.transformer.feat_channels = [384, 384, 384] 51 | model.transformer.reg_scale = 8 52 | -------------------------------------------------------------------------------- /src/models/detrpose/detrpose.py: -------------------------------------------------------------------------------- 1 | """ 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from DEIM (https://github.com/Intellindust-AI-Lab/DEIM/) 6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 7 | --------------------------------------------------------------------------------- 8 | Modified from D-FINE (https://github.com/Peterande/D-FINE/) 9 | Copyright (c) 2024 D-FINE Authors. All Rights Reserved. 10 | --------------------------------------------------------------------------------- 11 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/) 12 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved. 13 | """ 14 | 15 | from torch import nn 16 | 17 | class DETRPose(nn.Module): 18 | def __init__( 19 | self, 20 | backbone, 21 | encoder, 22 | transformer 23 | ): 24 | super().__init__() 25 | self.backbone = backbone 26 | self.encoder = encoder 27 | self.transformer = transformer 28 | 29 | def deploy(self): 30 | self.eval() 31 | for m in self.modules(): 32 | if hasattr(m, "convert_to_deploy"): 33 | m.convert_to_deploy() 34 | return self 35 | 36 | def forward(self, samples, targets=None): 37 | feats = self.backbone(samples) 38 | feats = self.encoder(feats) 39 | out = self.transformer(feats, targets, samples if self.training else None) 40 | return out 41 | 42 | -------------------------------------------------------------------------------- /configs/detrpose/detrpose_hgnetv2_m.py: -------------------------------------------------------------------------------- 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor 2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator 3 | 4 | from src.core import LazyCall as L 5 | from src.nn.optimizer import ModelEMA 6 | from src.misc.get_param_dicts import get_optim_params 7 | 8 | from torch import optim 9 | 10 | training_params.output_dir = "output/detrpose_hgnetv2_m" 11 | training_params.epochs = 64 # 60 + 4 12 | training_params.use_ema = True 13 | training_params.grad_accum_steps = 1 14 | 15 | ema = L(ModelEMA)( 16 | decay=0.9999, 17 | warmups=2000 18 | ) 19 | 20 | # optimizer params 21 | optimizer = L(optim.AdamW)( 22 | params=L(get_optim_params)( 23 | cfg=[ 24 | { 25 | 'params': '^(?=.*backbone).*$', 26 | 'lr': 0.00001 27 | }, 28 | ], 29 | # model=model 30 | ), 31 | lr=0.0001, 32 | betas=[0.9, 0.999], 33 | weight_decay=0.0001 34 | ) 35 | 36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)( 37 | # optimizer=optimizer, 38 | milestones=[15], 39 | gamma=0.1 40 | ) 41 | 42 | model.backbone.name = 'B2' 43 | model.backbone.use_lab = True 44 | model.encoder.in_channels = [384, 768, 1536] 45 | model.encoder.depth_mult = 0.67 46 | model.transformer.num_decoder_layers = 4 47 | 48 | dataset_train.dataset.transforms.policy = { 49 | 'name': 'stop_epoch', 50 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'], 51 | 'epoch': [5, 35, 60] # 60 / 2 + 5 = 35 52 | } 53 | dataset_train.collate_fn.base_size_repeat = 6 54 | dataset_train.collate_fn.stop_epoch = 60 55 | -------------------------------------------------------------------------------- /src/nn/optimizer/warmup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | from torch.optim.lr_scheduler import LRScheduler 7 | 8 | 9 | class Warmup(object): 10 | def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int=-1) -> None: 11 | self.lr_scheduler = lr_scheduler 12 | self.warmup_end_values = [pg['lr'] for pg in lr_scheduler.optimizer.param_groups] 13 | self.last_step = last_step 14 | self.warmup_duration = warmup_duration 15 | self.step() 16 | 17 | def state_dict(self): 18 | return {k: v for k, v in self.__dict__.items() if k != 'lr_scheduler'} 19 | 20 | def load_state_dict(self, state_dict): 21 | self.__dict__.update(state_dict) 22 | 23 | def get_warmup_factor(self, step, **kwargs): 24 | raise NotImplementedError 25 | 26 | def step(self, ): 27 | self.last_step += 1 28 | if self.last_step >= self.warmup_duration: 29 | return 30 | factor = self.get_warmup_factor(self.last_step) 31 | for i, pg in enumerate(self.lr_scheduler.optimizer.param_groups): 32 | pg['lr'] = factor * self.warmup_end_values[i] 33 | 34 | def finished(self, ): 35 | if self.last_step >= self.warmup_duration: 36 | return True 37 | return False 38 | 39 | 40 | class LinearWarmup(Warmup): 41 | def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int = -1) -> None: 42 | super().__init__(lr_scheduler, warmup_duration, last_step) 43 | 44 | def get_warmup_factor(self, step): 45 | return min(1.0, (step + 1) / self.warmup_duration) -------------------------------------------------------------------------------- /configs/detrpose/detrpose_hgnetv2_s.py: -------------------------------------------------------------------------------- 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor 2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator 3 | 4 | from src.core import LazyCall as L 5 | from src.nn.optimizer import ModelEMA 6 | from src.misc.get_param_dicts import get_optim_params 7 | 8 | from torch import optim 9 | 10 | training_params.output_dir = "output/detrpose_hgnetv2_s" 11 | training_params.epochs = 100 # 96 + 4 12 | training_params.use_ema = True 13 | training_params.grad_accum_steps = 1 14 | 15 | ema = L(ModelEMA)( 16 | decay=0.9999, 17 | warmups=2000 18 | ) 19 | 20 | # optimizer params 21 | optimizer = L(optim.AdamW)( 22 | params=L(get_optim_params)( 23 | cfg=[ 24 | { 25 | 'params': '^(?=.*backbone).*$', 26 | 'lr': 0.0001 27 | }, 28 | ], 29 | # model=model 30 | ), 31 | lr=0.0001, 32 | betas=[0.9, 0.999], 33 | weight_decay=0.0001 34 | ) 35 | 36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)( 37 | # optimizer=optimizer, 38 | milestones=[1000], 39 | gamma=0.1 40 | ) 41 | 42 | model.backbone.name = 'B0' 43 | model.backbone.use_lab = True 44 | model.encoder.in_channels = [256, 512, 1024] 45 | model.encoder.depth_mult=0.34 46 | model.encoder.expansion=0.5 47 | model.transformer.num_decoder_layers = 3 48 | 49 | dataset_train.dataset.transforms.policy = { 50 | 'name': 'stop_epoch', 51 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'], 52 | 'epoch': [5, 53, 96] # 96 / 2 + 5 = 53 53 | } 54 | dataset_train.collate_fn.base_size_repeat = 20 55 | dataset_train.collate_fn.stop_epoch = 96 56 | -------------------------------------------------------------------------------- /configs/detrpose/detrpose_hgnetv2_m_crowdpose.py: -------------------------------------------------------------------------------- 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor 2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator 3 | 4 | from src.core import LazyCall as L 5 | from src.nn.optimizer import ModelEMA 6 | from src.misc.get_param_dicts import get_optim_params 7 | 8 | from torch import optim 9 | 10 | training_params.output_dir = "output/detrpose_hgnetv2_m_crowdpose" 11 | training_params.epochs = 76 # 72 + 4 12 | training_params.use_ema = True 13 | 14 | ema = L(ModelEMA)( 15 | decay=0.9999, 16 | warmups=2000 17 | ) 18 | 19 | # optimizer params 20 | optimizer = L(optim.AdamW)( 21 | params=L(get_optim_params)( 22 | cfg=[ 23 | { 24 | 'params': '^(?=.*backbone).*$', 25 | 'lr': 0.00001 26 | }, 27 | ], 28 | # model=model 29 | ), 30 | lr=0.0001, 31 | betas=[0.9, 0.999], 32 | weight_decay=0.0001 33 | ) 34 | 35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)( 36 | # optimizer=optimizer, 37 | milestones=[1000], 38 | gamma=0.1 39 | ) 40 | 41 | model.transformer.num_body_points=14 42 | criterion.matcher.num_body_points=14 43 | criterion.num_body_points=14 44 | postprocessor.num_body_points=14 45 | 46 | model.backbone.name = 'B2' 47 | model.backbone.use_lab = True 48 | model.encoder.in_channels = [384, 768, 1536] 49 | model.encoder.depth_mult = 0.67 50 | model.transformer.num_decoder_layers = 4 51 | 52 | dataset_train.dataset.transforms.policy = { 53 | 'name': 'stop_epoch', 54 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'], 55 | 'epoch': [5, 41, 72] # 72 / 2 + 5 = 35 56 | } 57 | dataset_train.collate_fn.base_size_repeat = 6 58 | dataset_train.collate_fn.stop_epoch = 72 59 | -------------------------------------------------------------------------------- /configs/detrpose/detrpose_hgnetv2_s_crowdpose.py: -------------------------------------------------------------------------------- 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor 2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator 3 | 4 | from src.core import LazyCall as L 5 | from src.nn.optimizer import ModelEMA 6 | from src.misc.get_param_dicts import get_optim_params 7 | 8 | from torch import optim 9 | 10 | training_params.output_dir = "output/detrpose_hgnetv2_s_crowdpose" 11 | training_params.epochs = 176 # 156 + 20 12 | training_params.use_ema = True 13 | 14 | ema = L(ModelEMA)( 15 | decay=0.9999, 16 | warmups=2000 17 | ) 18 | 19 | # optimizer params 20 | optimizer = L(optim.AdamW)( 21 | params=L(get_optim_params)( 22 | cfg=[ 23 | { 24 | 'params': '^(?=.*backbone).*$', 25 | 'lr': 0.00001 26 | }, 27 | ], 28 | # model=model 29 | ), 30 | lr=0.0001, 31 | betas=[0.9, 0.999], 32 | weight_decay=0.0001 33 | ) 34 | 35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)( 36 | # optimizer=optimizer, 37 | milestones=[1000], 38 | gamma=0.1 39 | ) 40 | 41 | model.transformer.num_body_points=14 42 | criterion.matcher.num_body_points=14 43 | criterion.num_body_points=14 44 | postprocessor.num_body_points=14 45 | 46 | model.backbone.name = 'B0' 47 | model.backbone.use_lab = True 48 | model.encoder.in_channels = [256, 512, 1024] 49 | model.encoder.depth_mult=0.34 50 | model.encoder.expansion=0.5 51 | model.transformer.num_decoder_layers = 3 52 | 53 | dataset_train.dataset.transforms.policy = { 54 | 'name': 'stop_epoch', 55 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'], 56 | 'epoch': [5, 83, 156] # 156 / 2 + 5 = 83 57 | } 58 | dataset_train.collate_fn.base_size_repeat = 20 59 | dataset_train.collate_fn.stop_epoch = 156 60 | -------------------------------------------------------------------------------- /src/core/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from typing import Any 4 | from iopath.common.file_io import PathManager as PathManagerBase 5 | 6 | PathManager = PathManagerBase() 7 | 8 | def _convert_target_to_string(t: Any) -> str: 9 | """ 10 | Inverse of ``locate()``. 11 | 12 | Args: 13 | t: any object with ``__module__`` and ``__qualname__`` 14 | """ 15 | module, qualname = t.__module__, t.__qualname__ 16 | 17 | # Compress the path to this object, e.g. ``module.submodule._impl.class`` 18 | # may become ``module.submodule.class``, if the later also resolves to the same 19 | # object. This simplifies the string, and also is less affected by moving the 20 | # class implementation. 21 | module_parts = module.split(".") 22 | for k in range(1, len(module_parts)): 23 | prefix = ".".join(module_parts[:k]) 24 | candidate = f"{prefix}.{qualname}" 25 | try: 26 | if locate(candidate) is t: 27 | return candidate 28 | except ImportError: 29 | pass 30 | return f"{module}.{qualname}" 31 | 32 | 33 | def locate(name: str) -> Any: 34 | """ 35 | Locate and return an object ``x`` using an input string ``{x.__module__}.{x.__qualname__}``, 36 | such as "module.submodule.class_name". 37 | 38 | Raise Exception if it cannot be found. 39 | """ 40 | obj = pydoc.locate(name) 41 | 42 | # Some cases (e.g. torch.optim.sgd.SGD) not handled correctly 43 | # by pydoc.locate. Try a private function from hydra. 44 | if obj is None: 45 | try: 46 | # from hydra.utils import get_method - will print many errors 47 | from hydra.utils import _locate 48 | except ImportError as e: 49 | raise ImportError(f"Cannot dynamically locate object {name}!") from e 50 | else: 51 | obj = _locate(name) # it raises if fails 52 | 53 | return obj 54 | -------------------------------------------------------------------------------- /configs/detrpose/detrpose_hgnetv2_n.py: -------------------------------------------------------------------------------- 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor 2 | from .include.dataset import dataset_train, dataset_val, dataset_test, evaluator 3 | 4 | from src.core import LazyCall as L 5 | from src.nn.optimizer import ModelEMA 6 | from src.misc.get_param_dicts import get_optim_params 7 | 8 | from torch import optim 9 | 10 | training_params.output_dir = "output/detrpose_hgnetv2_n" 11 | training_params.epochs = 160 # 96 + 4 12 | training_params.use_ema = True 13 | training_params.grad_accum_steps = 1 14 | 15 | ema = L(ModelEMA)( 16 | decay=0.9999, 17 | warmups=2000 18 | ) 19 | 20 | # optimizer params 21 | optimizer = L(optim.AdamW)( 22 | params=L(get_optim_params)( 23 | cfg=[ 24 | { 25 | 'params': '^(?=.*backbone).*$', 26 | 'lr': 0.0001 27 | }, 28 | ], 29 | # model=model 30 | ), 31 | lr=0.0001, 32 | betas=[0.9, 0.999], 33 | weight_decay=0.0001 34 | ) 35 | 36 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)( 37 | # optimizer=optimizer, 38 | milestones=[1000], 39 | gamma=0.1 40 | ) 41 | 42 | model.backbone.name = 'B0' 43 | model.backbone.use_lab = True 44 | model.backbone.return_idx = [2, 3] 45 | model.encoder.in_channels = [512, 1024] 46 | model.encoder.feat_strides = [16, 32] 47 | model.encoder.n_levels = 2 48 | model.encoder.use_encoder_idx = [1] 49 | model.encoder.depth_mult = 0.5 50 | model.encoder.expansion = 0.34 51 | model.encoder.hidden_dim = 128 52 | model.encoder.dim_feedforward = 512 53 | model.transformer.num_decoder_layers = 3 54 | model.transformer.num_feature_levels = 2 55 | model.transformer.dim_feedforward = 512 56 | model.transformer.feat_strides = [16, 32] 57 | model.transformer.hidden_dim = 128 58 | model.transformer.dec_n_points= 6 59 | 60 | dataset_train.dataset.transforms.policy = { 61 | 'name': 'stop_epoch', 62 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'], 63 | 'epoch': [5, 83, 156] # 156 / 2 + 5 = 83 64 | } 65 | dataset_train.collate_fn.base_size_repeat = None 66 | dataset_train.collate_fn.stop_epoch = 156 67 | -------------------------------------------------------------------------------- /configs/detrpose/detrpose_hgnetv2_n_crowdpose.py: -------------------------------------------------------------------------------- 1 | from .include.detrpose_hgnetv2 import model, criterion, training_params, postprocessor 2 | from .include.dataset_crowdpose import dataset_train, dataset_val, dataset_test, evaluator 3 | 4 | from src.core import LazyCall as L 5 | from src.nn.optimizer import ModelEMA 6 | from src.misc.get_param_dicts import get_optim_params 7 | 8 | from torch import optim 9 | 10 | training_params.output_dir = "output/detrpose_hgnetv2_n_crowdpose" 11 | training_params.epochs = 284 # 264 + 20 12 | training_params.use_ema = True 13 | 14 | ema = L(ModelEMA)( 15 | decay=0.9999, 16 | warmups=2000 17 | ) 18 | 19 | # optimizer params 20 | optimizer = L(optim.AdamW)( 21 | params=L(get_optim_params)( 22 | cfg=[ 23 | { 24 | 'params': '^(?=.*backbone).*$', 25 | 'lr': 0.00001 26 | }, 27 | ], 28 | # model=model 29 | ), 30 | lr=0.0001, 31 | betas=[0.9, 0.999], 32 | weight_decay=0.0001 33 | ) 34 | 35 | lr_scheduler = L(optim.lr_scheduler.MultiStepLR)( 36 | # optimizer=optimizer, 37 | milestones=[1000], 38 | gamma=0.1 39 | ) 40 | 41 | model.transformer.num_body_points=14 42 | criterion.matcher.num_body_points=14 43 | criterion.num_body_points=14 44 | postprocessor.num_body_points=14 45 | 46 | model.backbone.name = 'B0' 47 | model.backbone.use_lab = True 48 | model.backbone.return_idx = [2, 3] 49 | model.encoder.in_channels = [512, 1024] 50 | model.encoder.feat_strides = [16, 32] 51 | model.encoder.n_levels = 2 52 | model.encoder.use_encoder_idx = [1] 53 | model.encoder.depth_mult = 0.5 54 | model.encoder.expansion = 0.34 55 | model.encoder.hidden_dim = 128 56 | model.encoder.dim_feedforward = 512 57 | model.transformer.num_decoder_layers = 3 58 | model.transformer.num_feature_levels = 2 59 | model.transformer.dim_feedforward = 512 60 | model.transformer.feat_strides = [16, 32] 61 | model.transformer.hidden_dim = 128 62 | model.transformer.dec_n_points= 6 63 | 64 | dataset_train.dataset.transforms.policy = { 65 | 'name': 'stop_epoch', 66 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'], 67 | 'epoch': [5, 137, 264] # 264 / 2 + 5 = 137 68 | } 69 | dataset_train.collate_fn.base_size_repeat = None 70 | dataset_train.collate_fn.stop_epoch = 264 71 | -------------------------------------------------------------------------------- /src/models/detrpose/postprocesses.py: -------------------------------------------------------------------------------- 1 | """ 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/) 6 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved. 7 | --------------------------------------------------------------------------------- 8 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/) 9 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved. 10 | --------------------------------------------------------------------------------- 11 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/) 12 | Copyright (c) 2023 IDEA. All Rights Reserved. 13 | """ 14 | 15 | import torch 16 | from torch import nn 17 | from torchvision.ops.boxes import nms 18 | 19 | 20 | class PostProcess(nn.Module): 21 | """ This module converts the model's output into the format expected by the coco api""" 22 | def __init__(self, num_select=60, num_body_points=17) -> None: 23 | super().__init__() 24 | self.num_select = num_select 25 | self.num_body_points = num_body_points 26 | self.deploy_mode = False 27 | 28 | @torch.no_grad() 29 | def forward(self, outputs, target_sizes): 30 | num_select = self.num_select 31 | out_logits, out_keypoints= outputs['pred_logits'], outputs['pred_keypoints'] 32 | 33 | prob = out_logits.sigmoid() 34 | topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), num_select, dim=1) 35 | scores = topk_values 36 | 37 | # keypoints 38 | topk_keypoints = (topk_indexes.float() // out_logits.shape[2]).long() 39 | labels = topk_indexes % out_logits.shape[2] 40 | 41 | if self.deploy_mode: 42 | keypoints = torch.gather(out_keypoints, 1, topk_keypoints[..., None, None].expand(1, num_select, self.num_body_points, 2)) 43 | keypoints = keypoints * target_sizes[:, None, None, :] 44 | return scores, labels, keypoints 45 | 46 | keypoints = torch.gather(out_keypoints, 1, topk_keypoints.unsqueeze(-1).repeat(1, 1, self.num_body_points*2)) 47 | keypoints = keypoints * target_sizes.repeat(1, self.num_body_points)[:, None, :] 48 | keypoints_res = keypoints.unflatten(-1, (-1, 2)) 49 | keypoints_res = torch.cat( 50 | [keypoints_res, torch.ones_like(keypoints_res[..., 0:1])], 51 | dim=-1).flatten(-2) 52 | 53 | results = [{'scores': s, 'labels': l, 'keypoints': k} for s, l, k in zip(scores, labels, keypoints_res)] 54 | return results 55 | 56 | def deploy(self, ): 57 | self.eval() 58 | self.deploy_mode = True 59 | return self 60 | -------------------------------------------------------------------------------- /src/misc/metrics.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import json 3 | import torch 4 | 5 | def inverse_sigmoid(x, eps=1e-5): 6 | x = x.clamp(min=0, max=1) 7 | x1 = x.clamp(min=eps) 8 | x2 = (1 - x).clamp(min=eps) 9 | return torch.log(x1/x2) 10 | 11 | 12 | class BestMetricSingle(): 13 | def __init__(self, init_res=0.0, better='large') -> None: 14 | self.init_res = init_res 15 | self.best_res = init_res 16 | self.best_ep = -1 17 | 18 | self.better = better 19 | assert better in ['large', 'small'] 20 | 21 | def isbetter(self, new_res, old_res): 22 | if self.better == 'large': 23 | return new_res > old_res 24 | if self.better == 'small': 25 | return new_res < old_res 26 | 27 | def update(self, new_res, ep): 28 | if self.isbetter(new_res, self.best_res): 29 | self.best_res = new_res 30 | self.best_ep = ep 31 | return True 32 | return False 33 | 34 | def __str__(self) -> str: 35 | return "best_res: {}\t best_ep: {}".format(self.best_res, self.best_ep) 36 | 37 | def __repr__(self) -> str: 38 | return self.__str__() 39 | 40 | def summary(self) -> dict: 41 | return { 42 | 'best_res': self.best_res, 43 | 'best_ep': self.best_ep, 44 | } 45 | 46 | 47 | class BestMetricHolder(): 48 | def __init__(self, init_res=0.0, better='large', use_ema=False) -> None: 49 | self.best_all = BestMetricSingle(init_res, better) 50 | self.use_ema = use_ema 51 | if use_ema: 52 | self.best_ema = BestMetricSingle(init_res, better) 53 | self.best_regular = BestMetricSingle(init_res, better) 54 | 55 | 56 | def update(self, new_res, epoch, is_ema=False): 57 | """ 58 | return if the results is the best. 59 | """ 60 | if not self.use_ema: 61 | return self.best_all.update(new_res, epoch) 62 | else: 63 | if is_ema: 64 | self.best_ema.update(new_res, epoch) 65 | return self.best_all.update(new_res, epoch) 66 | else: 67 | self.best_regular.update(new_res, epoch) 68 | return self.best_all.update(new_res, epoch) 69 | 70 | def summary(self): 71 | if not self.use_ema: 72 | return self.best_all.summary() 73 | 74 | res = {} 75 | res.update({f'all_{k}':v for k,v in self.best_all.summary().items()}) 76 | res.update({f'regular_{k}':v for k,v in self.best_regular.summary().items()}) 77 | res.update({f'ema_{k}':v for k,v in self.best_ema.summary().items()}) 78 | return res 79 | 80 | def __repr__(self) -> str: 81 | return json.dumps(self.summary(), indent=2) 82 | 83 | def __str__(self) -> str: 84 | return self.__repr__() -------------------------------------------------------------------------------- /configs/detrpose/include/detrpose_hgnetv2.py: -------------------------------------------------------------------------------- 1 | from src.core import LazyCall as L 2 | from src.models.detrpose import ( 3 | DETRPose, 4 | HybridEncoder, 5 | Transformer, 6 | PostProcess, 7 | Criterion, 8 | HungarianMatcher, 9 | ) 10 | 11 | from src.nn import HGNetv2 12 | 13 | training_params = { 14 | "clip_max_norm": 0.1, 15 | "save_checkpoint_interval": 1, 16 | "grad_accum_steps": 2, 17 | "print_freq": 100, 18 | 'sync_bn': True, 19 | 'use_ema': False, 20 | 'dist_url': 'env://', 21 | } 22 | 23 | eval_spatial_size = (640, 640) 24 | hidden_dim = 256 25 | n_levels = 3 26 | feat_strides = [8, 16, 32] 27 | num_classes = 2 28 | 29 | model = L(DETRPose)( 30 | backbone=L(HGNetv2)( 31 | name='B4', 32 | use_lab=False, 33 | return_idx=[1, 2, 3], 34 | freeze_stem_only=True, 35 | freeze_at=-1, 36 | freeze_norm=True, 37 | pretrained=True, 38 | ), 39 | encoder=L(HybridEncoder)( 40 | in_channels=[512, 1024, 2048], 41 | feat_strides=feat_strides, 42 | n_levels=n_levels, 43 | hidden_dim=hidden_dim, 44 | nhead=8, 45 | dim_feedforward=1024, 46 | dropout=0.0, 47 | enc_act='gelu', 48 | expansion=1.0, 49 | depth_mult=1.0, 50 | act='silu', 51 | temperatureH=20, 52 | temperatureW=20, 53 | eval_spatial_size= eval_spatial_size 54 | ), 55 | transformer=L(Transformer)( 56 | hidden_dim=hidden_dim, 57 | dropout=0.0, 58 | nhead=8, 59 | num_queries=60, 60 | dim_feedforward=1024, 61 | num_decoder_layers=6, 62 | normalize_before=False, 63 | return_intermediate_dec=True, 64 | activation='relu', 65 | num_feature_levels=3, 66 | dec_n_points=4, 67 | learnable_tgt_init=True, 68 | two_stage_type='standard', 69 | num_body_points=17, 70 | aux_loss=True, 71 | num_classes=num_classes, 72 | dec_pred_class_embed_share = False, 73 | dec_pred_pose_embed_share = False, 74 | two_stage_class_embed_share=False, 75 | two_stage_bbox_embed_share=False, 76 | cls_no_bias = False, 77 | # new parameters 78 | feat_strides=[8, 16, 32], 79 | eval_spatial_size=eval_spatial_size, 80 | reg_max=32, 81 | reg_scale=4 82 | ), 83 | ) 84 | 85 | criterion = L(Criterion)( 86 | num_classes=num_classes, 87 | weight_dict={'loss_vfl': 2.0, 'loss_keypoints': 10.0, 'loss_oks': 4.0}, 88 | focal_alpha=0.25, 89 | losses=['vfl', 'keypoints'], 90 | matcher=L(HungarianMatcher)( 91 | cost_class=2.0, 92 | cost_keypoints=10.0, 93 | cost_oks=4.0, 94 | focal_alpha=0.25 95 | ), 96 | num_body_points=17 97 | ) 98 | 99 | postprocessor = L(PostProcess)(num_select=60, num_body_points=17) 100 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from omegaconf import OmegaConf 3 | 4 | from src.solver import Trainer 5 | from src.misc import dist_utils 6 | from src.core import LazyConfig, instantiate 7 | 8 | def get_args_parser(): 9 | parser = argparse.ArgumentParser('Set transformer detector', add_help=False) 10 | parser.add_argument('--config_file', '-c', type=str, required=True) 11 | parser.add_argument('--options', 12 | nargs='+', 13 | help='override some settings in the used config, the key-value pair ' 14 | 'in xxx=yyy format will be merged into config file.') 15 | parser.add_argument('--device', default='cuda', 16 | help='device to use for training / testing') 17 | parser.add_argument('--seed', default=42, type=int) 18 | parser.add_argument('--resume', default=None, help='resume from checkpoint') 19 | parser.add_argument('--pretrain', default=None, help='apply transfer learning to the backbone and encoder using DFINE weights') 20 | parser.add_argument('--start_epoch', default=0, type=int, metavar='N', 21 | help='start epoch') 22 | parser.add_argument('--eval', action='store_true') 23 | parser.add_argument('--test', action='store_true') 24 | parser.add_argument('--find_unused_params', action='store_true') 25 | 26 | # distributed training parameters 27 | parser.add_argument('--world_size', default=1, type=int, 28 | help='number of distributed processes') 29 | parser.add_argument('--rank', default=0, type=int, 30 | help='number of distributed processes') 31 | parser.add_argument("--local_rank", type=int, help='local rank for DistributedDataParallel') 32 | parser.add_argument('--amp', action='store_true', 33 | help="Train with mixed precision") 34 | 35 | return parser 36 | 37 | def main(args): 38 | cfg = LazyConfig.load(args.config_file) 39 | 40 | updates = OmegaConf.create() 41 | for k, v in args.__dict__.items(): 42 | if k not in ["options"] and v is not None: 43 | updates[k] = v 44 | cfg.training_params = OmegaConf.merge(cfg.training_params, updates) 45 | 46 | if args.options: 47 | cfg = LazyConfig.apply_overrides(cfg, args.options) 48 | print(cfg) 49 | 50 | solver = Trainer(cfg) 51 | 52 | assert not(args.eval and args.test), "you can't do evaluation and test at the same time" 53 | 54 | if args.eval: 55 | if hasattr(cfg.model.backbone, 'pretrained'): 56 | cfg.model.backbone.pretrained = False 57 | solver.eval() 58 | elif args.test: 59 | if hasattr(cfg.model.backbone, 'pretrained'): 60 | cfg.model.backbone.pretrained = False 61 | solver.test() 62 | else: 63 | solver.fit() 64 | dist_utils.cleanup() 65 | 66 | if __name__ == '__main__': 67 | parser = argparse.ArgumentParser('RT-GroupPose training and evaluation script', parents=[get_args_parser()]) 68 | args = parser.parse_args() 69 | main(args) 70 | -------------------------------------------------------------------------------- /assets/TENSORRT_CONTAINER_LAMBDA.AI.md: -------------------------------------------------------------------------------- 1 |

2 | Manual to install TensorRT Containers in Lambda.ai instances 3 |

4 | 5 | ## Quick Start 6 | ### Lambda.ai 7 | 1. Go to [Lambda.ai](https://lambda.ai) and create an account. 8 | 2. Log in to your Lambda.ai account. 9 | 3. Click on the `Launch instance' button. It is located on the top right side of the website. 10 | 4. Select an instance. To replicate our results from the appendix, select `8x Tesla V100 (16 GB)` 11 | 12 | ### TensorRT Container Installation 13 | 1. Docker setup 14 | ```shell 15 | sudo usermod -aG docker $USER 16 | newgrp docker 17 | ``` 18 | 3. Installing Nvidia DeepLearning container 19 | ```shell 20 | curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ 21 | && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ 22 | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ 23 | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list 24 | ``` 25 | 2. Installing TensorRT docker container 26 | ```shell 27 | docker pull nvcr.io/nvidia/tensorrt:24.04-py3 28 | docker run --gpus all -it --rm nvcr.io/nvidia/tensorrt:24.04-py3 29 | ``` 30 | 31 | 3. Install the CUDA toolkit with the correct version (in our case 12.8) 32 | ```shell 33 | # cuda installation 34 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb 35 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 36 | sudo apt-get update 37 | sudo apt-get -y install cuda-toolkit-12-8 38 | ``` 39 | 40 | The complete installation takes approximately 5 minutes. 41 | 42 | ## Installing DETRPose 43 | ### Quick Start 44 | ```shell 45 | git clone https://github.com/SebastianJanampa/DETRPose.git 46 | cd DETRPose 47 | pip install -r requirements.txt 48 | apt-get update && apt-get install libgl1 49 | ``` 50 | 51 | ### Data Preparation 52 | ``` 53 | pip install gdown # to download files from google drive 54 | gdown 1VprytECcLtU4tKP32SYi_7oDRbw7yUTL # images 55 | unzip images.zip 56 | ``` 57 | 58 | ### Usage 59 | ```shell 60 | pip install onnx onnxsim 61 | pip install -r tools/benchmark/requirements.txt 62 | 63 | export model=l #n, s, m, l, x 64 | mkdir trt_engines 65 | ``` 66 | 1. Download official weights 67 | ```shell 68 | wget https://github.com/SebastianJanampa/DETRPose/releases/download/model_weights/detrpose_hgnetv2_${model}.pth 69 | ``` 70 | 2. Export onnx 71 | ```shell 72 | python tools/deployment/export_onnx.py --check -c configs/detrpose/detrpose_hgnetv2_${model}.py -r detrpose_hgnetv2_${model}.pth 73 | ``` 74 | 3. Export tensorrt 75 | ```shell 76 | trtexec --onnx="onnx_engines/detrpose_hgnetv2_${model}.onnx" --saveEngine="trt_engines/detrpose_hgnetv2_${model}.engine" --fp16 77 | ``` 78 | 4. Benchmark 79 | ```shell 80 | python tools/benchmark/trt_benchmark.py --infer_dir ./images --engine_dir trt_engines 81 | ``` -------------------------------------------------------------------------------- /configs/detrpose/include/dataset.py: -------------------------------------------------------------------------------- 1 | from src.core import LazyCall as L 2 | from src.data import CocoDetection 3 | from src.data.dataloader import ( 4 | BatchImageCollateFunction, 5 | DataLoader 6 | ) 7 | from src.data.coco_eval import CocoEvaluator 8 | from src.data.container import Compose 9 | import src.data.transforms as T 10 | 11 | from .detrpose_hgnetv2 import eval_spatial_size 12 | 13 | from omegaconf import OmegaConf 14 | 15 | scales = [(640, 640)] 16 | max_size = 1333 17 | scales2_resize = [400, 500, 600] 18 | 19 | __all__ = ["dataset_train", "dataset_val", "dataset_test", "evaluator"] 20 | 21 | dataset_train = L(DataLoader)( 22 | dataset=L(CocoDetection)( 23 | img_folder="./data/COCO2017/train2017", 24 | ann_file="./data/COCO2017/annotations/person_keypoints_train2017.json", 25 | transforms=L(Compose)( 26 | policy={ 27 | 'name': 'stop_epoch', 28 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'], 29 | 'epoch': [5, 29, 48] 30 | }, 31 | mosaic_prob=0.5, 32 | transforms1=L(T.Mosaic)(output_size=320, probability=1.0), 33 | transforms2=L(T.RandomZoomOut)(p=0.5), 34 | transforms3=L(T.RandomHorizontalFlip)(), 35 | transforms4=L(T.ColorJitter)(), 36 | transforms5=L(T.RandomResize)(sizes=scales, max_size=max_size), 37 | transforms6=L(T.ToTensor)(), 38 | transforms7=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1]) 39 | ), 40 | 41 | ), 42 | total_batch_size=16, 43 | collate_fn=L(BatchImageCollateFunction)( 44 | base_size=eval_spatial_size[0], 45 | base_size_repeat=4, 46 | stop_epoch=48, 47 | ), 48 | num_workers=4, 49 | shuffle=True, 50 | drop_last=True, 51 | pin_memory=True 52 | ) 53 | 54 | dataset_val = L(DataLoader)( 55 | dataset=L(CocoDetection)( 56 | img_folder="./data/COCO2017/val2017", 57 | ann_file="./data/COCO2017/annotations/person_keypoints_val2017.json", 58 | transforms=L(Compose)( 59 | transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size), 60 | transforms2=L(T.ToTensor)(), 61 | transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1]) 62 | ), 63 | ), 64 | total_batch_size=32, 65 | collate_fn=L(BatchImageCollateFunction)( 66 | base_size=eval_spatial_size[0], 67 | ), 68 | num_workers=4, 69 | shuffle=False, 70 | drop_last=False, 71 | pin_memory=True 72 | ) 73 | 74 | dataset_test = L(DataLoader)( 75 | dataset=L(CocoDetection)( 76 | img_folder="./data/COCO2017/test2017", 77 | ann_file="./data/COCO2017/annotations/image_info_test-dev2017.json", 78 | transforms=L(Compose)( 79 | transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size), 80 | transforms2=L(T.ToTensor)(), 81 | transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1]) 82 | ), 83 | ), 84 | total_batch_size=32, 85 | collate_fn=L(BatchImageCollateFunction)( 86 | base_size=eval_spatial_size[0], 87 | ), 88 | num_workers=4, 89 | shuffle=False, 90 | drop_last=False, 91 | pin_memory=True 92 | ) 93 | 94 | evaluator = L(CocoEvaluator)( 95 | ann_file="./data/COCO2017/annotations/person_keypoints_val2017.json", 96 | iou_types=['keypoints'], 97 | useCats=True 98 | ) 99 | 100 | -------------------------------------------------------------------------------- /configs/detrpose/include/dataset_crowdpose.py: -------------------------------------------------------------------------------- 1 | from src.core import LazyCall as L 2 | from src.data import CrowdPoseDetection 3 | from src.data.dataloader import ( 4 | BatchImageCollateFunction, 5 | DataLoader 6 | ) 7 | from src.data.crowdpose_eval import CrowdPoseEvaluator 8 | from src.data.container import Compose 9 | import src.data.transforms as T 10 | import src.data.transforms_crowdpose as CrowdT 11 | 12 | from .detrpose_hgnetv2 import eval_spatial_size 13 | 14 | from omegaconf import OmegaConf 15 | 16 | scales = [(640, 640)] 17 | max_size = 1333 18 | scales2_resize = [400, 500, 600] 19 | 20 | __all__ = ["dataset_train", "dataset_val", "dataset_test", "evaluator"] 21 | 22 | dataset_train = L(DataLoader)( 23 | dataset=L(CrowdPoseDetection)( 24 | img_folder="./data/crowdpose/images", 25 | ann_file="./data/crowdpose/annotations/crowdpose_trainval.json", 26 | transforms=L(Compose)( 27 | policy={ 28 | 'name': 'stop_epoch', 29 | 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'], 30 | 'epoch': [5, 29, 60] 31 | }, 32 | mosaic_prob=0.5, 33 | transforms1=L(T.Mosaic)(output_size=320, probability=1.0), 34 | transforms2=L(T.RandomZoomOut)(p=0.5), 35 | transforms3=L(CrowdT.RandomHorizontalFlip)(p=0.5), 36 | transforms4=L(T.ColorJitter)(), 37 | transforms5=L(T.RandomResize)(sizes=scales, max_size=max_size), 38 | transforms6=L(T.ToTensor)(), 39 | transforms7=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1]) 40 | ), 41 | 42 | ), 43 | total_batch_size=16, 44 | collate_fn=L(BatchImageCollateFunction)( 45 | base_size=eval_spatial_size[0], 46 | base_size_repeat=4, 47 | stop_epoch=60, 48 | ), 49 | num_workers=4, 50 | shuffle=True, 51 | drop_last=True, 52 | pin_memory=True 53 | ) 54 | 55 | dataset_val = L(DataLoader)( 56 | dataset=L(CrowdPoseDetection)( 57 | img_folder="./data/crowdpose/images", 58 | ann_file="./data/crowdpose/annotations/crowdpose_test.json", 59 | transforms=L(Compose)( 60 | transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size), 61 | transforms2=L(T.ToTensor)(), 62 | transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1]) 63 | ), 64 | ), 65 | total_batch_size=32, 66 | collate_fn=L(BatchImageCollateFunction)( 67 | base_size=eval_spatial_size[0], 68 | ), 69 | num_workers=4, 70 | shuffle=False, 71 | drop_last=False, 72 | pin_memory=True 73 | ) 74 | 75 | dataset_test = L(DataLoader)( 76 | dataset=L(CrowdPoseDetection)( 77 | img_folder="./data/crowdpose/images", 78 | ann_file="./data/crowdpose/annotations/crowdpose_test.json", 79 | transforms=L(Compose)( 80 | transforms1=L(T.RandomResize)(sizes=[eval_spatial_size], max_size=max_size), 81 | transforms2=L(T.ToTensor)(), 82 | transforms3=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1]) 83 | ), 84 | ), 85 | total_batch_size=32, 86 | collate_fn=L(BatchImageCollateFunction)( 87 | base_size=eval_spatial_size[0], 88 | ), 89 | num_workers=4, 90 | shuffle=False, 91 | drop_last=False, 92 | pin_memory=True 93 | ) 94 | 95 | evaluator = L(CrowdPoseEvaluator)( 96 | ann_file="./data/crowdpose/annotations/crowdpose_test.json", 97 | iou_types=['keypoints_crowd'], 98 | useCats=True 99 | ) 100 | 101 | -------------------------------------------------------------------------------- /src/core/instantiate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import collections.abc as abc 4 | import dataclasses 5 | import logging 6 | from typing import Any 7 | 8 | from .utils import _convert_target_to_string, locate 9 | 10 | __all__ = ["dump_dataclass", "instantiate"] 11 | 12 | 13 | def dump_dataclass(obj: Any): 14 | """ 15 | Dump a dataclass recursively into a dict that can be later instantiated. 16 | 17 | Args: 18 | obj: a dataclass object 19 | 20 | Returns: 21 | dict 22 | """ 23 | assert dataclasses.is_dataclass(obj) and not isinstance( 24 | obj, type 25 | ), "dump_dataclass() requires an instance of a dataclass." 26 | ret = {"_target_": _convert_target_to_string(type(obj))} 27 | for f in dataclasses.fields(obj): 28 | v = getattr(obj, f.name) 29 | if dataclasses.is_dataclass(v): 30 | v = dump_dataclass(v) 31 | if isinstance(v, (list, tuple)): 32 | v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v] 33 | ret[f.name] = v 34 | return ret 35 | 36 | 37 | def instantiate(cfg): 38 | """ 39 | Recursively instantiate objects defined in dictionaries by 40 | "_target_" and arguments. 41 | 42 | Args: 43 | cfg: a dict-like object with "_target_" that defines the caller, and 44 | other keys that define the arguments 45 | 46 | Returns: 47 | object instantiated by cfg 48 | """ 49 | from omegaconf import ListConfig, DictConfig, OmegaConf 50 | 51 | if isinstance(cfg, ListConfig): 52 | lst = [instantiate(x) for x in cfg] 53 | return ListConfig(lst, flags={"allow_objects": True}) 54 | if isinstance(cfg, list): 55 | # Specialize for list, because many classes take 56 | # list[objects] as arguments, such as ResNet, DatasetMapper 57 | return [instantiate(x) for x in cfg] 58 | 59 | # If input is a DictConfig backed by dataclasses (i.e. omegaconf's structured config), 60 | # instantiate it to the actual dataclass. 61 | if isinstance(cfg, DictConfig) and dataclasses.is_dataclass(cfg._metadata.object_type): 62 | return OmegaConf.to_object(cfg) 63 | 64 | if isinstance(cfg, abc.Mapping) and "_target_" in cfg: 65 | # conceptually equivalent to hydra.utils.instantiate(cfg) with _convert_=all, 66 | # but faster: https://github.com/facebookresearch/hydra/issues/1200 67 | cfg = {k: instantiate(v) for k, v in cfg.items()} 68 | cls = cfg.pop("_target_") 69 | cls = instantiate(cls) 70 | 71 | if isinstance(cls, str): 72 | cls_name = cls 73 | cls = locate(cls_name) 74 | assert cls is not None, cls_name 75 | else: 76 | try: 77 | cls_name = cls.__module__ + "." + cls.__qualname__ 78 | except Exception: 79 | # target could be anything, so the above could fail 80 | cls_name = str(cls) 81 | assert callable(cls), f"_target_ {cls} does not define a callable object" 82 | try: 83 | return cls(**cfg) 84 | except TypeError: 85 | logger = logging.getLogger(__name__) 86 | logger.error(f"Error when instantiating {cls_name}!") 87 | raise 88 | return cfg # return as-is if don't know what to do -------------------------------------------------------------------------------- /tools/benchmark/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import os 7 | import glob 8 | from PIL import Image 9 | 10 | import torch 11 | import torch.utils.data as data 12 | import torchvision 13 | import torchvision.transforms as T 14 | import torchvision.transforms.functional as F 15 | 16 | Image.MAX_IMAGE_PIXELS = None 17 | 18 | class ToTensor(T.ToTensor): 19 | def __init__(self) -> None: 20 | super().__init__() 21 | 22 | def __call__(self, pic): 23 | if isinstance(pic, torch.Tensor): 24 | return pic 25 | return super().__call__(pic) 26 | 27 | class PadToSize(T.Pad): 28 | def __init__(self, size, fill=0, padding_mode='constant'): 29 | super().__init__(0, fill, padding_mode) 30 | self.size = size 31 | self.fill = fill 32 | 33 | def __call__(self, img): 34 | """ 35 | Args: 36 | img (PIL Image or Tensor): Image to be padded. 37 | 38 | Returns: 39 | PIL Image or Tensor: Padded image. 40 | """ 41 | w, h = F.get_image_size(img) 42 | padding = (0, 0, self.size[0] - w, self.size[1] - h) 43 | return F.pad(img, padding, self.fill, self.padding_mode) 44 | 45 | 46 | class Dataset(data.Dataset): 47 | def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None: 48 | super().__init__() 49 | 50 | self.device = device 51 | self.size = 640 52 | 53 | self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg'))) 54 | 55 | if preprocess is None: 56 | self.preprocess = T.Compose([ 57 | T.Resize(size=639, max_size=640), 58 | PadToSize(size=(640, 640), fill=114), 59 | ToTensor(), 60 | T.ConvertImageDtype(torch.float), 61 | ]) 62 | else: 63 | self.preprocess = preprocess 64 | 65 | def __len__(self, ): 66 | return len(self.im_path_list) 67 | 68 | def __getitem__(self, index): 69 | # im = Image.open(self.img_path_list[index]).convert('RGB') 70 | im = torchvision.io.read_file(self.im_path_list[index]) 71 | im = torchvision.io.decode_image(im, mode=torchvision.io.ImageReadMode.RGB).to(self.device) 72 | _, h, w = im.shape # c,h,w 73 | 74 | im = self.preprocess(im) 75 | 76 | blob = { 77 | 'images': im, 78 | # 'im_shape': torch.tensor([self.size, self.size]).to(im.device), 79 | # 'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device), 80 | 'orig_target_sizes': torch.tensor([w, h]).to(im.device), 81 | } 82 | 83 | return blob 84 | 85 | @staticmethod 86 | def post_process(): 87 | pass 88 | 89 | @staticmethod 90 | def collate_fn(): 91 | pass 92 | 93 | 94 | def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''): 95 | '''show result 96 | Keys: 97 | 'num_dets', 'det_boxes', 'det_scores', 'det_classes' 98 | ''' 99 | for i in range(blob['image'].shape[0]): 100 | det_scores = outputs['det_scores'][i] 101 | det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold] 102 | 103 | im = (blob['image'][i] * 255).to(torch.uint8) 104 | im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2) 105 | Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg') 106 | -------------------------------------------------------------------------------- /src/misc/keypoint_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn as nn 4 | 5 | def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas): 6 | sigmas = kpt_preds.new_tensor(sigmas) 7 | variances = (sigmas * 2)**2 8 | 9 | assert kpt_preds.size(0) == kpt_gts.size(0) 10 | kpt_preds = kpt_preds.reshape(-1, kpt_preds.size(-1) // 2, 2) 11 | kpt_gts = kpt_gts.reshape(-1, kpt_gts.size(-1) // 2, 2) 12 | 13 | squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \ 14 | (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2 15 | squared_distance0 = squared_distance / (kpt_areas[:, None] * variances[None, :] * 2) 16 | squared_distance1 = torch.exp(-squared_distance0) 17 | squared_distance1 = squared_distance1 * kpt_valids 18 | oks = squared_distance1.sum(dim=1) / (kpt_valids.sum(dim=1)+1e-6) 19 | 20 | return oks 21 | 22 | def oks_loss(pred, 23 | target, 24 | valid=None, 25 | area=None, 26 | linear=False, 27 | sigmas=None, 28 | eps=1e-6): 29 | oks = oks_overlaps(pred, target, valid, area, sigmas).clamp(min=eps) 30 | if linear: 31 | loss = oks 32 | else: 33 | loss = -oks.log() 34 | return loss 35 | 36 | 37 | class OKSLoss(nn.Module): 38 | def __init__(self, 39 | linear=False, 40 | num_keypoints=17, 41 | eps=1e-6, 42 | reduction='mean', 43 | loss_weight=1.0): 44 | super(OKSLoss, self).__init__() 45 | self.linear = linear 46 | self.eps = eps 47 | self.reduction = reduction 48 | self.loss_weight = loss_weight 49 | if num_keypoints == 17: 50 | self.sigmas = np.array([ 51 | .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 52 | 1.07, .87, .87, .89, .89 53 | ], dtype=np.float32) / 10.0 54 | elif num_keypoints == 14: 55 | self.sigmas = np.array([ 56 | .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, 57 | .79, .79 58 | ]) / 10.0 59 | elif num_keypoints == 3: 60 | self.sigmas = np.array([ 61 | 1.07, 1.07, 0.67 62 | ]) / 10.0 63 | else: 64 | raise ValueError(f'Unsupported keypoints number {num_keypoints}') 65 | 66 | def forward(self, 67 | pred, 68 | target, 69 | valid, 70 | area, 71 | weight=None, 72 | avg_factor=None, 73 | reduction_override=None): 74 | assert reduction_override in (None, 'none', 'mean', 'sum') 75 | reduction = ( 76 | reduction_override if reduction_override else self.reduction) 77 | if (weight is not None) and (not torch.any(weight > 0)) and ( 78 | reduction != 'none'): 79 | if pred.dim() == weight.dim() + 1: 80 | weight = weight.unsqueeze(1) 81 | return (pred * weight).sum() # 0 82 | if weight is not None and weight.dim() > 1: 83 | # TODO: remove this in the future 84 | # reduce the weight of shape (n, 4) to (n,) to match the 85 | # iou_loss of shape (n,) 86 | assert weight.shape == pred.shape 87 | weight = weight.mean(-1) 88 | loss = self.loss_weight * oks_loss( 89 | pred, 90 | target, 91 | valid=valid, 92 | area=area, 93 | linear=self.linear, 94 | sigmas=self.sigmas, 95 | eps=self.eps) 96 | return loss -------------------------------------------------------------------------------- /tools/benchmark/torch_benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 3 | """ 4 | import time 5 | import torch 6 | from torch import nn 7 | import torch.backends.cudnn as cudnn 8 | cudnn.benchmark = True 9 | 10 | import argparse 11 | from dataset import Dataset 12 | from tqdm import tqdm 13 | 14 | import os, sys 15 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..')) 16 | from src.core import LazyConfig, instantiate 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(description='Argument Parser Example') 20 | parser.add_argument('--config_file', '-c', default='./configs/detrpose/detrpose_hgnetv2_l.py', type=str, ) 21 | parser.add_argument('--resume', '-r', type=str, ) 22 | parser.add_argument('--infer_dir', 23 | type=str, 24 | default='./data/COCO2017/val2017', 25 | help="Directory for images to perform inference on.") 26 | args = parser.parse_args() 27 | return args 28 | 29 | @torch.no_grad() 30 | def warmup(model, data, img_size, n): 31 | for _ in range(n): 32 | _ = model(data, img_size) 33 | 34 | @torch.no_grad() 35 | def speed(model, data, n): 36 | times = [] 37 | for i in tqdm(range(n), desc="Running Inference", unit="iteration"): 38 | blob = data[i] 39 | samples, target_sizes = blob['images'].unsqueeze(0), blob['orig_target_sizes'] 40 | torch.cuda.synchronize() 41 | t_ = time.perf_counter() 42 | _ = model(samples, target_sizes) 43 | torch.cuda.synchronize() 44 | t = time.perf_counter() - t_ 45 | times.append(t) 46 | 47 | # end-to-end model only 48 | times = sorted(times) 49 | if len(times) > 100: 50 | times = times[:100] 51 | return sum(times) / len(times) 52 | 53 | def main(): 54 | FLAGS = parse_args() 55 | dataset = Dataset(FLAGS.infer_dir) 56 | blob = torch.ones(1, 3, 640, 640).cuda() 57 | 58 | img_size = torch.tensor([[640, 640]], device='cuda') 59 | 60 | cfg = LazyConfig.load(FLAGS.config_file) 61 | 62 | if hasattr(cfg.model.backbone, 'pretrained'): 63 | cfg.model.backbone.pretrained = False 64 | 65 | model = instantiate(cfg.model) 66 | postprocessor = instantiate(cfg.postprocessor) 67 | 68 | if FLAGS.resume: 69 | checkpoint = torch.load(FLAGS.resume, map_location='cpu') 70 | if 'ema' in checkpoint: 71 | state = checkpoint['ema']['module'] 72 | else: 73 | state = checkpoint['model'] 74 | 75 | # NOTE load train mode state -> convert to deploy mode 76 | linea.load_state_dict(state) 77 | 78 | else: 79 | # raise AttributeError('Only support resume to load model.state_dict by now.') 80 | print('not load model.state_dict, use default init state dict...') 81 | 82 | class Model(nn.Module): 83 | def __init__(self, ) -> None: 84 | super().__init__() 85 | self.model = model.deploy() 86 | self.postprocessor = postprocessor.deploy() 87 | 88 | def forward(self, images, orig_target_sizes): 89 | outputs = self.model(images) 90 | outputs = self.postprocessor(outputs, orig_target_sizes) 91 | return outputs 92 | 93 | model = Model().cuda() 94 | 95 | warmup(model, blob, img_size, 400) 96 | t = [] 97 | for _ in range(1): 98 | t.append(speed(model, dataset, 1000)) 99 | avg_latency = 1000 * torch.tensor(t).mean() 100 | print(f"model: {FLAGS.config_file}, Latency: {avg_latency:.2f} ms") 101 | 102 | del model 103 | torch.cuda.empty_cache() 104 | time.sleep(1) 105 | 106 | 107 | if __name__ == '__main__': 108 | main() 109 | -------------------------------------------------------------------------------- /src/nn/optimizer/ema.py: -------------------------------------------------------------------------------- 1 | """ 2 | D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement 3 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 6 | Copyright (c) 2023 lyuwenyu. All Rights Reserved. 7 | """ 8 | 9 | import math 10 | from copy import deepcopy 11 | 12 | import torch 13 | import torch.nn as nn 14 | 15 | from ...misc import dist_utils 16 | 17 | __all__ = ["ModelEMA"] 18 | 19 | 20 | class ModelEMA(object): 21 | """ 22 | Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models 23 | Keep a moving average of everything in the model state_dict (parameters and buffers). 24 | This is intended to allow functionality like 25 | https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage 26 | A smoothed version of the weights is necessary for some training schemes to perform well. 27 | This class is sensitive where it is initialized in the sequence of model init, 28 | GPU assignment and distributed training wrappers. 29 | """ 30 | 31 | def __init__( 32 | self, model: nn.Module, decay: float = 0.9999, warmups: int = 1000, start: int = 0 33 | ): 34 | super().__init__() 35 | 36 | self.module = deepcopy(dist_utils.de_parallel(model)).eval() 37 | # if next(model.parameters()).device.type != 'cpu': 38 | # self.module.half() # FP16 EMA 39 | 40 | self.decay = decay 41 | self.warmups = warmups 42 | self.before_start = 0 43 | self.start = start 44 | self.updates = 0 # number of EMA updates 45 | if warmups == 0: 46 | self.decay_fn = lambda x: decay 47 | else: 48 | self.decay_fn = lambda x: decay * ( 49 | 1 - math.exp(-x / warmups) 50 | ) # decay exponential ramp (to help early epochs) 51 | 52 | for p in self.module.parameters(): 53 | p.requires_grad_(False) 54 | 55 | def update(self, model: nn.Module): 56 | if self.before_start < self.start: 57 | self.before_start += 1 58 | return 59 | # Update EMA parameters 60 | with torch.no_grad(): 61 | self.updates += 1 62 | d = self.decay_fn(self.updates) 63 | msd = dist_utils.de_parallel(model).state_dict() 64 | for k, v in self.module.state_dict().items(): 65 | if v.dtype.is_floating_point: 66 | v *= d 67 | v += (1 - d) * msd[k].detach() 68 | 69 | def to(self, *args, **kwargs): 70 | self.module = self.module.to(*args, **kwargs) 71 | return self 72 | 73 | def state_dict( 74 | self, 75 | ): 76 | return dict(module=self.module.state_dict(), updates=self.updates) 77 | 78 | def load_state_dict(self, state, strict=True): 79 | self.module.load_state_dict(state["module"], strict=strict) 80 | if "updates" in state: 81 | self.updates = state["updates"] 82 | 83 | def forwad( 84 | self, 85 | ): 86 | raise RuntimeError("ema...") 87 | 88 | def extra_repr(self) -> str: 89 | return f"decay={self.decay}, warmups={self.warmups}, name=ema" 90 | 91 | 92 | class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel): 93 | """Maintains moving averages of model parameters using an exponential decay. 94 | ``ema_avg = decay * avg_model_param + (1 - decay) * model_param`` 95 | `torch.optim.swa_utils.AveragedModel `_ 96 | is used to compute the EMA. 97 | """ 98 | 99 | def __init__(self, model, decay, device="cpu", use_buffers=True): 100 | self.decay_fn = lambda x: decay * (1 - math.exp(-x / 2000)) 101 | 102 | def ema_avg(avg_model_param, model_param, num_averaged): 103 | decay = self.decay_fn(num_averaged) 104 | return decay * avg_model_param + (1 - decay) * model_param 105 | 106 | super().__init__(model, device, ema_avg, use_buffers=use_buffers) 107 | -------------------------------------------------------------------------------- /tools/deployment/export_onnx.py: -------------------------------------------------------------------------------- 1 | """ 2 | --------------------------------------------------------------------------------- 3 | Modified from D-FINE 4 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 5 | --------------------------------------------------------------------------------- 6 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 7 | Copyright (c) 2023 lyuwenyu. All Rights Reserved. 8 | """ 9 | 10 | import os 11 | import sys 12 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..')) 13 | from src.core import LazyConfig, instantiate 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | def main(args, ): 19 | """main 20 | """ 21 | cfg = LazyConfig.load(args.config_file) 22 | 23 | if hasattr(cfg.model.backbone, 'pretrained'): 24 | cfg.model.backbone.pretrained = False 25 | 26 | model = instantiate(cfg.model) 27 | postprocessor = instantiate(cfg.postprocessor) 28 | 29 | if args.resume: 30 | checkpoint = torch.load(args.resume, map_location='cpu', weights_only=False) 31 | if 'ema' in checkpoint: 32 | state = checkpoint['ema']['module'] 33 | else: 34 | state = checkpoint['model'] 35 | 36 | # NOTE load train mode state -> convert to deploy mode 37 | model.load_state_dict(state) 38 | 39 | else: 40 | # raise AttributeError('Only support resume to load model.state_dict by now.') 41 | print('not load model.state_dict, use default init state dict...') 42 | 43 | model = model.deploy() 44 | model.eval() 45 | 46 | class Model(nn.Module): 47 | def __init__(self, ) -> None: 48 | super().__init__() 49 | self.model = model 50 | self.postprocessor = postprocessor.deploy() 51 | 52 | def forward(self, images, orig_target_sizes): 53 | outputs = self.model(images) 54 | outputs = self.postprocessor(outputs, orig_target_sizes) 55 | return outputs 56 | 57 | model = Model() 58 | 59 | data = torch.rand(1, 3, 640, 640) 60 | size = torch.tensor([[640, 640]]) 61 | _ = model(data, size) 62 | 63 | dynamic_axes = { 64 | 'images': {0: 'N', }, 65 | 'orig_target_sizes': {0: 'N'} 66 | } 67 | 68 | outout_folder = 'onnx_engines' 69 | os.makedirs(outout_folder , exist_ok=True) 70 | output_file = args.config_file.split('/')[-1].replace('py', 'onnx') 71 | output_file = f'{outout_folder}/{output_file}' 72 | # args.resume.replace('.pth', '.onnx') if args.resume else 'model.onnx' 73 | 74 | torch.onnx.export( 75 | model, 76 | (data, size), 77 | output_file, 78 | input_names=['images', 'orig_target_sizes'], 79 | output_names=['scores', 'labels', 'keypoints'], 80 | dynamic_axes=dynamic_axes, 81 | opset_version=16, 82 | # dynamo=True, 83 | # external_data=False, 84 | # verify=True, 85 | # report=True, 86 | verbose=False, 87 | do_constant_folding=True, 88 | ) 89 | 90 | if args.check: 91 | import onnx 92 | onnx_model = onnx.load(output_file) 93 | onnx.checker.check_model(onnx_model) 94 | print('Check export onnx model done...') 95 | 96 | if args.simplify: 97 | import onnx 98 | import onnxsim 99 | dynamic = True 100 | # input_shapes = {'images': [1, 3, 640, 640], 'orig_target_sizes': [1, 2]} if dynamic else None 101 | input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None 102 | onnx_model_simplify, check = onnxsim.simplify(output_file, test_input_shapes=input_shapes) 103 | onnx.save(onnx_model_simplify, output_file) 104 | print(f'Simplify onnx model {check}...') 105 | 106 | 107 | if __name__ == '__main__': 108 | 109 | import argparse 110 | parser = argparse.ArgumentParser() 111 | parser.add_argument('--config_file', '-c', default='configs/linea/linea_l.py', type=str, ) 112 | parser.add_argument('--resume', '-r', type=str, ) 113 | parser.add_argument('--check', action='store_true', default=True,) 114 | parser.add_argument('--simplify', action='store_true', default=True,) 115 | args = parser.parse_args() 116 | main(args) 117 | -------------------------------------------------------------------------------- /src/misc/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Utilities for bounding box manipulation and GIoU. 4 | """ 5 | import torch, os 6 | from torchvision.ops.boxes import box_area 7 | 8 | 9 | def box_cxcywh_to_xyxy(x): 10 | x_c, y_c, w, h = x.unbind(-1) 11 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 12 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 13 | return torch.stack(b, dim=-1) 14 | 15 | 16 | def box_xyxy_to_cxcywh(x): 17 | x0, y0, x1, y1 = x.unbind(-1) 18 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 19 | (x1 - x0), (y1 - y0)] 20 | return torch.stack(b, dim=-1) 21 | 22 | 23 | # modified from torchvision to also return the union 24 | def box_iou(boxes1, boxes2): 25 | area1 = box_area(boxes1) 26 | area2 = box_area(boxes2) 27 | 28 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 29 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 30 | 31 | wh = (rb - lt).clamp(min=0) # [N,M,2] 32 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 33 | 34 | union = area1[:, None] + area2 - inter 35 | 36 | iou = inter / (union + 1e-6) 37 | return iou, union 38 | 39 | 40 | def generalized_box_iou(boxes1, boxes2): 41 | """ 42 | Generalized IoU from https://giou.stanford.edu/ 43 | 44 | The boxes should be in [x0, y0, x1, y1] format 45 | 46 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 47 | and M = len(boxes2) 48 | """ 49 | # degenerate boxes gives inf / nan results 50 | # so do an early check 51 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 52 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 53 | iou, union = box_iou(boxes1, boxes2) 54 | 55 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 56 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 57 | 58 | wh = (rb - lt).clamp(min=0) # [N,M,2] 59 | area = wh[:, :, 0] * wh[:, :, 1] 60 | 61 | return iou - (area - union) / (area + 1e-6) 62 | 63 | 64 | 65 | # modified from torchvision to also return the union 66 | def box_iou_pairwise(boxes1, boxes2): 67 | area1 = box_area(boxes1) 68 | area2 = box_area(boxes2) 69 | 70 | lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2] 71 | rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2] 72 | 73 | wh = (rb - lt).clamp(min=0) # [N,2] 74 | inter = wh[:, 0] * wh[:, 1] # [N] 75 | 76 | union = area1 + area2 - inter 77 | 78 | iou = inter / union 79 | return iou, union 80 | 81 | 82 | def generalized_box_iou_pairwise(boxes1, boxes2): 83 | """ 84 | Generalized IoU from https://giou.stanford.edu/ 85 | 86 | Input: 87 | - boxes1, boxes2: N,4 88 | Output: 89 | - giou: N, 4 90 | """ 91 | # degenerate boxes gives inf / nan results 92 | # so do an early check 93 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 94 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 95 | assert boxes1.shape == boxes2.shape 96 | iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4 97 | 98 | lt = torch.min(boxes1[:, :2], boxes2[:, :2]) 99 | rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) 100 | 101 | wh = (rb - lt).clamp(min=0) # [N,2] 102 | area = wh[:, 0] * wh[:, 1] 103 | 104 | return iou - (area - union) / area 105 | 106 | def masks_to_boxes(masks): 107 | """Compute the bounding boxes around the provided masks 108 | 109 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 110 | 111 | Returns a [N, 4] tensors, with the boxes in xyxy format 112 | """ 113 | if masks.numel() == 0: 114 | return torch.zeros((0, 4), device=masks.device) 115 | 116 | h, w = masks.shape[-2:] 117 | 118 | y = torch.arange(0, h, dtype=torch.float) 119 | x = torch.arange(0, w, dtype=torch.float) 120 | y, x = torch.meshgrid(y, x) 121 | 122 | x_mask = (masks * x.unsqueeze(0)) 123 | x_max = x_mask.flatten(1).max(-1)[0] 124 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 125 | 126 | y_mask = (masks * y.unsqueeze(0)) 127 | y_max = y_mask.flatten(1).max(-1)[0] 128 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 129 | 130 | return torch.stack([x_min, y_min, x_max, y_max], 1) 131 | 132 | if __name__ == '__main__': 133 | x = torch.rand(5, 4) 134 | y = torch.rand(3, 4) 135 | iou, union = box_iou(x, y) -------------------------------------------------------------------------------- /src/data/container.py: -------------------------------------------------------------------------------- 1 | """ 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from D-DEIM (https://github.com/Intellindust-AI-Lab/DEIM/) 6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 7 | --------------------------------------------------------------------------------- 8 | Modified from D-FINE (https://github.com/Peterande/D-FINE/) 9 | Copyright (c) 2024 D-FINE Authors. All Rights Reserved. 10 | --------------------------------------------------------------------------------- 11 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/) 12 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved. 13 | """ 14 | 15 | from omegaconf import ListConfig 16 | import random 17 | 18 | class Compose(object): 19 | def __init__(self, policy=None, mosaic_prob=0.0, **transforms): 20 | self.transforms = [] 21 | for transform in transforms.values(): 22 | self.transforms.append(transform) 23 | 24 | self.mosaic_prob = mosaic_prob 25 | 26 | if policy is None: 27 | self.policy = {'name': 'default'} 28 | else: 29 | self.policy = policy 30 | if self.mosaic_prob > 0: 31 | print(" ### Mosaic with Prob.@{} and RandomZoomOut/RandomCrop existed ### ".format(self.mosaic_prob)) 32 | print(" ### ImgTransforms Epochs: {} ### ".format(policy['epoch'])) 33 | print(' ### Policy_ops@{} ###'.format(policy['ops'])) 34 | 35 | ### warnings ## 36 | self.warning_mosaic_start = True 37 | 38 | def __call__(self, image, target, dataset=None): 39 | return self.get_forward(self.policy['name'])(image, target, dataset) 40 | 41 | def get_forward(self, name): 42 | forwards = { 43 | 'default': self.default_forward, 44 | 'stop_epoch': self.stop_epoch_forward, 45 | } 46 | return forwards[name] 47 | 48 | def default_forward(self, image, target, dataset=None): 49 | for transform in self.transforms: 50 | image, target = transform(image, target) 51 | return image, target 52 | 53 | def stop_epoch_forward(self, image, target, dataset=None): 54 | cur_epoch = dataset.epoch 55 | policy_ops = self.policy['ops'] 56 | policy_epoch = self.policy['epoch'] 57 | 58 | if isinstance(policy_epoch, (list, ListConfig)) and len(policy_epoch) == 3: 59 | if policy_epoch[0] <= cur_epoch < policy_epoch[1]: 60 | with_mosaic = random.random() <= self.mosaic_prob # Probility for Mosaic 61 | else: 62 | with_mosaic = False 63 | 64 | for transform in self.transforms: 65 | if (type(transform).__name__ in policy_ops and cur_epoch < policy_epoch[0]): # first stage: NoAug 66 | pass 67 | elif (type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch[-1]): # last stage: NoAug 68 | pass 69 | else: 70 | # Using Mosaic for [policy_epoch[0], policy_epoch[1]] with probability 71 | if (type(transform).__name__ == 'Mosaic' and not with_mosaic): 72 | pass 73 | # Mosaic and Zoomout/IoUCrop can not be co-existed in the same sample 74 | elif (type(transform).__name__ == 'RandomZoomOut' or type(transform).__name__ == 'RandomCrop') and with_mosaic: 75 | pass 76 | else: 77 | if type(transform).__name__ == 'Mosaic': 78 | if self.warning_mosaic_start: 79 | # It shows in which epochs mosaic is being used 80 | print(f' ### Mosaic is being used @ epoch {cur_epoch}...') 81 | self.warning_mosaic_start = False 82 | image, target = transform(image, target, dataset) 83 | else: 84 | image, target = transform(image, target) 85 | else: 86 | for transform in self.transforms: 87 | image, target = transform(image, target) 88 | 89 | return image, target 90 | 91 | def __repr__(self): 92 | format_string = self.__class__.__name__ + "(" 93 | for t in self.transforms: 94 | format_string += "\n" 95 | format_string += " {0}".format(t) 96 | format_string += "\n)" 97 | return format_string -------------------------------------------------------------------------------- /src/nn/backbone/resnet.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Conditional DETR 3 | # Copyright (c) 2021 Microsoft. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Copied from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Backbone modules. 12 | """ 13 | import os 14 | 15 | import torch 16 | import torch.nn.functional as F 17 | import torchvision 18 | from torch import nn 19 | from torchvision.models._utils import IntermediateLayerGetter 20 | from typing import Dict, List 21 | 22 | class FrozenBatchNorm2d(torch.nn.Module): 23 | """ 24 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 25 | 26 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 27 | without which any other models than torchvision.models.resnet[18,34,50,101] 28 | produce nans. 29 | """ 30 | 31 | def __init__(self, n): 32 | super(FrozenBatchNorm2d, self).__init__() 33 | self.register_buffer("weight", torch.ones(n)) 34 | self.register_buffer("bias", torch.zeros(n)) 35 | self.register_buffer("running_mean", torch.zeros(n)) 36 | self.register_buffer("running_var", torch.ones(n)) 37 | 38 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, 39 | missing_keys, unexpected_keys, error_msgs): 40 | num_batches_tracked_key = prefix + 'num_batches_tracked' 41 | if num_batches_tracked_key in state_dict: 42 | del state_dict[num_batches_tracked_key] 43 | 44 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 45 | state_dict, prefix, local_metadata, strict, 46 | missing_keys, unexpected_keys, error_msgs) 47 | 48 | def forward(self, x): 49 | # move reshapes to the beginning 50 | # to make it fuser-friendly 51 | w = self.weight.reshape(1, -1, 1, 1) 52 | b = self.bias.reshape(1, -1, 1, 1) 53 | rv = self.running_var.reshape(1, -1, 1, 1) 54 | rm = self.running_mean.reshape(1, -1, 1, 1) 55 | eps = 1e-5 56 | scale = w * (rv + eps).rsqrt() 57 | bias = b - rm * scale 58 | return x * scale + bias 59 | 60 | 61 | class BackboneBase(nn.Module): 62 | def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_indices: list): 63 | super().__init__() 64 | for name, parameter in backbone.named_parameters(): 65 | if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: 66 | parameter.requires_grad_(False) 67 | 68 | return_layers = {} 69 | for idx, layer_index in enumerate(return_interm_indices): 70 | return_layers.update({"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)}) 71 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 72 | self.num_channels = num_channels 73 | 74 | def forward(self, input): 75 | xs = self.body(input) 76 | return xs.values() 77 | 78 | 79 | class ResNet(BackboneBase): 80 | """ResNet backbone with frozen BatchNorm.""" 81 | def __init__(self, name: str, 82 | train_backbone: bool, 83 | dilation: bool, 84 | return_interm_indices:list, 85 | batch_norm=FrozenBatchNorm2d, 86 | pretrained=False, 87 | ): 88 | if name in ['resnet18', 'resnet34', 'resnet50', 'resnet101']: 89 | backbone = getattr(torchvision.models, name)( 90 | replace_stride_with_dilation=[False, False, dilation], 91 | pretrained=pretrained, norm_layer=batch_norm) 92 | else: 93 | raise NotImplementedError("Why you can get here with name {}".format(name)) 94 | # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048 95 | assert name not in ('resnet18', 'resnet34'), "Only resnet50 and resnet101 are available." 96 | assert return_interm_indices in [[0,1,2,3], [1,2,3], [2, 3], [3]] 97 | num_channels_all = [256, 512, 1024, 2048] 98 | num_channels = num_channels_all[4-len(return_interm_indices):] 99 | super().__init__(backbone, train_backbone, num_channels, return_interm_indices) 100 | 101 | -------------------------------------------------------------------------------- /src/misc/get_param_dicts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | import torch.nn as nn 4 | 5 | import re 6 | 7 | 8 | def get_optim_params(cfg: list, model: nn.Module): 9 | """ 10 | E.g.: 11 | ^(?=.*a)(?=.*b).*$ means including a and b 12 | ^(?=.*(?:a|b)).*$ means including a or b 13 | ^(?=.*a)(?!.*b).*$ means including a, but not b 14 | """ 15 | 16 | param_groups = [] 17 | visited = [] 18 | 19 | cfg_ = [] 20 | for pg in cfg: 21 | cfg_.append(dict(pg)) 22 | 23 | for pg in cfg_: 24 | pattern = pg['params'] 25 | params = {k: v for k, v in model.named_parameters() if v.requires_grad and len(re.findall(pattern, k)) > 0} 26 | pg['params'] = params.values() 27 | param_groups.append(pg) 28 | visited.extend(list(params.keys())) 29 | 30 | names = [k for k, v in model.named_parameters() if v.requires_grad] 31 | 32 | if len(visited) < len(names): 33 | unseen = set(names) - set(visited) 34 | params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen} 35 | param_groups.append({'params': params.values()}) 36 | visited.extend(list(params.keys())) 37 | 38 | assert len(visited) == len(names), '' 39 | 40 | return param_groups 41 | 42 | def match_name_keywords(n: str, name_keywords: list): 43 | out = False 44 | for b in name_keywords: 45 | if b in n: 46 | out = True 47 | break 48 | return out 49 | 50 | 51 | def get_param_dict(args, model_without_ddp: nn.Module): 52 | try: 53 | param_dict_type = args.param_dict_type 54 | except: 55 | param_dict_type = 'default' 56 | assert param_dict_type in ['default', 'ddetr_in_mmdet', 'large_wd'] 57 | 58 | # by default 59 | if param_dict_type == 'default': 60 | param_dicts = [ 61 | {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, 62 | { 63 | "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], 64 | "lr": args.lr_backbone, 65 | } 66 | ] 67 | return param_dicts 68 | 69 | if param_dict_type == 'ddetr_in_mmdet': 70 | param_dicts = [ 71 | { 72 | "params": 73 | [p for n, p in model_without_ddp.named_parameters() 74 | if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad], 75 | "lr": args.lr, 76 | }, 77 | { 78 | "params": [p for n, p in model_without_ddp.named_parameters() 79 | if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad], 80 | "lr": args.lr_backbone, 81 | }, 82 | { 83 | "params": [p for n, p in model_without_ddp.named_parameters() 84 | if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad], 85 | "lr": args.lr * args.lr_linear_proj_mult, 86 | } 87 | ] 88 | return param_dicts 89 | 90 | if param_dict_type == 'large_wd': 91 | param_dicts = [ 92 | { 93 | "params": 94 | [p for n, p in model_without_ddp.named_parameters() 95 | if not match_name_keywords(n, ['backbone']) and not match_name_keywords(n, ['norm', 'bias']) and p.requires_grad], 96 | }, 97 | { 98 | "params": [p for n, p in model_without_ddp.named_parameters() 99 | if match_name_keywords(n, ['backbone']) and match_name_keywords(n, ['norm', 'bias']) and p.requires_grad], 100 | "lr": args.lr_backbone, 101 | "weight_decay": 0.0, 102 | }, 103 | { 104 | "params": [p for n, p in model_without_ddp.named_parameters() 105 | if match_name_keywords(n, ['backbone']) and not match_name_keywords(n, ['norm', 'bias']) and p.requires_grad], 106 | "lr": args.lr_backbone, 107 | "weight_decay": args.weight_decay, 108 | }, 109 | { 110 | "params": 111 | [p for n, p in model_without_ddp.named_parameters() 112 | if not match_name_keywords(n, ['backbone']) and match_name_keywords(n, ['norm', 'bias']) and p.requires_grad], 113 | "lr": args.lr, 114 | "weight_decay": 0.0, 115 | } 116 | ] 117 | 118 | # print("param_dicts: {}".format(param_dicts)) 119 | 120 | return param_dicts 121 | -------------------------------------------------------------------------------- /assets/TENSORRT_DEB_LAMBDA.AI.md: -------------------------------------------------------------------------------- 1 |

2 | Manual to install TensorRT in Lambda.ai instances 3 |

4 | 5 | ## Quick Start 6 | ### Lambda.ai 7 | 1. Go to [Lambda.ai](https://lambda.ai) and create an account. 8 | 2. Log in to your Lambda.ai account. 9 | 3. Click on the `Launch instance' button. It is located on the top right side of the website. 10 | 4. Select an instance. To replicate our results from the appendix, select `1x A10 (24 GB PCle)` 11 | 12 | ### CUDA Installation 13 | The Lambda Stack installs a pre-packaged version of CUDA with only whats needed for typical deep learning workflows. 14 | But the `.deb` TensorRT installation expects the full CUDA Toolkit to already be installed in the system in the standard way via NVIDIAs `.deb` repo. 15 | Thats why your TensorRT installation only succeeded after installing CUDA. 16 | This ensured all the expected binaries, libraries, and metadata were in place for TensorRT to install cleanly. 17 | 18 | 1. Check which CUDA version your Lambda.ai instance is using 19 | ```shell 20 | nvidia-smi 21 | ``` 22 | We got the following output 23 | ```shell 24 | +-----------------------------------------------------------------------------------------+ 25 | | NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | 26 | |-----------------------------------------+------------------------+----------------------+ 27 | | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | 28 | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | 29 | | | | MIG M. | 30 | |=========================================+========================+======================| 31 | | 0 NVIDIA A10 On | 00000000:06:00.0 Off | 0 | 32 | | 0% 28C P8 9W / 150W | 1MiB / 23028MiB | 0% Default | 33 | | | | N/A | 34 | +-----------------------------------------+------------------------+----------------------+ 35 | 36 | +-----------------------------------------------------------------------------------------+ 37 | | Processes: | 38 | | GPU GI CI PID Type Process name GPU Memory | 39 | | ID ID Usage | 40 | |=========================================================================================| 41 | | No running processes found | 42 | +-----------------------------------------------------------------------------------------+ 43 | ``` 44 | 45 | 2. Install the CUDA toolkit with the correct version (in our case 12.8) 46 | ```shell 47 | # cuda installation 48 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb 49 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 50 | sudo apt-get update 51 | sudo apt-get -y install cuda-toolkit-12-8 52 | ``` 53 | 54 | 3. Install TensorRT 55 | 56 | When you use the `.deb` installation, you will install the latest TensorRT. 57 | ```shell 58 | #tensorrt installation 59 | wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.9.0/local_repo/nv-tensorrt-local-repo-ubuntu2204-10.9.0-cuda-12.8_1.0-1_amd64.deb 60 | sudo dpkg -i nv-tensorrt-local-repo-ubuntu2204-10.9.0-cuda-12.8_1.0-1_amd64.deb 61 | sudo cp /var/nv-tensorrt-local-repo-ubuntu2204-10.9.0-cuda-12.8/nv-tensorrt-local-AD7406A2-keyring.gpg /usr/share/keyrings/ 62 | sudo apt-get update 63 | sudo apt-get install tensorrt 64 | ``` 65 | 66 | The complete installation takes approximately 10-15 minutes. 67 | 68 | ## Installing DETRPose 69 | ### Quick Start 70 | ```shell 71 | git clone https://github.com/SebastianJanampa/DETRPose.git 72 | cd DETRPose 73 | pip install -r requirements.txt 74 | ``` 75 | 76 | ### Data Preparation 77 | ``` 78 | pip install gdown # to download files from google drive 79 | gdown 1VprytECcLtU4tKP32SYi_7oDRbw7yUTL # images 80 | unzip images.zip 81 | ``` 82 | 83 | ### Usage 84 | ```shell 85 | pip install onnx onnxsim 86 | pip install -r tools/benchmark/requirements.txt 87 | 88 | export model=l #n, s, m, l, x 89 | mkdir trt_engines 90 | ``` 91 | 1. Download official weights 92 | ```shell 93 | wget https://github.com/SebastianJanampa/DETRPose/releases/download/model_weights/detrpose_hgnetv2_${model}.pth 94 | ``` 95 | 2. Export onnx 96 | ```shell 97 | python tools/deployment/export_onnx.py --check -c configs/detrpose/detrpose_hgnetv2_${model}.py -r detrpose_hgnetv2_${model}.pth 98 | ``` 99 | 3. Export tensorrt 100 | ```shell 101 | alias trtexec="/usr/src/tensorrt/bin/trtexec" 102 | trtexec --onnx="onnx_engines/detrpose_hgnetv2_${model}.onnx" --saveEngine="trt_engines/detrpose_hgnetv2_${model}.engine" --fp16 103 | ``` 104 | 4. Benchmark 105 | ```shell 106 | python tools/benchmark/trt_benchmark.py --infer_dir ./images --engine_dir trt_engines 107 | ``` -------------------------------------------------------------------------------- /tools/visualization/backbone_encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 IDEA. All Rights Reserved. 2 | # ------------------------------------------------------------------------ 3 | import os, sys 4 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..')) 5 | 6 | import argparse 7 | 8 | import matplotlib as mpl 9 | import matplotlib.pyplot as plt 10 | from matplotlib.backends.backend_agg import FigureCanvasAgg 11 | 12 | import torch 13 | from torch.utils.data import DataLoader 14 | 15 | from util.slconfig import SLConfig 16 | import util.misc as utils 17 | 18 | import datasets 19 | from datasets import build_dataset, BatchImageCollateFunction 20 | 21 | 22 | def create(args, classname): 23 | # we use register to maintain models from catdet6 on. 24 | from models.registry import MODULE_BUILD_FUNCS 25 | class_module = getattr(args, classname) 26 | assert class_module in MODULE_BUILD_FUNCS._module_dict 27 | build_func = MODULE_BUILD_FUNCS.get(class_module) 28 | return build_func(args) 29 | 30 | def main(args): 31 | cfg = SLConfig.fromfile(args.config) 32 | device = args.device 33 | 34 | setattr(cfg, 'coco_path', args.data_path) 35 | setattr(cfg, 'batch_size_train', 1) 36 | setattr(cfg, 'batch_size_val', 1) 37 | 38 | if 'HGNetv2' in cfg.backbone: 39 | cfg.pretrained = False 40 | 41 | # build model 42 | model, _ = create(cfg, 'modelname') 43 | model.to(device) 44 | 45 | dataset_val = build_dataset(image_set='val', args=cfg) 46 | 47 | sampler_val = torch.utils.data.SequentialSampler(dataset_val) 48 | 49 | data_loader_val = DataLoader(dataset_val, 1, sampler=sampler_val, drop_last=False, collate_fn=BatchImageCollateFunction(), num_workers=4) 50 | 51 | if args.resume: 52 | checkpoint = torch.load(args.resume, map_location='cpu') 53 | if 'ema' in checkpoint: 54 | state = checkpoint['ema']['module'] 55 | else: 56 | state = checkpoint['model'] 57 | 58 | # NOTE load train mode state -> convert to deploy mode 59 | model.load_state_dict(state) 60 | 61 | # folder path 62 | main_folder = cfg.output_dir 63 | if 'data/wireframe_processed' in args.data_path: 64 | backbone_dir = f'{main_folder}/visualization/backbone_wireframe' 65 | encoder_dir = f'{main_folder}/visualization/encoder_wireframe' 66 | 67 | elif 'data/york_processed' in args.data_path: 68 | backbone_dir = f'{main_folder}/visualization/backbone_york' 69 | encoder_dir = f'{main_folder}/visualization/encoder_york' 70 | else: 71 | raise 'Dataset does not exist. We support only wireframe and york datasets' 72 | 73 | os.makedirs(backbone_dir , exist_ok=True) 74 | os.makedirs(encoder_dir, exist_ok=True) 75 | 76 | with torch.no_grad(): 77 | 78 | for i, (samples, targets) in enumerate(data_loader_val): 79 | samples = samples.to(device) 80 | 81 | enc_feature_maps = [] 82 | backbone_feature_maps = [] 83 | hooks = [ 84 | model.backbone.register_forward_hook( 85 | lambda self, input, output: backbone_feature_maps.append(output) 86 | ), 87 | model.encoder.register_forward_hook( 88 | lambda self, input, output: enc_feature_maps.append(output) 89 | ), 90 | ] 91 | model(samples) 92 | 93 | for hook in hooks: 94 | hook.remove() 95 | 96 | back_feats = backbone_feature_maps[0] 97 | enc_feats = enc_feature_maps[0] 98 | 99 | curr_img_id = targets[0]['image_id'].tolist()[0] 100 | 101 | for j, back_feat in enumerate(back_feats): 102 | down = j + 1 103 | 104 | back_feat = back_feat[0].mean(0).cpu() 105 | 106 | fig = plt.figure(figsize=(16, 16)) 107 | plt.axis('off') 108 | plt.imshow(back_feat) 109 | plt.savefig( 110 | f"{backbone_dir}/{curr_img_id}_ds_{down}.png", 111 | bbox_inches='tight', 112 | pad_inches=0, 113 | dpi=200 114 | ) 115 | plt.close() 116 | 117 | for j, enc_feat in enumerate(enc_feats): 118 | down = j + 1 119 | 120 | enc_feat = enc_feat[0].mean(0).cpu() 121 | 122 | fig = plt.figure(figsize=(16, 16)) 123 | plt.axis('off') 124 | plt.imshow(enc_feat) 125 | plt.savefig( 126 | f"{encoder_dir}/{curr_img_id}_ds_{down}.png", 127 | bbox_inches='tight', 128 | pad_inches=0, 129 | dpi=200 130 | ) 131 | plt.close() 132 | 133 | # check condition to stop program 134 | if args.num_images is not None and i + 1 >= args.num_images: 135 | break 136 | 137 | 138 | if __name__ == '__main__': 139 | parser = argparse.ArgumentParser('Visualization of Deformable Line Attention') 140 | parser.add_argument('-c', '--config', type=str, required=True) 141 | parser.add_argument('-r', '--resume', default='', help='resume from checkpoint') 142 | parser.add_argument('-p', '--data-path', type=str, default='data/wireframe_processed', help='data path') 143 | parser.add_argument('-d', '--device', type=str, default='cpu') 144 | parser.add_argument('-n', '--num_images', type=int, help='total number of images to plot') 145 | args = parser.parse_args() 146 | main(args) 147 | -------------------------------------------------------------------------------- /src/data/coco.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | COCO dataset which returns image_id for evaluation. 4 | 5 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py 6 | """ 7 | 8 | from pathlib import Path 9 | import cv2 10 | import numpy as np 11 | import torch 12 | import torch.utils.data 13 | from PIL import Image 14 | from pycocotools import mask as coco_mask 15 | from pycocotools.coco import COCO 16 | # import datasets.transforms as T 17 | 18 | __all__ = ['build'] 19 | 20 | 21 | class CocoDetection(torch.utils.data.Dataset): 22 | def __init__(self, img_folder, ann_file, transforms, return_masks=False): 23 | super(CocoDetection, self).__init__() 24 | self._transforms = transforms 25 | self.prepare = ConvertCocoPolysToMask(return_masks) 26 | 27 | self.img_folder = Path(img_folder) 28 | self.coco = COCO(ann_file) 29 | imgIds = sorted(self.coco.getImgIds()) 30 | 31 | if "train" in ann_file: 32 | self.all_imgIds = [] 33 | for image_id in imgIds: 34 | if self.coco.getAnnIds(imgIds=image_id) == []: 35 | continue 36 | ann_ids = self.coco.getAnnIds(imgIds=image_id) 37 | target = self.coco.loadAnns(ann_ids) 38 | num_keypoints = [obj["num_keypoints"] for obj in target] 39 | if sum(num_keypoints) == 0: 40 | continue 41 | self.all_imgIds.append(image_id) 42 | else: 43 | self.all_imgIds = [] 44 | for image_id in imgIds: 45 | self.all_imgIds.append(image_id) 46 | 47 | def set_epoch(self, epoch): 48 | self._epoch = epoch 49 | 50 | @property 51 | def epoch(self): 52 | return self._epoch if hasattr(self, '_epoch') else -1 53 | 54 | def __len__(self): 55 | return len(self.all_imgIds) 56 | 57 | def load_item(self, idx): 58 | image_id = self.all_imgIds[idx] 59 | ann_ids = self.coco.getAnnIds(imgIds=image_id) 60 | target = self.coco.loadAnns(ann_ids) 61 | 62 | target = {'image_id': image_id, 'annotations': target} 63 | img = Image.open(self.img_folder / self.coco.loadImgs(image_id)[0]['file_name']) 64 | img, target = self.prepare(img, target) 65 | return img, target 66 | 67 | def __getitem__(self, idx): 68 | img, target = self.load_item(idx) 69 | if self._transforms is not None: 70 | img, target = self._transforms(img, target, self) 71 | return img, target 72 | 73 | 74 | def convert_coco_poly_to_mask(segmentations, height, width): 75 | masks = [] 76 | for polygons in segmentations: 77 | rles = coco_mask.frPyObjects(polygons, height, width) 78 | mask = coco_mask.decode(rles) 79 | if len(mask.shape) < 3: 80 | mask = mask[..., None] 81 | mask = torch.as_tensor(mask, dtype=torch.uint8) 82 | mask = mask.any(dim=2) 83 | masks.append(mask) 84 | if masks: 85 | masks = torch.stack(masks, dim=0) 86 | else: 87 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 88 | return masks 89 | 90 | 91 | class ConvertCocoPolysToMask(object): 92 | def __init__(self, return_masks=False): 93 | self.return_masks = return_masks 94 | 95 | def __call__(self, image, target): 96 | w, h = image.size 97 | 98 | img_array = np.array(image) 99 | if len(img_array.shape) == 2: 100 | img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB) 101 | image = Image.fromarray(img_array) 102 | image_id = target["image_id"] 103 | image_id = torch.tensor([image_id]) 104 | anno = target["annotations"] 105 | anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] 106 | anno = [obj for obj in anno if obj['num_keypoints'] != 0] 107 | keypoints = [obj["keypoints"] for obj in anno] 108 | boxes = [obj["bbox"] for obj in anno] 109 | keypoints = torch.as_tensor(keypoints, dtype=torch.float32).reshape(-1, 17, 3) 110 | # guard against no boxes via resizing 111 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) 112 | boxes[:, 2:] += boxes[:, :2] 113 | boxes[:, 0::2].clamp_(min=0, max=w) 114 | boxes[:, 1::2].clamp_(min=0, max=h) 115 | classes = [obj["category_id"] for obj in anno] 116 | classes = torch.tensor(classes, dtype=torch.int64) 117 | if self.return_masks: 118 | segmentations = [obj["segmentation"] for obj in anno] 119 | masks = convert_coco_poly_to_mask(segmentations, h, w) 120 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) 121 | boxes = boxes[keep] 122 | classes = classes[keep] 123 | keypoints = keypoints[keep] 124 | if self.return_masks: 125 | masks = masks[keep] 126 | target = {} 127 | target["boxes"] = boxes 128 | target["labels"] = classes 129 | if self.return_masks: 130 | target["masks"] = masks 131 | target["image_id"] = image_id 132 | if keypoints is not None: 133 | target["keypoints"] = keypoints 134 | # for conversion to coco api 135 | area = torch.tensor([obj["area"] for obj in anno]) 136 | iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) 137 | target["area"] = area[keep] 138 | target["iscrowd"] = iscrowd[keep] 139 | target["orig_size"] = torch.as_tensor([int(w), int(h)]) 140 | target["size"] = torch.as_tensor([int(h), int(w)]) 141 | return image, target 142 | 143 | 144 | -------------------------------------------------------------------------------- /tools/inference/onnx_inf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 3 | """ 4 | import os 5 | import cv2 6 | import glob 7 | import numpy as np 8 | import onnxruntime as ort 9 | import torch 10 | import torchvision.transforms as T 11 | 12 | from PIL import Image, ImageDraw 13 | from copy import deepcopy 14 | from annotator import Annotator 15 | from annotator_crowdpose import AnnotatorCrowdpose 16 | 17 | annotators = {'COCO': Annotator, 'CrowdPose': AnnotatorCrowdpose} 18 | 19 | def process_image(sess, im_pil): 20 | w, h = im_pil.size 21 | orig_size = torch.tensor([w, h])[None] 22 | 23 | transforms = T.Compose( 24 | [ 25 | T.Resize((640, 640)), 26 | T.ToTensor(), 27 | ] 28 | ) 29 | im_data = transforms(im_pil).unsqueeze(0) 30 | annotator = annotators[annotator_type](deepcopy(im_pil)) 31 | 32 | 33 | output = sess.run( 34 | output_names=None, 35 | input_feed={"images": im_data.numpy(), "orig_target_sizes": orig_size.numpy()}, 36 | ) 37 | 38 | scores, labels, keypoints = output 39 | scores, labels, keypoints = scores[0], labels[0], keypoints[0] 40 | for kpt, score in zip(keypoints, scores): 41 | if score > thrh: 42 | annotator.kpts( 43 | kpt, 44 | [h, w] 45 | ) 46 | annotator.save(f"{OUTPUT_NAME}.jpg") 47 | 48 | 49 | def process_video(sess, video_path): 50 | cap = cv2.VideoCapture(video_path) 51 | 52 | # Get video properties 53 | fps = cap.get(cv2.CAP_PROP_FPS) 54 | orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 55 | orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 56 | 57 | # Define the codec and create VideoWriter object 58 | fourcc = cv2.VideoWriter_fourcc(*"mp4v") 59 | out = cv2.VideoWriter(f"{OUTPUT_NAME}.mp4", fourcc, fps, (orig_w, orig_h)) 60 | 61 | transforms = T.Compose( 62 | [ 63 | T.Resize((640, 640)), 64 | T.ToTensor(), 65 | ] 66 | ) 67 | 68 | frame_count = 0 69 | print("Processing video frames...") 70 | while cap.isOpened(): 71 | ret, frame = cap.read() 72 | if not ret: 73 | break 74 | 75 | # Convert frame to PIL image 76 | frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 77 | 78 | w, h = frame_pil.size 79 | orig_size = torch.tensor([w, h])[None] 80 | annotator = annotators[annotator_type](deepcopy(frame_pil)) 81 | 82 | im_data = transforms(frame_pil).unsqueeze(0) 83 | 84 | output = sess.run( 85 | output_names=None, 86 | input_feed={"images": im_data.numpy(), "orig_target_sizes": orig_size.numpy()}, 87 | ) 88 | 89 | scores, labels, keypoints = output 90 | scores, labels, keypoints = scores[0], labels[0], keypoints[0] 91 | for kpt, score in zip(keypoints, scores): 92 | if score > thrh: 93 | annotator.kpts( 94 | kpt, 95 | [h, w] 96 | ) 97 | 98 | # Convert back to OpenCV image 99 | frame = annotator.result() 100 | 101 | # Write the frame 102 | out.write(frame) 103 | frame_count += 1 104 | 105 | if frame_count % 10 == 0: 106 | print(f"Processed {frame_count} frames...") 107 | 108 | cap.release() 109 | out.release() 110 | print(f"Video processing complete. Result saved as '{OUTPUT_NAME}.mp4'.") 111 | 112 | def process_file(sess, file_path): 113 | # Check if the input file is an image or a video 114 | try: 115 | # Try to open the input as an image 116 | im_pil = Image.open(file_path).convert("RGB") 117 | process_image(sess, im_pil) 118 | except IOError: 119 | # Not an image, process as video 120 | process_video(sess, file_path) 121 | 122 | def main(args): 123 | assert args.annotator.lower() in ['coco', 'crowdpose'] 124 | # Global variable 125 | global OUTPUT_NAME, thrh, annotator_type 126 | 127 | """Main function.""" 128 | # Load the ONNX model 129 | sess = ort.InferenceSession(args.onnx) 130 | print(f"Using device: {ort.get_device()}") 131 | 132 | input_path = args.input 133 | thrh = 0.5 if args.thrh is None else args.thrh 134 | 135 | annotator_name = args.annotator.lower() 136 | if annotator_name == 'coco': 137 | annotator_type = 'COCO' 138 | elif annotator_name == 'crowdpose': 139 | annotator_type = 'CrowdPose' 140 | 141 | # Check if the input argumnet is a file or a folder 142 | file_path = args.input 143 | if os.path.isdir(file_path): 144 | # Process a folder 145 | folder_dir = args.input 146 | output_dir = f"{folder_dir}/output" 147 | os.makedirs(output_dir, exist_ok=True) 148 | paths = list(glob.iglob(f"{folder_dir}/*.*")) 149 | for file_path in paths: 150 | OUTPUT_NAME = file_path.replace(f'{folder_dir}/', f'{output_dir}/').split('.')[0] 151 | OUTPUT_NAME = f"{OUTPUT_NAME}_{annotator_type}" 152 | process_file(sess, file_path) 153 | else: 154 | # Process a file 155 | OUTPUT_NAME = f'onxx_results_{annotator_type}' 156 | process_file(sess, file_path) 157 | 158 | if __name__ == "__main__": 159 | import argparse 160 | 161 | parser = argparse.ArgumentParser() 162 | parser.add_argument("--onnx", type=str, required=True, help="Path to the ONNX model file.") 163 | parser.add_argument("--annotator", type=str, required=True, help="Annotator type: COCO or CrowdPose.") 164 | parser.add_argument("-i", "--input", type=str, required=True, help="Path to the input image or video file.") 165 | parser.add_argument("-t", "--thrh", type=float, required=False, default=None) 166 | args = parser.parse_args() 167 | main(args) 168 | -------------------------------------------------------------------------------- /src/data/crowdpose.py: -------------------------------------------------------------------------------- 1 | """ 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/) 6 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved. 7 | --------------------------------------------------------------------------------- 8 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/) 9 | Copyright (c) 2023 IDEA. All Rights Reserved. 10 | """ 11 | 12 | import json 13 | from pathlib import Path 14 | import cv2 15 | import numpy as np 16 | import torch 17 | import torch.utils.data 18 | from PIL import Image 19 | from xtcocotools.coco import COCO 20 | 21 | class CrowdPoseDetection(torch.utils.data.Dataset): 22 | def __init__(self, img_folder, ann_file, transforms, return_masks=False): 23 | super(CrowdPoseDetection, self).__init__() 24 | self._transforms = transforms 25 | self.prepare = ConvertCocoPolysToMask(return_masks) 26 | 27 | self.img_folder = Path(img_folder) 28 | self.coco = COCO(ann_file) 29 | imgIds = sorted(self.coco.getImgIds()) 30 | 31 | if "train" in ann_file: 32 | self.all_imgIds = [] 33 | for image_id in imgIds: 34 | if self.coco.getAnnIds(imgIds=image_id) == []: 35 | continue 36 | ann_ids = self.coco.getAnnIds(imgIds=image_id) 37 | target = self.coco.loadAnns(ann_ids) 38 | num_keypoints = [obj["num_keypoints"] for obj in target] 39 | if sum(num_keypoints) == 0: 40 | continue 41 | self.all_imgIds.append(image_id) 42 | else: 43 | self.all_imgIds = [] 44 | for image_id in imgIds: 45 | self.all_imgIds.append(image_id) 46 | 47 | def set_epoch(self, epoch): 48 | self._epoch = epoch 49 | 50 | @property 51 | def epoch(self): 52 | return self._epoch if hasattr(self, '_epoch') else -1 53 | 54 | def __len__(self): 55 | return len(self.all_imgIds) 56 | 57 | def load_item(self, idx): 58 | image_id = self.all_imgIds[idx] 59 | ann_ids = self.coco.getAnnIds(imgIds=image_id) 60 | target = self.coco.loadAnns(ann_ids) 61 | 62 | target = {'image_id': image_id, 'annotations': target} 63 | img = Image.open(self.img_folder / self.coco.loadImgs(image_id)[0]['file_name']) 64 | img, target = self.prepare(img, target) 65 | return img, target 66 | 67 | def __getitem__(self, idx): 68 | img, target = self.load_item(idx) 69 | if self._transforms is not None: 70 | img, target = self._transforms(img, target, self) 71 | return img, target 72 | 73 | 74 | def convert_coco_poly_to_mask(segmentations, height, width): 75 | masks = [] 76 | for polygons in segmentations: 77 | rles = coco_mask.frPyObjects(polygons, height, width) 78 | mask = coco_mask.decode(rles) 79 | if len(mask.shape) < 3: 80 | mask = mask[..., None] 81 | mask = torch.as_tensor(mask, dtype=torch.uint8) 82 | mask = mask.any(dim=2) 83 | masks.append(mask) 84 | if masks: 85 | masks = torch.stack(masks, dim=0) 86 | else: 87 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 88 | return masks 89 | 90 | 91 | class ConvertCocoPolysToMask(object): 92 | def __init__(self, return_masks=False): 93 | self.return_masks = return_masks 94 | 95 | def __call__(self, image, target): 96 | w, h = image.size 97 | 98 | img_array = np.array(image) 99 | if len(img_array.shape) == 2: 100 | img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB) 101 | image = Image.fromarray(img_array) 102 | image_id = target["image_id"] 103 | image_id = torch.tensor([image_id]) 104 | anno = target["annotations"] 105 | anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] 106 | anno = [obj for obj in anno if obj['num_keypoints'] != 0] 107 | keypoints = [obj["keypoints"] for obj in anno] 108 | boxes = [obj["bbox"] for obj in anno] 109 | keypoints = torch.as_tensor(keypoints, dtype=torch.float32).reshape(-1, 14, 3) 110 | # guard against no boxes via resizing 111 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) 112 | boxes[:, 2:] += boxes[:, :2] 113 | boxes[:, 0::2].clamp_(min=0, max=w) 114 | boxes[:, 1::2].clamp_(min=0, max=h) 115 | classes = [obj["category_id"] for obj in anno] 116 | classes = torch.tensor(classes, dtype=torch.int64) 117 | if self.return_masks: 118 | segmentations = [obj["segmentation"] for obj in anno] 119 | masks = convert_coco_poly_to_mask(segmentations, h, w) 120 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) 121 | boxes = boxes[keep] 122 | classes = classes[keep] 123 | keypoints = keypoints[keep] 124 | if self.return_masks: 125 | masks = masks[keep] 126 | target = {} 127 | target["boxes"] = boxes 128 | target["labels"] = classes 129 | if self.return_masks: 130 | target["masks"] = masks 131 | target["image_id"] = image_id 132 | if keypoints is not None: 133 | target["keypoints"] = keypoints 134 | iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) 135 | target["iscrowd"] = iscrowd[keep] 136 | target["orig_size"] = torch.as_tensor([int(w), int(h)]) 137 | target["size"] = torch.as_tensor([int(h), int(w)]) 138 | return image, target 139 | 140 | -------------------------------------------------------------------------------- /src/models/detrpose/matcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/) 6 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved. 7 | --------------------------------------------------------------------------------- 8 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/) 9 | Copyright (c) 2023 IDEA. All Rights Reserved. 10 | """ 11 | 12 | import torch 13 | from scipy.optimize import linear_sum_assignment 14 | from torch import nn 15 | import numpy as np 16 | 17 | 18 | class HungarianMatcher(nn.Module): 19 | def __init__(self, cost_class: float = 1, focal_alpha=0.25, 20 | cost_keypoints=1.0, cost_oks=0.01, num_body_points=17): 21 | super().__init__() 22 | self.cost_class = cost_class 23 | 24 | self.cost_keypoints = cost_keypoints 25 | self.cost_oks = cost_oks 26 | self.focal_alpha = focal_alpha 27 | self.num_body_points = num_body_points 28 | 29 | if num_body_points==17: 30 | self.sigmas = np.array([ 31 | .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 32 | 1.07, .87, .87, .89, .89 33 | ], dtype=np.float32) / 10.0 34 | 35 | elif num_body_points==14: 36 | self.sigmas = np.array([ 37 | .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, 38 | .79, .79 39 | ]) / 10.0 40 | else: 41 | raise NotImplementedError 42 | 43 | @torch.no_grad() 44 | def forward(self, outputs, targets): 45 | bs, num_queries = outputs["pred_logits"].shape[:2] 46 | 47 | # We flatten to compute the cost matrices in a batch 48 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] 49 | out_keypoints = outputs["pred_keypoints"].flatten(0, 1) # [batch_size * num_queries, 51] 50 | 51 | # Also concat the target labels and boxes 52 | tgt_ids = torch.cat([v["labels"] for v in targets]) 53 | tgt_keypoints = torch.cat([v["keypoints"] for v in targets]) # nkp, 51 54 | tgt_area = torch.cat([v["area"] for v in targets]) # nkp, 51 55 | 56 | # Compute the classification cost. 57 | alpha = self.focal_alpha 58 | gamma = 2.0 59 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) 60 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) 61 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] 62 | 63 | # compute the keypoint costs 64 | Z_pred = out_keypoints[:, 0:(self.num_body_points * 2)] 65 | Z_gt = tgt_keypoints[:, 0:(self.num_body_points * 2)] 66 | V_gt: torch.Tensor = tgt_keypoints[:, (self.num_body_points * 2):] 67 | if Z_pred.sum() > 0: 68 | sigmas = Z_pred.new_tensor(self.sigmas) 69 | variances = (sigmas * 2) ** 2 70 | kpt_preds = Z_pred.reshape(-1, Z_pred.size(-1) // 2, 2) 71 | kpt_gts = Z_gt.reshape(-1, Z_gt.size(-1) // 2, 2) 72 | squared_distance = (kpt_preds[:, None, :, 0] - kpt_gts[None, :, :, 0]) ** 2 + \ 73 | (kpt_preds[:, None, :, 1] - kpt_gts[None, :, :, 1]) ** 2 74 | squared_distance0 = squared_distance / (tgt_area[:, None] * variances[None, :] * 2) 75 | squared_distance1 = torch.exp(-squared_distance0) 76 | squared_distance1 = squared_distance1 * V_gt 77 | oks = squared_distance1.sum(dim=-1) / (V_gt.sum(dim=-1) + 1e-6) 78 | oks = oks.clamp(min=1e-6) 79 | cost_oks = 1 - oks 80 | 81 | cost_keypoints = torch.abs(Z_pred[:, None, :] - Z_gt[None]) # npred, ngt, 34 82 | cost_keypoints = cost_keypoints * V_gt.repeat_interleave(2, dim=1)[None] 83 | cost_keypoints = cost_keypoints.sum(-1) 84 | C = self.cost_class * cost_class + self.cost_keypoints * cost_keypoints + self.cost_oks * cost_oks 85 | C = C.view(bs, num_queries, -1).cpu() 86 | 87 | else: 88 | cost_keypoints = cost_oks = 0 89 | C = self.cost_class * cost_class + self.cost_keypoints * cost_keypoints + self.cost_oks * cost_oks 90 | C = C.view(bs, num_queries, -1).cpu() 91 | 92 | # Final cost matrix 93 | sizes = [len(v["boxes"]) for v in targets] 94 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] 95 | 96 | if tgt_ids.shape[0] > 0: 97 | cost_mean_dict = { 98 | 'class': cost_class.mean(), 99 | "keypoints": cost_keypoints.mean() 100 | } 101 | else: 102 | # for the cases when no grounding truth boxes 103 | cost_mean_dict = { 104 | 'class': torch.zeros_like(cost_class.mean()), 105 | 'keypoints': torch.zeros_like(cost_keypoints.mean()), 106 | } 107 | 108 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in 109 | indices]#, cost_mean_dict 110 | 111 | def build_matcher(args): 112 | assert args.matcher_type in ['HungarianMatcher'], "Unknown args.matcher_type: {}".format( 113 | args.matcher_type) 114 | if args.matcher_type == 'HungarianMatcher': 115 | return HungarianMatcher( 116 | cost_class=args.set_cost_class, focal_alpha=args.focal_alpha, cost_keypoints=args.set_cost_keypoints, cost_oks=args.set_cost_oks, num_body_points=args.num_body_points) 117 | else: 118 | raise NotImplementedError("Unknown args.matcher_type: {}".format(args.matcher_type)) -------------------------------------------------------------------------------- /src/misc/dist_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import atexit 3 | import json 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | 8 | from torch.utils.data import DistributedSampler 9 | from torch.nn.parallel import DataParallel as DP 10 | from torch.nn.parallel import DistributedDataParallel as DDP 11 | 12 | from ..data.dataloader import DataLoader 13 | 14 | def is_dist_avail_and_initialized(): 15 | if not dist.is_available(): 16 | return False 17 | if not dist.is_initialized(): 18 | return False 19 | return True 20 | 21 | 22 | def get_world_size(): 23 | if not is_dist_avail_and_initialized(): 24 | return 1 25 | return dist.get_world_size() 26 | 27 | 28 | def get_rank(): 29 | if not is_dist_avail_and_initialized(): 30 | return 0 31 | return dist.get_rank() 32 | 33 | 34 | def is_main_process(): 35 | return get_rank() == 0 36 | 37 | 38 | def save_on_master(*args, **kwargs): 39 | if is_main_process(): 40 | torch.save(*args, **kwargs) 41 | 42 | 43 | def init_distributed_mode(args): 44 | if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != '': # 'RANK' in os.environ and 45 | args.rank = int(os.environ["RANK"]) 46 | args.world_size = int(os.environ['WORLD_SIZE']) 47 | args.gpu = args.local_rank = int(os.environ['LOCAL_RANK']) 48 | # local_world_size = int(os.environ['WORLD_SIZE']) 49 | # args.world_size = args.world_size * local_world_size 50 | # args.gpu = args.local_rank = int(os.environ['LOCAL_RANK']) 51 | # args.rank = args.rank * local_world_size + args.local_rank 52 | # print('world size: {}, rank: {}, local rank: {}'.format(args.world_size, args.rank, args.local_rank)) 53 | # print(json.dumps(dict(os.environ), indent=2)) 54 | elif 'SLURM_PROCID' in os.environ: 55 | args.rank = int(os.environ['SLURM_PROCID']) 56 | args.gpu = args.local_rank = int(os.environ['SLURM_LOCALID']) 57 | args.world_size = int(os.environ['SLURM_NPROCS']) 58 | 59 | # print('world size: {}, world rank: {}, local rank: {}, device_count: {}'.format(args.world_size, args.rank, args.local_rank, torch.cuda.device_count())) 60 | # print("os.environ['SLURM_JOB_NODELIST']:", os.environ['SLURM_JOB_NODELIST']) 61 | # print(json.dumps(dict(os.environ), indent=2)) 62 | # print('args:') 63 | # print(json.dumps(vars(args), indent=2)) 64 | else: 65 | print('Not using distributed mode') 66 | args.distributed = False 67 | args.world_size = 1 68 | args.rank = 0 69 | args.local_rank = 0 70 | return 71 | 72 | print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank)) 73 | args.distributed = True 74 | torch.cuda.set_device(args.local_rank) 75 | args.dist_backend = 'nccl' 76 | print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True) 77 | torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 78 | world_size=args.world_size, rank=args.rank) 79 | print("Before torch.distributed.barrier()") 80 | torch.distributed.barrier() 81 | print("End torch.distributed.barrier()") 82 | setup_for_distributed(args.rank == 0) 83 | 84 | def setup_for_distributed(is_master): 85 | """ 86 | This function disables printing when not in master process 87 | """ 88 | import builtins as __builtin__ 89 | builtin_print = __builtin__.print 90 | 91 | def print(*args, **kwargs): 92 | force = kwargs.pop('force', False) 93 | if is_master or force: 94 | builtin_print(*args, **kwargs) 95 | 96 | __builtin__.print = print 97 | 98 | def warp_loader(loader, shuffle=False): 99 | if is_dist_avail_and_initialized(): 100 | sampler = DistributedSampler(loader.dataset, shuffle=shuffle) 101 | loader = DataLoader(loader.dataset, 102 | loader.batch_size, 103 | sampler=sampler, 104 | drop_last=loader.drop_last, 105 | collate_fn=loader.collate_fn, 106 | pin_memory=loader.pin_memory, 107 | num_workers=loader.num_workers) 108 | return loader 109 | 110 | 111 | def warp_model( 112 | model: torch.nn.Module, 113 | sync_bn: bool=False, 114 | dist_mode: str='ddp', 115 | find_unused_parameters: bool=False, 116 | compile: bool=False, 117 | compile_mode: str='reduce-overhead', 118 | **kwargs 119 | ): 120 | if is_dist_avail_and_initialized(): 121 | rank = get_rank() 122 | model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if sync_bn else model 123 | if dist_mode == 'dp': 124 | model = DP(model, device_ids=[rank], output_device=rank) 125 | elif dist_mode == 'ddp': 126 | model = DDP(model, device_ids=[rank], output_device=rank, find_unused_parameters=find_unused_parameters) 127 | else: 128 | raise AttributeError('') 129 | 130 | if compile: 131 | model = torch.compile(model, mode=compile_mode) 132 | 133 | return model 134 | 135 | @atexit.register 136 | def cleanup(): 137 | """cleanup distributed environment""" 138 | if is_dist_avail_and_initialized(): 139 | torch.distributed.barrier() 140 | torch.distributed.destroy_process_group() 141 | 142 | 143 | def is_parallel(model) -> bool: 144 | # Returns True if model is of type DP or DDP 145 | return type(model) in ( 146 | torch.nn.parallel.DataParallel, 147 | torch.nn.parallel.DistributedDataParallel, 148 | ) 149 | 150 | 151 | def de_parallel(model) -> nn.Module: 152 | # De-parallelize a model: returns single-GPU model if model is of type DP or DDP 153 | return model.module if is_parallel(model) else model 154 | -------------------------------------------------------------------------------- /src/models/detrpose/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/) 6 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved. 7 | --------------------------------------------------------------------------------- 8 | Modified from GroupPose (https://github.com/Michel-liu/GroupPose/) 9 | Copyright (c) 2023 GroupPose Authors. All Rights Reserved. 10 | --------------------------------------------------------------------------------- 11 | Modified from ED-Pose (https://github.com/IDEA-Research/ED-Pose/) 12 | Copyright (c) 2023 IDEA. All Rights Reserved. 13 | """ 14 | 15 | import torch 16 | import random 17 | from torch import nn, Tensor 18 | import os 19 | import numpy as np 20 | import math 21 | import torch.nn.functional as F 22 | from torch import nn 23 | 24 | 25 | def gen_encoder_output_proposals(memory:Tensor, spatial_shapes:Tensor): 26 | """ 27 | Input: 28 | - memory: bs, \sum{hw}, d_model 29 | - spatial_shapes: nlevel, 2 30 | - learnedwh: 2 31 | Output: 32 | - output_memory: bs, \sum{hw}, d_model 33 | - output_proposals: bs, \sum{hw}, 4 34 | """ 35 | N_, S_, C_ = memory.shape 36 | base_scale = 4.0 37 | proposals = [] 38 | _cur = 0 39 | for lvl, (H_, W_) in enumerate(spatial_shapes): 40 | grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device), 41 | torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device), 42 | indexing='ij') 43 | grid = torch.stack([grid_x, grid_y], -1) # H_, W_, 2 44 | 45 | grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / torch.tensor([W_, H_], dtype=torch.float32, device=memory.device) 46 | 47 | proposal = grid.view(N_, -1, 2) 48 | proposals.append(proposal) 49 | _cur += (H_ * W_) 50 | output_proposals = torch.cat(proposals, 1) 51 | output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) 52 | output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid 53 | output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf')) 54 | 55 | output_memory = memory 56 | output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) 57 | 58 | return output_memory, output_proposals 59 | 60 | 61 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): 62 | """ 63 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 64 | Args: 65 | inputs: A float tensor of arbitrary shape. 66 | The predictions for each example. 67 | targets: A float tensor with the same shape as inputs. Stores the binary 68 | classification label for each element in inputs 69 | (0 for the negative class and 1 for the positive class). 70 | alpha: (optional) Weighting factor in range (0,1) to balance 71 | positive vs negative examples. Default = -1 (no weighting). 72 | gamma: Exponent of the modulating factor (1 - p_t) to 73 | balance easy vs hard examples. 74 | Returns: 75 | Loss tensor 76 | """ 77 | prob = inputs.sigmoid() 78 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") 79 | p_t = prob * targets + (1 - prob) * (1 - targets) 80 | loss = ce_loss * ((1 - p_t) ** gamma) 81 | 82 | if alpha >= 0: 83 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets) 84 | loss = alpha_t * loss 85 | 86 | 87 | return loss.mean(1).sum() / num_boxes 88 | 89 | class MLP(nn.Module): 90 | """ Very simple multi-layer perceptron (also called FFN)""" 91 | 92 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 93 | super().__init__() 94 | self.num_layers = num_layers 95 | h = [hidden_dim] * (num_layers - 1) 96 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 97 | 98 | def forward(self, x): 99 | for i, layer in enumerate(self.layers): 100 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 101 | return x 102 | 103 | def _get_activation_fn(activation, d_model=256, batch_dim=0): 104 | """Return an activation function given a string""" 105 | if activation == "relu": 106 | return F.relu 107 | if activation == "gelu": 108 | return F.gelu 109 | if activation == "glu": 110 | return F.glu 111 | if activation == "prelu": 112 | return nn.PReLU() 113 | if activation == "selu": 114 | return F.selu 115 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 116 | 117 | 118 | def gen_sineembed_for_position(pos_tensor): 119 | # n_query, bs, _ = pos_tensor.size() 120 | # sineembed_tensor = torch.zeros(n_query, bs, 256) 121 | scale = 2 * math.pi 122 | dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) 123 | dim_t = 10000 ** (2 * (dim_t // 2) / 128) 124 | x_embed = pos_tensor[:, :, 0] * scale 125 | y_embed = pos_tensor[:, :, 1] * scale 126 | pos_x = x_embed[:, :, None] / dim_t 127 | pos_y = y_embed[:, :, None] / dim_t 128 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) 129 | pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) 130 | if pos_tensor.size(-1) == 2: 131 | pos = torch.cat((pos_y, pos_x), dim=2) 132 | elif pos_tensor.size(-1) == 4: 133 | w_embed = pos_tensor[:, :, 2] * scale 134 | pos_w = w_embed[:, :, None] / dim_t 135 | pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) 136 | 137 | h_embed = pos_tensor[:, :, 3] * scale 138 | pos_h = h_embed[:, :, None] / dim_t 139 | pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) 140 | 141 | pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) 142 | else: 143 | raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1))) 144 | return pos 145 | 146 | 147 | def inverse_sigmoid(x, eps=1e-3): 148 | x = x.clamp(min=0, max=1) 149 | x1 = x.clamp(min=eps) 150 | x2 = (1 - x).clamp(min=eps) 151 | return torch.log(x1/x2) -------------------------------------------------------------------------------- /tools/inference/annotator_crowdpose.py: -------------------------------------------------------------------------------- 1 | ######################################################################################### 2 | # Modified from: 3 | # Ultralytics 4 | # https://github.com/ultralytics/ultralytics/blob/main/ultralytics/utils/plotting.py 5 | ######################################################################################### 6 | 7 | import math 8 | import warnings 9 | from pathlib import Path 10 | from typing import Callable, Dict, List, Optional, Union 11 | 12 | import cv2 13 | import numpy as np 14 | import torch 15 | from PIL import Image, ImageDraw, ImageFont 16 | from PIL import __version__ as pil_version 17 | 18 | from annotator import Annotator, Colors 19 | 20 | 21 | colors = Colors() # create instance for 'from utils.plots import colors' 22 | 23 | class AnnotatorCrowdpose(Annotator): 24 | """ 25 | Ultralytics Annotator for train/val mosaics and JPGs and predictions annotations. 26 | 27 | Attributes: 28 | im (Image.Image | np.ndarray): The image to annotate. 29 | pil (bool): Whether to use PIL or cv2 for drawing annotations. 30 | font (ImageFont.truetype | ImageFont.load_default): Font used for text annotations. 31 | lw (float): Line width for drawing. 32 | skeleton (List[List[int]]): Skeleton structure for keypoints. 33 | limb_color (List[int]): Color palette for limbs. 34 | kpt_color (List[int]): Color palette for keypoints. 35 | dark_colors (set): Set of colors considered dark for text contrast. 36 | light_colors (set): Set of colors considered light for text contrast. 37 | 38 | Examples: 39 | >>> from ultralytics.utils.plotting import Annotator 40 | >>> im0 = cv2.imread("test.png") 41 | >>> annotator = Annotator(im0, line_width=10) 42 | >>> annotator.box_label([10, 10, 100, 100], "person", (255, 0, 0)) 43 | """ 44 | 45 | def __init__( 46 | self, 47 | im, 48 | line_width: Optional[int] = None, 49 | font_size: Optional[int] = None, 50 | font: str = "Arial.ttf", 51 | pil: bool = False, 52 | example: str = "abc", 53 | ): 54 | """Initialize the Annotator class with image and line width along with color palette for keypoints and limbs.""" 55 | super().__init__(im, line_width, font_size, font, pil, example) 56 | 57 | # Pose Crowdpose 58 | self.skeleton = [ 59 | # limbs 60 | [12, 10], 61 | [10, 8], 62 | [11, 9], 63 | [9, 7], 64 | # torso 65 | [8, 7], 66 | [8, 2], 67 | [7, 1], 68 | # arms 69 | [14, 1], 70 | [14, 2], 71 | [1, 3], 72 | [3, 5], 73 | [2, 4], 74 | [4, 6], 75 | # head 76 | [14, 13], 77 | ] 78 | 79 | self.limb_color = colors.pose_palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 0, 16]] 80 | self.kpt_color = colors.pose_palette[[0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 16, 0]] 81 | # 9, 9, 9, 9, 9, 9, 9, 0, 16, 16, 0, 0, 0, 0, 0, 0]] 82 | self.dark_colors = { 83 | (235, 219, 11), 84 | (243, 243, 243), 85 | (183, 223, 0), 86 | (221, 111, 255), 87 | (0, 237, 204), 88 | (68, 243, 0), 89 | (255, 255, 0), 90 | (179, 255, 1), 91 | (11, 255, 162), 92 | } 93 | self.light_colors = { 94 | (255, 42, 4), 95 | (79, 68, 255), 96 | (255, 0, 189), 97 | (255, 180, 0), 98 | (186, 0, 221), 99 | (0, 192, 38), 100 | (255, 36, 125), 101 | (104, 0, 123), 102 | (108, 27, 255), 103 | (47, 109, 252), 104 | (104, 31, 17), 105 | } 106 | 107 | # def kpts( 108 | # self, 109 | # kpts, 110 | # shape: tuple = (640, 640), 111 | # radius: Optional[int] = None, 112 | # kpt_line: bool = True, 113 | # conf_thres: float = 0.25, 114 | # kpt_color: Optional[tuple] = None, 115 | # ): 116 | # """ 117 | # Plot keypoints on the image. 118 | 119 | # Args: 120 | # kpts (torch.Tensor): Keypoints, shape [17, 3] (x, y, confidence). 121 | # shape (tuple, optional): Image shape (h, w). 122 | # radius (int, optional): Keypoint radius. 123 | # kpt_line (bool, optional): Draw lines between keypoints. 124 | # conf_thres (float, optional): Confidence threshold. 125 | # kpt_color (tuple, optional): Keypoint color (B, G, R). 126 | 127 | # Note: 128 | # - `kpt_line=True` currently only supports human pose plotting. 129 | # - Modifies self.im in-place. 130 | # - If self.pil is True, converts image to numpy array and back to PIL. 131 | # """ 132 | # radius = radius if radius is not None else self.lw 133 | # if self.pil: 134 | # # Convert to numpy first 135 | # self.im = np.asarray(self.im).copy() 136 | # nkpt, ndim = kpts.shape 137 | # is_pose = nkpt == 17 and ndim in {2, 3} 138 | # kpt_line &= is_pose # `kpt_line=True` for now only supports human pose plotting 139 | # for i, k in enumerate(kpts): 140 | # color_k = kpt_color or (self.kpt_color[i].tolist() if is_pose else colors(i)) 141 | # x_coord, y_coord = k[0], k[1] 142 | # if x_coord % shape[1] != 0 and y_coord % shape[0] != 0: 143 | # if len(k) == 3: 144 | # conf = k[2] 145 | # if conf < conf_thres: 146 | # continue 147 | # cv2.circle(self.im, (int(x_coord), int(y_coord)), radius, color_k, -1, lineType=cv2.LINE_AA) 148 | 149 | # if kpt_line: 150 | # ndim = kpts.shape[-1] 151 | # for i, sk in enumerate(self.skeleton): 152 | # pos1 = (int(kpts[(sk[0] - 1), 0]), int(kpts[(sk[0] - 1), 1])) 153 | # pos2 = (int(kpts[(sk[1] - 1), 0]), int(kpts[(sk[1] - 1), 1])) 154 | # if ndim == 3: 155 | # conf1 = kpts[(sk[0] - 1), 2] 156 | # conf2 = kpts[(sk[1] - 1), 2] 157 | # if conf1 < conf_thres or conf2 < conf_thres: 158 | # continue 159 | # if pos1[0] % shape[1] == 0 or pos1[1] % shape[0] == 0 or pos1[0] < 0 or pos1[1] < 0: 160 | # continue 161 | # if pos2[0] % shape[1] == 0 or pos2[1] % shape[0] == 0 or pos2[0] < 0 or pos2[1] < 0: 162 | # continue 163 | # cv2.line( 164 | # self.im, 165 | # pos1, 166 | # pos2, 167 | # kpt_color or self.limb_color[i].tolist(), 168 | # thickness=int(np.ceil(self.lw / 2)), 169 | # lineType=cv2.LINE_AA, 170 | # ) 171 | # if self.pil: 172 | # # Convert im back to PIL and update draw 173 | # self.fromarray(self.im) 174 | -------------------------------------------------------------------------------- /tools/visualization/line_attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 IDEA. All Rights Reserved. 2 | # ------------------------------------------------------------------------ 3 | import os, sys 4 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..')) 5 | 6 | import argparse 7 | 8 | import matplotlib as mpl 9 | import matplotlib.pyplot as plt 10 | from matplotlib.backends.backend_agg import FigureCanvasAgg 11 | 12 | import torch 13 | from torch.utils.data import DataLoader 14 | 15 | from util.slconfig import SLConfig 16 | 17 | import datasets 18 | from datasets import build_dataset, BatchImageCollateFunction 19 | 20 | 21 | def create(args, classname): 22 | # we use register to maintain models from catdet6 on. 23 | from models.registry import MODULE_BUILD_FUNCS 24 | class_module = getattr(args, classname) 25 | assert class_module in MODULE_BUILD_FUNCS._module_dict 26 | build_func = MODULE_BUILD_FUNCS.get(class_module) 27 | return build_func(args) 28 | 29 | def main(args): 30 | cfg = SLConfig.fromfile(args.config) 31 | device = args.device 32 | 33 | setattr(cfg, 'coco_path', args.data_path) 34 | setattr(cfg, 'batch_size_train', 1) 35 | setattr(cfg, 'batch_size_val', 1) 36 | 37 | if 'HGNetv2' in cfg.backbone: 38 | cfg.pretrained = False 39 | 40 | # build model 41 | model, _ = create(cfg, 'modelname') 42 | model.to(device) 43 | 44 | criterion = create(cfg, 'criterionname') 45 | 46 | dataset_val = build_dataset(image_set='val', args=cfg) 47 | 48 | sampler_val = torch.utils.data.SequentialSampler(dataset_val) 49 | 50 | data_loader_val = DataLoader(dataset_val, 1, sampler=sampler_val, drop_last=False, collate_fn=BatchImageCollateFunction(), num_workers=4) 51 | 52 | if args.resume: 53 | checkpoint = torch.load(args.resume, map_location='cpu') 54 | if 'ema' in checkpoint: 55 | state = checkpoint['ema']['module'] 56 | else: 57 | state = checkpoint['model'] 58 | 59 | # NOTE load train mode state -> convert to deploy mode 60 | model.load_state_dict(state) 61 | 62 | # change to device 63 | model.to(device) 64 | 65 | # transformer parameters 66 | len_q = cfg.num_queries 67 | nheads = cfg.nheads 68 | num_sampling_points = cfg.dec_n_points 69 | num_points_scale = torch.tensor([1/n for n in num_sampling_points for _ in range(n)], dtype=torch.float32).reshape(-1, 1) 70 | 71 | # folder path 72 | main_folder = cfg.output_dir 73 | if 'data/wireframe_processed' in args.data_path: 74 | append_path = f'{main_folder}/visualization/line_attention_wireframe' 75 | 76 | elif 'data/york_processed' in args.data_path: 77 | append_path = f'{main_folder}/visualization/line_attention_york' 78 | os.makedirs(append_path , exist_ok=True) 79 | 80 | with torch.no_grad(): 81 | 82 | for i, (samples, targets) in enumerate(data_loader_val): 83 | samples = samples.to(device) 84 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 85 | 86 | sampling_ratios = [] 87 | reference_points = [] 88 | attention_weights = [] 89 | hooks = [ 90 | model.decoder.decoder.layers[-1].cross_attn.sampling_ratios.register_forward_hook( 91 | lambda self, input, output: sampling_ratios.append(output[0]) 92 | ), 93 | model.decoder.decoder.layers[-1].cross_attn.attention_weights.register_forward_hook( 94 | lambda self, input, output: attention_weights.append(output[0]) 95 | ), 96 | model.decoder.decoder.register_forward_hook( 97 | lambda self, input, output: reference_points.append(output[0]) 98 | ), 99 | ] 100 | 101 | output = model(samples, None) 102 | 103 | [(src_idx, tgt_idx)] = criterion(output, targets, return_indices=True) 104 | 105 | for hook in hooks: 106 | hook.remove() 107 | 108 | sampling_ratios = sampling_ratios[0].cpu().view(1, len_q, nheads, sum(num_sampling_points), 1) 109 | attention_weights = attention_weights[0].cpu().view(1, len_q, nheads, sum(num_sampling_points)) 110 | attention_weights = torch.nn.functional.softmax(attention_weights, dim=-1) 111 | 112 | reference_points = reference_points[0][-2:-1].cpu().transpose(1, 2) 113 | 114 | vector = reference_points[:, :, None, :, :2] - reference_points[:, :, None, :, 2:] 115 | center = 0.5 * (reference_points[:, :, None, :, :2] + reference_points[:, :, None, :, 2:]) 116 | 117 | sampling_locations = center + sampling_ratios * num_points_scale * vector * 0.5 118 | 119 | # Plot image 120 | img = samples[0].permute(1, 2, 0).cpu() 121 | img = (img - img.min()) / (img.max() - img.min()) 122 | fig, ax = plt.subplots() 123 | ax.imshow(img, extent=[0, 1, 1, 0]) 124 | 125 | reference_points = reference_points.transpose(1, 2)[0, 0] 126 | sampling_locations = sampling_locations[0] 127 | attention_weights = attention_weights[0] 128 | 129 | # choose the query idx 130 | line_idx = src_idx[tgt_idx == 0][0] 131 | reference_points = reference_points[line_idx] 132 | sampling_locations = sampling_locations[line_idx] 133 | attention_weights = attention_weights[line_idx] 134 | 135 | # sampling points 136 | for j in range(nheads): 137 | x1, y1 = sampling_locations[j].split(1, dim=-1) 138 | pos = ax.scatter(x1, y1, marker='*', c=attention_weights[j], cmap='jet', zorder=2) 139 | cbar = fig.colorbar(pos, ax=ax) 140 | cbar.ax.tick_params(size=0) 141 | cbar.set_ticks([]) 142 | 143 | # reference lines 144 | x1, y1, x2, y2 = reference_points.split(1, dim=-1) 145 | ax.plot((x1[0], x2[0]), (y1[0], y2[0]), c='k', marker='o', zorder=3) 146 | 147 | plt.axis([0, 1, 1, 0]) 148 | plt.axis(False) 149 | 150 | 151 | curr_img_id = targets[0]['image_id'].tolist()[0] 152 | plt.savefig(f'{append_path}/{curr_img_id}.png', bbox_inches="tight", pad_inches=0.0, dpi=100) 153 | plt.close() 154 | 155 | # check condition to stop program 156 | if args.num_images is not None and i + 1 >= args.num_images: 157 | break 158 | 159 | 160 | if __name__ == '__main__': 161 | parser = argparse.ArgumentParser('Visualization of Deformable Line Attention') 162 | parser.add_argument('-c', '--config', type=str, required=True) 163 | parser.add_argument('-r', '--resume', default='', help='resume from checkpoint') 164 | parser.add_argument('-p', '--data-path', type=str, default='data/wireframe_processed', help='data path') 165 | parser.add_argument('-d', '--device', type=str, default='cpu') 166 | parser.add_argument('-n', '--num_images', type=int, help='total number of images to plot') 167 | args = parser.parse_args() 168 | main(args) 169 | -------------------------------------------------------------------------------- /tools/inference/torch_inf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 3 | """ 4 | import os 5 | import sys 6 | import glob 7 | 8 | import cv2 # Added for video processing 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | import torchvision.transforms as T 13 | 14 | from PIL import Image, ImageDraw 15 | from copy import deepcopy 16 | from annotator import Annotator 17 | from annotator_crowdpose import AnnotatorCrowdpose 18 | 19 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) 20 | from src.core import LazyConfig, instantiate 21 | 22 | annotators = {'COCO': Annotator, 'CrowdPose': AnnotatorCrowdpose} 23 | 24 | def process_image(model, device, file_path): 25 | im_pil = Image.open(file_path).convert("RGB") 26 | w, h = im_pil.size 27 | orig_size = torch.tensor([[w, h]]).to(device) 28 | annotator = annotators[annotator_type](deepcopy(im_pil)) 29 | 30 | transforms = T.Compose( 31 | [ 32 | T.Resize((640, 640)), 33 | T.ToTensor(), 34 | ] 35 | ) 36 | im_data = transforms(im_pil).unsqueeze(0).to(device) 37 | 38 | output = model(im_data, orig_size) 39 | 40 | scores, labels, keypoints = output 41 | scores, labels, keypoints = scores[0], labels[0], keypoints[0] 42 | for kpt, score in zip(keypoints, scores): 43 | if score > thrh: 44 | annotator.kpts( 45 | kpt, 46 | [h, w] 47 | ) 48 | annotator.save(f"{OUTPUT_NAME}.jpg") 49 | 50 | 51 | def process_video(model, device, file_path): 52 | cap = cv2.VideoCapture(file_path) 53 | 54 | # Get video properties 55 | fps = cap.get(cv2.CAP_PROP_FPS) 56 | orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 57 | orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 58 | 59 | # Define the codec and create VideoWriter object 60 | fourcc = cv2.VideoWriter_fourcc(*"mp4v") 61 | out = cv2.VideoWriter(f"{OUTPUT_NAME}.mp4", fourcc, fps, (orig_w, orig_h)) 62 | 63 | transforms = T.Compose( 64 | [ 65 | T.Resize((640, 640)), 66 | T.ToTensor(), 67 | ] 68 | ) 69 | 70 | frame_count = 0 71 | print("Processing video frames...") 72 | while cap.isOpened(): 73 | ret, frame = cap.read() 74 | if not ret: 75 | break 76 | 77 | # Convert frame to PIL image 78 | frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 79 | 80 | w, h = frame_pil.size 81 | orig_size = torch.tensor([[w, h]]).to(device) 82 | 83 | annotator = annotators[annotator_type](deepcopy(frame_pil)) 84 | 85 | im_data = transforms(frame_pil).unsqueeze(0).to(device) 86 | 87 | output = model(im_data, orig_size) 88 | 89 | scores, labels, keypoints = output 90 | scores, labels, keypoints = scores[0], labels[0], keypoints[0] 91 | for kpt, score in zip(keypoints, scores): 92 | if score > thrh: 93 | annotator.kpts( 94 | kpt, 95 | [h, w] 96 | ) 97 | 98 | # Convert back to OpenCV image 99 | frame = annotator.result() 100 | 101 | # Write the frame 102 | out.write(frame) 103 | frame_count += 1 104 | 105 | if frame_count % 10 == 0: 106 | print(f"Processed {frame_count} frames...") 107 | 108 | cap.release() 109 | out.release() 110 | print("Video processing complete. Result saved as 'results_video.mp4'.") 111 | 112 | def process_file(model, device, file_path): 113 | # Check if the input file is an image or a vide 114 | if os.path.splitext(file_path)[-1].lower() in [".jpg", ".jpeg", ".png", ".bmp"]: 115 | # Process as image 116 | process_image(model, device, file_path) 117 | print("Image processing complete.") 118 | else: 119 | # Process as video 120 | process_video(model, device, file_path) 121 | print("Video processing complete.") 122 | 123 | def create(args, classname): 124 | # we use register to maintain models from catdet6 on. 125 | from models.registry import MODULE_BUILD_FUNCS 126 | class_module = getattr(args, classname) 127 | assert class_module in MODULE_BUILD_FUNCS._module_dict 128 | build_func = MODULE_BUILD_FUNCS.get(class_module) 129 | return build_func(args) 130 | 131 | def main(args): 132 | # Global variable 133 | global OUTPUT_NAME, thrh, annotator_type 134 | 135 | """Main function""" 136 | cfg = LazyConfig.load(args.config) 137 | 138 | if hasattr(cfg.model.backbone, 'pretrained'): 139 | cfg.model.backbone.pretrained = False 140 | 141 | model = instantiate(cfg.model) 142 | postprocessor = instantiate(cfg.postprocessor) 143 | 144 | num_body_points = model.transformer.num_body_points 145 | if num_body_points == 17: 146 | annotator_type = 'COCO' 147 | elif num_body_points == 14: 148 | annotator_type = 'CrowdPose' 149 | else: 150 | raise Exception(f'Not implemented annotator for model with {num_body_points} keypoints') 151 | 152 | if args.resume: 153 | checkpoint = torch.load(args.resume, map_location='cpu', weights_only=False) 154 | if 'ema' in checkpoint: 155 | state = checkpoint['ema']['module'] 156 | else: 157 | state = checkpoint['model'] 158 | 159 | # NOTE load train mode state -> convert to deploy mode 160 | model.load_state_dict(state) 161 | 162 | else: 163 | # raise AttributeError('Only support resume to load model.state_dict by now.') 164 | print('not load model.state_dict, use default init state dict...') 165 | 166 | class Model(nn.Module): 167 | def __init__(self): 168 | super().__init__() 169 | self.model = model.deploy() 170 | self.postprocessor = postprocessor.deploy() 171 | 172 | def forward(self, images, orig_target_sizes): 173 | outputs = self.model(images) 174 | outputs = self.postprocessor(outputs, orig_target_sizes) 175 | return outputs 176 | 177 | device = args.device 178 | model = Model().to(device) 179 | thrh = 0.5 if args.thrh is None else args.thrh 180 | 181 | # Check if the input argumnet is a file or a folder 182 | file_path = args.input 183 | if os.path.isdir(file_path): 184 | # Process a folder 185 | folder_dir = args.input 186 | output_dir = f"{folder_dir}/output" 187 | os.makedirs(output_dir, exist_ok=True) 188 | paths = list(glob.iglob(f"{folder_dir}/*.*")) 189 | for file_path in paths: 190 | OUTPUT_NAME = file_path.replace(f'{folder_dir}/', f'{output_dir}/').split('.')[0] 191 | OUTPUT_NAME = f"{OUTPUT_NAME}_{annotator_type}" 192 | process_file(model, device, file_path) 193 | else: 194 | # Process a file 195 | OUTPUT_NAME = f'torch_results_{annotator_type}' 196 | process_file(model, device, file_path) 197 | 198 | 199 | if __name__ == "__main__": 200 | import argparse 201 | 202 | parser = argparse.ArgumentParser() 203 | parser.add_argument("-c", "--config", type=str, required=True) 204 | parser.add_argument("-r", "--resume", type=str, required=True) 205 | parser.add_argument("-d", "--device", type=str, default="cpu") 206 | parser.add_argument("-i", "--input", type=str, required=True) 207 | parser.add_argument("-t", "--thrh", type=float, required=False, default=None) 208 | args = parser.parse_args() 209 | main(args) 210 | -------------------------------------------------------------------------------- /src/models/detrpose/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import math 10 | 11 | import torch 12 | from torch import nn 13 | import torch.nn.functional as F 14 | from torch.nn.init import xavier_uniform_, constant_ 15 | 16 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 17 | # N_, S_, M_, D_ = value.shape 18 | _, D_ , _= value[0].shape 19 | N_, Lq_, M_, L_, P_, _ = sampling_locations.shape 20 | # value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 21 | 22 | sampling_grids = 2 * sampling_locations - 1 23 | sampling_grids = sampling_grids.transpose(1, 2).flatten(0, 1) 24 | 25 | sampling_value_list = [] 26 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 27 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 28 | value_l_ = value[lid_].unflatten(2, (H_, W_)) 29 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 30 | sampling_grid_l_ = sampling_grids[:, :, lid_] 31 | # N_*M_, D_, Lq_, P_ 32 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 33 | mode='bilinear', padding_mode='zeros', align_corners=False) 34 | sampling_value_list.append(sampling_value_l_) 35 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 36 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 37 | output = (torch.concat(sampling_value_list, dim=-1) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 38 | return output.transpose(1, 2)#.contiguous() 39 | 40 | 41 | class MSDeformAttn(nn.Module): 42 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False): 43 | """ 44 | Multi-Scale Deformable Attention Module 45 | :param d_model hidden dimension 46 | :param n_levels number of feature levels 47 | :param n_heads number of attention heads 48 | :param n_points number of sampling points per attention head per feature level 49 | """ 50 | super().__init__() 51 | if d_model % n_heads != 0: 52 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 53 | _d_per_head = d_model // n_heads 54 | 55 | self.d_model = d_model 56 | self.n_levels = n_levels 57 | self.n_heads = n_heads 58 | self.n_points = n_points 59 | 60 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 61 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 62 | # self.value_proj = nn.Linear(d_model, d_model) 63 | # self.output_proj = nn.Linear(d_model, d_model) 64 | 65 | self.use_4D_normalizer = use_4D_normalizer 66 | 67 | self._reset_parameters() 68 | 69 | def _reset_parameters(self): 70 | constant_(self.sampling_offsets.weight.data, 0.) 71 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 72 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 73 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 74 | for i in range(self.n_points): 75 | grid_init[:, :, i, :] *= i % 4 + 1 76 | with torch.no_grad(): 77 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 78 | if self.n_points % 4 != 0: 79 | constant_(self.sampling_offsets.bias, 0.) 80 | constant_(self.attention_weights.weight.data, 0.) 81 | constant_(self.attention_weights.bias.data, 0.) 82 | # xavier_uniform_(self.value_proj.weight.data) 83 | # constant_(self.value_proj.bias.data, 0.) 84 | # xavier_uniform_(self.output_proj.weight.data) 85 | # constant_(self.output_proj.bias.data, 0.) 86 | 87 | def forward(self, query, reference_points, value, input_spatial_shapes): 88 | """ 89 | :param query (N, Length_{query}, C) 90 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 91 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 92 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 93 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 94 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 95 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 96 | 97 | :return output (N, Length_{query}, C) 98 | """ 99 | N, Len_q, _ = query.shape 100 | # assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 101 | 102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 104 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 105 | 106 | # N, Len_q, n_heads, n_levels, n_points, 2 107 | reference_points = torch.transpose(reference_points, 2, 3).flatten(1, 2) 108 | 109 | if reference_points.shape[-1] == 2: 110 | offset_normalizer = torch.tensor(input_spatial_shapes, device=query.device) 111 | offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.n_levels, 1, 2) 112 | sampling_locations = reference_points[:, :, None, :, None, :] \ 113 | + sampling_offsets / offset_normalizer 114 | elif reference_points.shape[-1] == 4: 115 | if self.use_4D_normalizer: 116 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 117 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 118 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5 119 | else: 120 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 121 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 122 | else: 123 | raise ValueError( 124 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 125 | 126 | output = ms_deform_attn_core_pytorch( 127 | value, input_spatial_shapes, sampling_locations, attention_weights) 128 | return output 129 | -------------------------------------------------------------------------------- /src/solver/engine.py: -------------------------------------------------------------------------------- 1 | """ 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from DEIM (https://github.com/Intellindust-AI-Lab/DEIM/) 6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 7 | --------------------------------------------------------------------------------- 8 | Modified from DETR (https://github.com/facebookresearch/detr/blob/main/engine.py) 9 | Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 10 | """ 11 | 12 | import math 13 | import sys 14 | from typing import Iterable 15 | 16 | import torch 17 | from ..misc import logger as utils 18 | from ..misc import dist_utils 19 | 20 | GIGABYTE = 1024 ** 3 21 | 22 | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, 23 | data_loader: Iterable, optimizer: torch.optim.Optimizer, 24 | batch_size:int, grad_accum_steps:int, 25 | device: torch.device, epoch: int, max_norm: float = 0, writer=None, 26 | lr_scheduler=None, warmup_scheduler=None, ema=None, args=None): 27 | scaler = torch.amp.GradScaler(str(device), enabled=args.amp) 28 | model.train() 29 | criterion.train() 30 | metric_logger = utils.MetricLogger(delimiter=" ") 31 | metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) 32 | header = 'Epoch: [{}]'.format(epoch) 33 | print_freq = args.print_freq 34 | 35 | sub_batch_size = batch_size // args.grad_accum_steps 36 | 37 | print("Grad accum steps: ", args.grad_accum_steps) 38 | print("Batch size/GPU: ", batch_size) 39 | print("Total batch size: ", batch_size * dist_utils.get_world_size()) 40 | 41 | optimizer.zero_grad() 42 | 43 | 44 | for i, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): 45 | samples = samples.to(device) 46 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 47 | 48 | global_step = epoch * len(data_loader) + i 49 | 50 | for j in range(args.grad_accum_steps): 51 | start_idx = j * sub_batch_size 52 | final_idx = start_idx + sub_batch_size 53 | new_samples = samples[start_idx:final_idx] 54 | new_samples = new_samples.to(device) 55 | new_targets = [{k: v.to(device) for k, v in t.items()} for t in targets[start_idx:final_idx]] 56 | 57 | with torch.amp.autocast(str(device), enabled=args.amp): 58 | outputs = model(new_samples, new_targets) 59 | 60 | with torch.amp.autocast(str(device), enabled=False): 61 | loss_dict = criterion(outputs, new_targets) 62 | losses = sum(loss_dict.values()) 63 | 64 | if args.amp: 65 | scaler.scale(losses).backward() 66 | else: 67 | losses.backward() 68 | 69 | # reduce losses over all GPUs for logging purposes 70 | loss_dict_reduced = utils.reduce_dict(loss_dict) 71 | losses_reduced_scaled = sum(loss_dict_reduced.values()) 72 | 73 | loss_value = losses_reduced_scaled.item() 74 | 75 | if not math.isfinite(loss_value): 76 | print("Loss is {}, stopping training".format(loss_value)) 77 | print(loss_dict_reduced) 78 | sys.exit(1) 79 | 80 | 81 | # amp backward function 82 | if args.amp: 83 | if max_norm > 0: 84 | scaler.unscale_(optimizer) 85 | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) 86 | scaler.step(optimizer) 87 | scaler.update() 88 | else: 89 | # original backward function 90 | if max_norm > 0: 91 | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) 92 | optimizer.step() 93 | 94 | # ema 95 | if ema is not None: 96 | ema.update(model) 97 | 98 | if warmup_scheduler is not None: 99 | warmup_scheduler.step() 100 | 101 | 102 | metric_logger.update(loss=loss_value, **loss_dict_reduced) 103 | metric_logger.update(lr=optimizer.param_groups[0]["lr"]) 104 | 105 | 106 | if writer and dist_utils.is_main_process() and global_step % 10 == 0: 107 | writer.add_scalar('Loss/total', loss_value, global_step) 108 | for j, pg in enumerate(optimizer.param_groups): 109 | writer.add_scalar(f'Lr/pg_{j}', pg['lr'], global_step) 110 | for k, v in loss_dict_reduced.items(): 111 | writer.add_scalar(f'Loss/{k}', v.item(), global_step) 112 | free, total = torch.cuda.mem_get_info(device) 113 | mem_used_MB = (total - free) / GIGABYTE 114 | writer.add_scalar('Info/memory', mem_used_MB, global_step) 115 | 116 | optimizer.zero_grad() 117 | 118 | # gather the stats from all processes 119 | metric_logger.synchronize_between_processes() 120 | print("Averaged stats:", metric_logger) 121 | return {k: meter.global_avg for k, meter in metric_logger.meters.items() if meter.count > 0} 122 | 123 | 124 | 125 | 126 | @torch.no_grad() 127 | def evaluate(model, postprocessors, coco_evaluator, data_loader, device, writer=None, save_results=False): 128 | model.eval() 129 | if coco_evaluator is not None: 130 | coco_evaluator.cleanup() 131 | 132 | metric_logger = utils.MetricLogger(delimiter=" ") 133 | header = 'Test:' 134 | res_json = [] 135 | 136 | for samples, targets in metric_logger.log_every(data_loader, 10, header): 137 | samples = samples.to(device) 138 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 139 | 140 | outputs = model(samples, targets) 141 | 142 | orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) 143 | results = postprocessors(outputs, orig_target_sizes) 144 | 145 | res = {target['image_id'].item(): output for target, output in zip(targets, results)} 146 | if coco_evaluator is not None: 147 | coco_evaluator.update(res) 148 | 149 | if save_results: 150 | for k, v in res.items(): 151 | scores = v['scores'] 152 | labels = v['labels'] 153 | keypoints = v['keypoints'] 154 | 155 | for s, l, kpt in zip(scores, labels, keypoints): 156 | res_json.append( 157 | { 158 | "image_id": k, 159 | "category_id": l.item(), 160 | "keypoints": kpt.round(decimals=4).tolist(), 161 | "score": s.item() 162 | } 163 | ) 164 | 165 | # gather the stats from all processes 166 | metric_logger.synchronize_between_processes() 167 | print("Averaged stats:", metric_logger) 168 | if coco_evaluator is not None: 169 | coco_evaluator.synchronize_between_processes() 170 | 171 | if save_results: 172 | return res_json 173 | 174 | # accumulate predictions from all images 175 | if coco_evaluator is not None: 176 | coco_evaluator.accumulate() 177 | coco_evaluator.summarize() 178 | 179 | stats = {k: meter.global_avg for k, meter in metric_logger.meters.items() if meter.count > 0} 180 | if coco_evaluator is not None: 181 | stats['coco_eval_keypoints'] = coco_evaluator.coco_eval['keypoints'].stats.tolist() 182 | return stats 183 | -------------------------------------------------------------------------------- /src/data/dataloader.py: -------------------------------------------------------------------------------- 1 | """ 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from D-DEIM (https://github.com/Intellindust-AI-Lab/DEIM/) 6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 7 | --------------------------------------------------------------------------------- 8 | Modified from D-FINE (https://github.com/Peterande/D-FINE/) 9 | Copyright (c) 2024 D-FINE Authors. All Rights Reserved. 10 | --------------------------------------------------------------------------------- 11 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR/) 12 | Copyright (c) 2023 RT-DETR Authors. All Rights Reserved. 13 | """ 14 | 15 | import torch 16 | import torch.nn.functional as F 17 | import torch.utils.data as data 18 | from torchvision.transforms.functional import resize 19 | import random 20 | 21 | from PIL import Image, ImageDraw 22 | import os 23 | 24 | from copy import deepcopy 25 | 26 | # This only for printing 27 | RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m" 28 | 29 | 30 | class DataLoader(data.DataLoader): 31 | def __repr__(self) -> str: 32 | format_string = self.__class__.__name__ + "(" 33 | for n in ['dataset', 'batch_size', 'num_workers', 'drop_last', 'collate_fn']: 34 | format_string += "\n" 35 | format_string += " {0}: {1}".format(n, getattr(self, n)) 36 | format_string += "\n)" 37 | return format_string 38 | 39 | def set_epoch(self, epoch): 40 | self._epoch = epoch 41 | self.dataset.set_epoch(epoch) 42 | self.collate_fn.set_epoch(epoch) 43 | 44 | @property 45 | def epoch(self): 46 | return self._epoch if hasattr(self, '_epoch') else -1 47 | 48 | @property 49 | def shuffle(self): 50 | return self._shuffle 51 | 52 | @shuffle.setter 53 | def shuffle(self, shuffle): 54 | assert isinstance(shuffle, bool), 'shuffle must be a boolean' 55 | self._shuffle = shuffle 56 | 57 | 58 | class BaseCollateFunction(object): 59 | def set_epoch(self, epoch): 60 | self._epoch = epoch 61 | 62 | @property 63 | def epoch(self): 64 | return self._epoch if hasattr(self, '_epoch') else -1 65 | 66 | def __call__(self, items): 67 | raise NotImplementedError('') 68 | 69 | 70 | def generate_scales(base_size, base_size_repeat): 71 | scale_repeat = (base_size - int(base_size * 0.75 / 32) * 32) // 32 72 | scales = [int(base_size * 0.75 / 32) * 32 + i * 32 for i in range(scale_repeat)] 73 | scales += [base_size] * base_size_repeat 74 | scales += [int(base_size * 1.25 / 32) * 32 - i * 32 for i in range(scale_repeat)] 75 | return scales 76 | 77 | 78 | class BatchImageCollateFunction(BaseCollateFunction): 79 | def __init__( 80 | self, 81 | stop_epoch=None, 82 | ema_restart_decay=0.9999, 83 | base_size=640, 84 | base_size_repeat=None, 85 | mixup_prob=0.0, 86 | mixup_epochs=[0, 0], 87 | vis_folder='./vis_dataset/', 88 | vis_save=False 89 | ) -> None: 90 | super().__init__() 91 | self.base_size = base_size 92 | self.scales = generate_scales(base_size, base_size_repeat) if base_size_repeat is not None else None 93 | if self.scales is not None: 94 | self.scales.sort() 95 | print(GREEN + "Multi-scaling uses the following size: " + RESET, self.scales) 96 | self.stop_epoch = stop_epoch if stop_epoch is not None else 100000000 97 | self.ema_restart_decay = ema_restart_decay 98 | 99 | self.mixup_prob = mixup_prob 100 | self.mixup_epochs = mixup_epochs 101 | self.print_info_flag = True 102 | 103 | self.vis_save = vis_save 104 | self.vis_folder = vis_folder 105 | self.vis_image_number = 0 106 | self.max_vis_image_number = 10 107 | 108 | if vis_save: 109 | os.makedirs(self.vis_folder, exist_ok=True) 110 | 111 | def __call__(self, items): 112 | images = torch.cat([x[0][None] for x in items], dim=0) 113 | targets = [x[1] for x in items] 114 | 115 | images, targets = self.apply_mixup(images, targets) 116 | 117 | if self.scales is not None and self.epoch < self.stop_epoch: 118 | sz = random.choice(self.scales) 119 | images = resize(images, [sz, sz]) 120 | return images, targets 121 | 122 | def apply_mixup(self, images, targets): 123 | """ 124 | Applies Mixup augmentation to the batch if conditions are met. 125 | 126 | Args: 127 | images (torch.Tensor): Batch of images. 128 | targets (list[dict]): List of target dictionaries corresponding to images. 129 | 130 | Returns: 131 | tuple: Updated images and targets 132 | """ 133 | # Log when Mixup is permanently disabled 134 | if self.epoch == self.mixup_epochs[-1] and self.print_info_flag: 135 | print(f" ### Attention --- Mixup is closed after epoch@ {self.epoch} ###") 136 | self.print_info_flag = False 137 | 138 | # Apply Mixup if within specified epoch range and probability threshold 139 | if random.random() < self.mixup_prob and self.mixup_epochs[0] <= self.epoch < self.mixup_epochs[1]: 140 | # Generate mixup ratio 141 | beta = round(random.uniform(0.45, 0.55), 6) 142 | 143 | # Mix images 144 | images = images.roll(shifts=1, dims=0).mul_(1.0 - beta).add_(images.mul(beta)) 145 | 146 | # Prepare targets for Mixup 147 | shifted_targets = targets[-1:] + targets[:-1] 148 | updated_targets = deepcopy(targets) 149 | 150 | for i in range(len(targets)): 151 | # Combine boxes, labels, and areas from original and shifted targets 152 | updated_targets[i]['boxes'] = torch.cat([targets[i]['boxes'], shifted_targets[i]['boxes']], dim=0) 153 | updated_targets[i]['keypoints'] = torch.cat([targets[i]['keypoints'], shifted_targets[i]['keypoints']], dim=0) 154 | updated_targets[i]['labels'] = torch.cat([targets[i]['labels'], shifted_targets[i]['labels']], dim=0) 155 | updated_targets[i]['area'] = torch.cat([targets[i]['area'], shifted_targets[i]['area']], dim=0) 156 | 157 | # Add mixup ratio to targets 158 | updated_targets[i]['mixup'] = torch.tensor( 159 | [beta] * len(targets[i]['labels']) + [1.0 - beta] * len(shifted_targets[i]['labels']), 160 | dtype=torch.float32 161 | ) 162 | targets = updated_targets 163 | 164 | if self.vis_save and self.vis_image_number < self.max_vis_image_number: 165 | for i in range(len(updated_targets)): 166 | image_tensor = images[i] 167 | image_tensor_uint8 = ((image_tensor - image_tensor.min()) / (image_tensor.max() - image_tensor.min()) * 255).type(torch.uint8) 168 | image_numpy = image_tensor_uint8.numpy().transpose((1, 2, 0)) 169 | pilImage = Image.fromarray(image_numpy) 170 | draw = ImageDraw.Draw(pilImage) 171 | print('mix_vis:', i, 'boxes.len=', len(updated_targets[i]['boxes'])) 172 | for box in updated_targets[i]['boxes']: 173 | draw.rectangle([int(box[0]*640 - (box[2]*640)/2), int(box[1]*640 - (box[3]*640)/2), 174 | int(box[0]*640 + (box[2]*640)/2), int(box[1]*640 + (box[3]*640)/2)], outline=(255,255,0)) 175 | for pose in updated_targets[i]['keypoints']: 176 | num_pose_point = pose.shape[0] // 3 177 | pose_ = pose[:-num_pose_point].reshape(-1, 2) 178 | for p in pose_: 179 | if sum(p) != 0: 180 | draw.circle((p[0]*640, p[1]*640), 4, fill='blue') 181 | 182 | 183 | pilImage.save(self.vis_folder + f"example_{self.vis_image_number}_" + str(i) + "_"+ str(len(updated_targets[i]['boxes'])) +'_out.jpg') 184 | self.vis_image_number += 1 185 | 186 | return images, targets 187 | -------------------------------------------------------------------------------- /src/misc/logger.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pickle 3 | import datetime 4 | from collections import defaultdict, deque 5 | from typing import Dict 6 | 7 | import torch 8 | import torch.distributed as dist 9 | 10 | from .dist_utils import is_dist_avail_and_initialized, get_world_size 11 | 12 | 13 | class SmoothedValue(object): 14 | """Track a series of values and provide access to smoothed values over a 15 | window or the global series average. 16 | """ 17 | 18 | def __init__(self, window_size=20, fmt=None): 19 | if fmt is None: 20 | fmt = "{median:.4f} ({global_avg:.4f})" 21 | self.deque = deque(maxlen=window_size) 22 | self.total = 0.0 23 | self.count = 0 24 | self.fmt = fmt 25 | 26 | def update(self, value, n=1): 27 | self.deque.append(value) 28 | self.count += n 29 | self.total += value * n 30 | 31 | def synchronize_between_processes(self): 32 | """ 33 | Warning: does not synchronize the deque! 34 | """ 35 | if not is_dist_avail_and_initialized(): 36 | return 37 | t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') 38 | dist.barrier() 39 | dist.all_reduce(t) 40 | t = t.tolist() 41 | self.count = int(t[0]) 42 | self.total = t[1] 43 | 44 | @property 45 | def median(self): 46 | d = torch.tensor(list(self.deque)) 47 | if d.shape[0] == 0: 48 | return 0 49 | return d.median().item() 50 | 51 | @property 52 | def avg(self): 53 | d = torch.tensor(list(self.deque), dtype=torch.float32) 54 | return d.mean().item() 55 | 56 | @property 57 | def global_avg(self): 58 | return self.total / self.count 59 | 60 | @property 61 | def max(self): 62 | return max(self.deque) 63 | 64 | @property 65 | def value(self): 66 | return self.deque[-1] 67 | 68 | def __str__(self): 69 | return self.fmt.format( 70 | median=self.median, 71 | avg=self.avg, 72 | global_avg=self.global_avg, 73 | max=self.max, 74 | value=self.value) 75 | 76 | 77 | def all_gather(data): 78 | """ 79 | Run all_gather on arbitrary picklable data (not necessarily tensors) 80 | Args: 81 | data: any picklable object 82 | Returns: 83 | list[data]: list of data gathered from each rank 84 | """ 85 | world_size = get_world_size() 86 | if world_size == 1: 87 | return [data] 88 | 89 | # serialized to a Tensor 90 | buffer = pickle.dumps(data) 91 | storage = torch.ByteStorage.from_buffer(buffer) 92 | tensor = torch.ByteTensor(storage).to("cuda") 93 | 94 | # obtain Tensor size of each rank 95 | local_size = torch.tensor([tensor.numel()], device="cuda") 96 | size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] 97 | dist.all_gather(size_list, local_size) 98 | size_list = [int(size.item()) for size in size_list] 99 | max_size = max(size_list) 100 | 101 | # receiving Tensor from all ranks 102 | # we pad the tensor because torch all_gather does not support 103 | # gathering tensors of different shapes 104 | tensor_list = [] 105 | for _ in size_list: 106 | tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) 107 | if local_size != max_size: 108 | padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") 109 | tensor = torch.cat((tensor, padding), dim=0) 110 | dist.all_gather(tensor_list, tensor) 111 | 112 | data_list = [] 113 | for size, tensor in zip(size_list, tensor_list): 114 | buffer = tensor.cpu().numpy().tobytes()[:size] 115 | data_list.append(pickle.loads(buffer)) 116 | 117 | return data_list 118 | 119 | 120 | def reduce_dict(input_dict, average=True): 121 | """ 122 | Args: 123 | input_dict (dict): all the values will be reduced 124 | average (bool): whether to do average or sum 125 | Reduce the values in the dictionary from all processes so that all processes 126 | have the averaged results. Returns a dict with the same fields as 127 | input_dict, after reduction. 128 | """ 129 | world_size = get_world_size() 130 | if world_size < 2: 131 | return input_dict 132 | with torch.no_grad(): 133 | names = [] 134 | values = [] 135 | # sort the keys so that they are consistent across processes 136 | for k in sorted(input_dict.keys()): 137 | names.append(k) 138 | values.append(input_dict[k]) 139 | values = torch.stack(values, dim=0) 140 | dist.all_reduce(values) 141 | if average: 142 | values /= world_size 143 | reduced_dict = {k: v for k, v in zip(names, values)} 144 | return reduced_dict 145 | 146 | 147 | class MetricLogger(object): 148 | def __init__(self, delimiter="\t"): 149 | self.meters = defaultdict(SmoothedValue) 150 | self.delimiter = delimiter 151 | 152 | def update(self, **kwargs): 153 | for k, v in kwargs.items(): 154 | if isinstance(v, torch.Tensor): 155 | v = v.item() 156 | assert isinstance(v, (float, int)) 157 | self.meters[k].update(v) 158 | 159 | def __getattr__(self, attr): 160 | if attr in self.meters: 161 | return self.meters[attr] 162 | if attr in self.__dict__: 163 | return self.__dict__[attr] 164 | raise AttributeError("'{}' object has no attribute '{}'".format( 165 | type(self).__name__, attr)) 166 | 167 | def __str__(self): 168 | loss_str = [] 169 | for name, meter in self.meters.items(): 170 | if meter.count > 0: 171 | loss_str.append( 172 | "{}: {}".format(name, str(meter)) 173 | ) 174 | return self.delimiter.join(loss_str) 175 | 176 | def synchronize_between_processes(self): 177 | for meter in self.meters.values(): 178 | meter.synchronize_between_processes() 179 | 180 | def add_meter(self, name, meter): 181 | self.meters[name] = meter 182 | 183 | def log_every(self, iterable, print_freq, header=None, logger=None): 184 | if logger is None: 185 | print_func = print 186 | else: 187 | print_func = logger.info 188 | 189 | i = 0 190 | if not header: 191 | header = '' 192 | start_time = time.time() 193 | end = time.time() 194 | iter_time = SmoothedValue(fmt='{avg:.4f}') 195 | data_time = SmoothedValue(fmt='{avg:.4f}') 196 | space_fmt = ':' + str(len(str(len(iterable)))) + 'd' 197 | if torch.cuda.is_available(): 198 | log_msg = self.delimiter.join([ 199 | header, 200 | '[{0' + space_fmt + '}/{1}]', 201 | 'eta: {eta}', 202 | '{meters}', 203 | 'time: {time}', 204 | 'data: {data}', 205 | 'max mem: {memory:.0f}' 206 | ]) 207 | else: 208 | log_msg = self.delimiter.join([ 209 | header, 210 | '[{0' + space_fmt + '}/{1}]', 211 | 'eta: {eta}', 212 | '{meters}', 213 | 'time: {time}', 214 | 'data: {data}' 215 | ]) 216 | MB = 1024.0 * 1024.0 217 | for obj in iterable: 218 | data_time.update(time.time() - end) 219 | yield obj 220 | iter_time.update(time.time() - end) 221 | if i % print_freq == 0 or i == len(iterable) - 1: 222 | eta_seconds = iter_time.global_avg * (len(iterable) - i) 223 | eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) 224 | if torch.cuda.is_available(): 225 | print_func(log_msg.format( 226 | i, len(iterable), eta=eta_string, 227 | meters=str(self), 228 | time=str(iter_time), data=str(data_time), 229 | memory=torch.cuda.max_memory_allocated() / MB)) 230 | else: 231 | print_func(log_msg.format( 232 | i, len(iterable), eta=eta_string, 233 | meters=str(self), 234 | time=str(iter_time), data=str(data_time))) 235 | i += 1 236 | end = time.time() 237 | total_time = time.time() - start_time 238 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 239 | print_func('{} Total time: {} ({:.4f} s / it)'.format( 240 | header, total_time_str, total_time / len(iterable))) 241 | -------------------------------------------------------------------------------- /tools/benchmark/trt_benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 3 | """ 4 | 5 | import tensorrt as trt 6 | import pycuda.driver as cuda 7 | from utils import TimeProfiler 8 | import numpy as np 9 | import os 10 | import time 11 | import torch 12 | 13 | from collections import namedtuple, OrderedDict 14 | import glob 15 | import argparse 16 | from dataset import Dataset 17 | from tqdm import tqdm 18 | 19 | 20 | def parse_args(): 21 | parser = argparse.ArgumentParser(description='Argument Parser Example') 22 | parser.add_argument('--infer_dir', 23 | type=str, 24 | default='./data/COCO2017/val2017', 25 | help="Directory for images to perform inference on.") 26 | parser.add_argument("--engine_dir", 27 | type=str, 28 | default='trt_engines', 29 | help="Directory containing model engine files.") 30 | parser.add_argument('--busy', 31 | action='store_true', 32 | help="Flag to indicate that other processes may be running.") 33 | args = parser.parse_args() 34 | return args 35 | 36 | class TRTInference(object): 37 | def __init__(self, engine_path, device='cuda', backend='torch', max_batch_size=32, verbose=False): 38 | self.engine_path = engine_path 39 | self.device = device 40 | self.backend = backend 41 | self.max_batch_size = max_batch_size 42 | 43 | self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO) 44 | self.engine = self.load_engine(engine_path) 45 | self.context = self.engine.create_execution_context() 46 | self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device) 47 | self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items()) 48 | self.input_names = self.get_input_names() 49 | self.output_names = self.get_output_names() 50 | 51 | if self.backend == 'cuda': 52 | self.stream = cuda.Stream() 53 | self.time_profile = TimeProfiler() 54 | self.time_profile_dataset = TimeProfiler() 55 | self.yolo = 'yolo' in engine_path 56 | 57 | def init(self): 58 | self.dynamic = False 59 | 60 | def load_engine(self, path): 61 | trt.init_libnvinfer_plugins(self.logger, '') 62 | with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime: 63 | return runtime.deserialize_cuda_engine(f.read()) 64 | 65 | def get_input_names(self): 66 | names = [] 67 | for _, name in enumerate(self.engine): 68 | if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: 69 | names.append(name) 70 | return names 71 | 72 | def get_output_names(self): 73 | names = [] 74 | for _, name in enumerate(self.engine): 75 | if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT: 76 | names.append(name) 77 | return names 78 | 79 | def get_bindings(self, engine, context, max_batch_size=32, device=None): 80 | Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr')) 81 | bindings = OrderedDict() 82 | for i, name in enumerate(engine): 83 | shape = engine.get_tensor_shape(name) 84 | dtype = trt.nptype(engine.get_tensor_dtype(name)) 85 | 86 | if shape[0] == -1: 87 | dynamic = True 88 | shape[0] = max_batch_size 89 | if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: 90 | context.set_input_shape(name, shape) 91 | 92 | if self.backend == 'cuda': 93 | if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: 94 | data = np.random.randn(*shape).astype(dtype) 95 | ptr = cuda.mem_alloc(data.nbytes) 96 | bindings[name] = Binding(name, dtype, shape, data, ptr) 97 | else: 98 | data = cuda.pagelocked_empty(trt.volume(shape), dtype) 99 | ptr = cuda.mem_alloc(data.nbytes) 100 | bindings[name] = Binding(name, dtype, shape, data, ptr) 101 | else: 102 | data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device) 103 | bindings[name] = Binding(name, dtype, shape, data, data.data_ptr()) 104 | return bindings 105 | 106 | def run_torch(self, blob): 107 | for n in self.input_names: 108 | if self.bindings[n].shape != blob[n].shape: 109 | self.context.set_input_shape(n, blob[n].shape) 110 | self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape) 111 | 112 | self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names}) 113 | self.context.execute_v2(list(self.bindings_addr.values())) 114 | outputs = {n: self.bindings[n].data for n in self.output_names} 115 | return outputs 116 | 117 | def async_run_cuda(self, blob): 118 | for n in self.input_names: 119 | cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream) 120 | 121 | bindings_addr = [int(v) for _, v in self.bindings_addr.items()] 122 | self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle) 123 | 124 | outputs = {} 125 | for n in self.output_names: 126 | cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream) 127 | outputs[n] = self.bindings[n].data 128 | 129 | self.stream.synchronize() 130 | 131 | return outputs 132 | 133 | def __call__(self, blob): 134 | if self.backend == 'torch': 135 | return self.run_torch(blob) 136 | elif self.backend == 'cuda': 137 | return self.async_run_cuda(blob) 138 | 139 | def synchronize(self): 140 | if self.backend == 'torch' and torch.cuda.is_available(): 141 | torch.cuda.synchronize() 142 | elif self.backend == 'cuda': 143 | self.stream.synchronize() 144 | 145 | def warmup(self, blob, n): 146 | for _ in range(n): 147 | _ = self(blob) 148 | 149 | def speed(self, blob, n, nonempty_process=False): 150 | times = [] 151 | self.time_profile_dataset.reset() 152 | for i in tqdm(range(n), desc="Running Inference", unit="iteration"): 153 | self.time_profile.reset() 154 | with self.time_profile_dataset: 155 | img = blob[i] 156 | if img['images'] is not None: 157 | img['image'] = img['input'] = img['images'].unsqueeze(0) 158 | else: 159 | img['images'] = img['input'] = img['image'].unsqueeze(0) 160 | with self.time_profile: 161 | _ = self(img) 162 | times.append(self.time_profile.total) 163 | 164 | # end-to-end model only 165 | if not self.yolo: 166 | print('end-to-end') 167 | times = sorted(times) 168 | if len(times) > 100 and nonempty_process: 169 | times = times[:100] 170 | 171 | avg_time = sum(times) / len(times) # Calculate the average of the remaining times 172 | return avg_time 173 | 174 | def main(): 175 | FLAGS = parse_args() 176 | dataset = Dataset(FLAGS.infer_dir) 177 | im = torch.ones(1, 3, 640, 640).cuda() 178 | blob = { 179 | 'image': im, 180 | 'images': im, 181 | 'input': im, 182 | 'im_shape': torch.tensor([640, 640]).to(im.device), 183 | 'scale_factor': torch.tensor([1, 1]).to(im.device), 184 | 'orig_target_sizes': torch.tensor([[640, 640]]).to(im.device), 185 | } 186 | 187 | engine_files = glob.glob(os.path.join(FLAGS.engine_dir, "*.engine")) 188 | results = [] 189 | 190 | for engine_file in engine_files: 191 | print(f"Testing engine: {engine_file}") 192 | model = TRTInference(engine_file, max_batch_size=1, verbose=False) 193 | model.init() 194 | model.warmup(blob, 400) 195 | t = [] 196 | for _ in range(1): 197 | t.append(model.speed(dataset, 1000, FLAGS.busy)) 198 | avg_latency = 1000 * torch.tensor(t).mean() 199 | results.append((engine_file, avg_latency)) 200 | print(f"Engine: {engine_file}, Latency: {avg_latency:.2f} ms") 201 | 202 | del model 203 | torch.cuda.empty_cache() 204 | time.sleep(1) 205 | 206 | sorted_results = sorted(results, key=lambda x: x[1]) 207 | for engine_file, latency in sorted_results: 208 | print(f"Engine: {engine_file}, Latency: {latency:.2f} ms") 209 | 210 | if __name__ == '__main__': 211 | main() 212 | -------------------------------------------------------------------------------- /tools/inference/trt_inf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 3 | """ 4 | 5 | import os 6 | import time 7 | import glob 8 | import collections 9 | import contextlib 10 | from collections import OrderedDict 11 | 12 | import cv2 # Added for video processing 13 | import numpy as np 14 | import tensorrt as trt 15 | import torch 16 | import torchvision.transforms as T 17 | 18 | from PIL import Image, ImageDraw 19 | from copy import deepcopy 20 | from annotator import Annotator 21 | from annotator_crowdpose import AnnotatorCrowdpose 22 | 23 | annotators = {'COCO': Annotator, 'CrowdPose': AnnotatorCrowdpose} 24 | 25 | 26 | class TimeProfiler(contextlib.ContextDecorator): 27 | def __init__(self): 28 | self.total = 0 29 | 30 | def __enter__(self): 31 | self.start = self.time() 32 | return self 33 | 34 | def __exit__(self, type, value, traceback): 35 | self.total += self.time() - self.start 36 | 37 | def reset(self): 38 | self.total = 0 39 | 40 | def time(self): 41 | if torch.cuda.is_available(): 42 | torch.cuda.synchronize() 43 | return time.time() 44 | 45 | 46 | class TRTInference(object): 47 | def __init__( 48 | self, engine_path, device="cuda:0", backend="torch", max_batch_size=32, verbose=False 49 | ): 50 | self.engine_path = engine_path 51 | self.device = device 52 | self.backend = backend 53 | self.max_batch_size = max_batch_size 54 | 55 | self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO) 56 | 57 | self.engine = self.load_engine(engine_path) 58 | self.context = self.engine.create_execution_context() 59 | self.bindings = self.get_bindings( 60 | self.engine, self.context, self.max_batch_size, self.device 61 | ) 62 | self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items()) 63 | self.input_names = self.get_input_names() 64 | self.output_names = self.get_output_names() 65 | self.time_profile = TimeProfiler() 66 | 67 | def load_engine(self, path): 68 | trt.init_libnvinfer_plugins(self.logger, "") 69 | with open(path, "rb") as f, trt.Runtime(self.logger) as runtime: 70 | return runtime.deserialize_cuda_engine(f.read()) 71 | 72 | def get_input_names(self): 73 | names = [] 74 | for _, name in enumerate(self.engine): 75 | if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: 76 | names.append(name) 77 | return names 78 | 79 | def get_output_names(self): 80 | names = [] 81 | for _, name in enumerate(self.engine): 82 | if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT: 83 | names.append(name) 84 | return names 85 | 86 | def get_bindings(self, engine, context, max_batch_size=32, device=None) -> OrderedDict: 87 | Binding = collections.namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) 88 | bindings = OrderedDict() 89 | 90 | for i, name in enumerate(engine): 91 | shape = engine.get_tensor_shape(name) 92 | dtype = trt.nptype(engine.get_tensor_dtype(name)) 93 | 94 | if shape[0] == -1: 95 | shape[0] = max_batch_size 96 | if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: 97 | context.set_input_shape(name, shape) 98 | 99 | data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device) 100 | bindings[name] = Binding(name, dtype, shape, data, data.data_ptr()) 101 | 102 | return bindings 103 | 104 | def run_torch(self, blob): 105 | for n in self.input_names: 106 | if blob[n].dtype is not self.bindings[n].data.dtype: 107 | blob[n] = blob[n].to(dtype=self.bindings[n].data.dtype) 108 | if self.bindings[n].shape != blob[n].shape: 109 | self.context.set_input_shape(n, blob[n].shape) 110 | self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape) 111 | 112 | assert self.bindings[n].data.dtype == blob[n].dtype, "{} dtype mismatch".format(n) 113 | 114 | self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names}) 115 | self.context.execute_v2(list(self.bindings_addr.values())) 116 | outputs = {n: self.bindings[n].data for n in self.output_names} 117 | 118 | return outputs 119 | 120 | def __call__(self, blob): 121 | if self.backend == "torch": 122 | return self.run_torch(blob) 123 | else: 124 | raise NotImplementedError("Only 'torch' backend is implemented.") 125 | 126 | def synchronize(self): 127 | if self.backend == "torch" and torch.cuda.is_available(): 128 | torch.cuda.synchronize() 129 | 130 | def process_image(m, file_path, device): 131 | im_pil = Image.open(file_path).convert("RGB") 132 | w, h = im_pil.size 133 | orig_size = torch.tensor([w, h])[None].to(device) 134 | 135 | transforms = T.Compose( 136 | [ 137 | T.Resize((640, 640)), 138 | T.ToTensor(), 139 | ] 140 | ) 141 | im_data = transforms(im_pil)[None] 142 | annotator = annotators[annotator_type](deepcopy(im_pil)) 143 | 144 | blob = { 145 | "images": im_data.to(device), 146 | "orig_target_sizes": orig_size.to(device), 147 | } 148 | 149 | output = m(blob) 150 | 151 | scores, labels, keypoints = output.values() 152 | scores, labels, keypoints = scores[0], labels[0], keypoints[0] 153 | for kpt, score in zip(keypoints, scores): 154 | if score > thrh: 155 | annotator.kpts( 156 | kpt, 157 | [h, w] 158 | ) 159 | annotator.save(f"{OUTPUT_NAME}.jpg") 160 | 161 | def process_video(m, file_path, device): 162 | cap = cv2.VideoCapture(file_path) 163 | 164 | # Get video properties 165 | fps = cap.get(cv2.CAP_PROP_FPS) 166 | orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 167 | orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 168 | 169 | # Define the codec and create VideoWriter object 170 | fourcc = cv2.VideoWriter_fourcc(*"mp4v") 171 | out = cv2.VideoWriter(f"{OUTPUT_NAME}.mp4", fourcc, fps, (orig_w, orig_h)) 172 | 173 | transforms = T.Compose( 174 | [ 175 | T.Resize((640, 640)), 176 | T.ToTensor(), 177 | ] 178 | ) 179 | 180 | frame_count = 0 181 | print("Processing video frames...") 182 | while cap.isOpened(): 183 | ret, frame = cap.read() 184 | if not ret: 185 | break 186 | 187 | # Convert frame to PIL image 188 | frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 189 | 190 | w, h = frame_pil.size 191 | orig_size = torch.tensor([w, h], device=device)[None] 192 | annotator = annotators[annotator_type](deepcopy(frame_pil)) 193 | 194 | im_data = transforms(frame_pil)[None] 195 | 196 | blob = { 197 | "images": im_data.to(device), 198 | "orig_target_sizes": orig_size, 199 | } 200 | 201 | output = m(blob) 202 | 203 | scores, labels, keypoints = output.values() 204 | scores, labels, keypoints = scores[0], labels[0], keypoints[0] 205 | for kpt, score in zip(keypoints, scores): 206 | if score > thrh: 207 | annotator.kpts( 208 | kpt, 209 | [h, w] 210 | ) 211 | 212 | # Convert back to OpenCV image 213 | frame = annotator.result() 214 | 215 | # Write the frame 216 | out.write(frame) 217 | frame_count += 1 218 | 219 | if frame_count % 100 == 0: 220 | print(f"Processed {frame_count} frames...") 221 | 222 | cap.release() 223 | out.release() 224 | print(f"Video processing complete. Result saved as '{OUTPUT_NAME}.mp4'.") 225 | 226 | def process_file(m, file_path, device): 227 | # Check if the input file is an image or a vide 228 | if os.path.splitext(file_path)[-1].lower() in [".jpg", ".jpeg", ".png", ".bmp"]: 229 | # Process as image 230 | process_image(m, file_path, device) 231 | else: 232 | # Process as video 233 | process_video(m, file_path, device) 234 | 235 | if __name__ == "__main__": 236 | import argparse 237 | 238 | parser = argparse.ArgumentParser() 239 | parser.add_argument("-trt", "--trt", type=str, required=True) 240 | parser.add_argument("--annotator", type=str, required=True, help="Annotator type: COCO or CrowdPose.") 241 | parser.add_argument("-i", "--input", type=str, required=True) 242 | parser.add_argument("-d", "--device", type=str, default="cuda:0") 243 | parser.add_argument("-t", "--thrh", type=float, required=False, default=None) 244 | 245 | args = parser.parse_args() 246 | 247 | assert args.annotator.lower() in ['coco', 'crowdpose'] 248 | 249 | # Global variable 250 | global OUTPUT_NAME, thrh, annotator_type 251 | thrh = 0.5 if args.thrh is None else args.thrh 252 | 253 | annotator_name = args.annotator.lower() 254 | if annotator_name == 'coco': 255 | annotator_type = 'COCO' 256 | elif annotator_name == 'crowdpose': 257 | annotator_type = 'CrowdPose' 258 | 259 | m = TRTInference(args.trt, device=args.device) 260 | 261 | # Check if the input argumnet is a file or a folder 262 | file_path = args.input 263 | if os.path.isdir(file_path): 264 | # Process a folder 265 | folder_dir = args.input 266 | if folder_dir[-1] == '/': 267 | folder_dir = folder_dir[:-1] 268 | output_dir = f"{folder_dir}/output" 269 | os.makedirs(output_dir, exist_ok=True) 270 | paths = list(glob.iglob(f"{folder_dir}/*.*")) 271 | for file_path in paths: 272 | OUTPUT_NAME = file_path.replace(f'{folder_dir}/', f'{output_dir}/').split('.')[0] 273 | OUTPUT_NAME = f"{OUTPUT_NAME}_{annotator_type}" 274 | process_file(m, file_path, args.device) 275 | else: 276 | # Process a file 277 | OUTPUT_NAME = f'trt_results_{annotator_type}' 278 | process_file(m, file_path, args.device) -------------------------------------------------------------------------------- /src/models/detrpose/dn_component.py: -------------------------------------------------------------------------------- 1 | """ 2 | DETRPose: Real-time end-to-end transformer model for multi-person pose estimation 3 | Copyright (c) 2025 The DETRPose Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from DINO (https://github.com/IDEA-Research/DINO/) 6 | Copyright (c) 2022 IDEA. All Rights Reserved. 7 | --------------------------------------------------------------------------------- 8 | Modified from DN-DETR (https://github.com/IDEA-Research/DN-DETR/) 9 | Copyright (c) 2022 IDEA. All Rights Reserved. 10 | """ 11 | 12 | 13 | import torch 14 | from .utils import inverse_sigmoid 15 | import torch.nn.functional as F 16 | import numpy as np 17 | 18 | def get_sigmas(num_keypoints, device): 19 | if num_keypoints == 17: 20 | sigmas = np.array([ 21 | .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 22 | 1.07, .87, .87, .89, .89 23 | ], dtype=np.float32) / 10.0 24 | elif num_keypoints == 14: 25 | sigmas = np.array([ 26 | .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, 27 | .79, .79 28 | ]) / 10.0 29 | elif num_keypoints == 3: 30 | sigmas = np.array([ 31 | 1.07, 1.07, 0.67 32 | ]) / 10.0 33 | else: 34 | raise ValueError(f'Unsupported keypoints number {num_keypoints}') 35 | sigmas = np.concatenate([[0.1], sigmas]) # for the center of the human 36 | sigmas = torch.tensor(sigmas, device=device, dtype=torch.float32) 37 | return sigmas[None, :, None] 38 | 39 | 40 | def prepare_for_cdn(dn_args, training, num_queries, num_classes, num_keypoints, hidden_dim, label_enc, pose_enc, img_dim, device): 41 | """ 42 | A major difference of DINO from DN-DETR is that the author process pattern embedding pattern embedding in its detector 43 | forward function and use learnable tgt embedding, so we change this function a little bit. 44 | :param dn_args: targets, dn_number, label_noise_ratio 45 | :param training: if it is training or inference 46 | :param num_queries: number of queires 47 | :param num_classes: number of classes 48 | :param hidden_dim: transformer hidden dim 49 | :param label_enc: encode labels in dn 50 | :return: 51 | """ 52 | if training: 53 | targets, dn_number, label_noise_ratio = dn_args 54 | # positive and negative dn queries 55 | dn_number = dn_number * 2 56 | known = [(torch.ones_like(t['labels'])) for t in targets] 57 | batch_size = len(known) 58 | known_num = [sum(k) for k in known] 59 | 60 | if int(max(known_num)) == 0: 61 | return None, None, None, None 62 | 63 | dn_number = dn_number // (int(max(known_num) * 2)) 64 | dn_number = 1 if dn_number == 0 else dn_number 65 | 66 | unmask_bbox = unmask_label = torch.cat(known) 67 | 68 | # instance label denoise 69 | labels = torch.cat([t['labels'] for t in targets]) 70 | batch_idx = torch.cat([torch.full_like(t['labels'].long(), i) for i, t in enumerate(targets)]) 71 | 72 | known_indice = torch.nonzero(unmask_label + unmask_bbox) 73 | known_indice = known_indice.view(-1) 74 | known_indice = known_indice.repeat(2 * dn_number, 1).view(-1) 75 | 76 | known_labels = labels.repeat(2 * dn_number, 1).view(-1) 77 | known_labels_expaned = known_labels.clone() 78 | 79 | known_labels_poses_expaned = torch.arange(num_keypoints, dtype=torch.long, device=device) 80 | known_labels_poses_expaned = known_labels_poses_expaned[None].repeat(len(known_labels), 1) 81 | 82 | known_bid = batch_idx.repeat(2 * dn_number, 1).view(-1) 83 | 84 | if label_noise_ratio > 0: 85 | p = torch.rand_like(known_labels_expaned.float()) 86 | chosen_indice = torch.nonzero(p < (label_noise_ratio * 0.5)).view(-1) # half of bbox prob 87 | new_label = torch.randint_like(chosen_indice, 0, num_classes) # randomly put a new one here 88 | known_labels_expaned.scatter_(0, chosen_indice, new_label) 89 | 90 | # weights = torch.ones((len(chosen_indice), num_keypoints), device=p.device) 91 | # new_label_pose = torch.multinomial(weights, num_samples=num_keypoints, replacement=False) 92 | # known_labels_poses_expaned.scatter_(0, chosen_indice.unsqueeze(-1).repeat(1, num_keypoints), new_label_pose) 93 | 94 | # keypoint noise 95 | boxes = torch.cat([t['boxes'] for t in targets]) 96 | xy = (boxes[:, :2] + boxes[:, 2:]) / 2. 97 | keypoints = torch.cat([t['keypoints'] for t in targets]) 98 | if 'area' in targets[0]: 99 | areas = torch.cat([t['area'] for t in targets]) 100 | else: 101 | areas = boxes[:, 2] * boxes[:, 3] * 0.53 102 | poses = keypoints[:, 0:(num_keypoints * 2)] 103 | poses = torch.cat([xy, poses], dim=1) 104 | non_viz = keypoints[:, (num_keypoints * 2):] == 0 105 | non_viz = torch.cat((torch.ones_like(non_viz[:, 0:1]).bool(), non_viz), dim=1) 106 | vars = (2 * get_sigmas(num_keypoints, device)) ** 2 107 | 108 | 109 | known_poses = poses.repeat(2 * dn_number, 1).reshape(-1, num_keypoints+1, 2) 110 | known_areas = areas.repeat(2 * dn_number)[..., None, None] # normalized [0, 1] 111 | known_areas = known_areas * img_dim[0] * img_dim[1] # scaled [0, h*w] 112 | known_non_viz = non_viz.repeat(2 * dn_number, 1) 113 | 114 | single_pad = int(max(known_num)) 115 | pad_size = int(single_pad * 2 * dn_number) 116 | positive_idx = torch.tensor(range(len(poses))).long().cuda().unsqueeze(0).repeat(dn_number, 1) 117 | positive_idx += (torch.tensor(range(dn_number)) * len(poses) * 2).long().cuda().unsqueeze(1) 118 | positive_idx = positive_idx.flatten() 119 | negative_idx = positive_idx + len(poses) 120 | 121 | eps = np.finfo('float32').eps 122 | rand_vector = torch.rand_like(known_poses) 123 | rand_vector = F.normalize(rand_vector, -1) # ||rand_vector|| = 1 124 | rand_alpha = torch.zeros_like(known_poses[..., :1]).uniform_(-np.log(1), -np.log(0.5)) 125 | rand_alpha[negative_idx] = rand_alpha[negative_idx].uniform_(-np.log(0.5), -np.log(0.1)) 126 | rand_alpha *= 2 * (known_areas + eps) * vars ## This is distance **2 127 | rand_alpha = torch.sqrt(rand_alpha) / max(img_dim) 128 | # rand_alpha = rand_alpha ** 1.25 ## This is distance 129 | rand_alpha[known_non_viz] = 0. 130 | 131 | known_poses_expand = known_poses + rand_alpha * rand_vector 132 | 133 | m = known_labels_expaned.long().to(device) 134 | input_label_embed = label_enc(m) 135 | # input_label_pose_embed = pose_enc(known_labels_poses_expaned) 136 | input_label_pose_embed = pose_enc.weight[None].repeat(known_poses_expand.size(0), 1, 1) 137 | input_label_embed = torch.cat([input_label_embed.unsqueeze(1), input_label_pose_embed], dim=1) 138 | input_label_embed = input_label_embed.flatten(1) 139 | 140 | input_pose_embed = inverse_sigmoid(known_poses_expand) 141 | 142 | padding_label = torch.zeros(pad_size, hidden_dim * (num_keypoints + 1)).cuda() 143 | padding_pose = torch.zeros(pad_size, num_keypoints+1).cuda() 144 | 145 | input_query_label = padding_label.repeat(batch_size, 1, 1) 146 | input_query_pose = padding_pose[...,None].repeat(batch_size, 1, 1, 2) 147 | 148 | map_known_indice = torch.tensor([], device=device) 149 | if len(known_num): 150 | map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num]) # [1,2, 1,2,3] 151 | map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(2 * dn_number)]).long() 152 | if len(known_bid): 153 | input_query_label[(known_bid.long(), map_known_indice)] = input_label_embed 154 | input_query_pose[(known_bid.long(), map_known_indice)] = input_pose_embed 155 | 156 | tgt_size = pad_size + num_queries 157 | attn_mask = torch.ones(tgt_size, tgt_size, device=device) < 0 158 | # match query cannot see the reconstruct 159 | attn_mask[pad_size:, :pad_size] = True 160 | # reconstruct cannot see each other 161 | for i in range(dn_number): 162 | if i == 0: 163 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True 164 | if i == dn_number - 1: 165 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * i * 2] = True 166 | else: 167 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True 168 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * 2 * i] = True 169 | # import matplotlib.pyplot as plt 170 | # plt.imshow(~attn_mask.detach().cpu().numpy(), cmap='gray') 171 | # plt.show() 172 | 173 | dn_meta = { 174 | 'pad_size': pad_size, 175 | 'num_dn_group': dn_number, 176 | } 177 | else: 178 | 179 | input_query_label = None 180 | input_query_bbox = None 181 | attn_mask = None 182 | dn_meta = None 183 | 184 | return input_query_label.unflatten(-1, (-1, hidden_dim)), input_query_pose, attn_mask, dn_meta 185 | 186 | 187 | def dn_post_process(outputs_class, outputs_keypoints, dn_meta, aux_loss, _set_aux_loss): 188 | """ 189 | post process of dn after output from the transformer 190 | put the dn part in the dn_meta 191 | """ 192 | if dn_meta and dn_meta['pad_size'] > 0: 193 | output_known_class = outputs_class[:, :, :dn_meta['pad_size'], :] 194 | output_known_keypoints = outputs_keypoints[:, :, :dn_meta['pad_size'], :] 195 | outputs_class = outputs_class[:, :, dn_meta['pad_size']:, :] 196 | outputs_keypoints = outputs_keypoints[:, :, dn_meta['pad_size']:, :] 197 | out = {'pred_logits': output_known_class[-1], 'pred_keypoints': output_known_keypoints[-1]} 198 | if aux_loss: 199 | out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_keypoints) 200 | dn_meta['output_known_lbs_keypoints'] = out 201 | return outputs_class, outputs_keypoints 202 | --------------------------------------------------------------------------------