├── requirements.txt
├── utils
    ├── __init__.py
    ├── nms.py
    ├── util.py
    ├── visualization.py
    ├── yolov6_util.py
    ├── vitpose_util.py
    └── inference.py
├── run.py
├── configs
    ├── base_config.py
    └── custom_config.py
├── .gitignore
├── README.md
└── LICENSE


/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib == 3.6.2
2 | numpy == 1.24.2
3 | onnxruntime-gpu == 1.13.1
4 | opencv-python == 4.7.0.68
5 | yacs == 0.1.8


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import *
2 | from .inference import *
3 | from .visualization import *
4 | from .vitpose_util import *
5 | from .yolov6_util import *
6 | from .nms import *


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import onnxruntime as ort
 4 | 
 5 | from utils.inference import inference_image, inference_video, inference_webcam
 6 | from utils import get_config
 7 | 
 8 | 
 9 | def main(cfg):
10 |     YOLOV6_PATH = cfg.yolov6_path
11 |     VITPOSE_PATH = cfg.vitpose_path
12 |     IMG_PATH = cfg.image_path
13 |     VID_PATH = cfg.video_path
14 |     WEBCAM = cfg.webcam
15 | 
16 |     assert (IMG_PATH or VID_PATH or (WEBCAM is not None)), "Argument -img or -vid or -wc should be provided"
17 | 
18 |     if cfg.cpu:
19 |         EP_list = ['CPUExecutionProvider']
20 |     else:
21 |         EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider']
22 | 
23 |     yolov6_sess = ort.InferenceSession(YOLOV6_PATH, providers=EP_list)
24 |     vitpose_sess = ort.InferenceSession(VITPOSE_PATH, providers=EP_list)
25 |     # TODO : implement smooth_net feature
26 |     # if cfg.smooth_net:
27 |     #     smooth_net = ort.InferenceSession('smoothnet-32.onnx', providers=EP_list)
28 | 
29 |     os.system("") # make terminal to be able to use ANSI escape
30 | 
31 |     # Inference image
32 |     if IMG_PATH:
33 |         inference_image(IMG_PATH, yolov6_sess, vitpose_sess, cfg)
34 | 
35 |     #Inference video from file
36 |     if VID_PATH:
37 |         inference_video(VID_PATH, yolov6_sess, vitpose_sess, cfg)
38 | 
39 |     # Inference video from webcam
40 |     if WEBCAM is not None:
41 |         inference_webcam(WEBCAM, yolov6_sess, vitpose_sess, cfg)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     cfg = get_config()
46 | 
47 |     main(cfg)


--------------------------------------------------------------------------------
/utils/nms.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def nms(boxes, probs=None, overlapThresh=0.3):
 4 | 	# if there are no boxes, return an empty list
 5 | 	if len(boxes) == 0:
 6 | 		return []
 7 | 
 8 | 	# if the bounding boxes are integers, convert them to floats -- this
 9 | 	# is important since we'll be doing a bunch of divisions
10 | 	if boxes.dtype.kind == "i":
11 | 		boxes = boxes.astype("float")
12 | 
13 | 	# initialize the list of picked indexes
14 | 	pick = []
15 | 
16 | 	# grab the coordinates of the bounding boxes
17 | 	x1 = boxes[:, 0]
18 | 	y1 = boxes[:, 1]
19 | 	x2 = boxes[:, 2]
20 | 	y2 = boxes[:, 3]
21 | 
22 | 	# compute the area of the bounding boxes and grab the indexes to sort
23 | 	# (in the case that no probabilities are provided, simply sort on the
24 | 	# bottom-left y-coordinate)
25 | 	area = (x2 - x1 + 1) * (y2 - y1 + 1)
26 | 	idxs = y2
27 | 
28 | 	# if probabilities are provided, sort on them instead
29 | 	if probs is not None:
30 | 		idxs = probs
31 | 
32 | 	# sort the indexes
33 | 	idxs = np.argsort(idxs)
34 | 
35 | 	# keep looping while some indexes still remain in the indexes list
36 | 	while len(idxs) > 0:
37 | 		# grab the last index in the indexes list and add the index value
38 | 		# to the list of picked indexes
39 | 		last = len(idxs) - 1
40 | 		i = idxs[last]
41 | 		pick.append(i)
42 | 
43 | 		# find the largest (x, y) coordinates for the start of the bounding
44 | 		# box and the smallest (x, y) coordinates for the end of the bounding
45 | 		# box
46 | 		xx1 = np.maximum(x1[i], x1[idxs[:last]])
47 | 		yy1 = np.maximum(y1[i], y1[idxs[:last]])
48 | 		xx2 = np.minimum(x2[i], x2[idxs[:last]])
49 | 		yy2 = np.minimum(y2[i], y2[idxs[:last]])
50 | 
51 | 		# compute the width and height of the bounding box
52 | 		w = np.maximum(0, xx2 - xx1 + 1)
53 | 		h = np.maximum(0, yy2 - yy1 + 1)
54 | 
55 | 		# compute the ratio of overlap
56 | 		overlap = (w * h) / area[idxs[:last]]
57 | 
58 | 		# delete all indexes from the index list that have overlap greater
59 | 		# than the provided overlap threshold
60 | 		idxs = np.delete(idxs, np.concatenate(([last],
61 | 			np.where(overlap > overlapThresh)[0])))
62 | 
63 | 	# return pick indices
64 | 	return np.array(pick)


--------------------------------------------------------------------------------
/configs/base_config.py:
--------------------------------------------------------------------------------
 1 | from yacs.config import CfgNode as CN
 2 | 
 3 | 
 4 | """
 5 | yolov6_path     (str): path to YOLOv6 onnx file
 6 | vitpose_path    (str): path to ViTPose onnx file
 7 | 
 8 | image_path      (str): path to image file
 9 | video_path      (str): path to video file
10 | webcam          (str or int): webcam URL or ID, set None for not using
11 | 
12 | no_background   (bool): draw a black screen instead of the original image, if True
13 | no_bbox         (bool): skip drawing bboxes, if True
14 | no_skeleton     (bool): skip drawing skeletons, if True
15 | dynamic_drawing (bool): keypoint radius and skeleton width change dynamic with bbox size, if True
16 | smooth_net      (bool): reduce jitter in keypoints predicted by using SmoothNet. not implemented yet
17 | result_scale    (float): set a coefficient to scale a size of result, set None for not processing
18 | 
19 | save            (bool): save the result, if True
20 | save_prediction (bool): save the predictions(bbox, pose), if True.
21 |                         Numpy is needed to read the save file
22 | set_fps         (int): set a fps of result to be saved, 
23 |                        set None to use original fps for video( or 60fps for webcam)
24 | 
25 | conf_thres      (float): set a bbox confidence threshold for non-maximum suppression
26 | iou_thres       (float): set a bbox iou threshold for non-maximum suppression
27 | max_detection   (int): set the maximum amount of bbox
28 | key_conf_thres  (float): set a keypoint confidence threshold
29 | no_pad          (bool): do not use additional padding. if True
30 | cpu             (bool): use CPU to inference, if True
31 | pose batch size (int): set pose batch size
32 | yolo batch size (int): set yolo batch size, it works only in video
33 | """
34 | 
35 | 
36 | _C = CN()
37 | 
38 | _C.yolov6_path = 'yolov6m.onnx'
39 | _C.vitpose_path = 'vitpose-b-multi-coco.onnx'
40 | 
41 | _C.image_path = ''
42 | _C.video_path = ''
43 | _C.webcam = None
44 | 
45 | _C.no_background = False
46 | _C.no_bbox = False
47 | _C.no_skeleton = False
48 | _C.dynamic_drawing = False
49 | _C.smooth_net = False
50 | _C.result_scale = None
51 | 
52 | _C.save = False
53 | _C.save_prediction = False
54 | _C.set_fps = None
55 | 
56 | _C.conf_thres = 0.25
57 | _C.iou_thres = 0.45
58 | _C.max_detection = 100
59 | _C.key_conf_thres = 0.4
60 | _C.no_pad = False
61 | _C.cpu = False
62 | _C.pose_batch_size = 1
63 | _C.yolo_batch_size = 1
64 | 
65 | 
66 | cfg = _C


--------------------------------------------------------------------------------
/configs/custom_config.py:
--------------------------------------------------------------------------------
 1 | from yacs.config import CfgNode as CN
 2 | 
 3 | 
 4 | """
 5 | yolov6_path     (str): path to YOLOv6 onnx file
 6 | vitpose_path    (str): path to ViTPose onnx file
 7 | 
 8 | image_path      (str): path to image file
 9 | video_path      (str): path to video file
10 | webcam          (str or int): webcam URL or ID, set None for not using
11 | 
12 | no_background   (bool): draw a black screen instead of the original image, if True
13 | no_bbox         (bool): skip drawing bboxes, if True
14 | no_skeleton     (bool): skip drawing skeletons, if True
15 | dynamic_drawing (bool): keypoint radius and skeleton width change dynamic with bbox size, if True
16 | smooth_net      (bool): reduce jitter in keypoints predicted by using SmoothNet. not implemented yet
17 | result_scale    (float): set a coefficient to scale a size of result, set None for not processing
18 | 
19 | save            (bool): save the result, if True
20 | save_prediction (bool): save the predictions(bbox, pose), if True.
21 |                         Numpy is needed to read the save file
22 | set_fps         (int): set a fps of result to be saved, 
23 |                        set None to use original fps for video( or 60fps for webcam)
24 | 
25 | conf_thres      (float): set a bbox confidence threshold for non-maximum suppression
26 | iou_thres       (float): set a bbox iou threshold for non-maximum suppression
27 | max_detection   (int): set the maximum amount of bbox
28 | key_conf_thres  (float): set a keypoint confidence threshold
29 | no_pad          (bool): do not use additional padding. if True
30 | cpu             (bool): use CPU to inference, if True
31 | pose batch size (int): set pose batch size
32 | yolo batch size (int): set yolo batch size, it works only in video
33 | """
34 | 
35 | 
36 | _C = CN()
37 | 
38 | _C.yolov6_path = 'yolov6m.onnx'
39 | _C.vitpose_path = 'vitpose-b-multi-coco.onnx'
40 | 
41 | _C.image_path = ''
42 | _C.video_path = ''
43 | _C.webcam = None
44 | 
45 | _C.no_background = False
46 | _C.no_bbox = True
47 | _C.no_skeleton = False
48 | _C.dynamic_drawing = True
49 | _C.smooth_net = False
50 | _C.result_scale = None
51 | 
52 | _C.save = True
53 | _C.save_prediction = False
54 | _C.set_fps = None
55 | 
56 | _C.conf_thres = 0.25
57 | _C.iou_thres = 0.45
58 | _C.max_detection = 100
59 | _C.key_conf_thres = 0.15
60 | _C.no_pad = False
61 | _C.cpu = False
62 | _C.pose_batch_size = 1
63 | _C.yolo_batch_size = 1
64 | 
65 | 
66 | cfg = _C


--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | 
 4 | from yacs.config import CfgNode as CN
 5 | 
 6 | 
 7 | def get_config():
 8 |     parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)
 9 |     parser.add_argument('--yolov6_path', '-yolo', help='yolov6 path')
10 |     parser.add_argument('--vitpose_path', '-pose', help='vitpose path')
11 | 
12 |     parser.add_argument('--image-path', '-img', help='image path')
13 |     parser.add_argument('--video-path', '-vid', help='videos path')
14 |     parser.add_argument('--webcam', '-wc', help='webcam id or webcam URL')
15 | 
16 |     parser.add_argument('--no-background', '-nobg', action='store_true', help="draw only skeletons or bboxes, background will be black")
17 |     parser.add_argument('--no-bbox', '-nobx', action='store_true', help="dont't draw bboxes")
18 |     parser.add_argument('--no-skeleton', '-nosk', action='store_true', help="don't draw skeletons")
19 |     parser.add_argument('--dynamic-drawing', '-dd', action='store_true', help='turn on dynamic drawing')
20 |     parser.add_argument('--smooth-net', '-sn', action='store_true', help='use smooth-net for jitter filtering')
21 |     parser.add_argument('--result-scale', '-rs', type=float, help='set scale to result')
22 | 
23 |     parser.add_argument('--save', '-s', action='store_true', help='save drawing result')
24 |     parser.add_argument('--save-prediction', '-sp', action='store_true', help='save prediction')
25 |     parser.add_argument('--set-fps', '-fps', type=int, help='set fps for result video')
26 | 
27 |     parser.add_argument('--conf-thres', '-conf',type=float, help='set conf thres for nms')
28 |     parser.add_argument('--iou-thres', '-iou', type=float, help='set iou thres for nms')
29 |     parser.add_argument('--max-detection', '-max',type=int, help='set max detection for nms')
30 |     parser.add_argument('--key-conf-thres', '-kconf',type=float, help='set keypoint conf thres')
31 |     parser.add_argument('--no-pad', action='store_true', help="don't use additional padding")
32 |     parser.add_argument('--cpu', '-cpu', action='store_true', help="use cpu instead of gpu")
33 |     parser.add_argument('--pose-batch-size', '-pbs',type=int, help='set pose batch size')
34 |     parser.add_argument('--yolo-batch-size', '-ybs',type=int, help='set yolo batch size')
35 | 
36 |     parser.add_argument('--config', '-cfg', default=None, help='config path')
37 | 
38 |     args = parser.parse_args()
39 |     args = vars(args)
40 | 
41 |     if args['config'] is None:
42 |         cfg = cfg = CN._load_cfg_py_source('configs/base_config.py')
43 |     else:
44 |         cfg = CN._load_cfg_py_source(args.config)
45 | 
46 |     for key, value in args.items():
47 |         cfg[key] = value
48 | 
49 |     return cfg
50 | 
51 | 
52 | def print_fps(fps):
53 |     bar = int(fps*2) // 5
54 |     sub_bar = " ▎▍▋▊"[int(fps*2)%5] # ▉▊▋▍▎
55 |     sys.stdout.write("\033[K")
56 |     print(f'- fps:{fps:06.1f} : ' + "▉"*bar + sub_bar, end='\r')


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,python
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,python
  3 | 
  4 | ### Python ###
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 
166 | ### Python Patch ###
167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168 | poetry.toml
169 | 
170 | # ruff
171 | .ruff_cache/
172 | 
173 | ### VisualStudioCode ###
174 | .vscode/*
175 | !.vscode/settings.json
176 | !.vscode/tasks.json
177 | !.vscode/launch.json
178 | !.vscode/extensions.json
179 | !.vscode/*.code-snippets
180 | 
181 | # Local History for Visual Studio Code
182 | .history/
183 | 
184 | # Built Visual Studio Code Extensions
185 | *.vsix
186 | 
187 | ### VisualStudioCode Patch ###
188 | # Ignore all local history of files
189 | .history
190 | .ionide
191 | 
192 | # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python
193 | 
194 | 
195 | # **/__pycache__
196 | **/*.onnx
197 | **/*.mp4
198 | ignore/
199 | examples/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ViTPose-ONNX
  2 | Easy inference for [ViTPose](https://github.com/ViTAE-Transformer/ViTPose) using ONNX  
  3 | <p align="center">
  4 | <img src="https://user-images.githubusercontent.com/105025612/221503731-ba87c70b-2422-4e53-a101-ad1bdd9bd3d4.gif">
  5 | </p>
  6 | 
  7 | ## Requirements
  8 | ```
  9 | pip install -r requirements.txt
 10 | ```
 11 | As you can see in 'requirements.txt', it requires only 5 libraries below  
 12 |  - matplotlib  
 13 |  - numpy  
 14 |  - onnxruntime-gpu  
 15 |  - opencv-python  
 16 |  - yacs  
 17 | 
 18 | ## Usage
 19 | ### Install
 20 | ```
 21 | git clone https://github.com/Pukei-Pukei/ViTPose-ONNX.git
 22 | cd ViTPose-ONNX
 23 | pip install -r requirements.txt
 24 | ```
 25 | ### Run
 26 | Download [vitpose-b-multi-coco.onnx](https://drive.google.com/drive/folders/1v7tStPJqV4x9vgEW9l_mnwbEuw87exiq?usp=share_link) and [yolov6m.onnx](https://drive.google.com/file/d/1lZ251Y_oG0yNwgFW067HWKsSQAbiLdln/view?usp=share_link), then put them in ViTPose-ONNX folder  
 27 | Run the commands below to start inference
 28 | ```
 29 | python run.py -img <path_to_image>
 30 | ```
 31 | ```
 32 | python run.py -vid <path_to_video>
 33 | ```
 34 | ```
 35 | python run.py -wc <webcam ID or URL>
 36 | ```
 37 | ```
 38 | python run.py -cfg <config path> -vid <path_to_video>
 39 | ```  
 40 | ### Example
 41 | ```
 42 | python run.py -cfg configs/custom_config.py -vid dance.mp4 -s
 43 | ```
 44 | '-s' for save option
 45 | 
 46 | ## Options
 47 | 
 48 |     --yolov6-path, -yolo PATH       :   Path to YOLOv6 onnx file
 49 |     --vitpose-path, -pose PATH      :   Path to ViTPose onnx file
 50 | 
 51 |     --image-path, -img PATH         :   Image path 
 52 |     --video-path, -vid PATH         :   Videos path 
 53 |     --webcam, -wc PATH              :   Webcam id or webcam URL 
 54 | 
 55 |     --no-background, -nobg          :   Background will be black screen
 56 |     --no-bbox, -nobx                :   Don't draw bboxes
 57 |     --no-skeleton, -nosk            :   Don't draw skeletons
 58 |     --dynamic-drawing, -dd          :   Turn on dynamic drawing, keypoint 
 59 |                                         radius and skeleton width change 
 60 |                                         dynamically with bbox size
 61 |     --result-scale, -rs SIZE        :   Set a coefficient to scale a size 
 62 |                                         of result, set None for not 
 63 |                                         processing
 64 | 
 65 |     --save, -s                      :   Save drawing result
 66 |     --save-prediction, -sp          :   Save the predictions(bbox, pose), 
 67 |                                         Numpy is needed to read the save 
 68 |                                         file
 69 | 
 70 |     --conf-thres, -conf THRES       :   Set confidence threshold for 
 71 |                                         non-maximum suppression
 72 |     --iou-thres, -iou THRES         :   Set IoU threshold for 
 73 |                                         non-maximum suppression
 74 |     --max-detection, -max MAX       :   Set max detection for non-maximum 
 75 |                                         suppression
 76 |     --key-conf-thres, -kconf THRES  :   Set keypoint confidence threshold
 77 |     --no-pad                        :   Don't use additional padding
 78 | 
 79 |     --cpu, -cpu                     :   Use cpu instead of gpu
 80 |     --pose-batch-size, -pbs SIZE    :   Set pose batch size
 81 |     --yolo-batch-size, -ybs SIZE    :   Set yolo batch size, 
 82 |                                         it works only in video
 83 | 
 84 |     --config, -cfg                  :   Config path. use config for easy 
 85 |                                         usage of options. default config 
 86 |                                         path is 'configs/base_config.py'
 87 | 
 88 | 
 89 | 
 90 | ## Download ONNX file
 91 | 
 92 | |Model   |ONNX       |Original Weight for PyTorch|
 93 | |:------:|:---------:|:-------------:|
 94 | |[ViTPose-B](https://github.com/ViTAE-Transformer/ViTPose#results-from-this-repo-on-ms-coco-val-set-single-task-training)|[GoogleDrive](https://drive.google.com/drive/folders/1v7tStPJqV4x9vgEW9l_mnwbEuw87exiq?usp=share_link)|[Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgSrlMB093JzJtqq-?e=Jr5S3R)|
 95 | |[YOLOv6-M](https://github.com/meituan/YOLOv6#benchmark)|[GoogleDrive](https://drive.google.com/file/d/1lZ251Y_oG0yNwgFW067HWKsSQAbiLdln/view?usp=share_link)|[Download](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6m.pt)|
 96 | 
 97 | If you want other versions, refer to [Tutorial](https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html) and get your own ONNX
 98 | 
 99 | 
100 | ## Acknowledgements
101 | 
102 | - [ViTPose](https://github.com/ViTAE-Transformer/ViTPose)
103 | 
104 | - [YOLOv6](https://github.com/meituan/YOLOv6)
105 | 
106 | - [simple-HRNet](https://github.com/stefanopini/simple-HRNet)
107 | 
108 | - [(Faster) Non-Maximum Suppression in Python](https://pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/)
109 | 
110 | 


--------------------------------------------------------------------------------
/utils/visualization.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | 
  5 | 
  6 | __all__ = ["joints_dict", "draw_points_and_skeleton"]
  7 | 
  8 | 
  9 | def joints_dict():
 10 |     joints = {
 11 |         "coco": {
 12 |             "keypoints": {
 13 |                 0: "nose",
 14 |                 1: "left_eye",
 15 |                 2: "right_eye",
 16 |                 3: "left_ear",
 17 |                 4: "right_ear",
 18 |                 5: "left_shoulder",
 19 |                 6: "right_shoulder",
 20 |                 7: "left_elbow",
 21 |                 8: "right_elbow",
 22 |                 9: "left_wrist",
 23 |                 10: "right_wrist",
 24 |                 11: "left_hip",
 25 |                 12: "right_hip",
 26 |                 13: "left_knee",
 27 |                 14: "right_knee",
 28 |                 15: "left_ankle",
 29 |                 16: "right_ankle"
 30 |             },
 31 |             "skeleton": [
 32 |                 # # [16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12], [7, 13], [6, 7], [6, 8],
 33 |                 # # [7, 9], [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]
 34 |                 # [15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7],
 35 |                 # [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]
 36 |                 [15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7],
 37 |                 [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4],  # [3, 5], [4, 6]
 38 |                 [0, 5], [0, 6]
 39 |             ]
 40 |         },
 41 |         "mpii": {
 42 |             "keypoints": {
 43 |                 0: "right_ankle",
 44 |                 1: "right_knee",
 45 |                 2: "right_hip",
 46 |                 3: "left_hip",
 47 |                 4: "left_knee",
 48 |                 5: "left_ankle",
 49 |                 6: "pelvis",
 50 |                 7: "thorax",
 51 |                 8: "upper_neck",
 52 |                 9: "head top",
 53 |                 10: "right_wrist",
 54 |                 11: "right_elbow",
 55 |                 12: "right_shoulder",
 56 |                 13: "left_shoulder",
 57 |                 14: "left_elbow",
 58 |                 15: "left_wrist"
 59 |             },
 60 |             "skeleton": [
 61 |                 # [5, 4], [4, 3], [0, 1], [1, 2], [3, 2], [13, 3], [12, 2], [13, 12], [13, 14],
 62 |                 # [12, 11], [14, 15], [11, 10], # [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]
 63 |                 [5, 4], [4, 3], [0, 1], [1, 2], [3, 2], [3, 6], [2, 6], [6, 7], [7, 8], [8, 9],
 64 |                 [13, 7], [12, 7], [13, 14], [12, 11], [14, 15], [11, 10],
 65 |             ]
 66 |         },
 67 |     }
 68 |     return joints
 69 | 
 70 | 
 71 | def draw_points(image, points, color_palette='tab20', palette_samples=16, confidence_threshold=0.5, xywh=None):
 72 |     """
 73 |     Draws `points` on `image`.
 74 | 
 75 |     Args:
 76 |         image: image in opencv format
 77 |         points: list of points to be drawn.
 78 |             Shape: (nof_points, 3)
 79 |             Format: each point should contain (y, x, confidence)
 80 |         color_palette: name of a matplotlib color palette
 81 |             Default: 'tab20'
 82 |         palette_samples: number of different colors sampled from the `color_palette`
 83 |             Default: 16
 84 |         confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1]
 85 |             Default: 0.5
 86 | 
 87 |     Returns:
 88 |         A new image with overlaid points
 89 | 
 90 |     """
 91 |     try:
 92 |         colors = np.round(
 93 |             np.array(plt.get_cmap(color_palette).colors) * 255
 94 |         ).astype(np.uint8)[:, ::-1].tolist()
 95 |     except AttributeError:  # if palette has not pre-defined colors
 96 |         colors = np.round(
 97 |             np.array(plt.get_cmap(color_palette)(np.linspace(0, 1, palette_samples))) * 255
 98 |         ).astype(np.uint8)[:, -2::-1].tolist()
 99 | 
100 |     if xywh is not None:
101 |         circle_size = 7
102 |         circle_size = int(np.clip(np.sqrt(sum(xywh[2:]) / 3000)*circle_size, 1, circle_size*2))
103 |     else:
104 |         circle_size = max(1, min(image.shape[:2]) // 160)
105 |     
106 | 
107 |     for i, pt in enumerate(points):
108 |         if pt[2] > confidence_threshold:
109 |             image = cv2.circle(image, (int(pt[1]), int(pt[0])), circle_size, tuple(colors[i % len(colors)]), -1)
110 | 
111 |     return image
112 | 
113 | 
114 | def draw_skeleton(image, points, skeleton, color_palette='Set2', palette_samples=8, person_index=0,
115 |                   confidence_threshold=0.5, xywh=None):
116 |     """
117 |     Draws a `skeleton` on `image`.
118 | 
119 |     Args:
120 |         image: image in opencv format
121 |         points: list of points to be drawn.
122 |             Shape: (nof_points, 3)
123 |             Format: each point should contain (y, x, confidence)
124 |         skeleton: list of joints to be drawn
125 |             Shape: (nof_joints, 2)
126 |             Format: each joint should contain (point_a, point_b) where `point_a` and `point_b` are an index in `points`
127 |         color_palette: name of a matplotlib color palette
128 |             Default: 'Set2'
129 |         palette_samples: number of different colors sampled from the `color_palette`
130 |             Default: 8
131 |         person_index: index of the person in `image`
132 |             Default: 0
133 |         confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1]
134 |             Default: 0.5
135 | 
136 |     Returns:
137 |         A new image with overlaid joints
138 | 
139 |     """
140 |     try:
141 |         colors = np.round(
142 |             np.array(plt.get_cmap(color_palette).colors) * 255
143 |         ).astype(np.uint8)[:, ::-1].tolist()
144 |     except AttributeError:  # if palette has not pre-defined colors
145 |         colors = np.round(
146 |             np.array(plt.get_cmap(color_palette)(np.linspace(0, 1, palette_samples))) * 255
147 |         ).astype(np.uint8)[:, -2::-1].tolist()
148 | 
149 |     if xywh is not None:
150 |         lw = 2
151 |         lw = int(np.clip((sum(xywh[2:]) / 300)**(1/6)*lw, 1, lw*2))
152 |     else:
153 |         lw = 2
154 | 
155 |     for i, joint in enumerate(skeleton):
156 |         pt1, pt2 = points[joint]
157 |         if pt1[2] > confidence_threshold and pt2[2] > confidence_threshold:
158 |             image = cv2.line(
159 |                 image, (int(pt1[1]), int(pt1[0])), (int(pt2[1]), int(pt2[0])),
160 |                 tuple(colors[person_index % len(colors)]), lw
161 |             )
162 | 
163 |     return image
164 | 
165 | 
166 | def draw_points_and_skeleton(image, points, skeleton, points_color_palette='tab20', points_palette_samples=16,
167 |                              skeleton_color_palette='Set2', skeleton_palette_samples=8, person_index=0,
168 |                              confidence_threshold=0.5, xywh=None):
169 |     """
170 |     Draws `points` and `skeleton` on `image`.
171 | 
172 |     Args:
173 |         image: image in opencv format
174 |         points: list of points to be drawn.
175 |             Shape: (nof_points, 3)
176 |             Format: each point should contain (y, x, confidence)
177 |         skeleton: list of joints to be drawn
178 |             Shape: (nof_joints, 2)
179 |             Format: each joint should contain (point_a, point_b) where `point_a` and `point_b` are an index in `points`
180 |         points_color_palette: name of a matplotlib color palette
181 |             Default: 'tab20'
182 |         points_palette_samples: number of different colors sampled from the `color_palette`
183 |             Default: 16
184 |         skeleton_color_palette: name of a matplotlib color palette
185 |             Default: 'Set2'
186 |         skeleton_palette_samples: number of different colors sampled from the `color_palette`
187 |             Default: 8
188 |         person_index: index of the person in `image`
189 |             Default: 0
190 |         confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1]
191 |             Default: 0.5
192 | 
193 |     Returns:
194 |         A new image with overlaid joints
195 | 
196 |     """
197 |     image = draw_skeleton(image, points, skeleton, color_palette=skeleton_color_palette,
198 |                           palette_samples=skeleton_palette_samples, person_index=person_index,
199 |                           confidence_threshold=confidence_threshold, xywh=xywh)
200 |     image = draw_points(image, points, color_palette=points_color_palette, palette_samples=points_palette_samples,
201 |                         confidence_threshold=confidence_threshold, xywh=xywh)
202 |     return image


--------------------------------------------------------------------------------
/utils/yolov6_util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cv2
  3 | 
  4 | from utils.nms import nms
  5 | 
  6 | 
  7 | def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32, return_int=False):
  8 |     '''Resize and pad image while meeting stride-multiple constraints.'''
  9 |     shape = im.shape[:2]  # current shape [height, width]
 10 |     if isinstance(new_shape, int):
 11 |         new_shape = (new_shape, new_shape)
 12 |     elif isinstance(new_shape, list) and len(new_shape) == 1:
 13 |        new_shape = (new_shape[0], new_shape[0])
 14 | 
 15 |     # Scale ratio (new / old)
 16 |     r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
 17 |     if not scaleup:  # only scale down, do not scale up (for better val mAP)
 18 |         r = min(r, 1.0)
 19 | 
 20 |     # Compute padding
 21 |     new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
 22 |     dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
 23 | 
 24 |     if auto:  # minimum rectangle
 25 |         dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
 26 | 
 27 |     dw /= 2  # divide padding into 2 sides
 28 |     dh /= 2
 29 |     
 30 |     if shape[::-1] != new_unpad:  # resize
 31 |         im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
 32 |     top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
 33 |     left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
 34 |     im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
 35 |     if not return_int:
 36 |         return im, r, (dw, dh)
 37 |     else:
 38 |         return im, r, (left, top)
 39 | 
 40 | 
 41 | def plot_box_and_label(image, lw, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255), font=cv2.FONT_HERSHEY_COMPLEX):
 42 |     # Add one xyxy box to image with label
 43 |     p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
 44 |     cv2.rectangle(image, p1, p2, color, thickness=lw, lineType=cv2.LINE_AA)
 45 |     if label:
 46 |         lw = 1
 47 |         tf = max(lw - 1, 1)  # font thickness
 48 |         w, h = cv2.getTextSize(label, 0, fontScale=lw / 3, thickness=tf)[0]  # text width, height
 49 |         outside = p1[1] - h - 3 >= 0  # label fits outside box
 50 |         p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3
 51 |         cv2.rectangle(image, p1, p2, color, -1, cv2.LINE_AA)  # filled
 52 |         cv2.putText(image, label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2), font, lw / 3, txt_color,
 53 |                     thickness=tf, lineType=cv2.LINE_AA)
 54 | 
 55 | 
 56 | def xywh2xyxy(x):
 57 |     '''Convert boxes with shape [n, 4] from [x, y, w, h] to [x1, y1, x2, y2] where x1y1 is top-left, x2y2=bottom-right.'''
 58 |     y = np.copy(x)
 59 |     y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
 60 |     y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
 61 |     y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
 62 |     y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
 63 |     return y
 64 | 
 65 | 
 66 | def xyxy2xywh(x):
 67 |     '''Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right.'''
 68 |     y = np.copy(x)
 69 |     y[..., 0] = (x[..., 0] + x[..., 2]) / 2  # x center
 70 |     y[..., 1] = (x[..., 1] + x[..., 3]) / 2  # y center
 71 |     y[..., 2] = x[..., 2] - x[..., 0]  # width
 72 |     y[..., 3] = x[..., 3] - x[..., 1]  # height
 73 |     return y
 74 | 
 75 | 
 76 | def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45):
 77 |     """Runs Non-Maximum Suppression (NMS) on inference results.
 78 |     This code is borrowed from: https://github.com/ultralytics/yolov5/blob/47233e1698b89fc437a4fb9463c815e9171be955/utils/general.py#L775
 79 |     Args:
 80 |         prediction: (numpy.ndarray), with shape [N, 5 + num_classes], N is the number of bboxes.
 81 |         conf_thres: (float) confidence threshold.
 82 |         iou_thres: (float) iou threshold.
 83 |     Returns:
 84 |          output: (numpy.ndarray), list of detections, each item is a tensor with shape (num_boxes, 6), 6 is for [xyxy, conf, cls].
 85 |     """
 86 | 
 87 |     pred_candidates = np.logical_and(prediction[..., 4] > conf_thres, np.amax(prediction[..., 5:], axis=-1)[0] > conf_thres)  # candidates
 88 |     # Check the parameters.
 89 |     assert 0 <= conf_thres <= 1, f'conf_thresh must be in 0.0 to 1.0, however {conf_thres} is provided.'
 90 |     assert 0 <= iou_thres <= 1, f'iou_thres must be in 0.0 to 1.0, however {iou_thres} is provided.'
 91 | 
 92 |     # Function settings.
 93 |     max_wh = 4096  # maximum box width and height
 94 |     max_nms = 30000  # maximum number of boxes put into torchvision.ops.nms()
 95 | 
 96 |     output = [np.zeros((0, 6))] * prediction.shape[0]
 97 |     for img_idx, x in enumerate(prediction):  # image index, image inference
 98 |         x = x[pred_candidates[img_idx]]  # confidence
 99 | 
100 |         # If no box remains, skip the next process.
101 |         if not x.shape[0]:
102 |             continue
103 | 
104 |         # confidence multiply the objectness
105 |         x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
106 | 
107 |         # (center x, center y, width, height) to (x1, y1, x2, y2)
108 |         box = xywh2xyxy(x[:, :4])
109 | 
110 |         class_idx = x[:, 5:].argmax(1, keepdims=True)
111 |         conf = np.take_along_axis(x[:, 5:], class_idx, axis=1)
112 |         x = np.concatenate((box, conf, class_idx.astype('float32')), 1)[conf.flatten() > conf_thres]
113 | 
114 |         # Check shape
115 |         num_box = x.shape[0]  # number of boxes
116 |         if not num_box:  # no boxes kept.
117 |             continue
118 |         elif num_box > max_nms:  # excess max boxes' number.
119 |             # sort by confidence
120 |             x = x[np.flip(x[:, 4].argsort(), -1)[:max_nms]]
121 | 
122 |         # Batched NMS
123 |         boxes, scores = x[:, :4], x[:, 4]  # boxes (offset by class), scores
124 |         keep_box_idx = nms(boxes, scores, iou_thres)
125 | 
126 |         output[img_idx] = x[keep_box_idx]
127 | 
128 |     return output
129 | 
130 | 
131 | def rescale(ori_shape, boxes, target_shape):
132 |     '''Rescale the output to the original image shape'''
133 |     ratio = min(ori_shape[0] / target_shape[0], ori_shape[1] / target_shape[1])
134 |     padding = (ori_shape[1] - target_shape[1] * ratio) / 2, (ori_shape[0] - target_shape[0] * ratio) / 2
135 | 
136 |     boxes[:, [0, 2]] -= padding[0]
137 |     boxes[:, [1, 3]] -= padding[1]
138 |     boxes[:, :4] /= ratio
139 | 
140 |     boxes[:, 0] = boxes[:, 0].clip(0, target_shape[1])  # x1
141 |     boxes[:, 1] = boxes[:, 1].clip(0, target_shape[0])  # y1
142 |     boxes[:, 2] = boxes[:, 2].clip(0, target_shape[1])  # x2
143 |     boxes[:, 3] = boxes[:, 3].clip(0, target_shape[0])  # y2
144 | 
145 |     return boxes
146 | 
147 | 
148 | def preprocess_with_bboxes(original_img, bboxes, detection_img_size, pose_img_size, cfg):
149 |     """
150 |         Args:
151 |         original_img: (numpy.ndarray) (H, W, C)
152 |         bboxes: (numpy.ndarray), normalized bboxes with shape [N, 5 + num_classes], N is the number of bboxes.
153 |         detection_img_size: (tuple), (H, W)
154 |         pose_img_size: (tuple), (H, W)
155 |     Returns:
156 |          img_list: (list of numpy.ndarray)
157 |          xyxy_list: (list of numpy.ndarray)
158 |          conf_list: (list of numpy.ndarray)
159 |     """
160 | 
161 |     if len(bboxes):
162 |         bboxes[:, :4] = rescale(detection_img_size, bboxes[:, :4], original_img.shape)
163 | 
164 |     img_list = []
165 |     xyxy_list = []
166 |     conf_list = []
167 |  
168 |     # bboxes = np.flip(bboxes, axis=0)
169 |     for i, (*xyxy, conf, cls) in enumerate(bboxes):
170 |         if i >= cfg.max_detection:
171 |             break
172 | 
173 |         # pad to offset the wrong effect in PatchEmbed in vit.py
174 |         # in my opinion, a little bit smaller image is better than a little bit truncated image
175 |         # padding=4 for base conf
176 |         if not cfg.no_pad:
177 |             padding = 4
178 |             xyxy[2] += padding * (xyxy[2]-xyxy[0]) / pose_img_size[1]
179 |             xyxy[3] += padding * (xyxy[3]-xyxy[1]) / pose_img_size[0]
180 |             xyxy[2] = np.clip(xyxy[2], xyxy[0], original_img.shape[1])
181 |             xyxy[3] = np.clip(xyxy[3], xyxy[1], original_img.shape[0])
182 |         
183 |         # crop image
184 |         l, t, r, b = map(int, np.round(xyxy))
185 |         img = original_img[t:b, l:r, :]
186 | 
187 |         # resize image
188 |         img = cv2.resize(img, pose_img_size[::-1], interpolation = cv2.INTER_LINEAR)
189 | 
190 |         # normalization
191 |         img = img / 255.0
192 |         mean_std = np.array([[0.485, 0.456, 0.406], [0.229, 0.224, 0.225]])
193 |         img = (img - mean_std[0]) / mean_std[1]
194 | 
195 |         # convert to torch tensor format
196 |         # img = img.transpose(2, 0, 1).astype('float32') # HWC to CHW
197 | 
198 |         img_list.append(img)
199 |         xyxy_list.append(xyxy)
200 |         conf_list.append(conf)
201 | 
202 |     return img_list, xyxy_list, conf_list


--------------------------------------------------------------------------------
/utils/vitpose_util.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | 
  4 | 
  5 | def _gaussian_blur(heatmaps, kernel=11):
  6 |     """Modulate heatmap distribution with Gaussian.
  7 |      sigma = 0.3*((kernel_size-1)*0.5-1)+0.8
  8 |      sigma~=3 if k=17
  9 |      sigma=2 if k=11;
 10 |      sigma~=1.5 if k=7;
 11 |      sigma~=1 if k=3;
 12 | 
 13 |     Note:
 14 |         - batch_size: N
 15 |         - num_keypoints: K
 16 |         - heatmap height: H
 17 |         - heatmap width: W
 18 | 
 19 |     Args:
 20 |         heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
 21 |         kernel (int): Gaussian kernel size (K) for modulation, which should
 22 |             match the heatmap gaussian sigma when training.
 23 |             K=17 for sigma=3 and k=11 for sigma=2.
 24 | 
 25 |     Returns:
 26 |         np.ndarray ([N, K, H, W]): Modulated heatmap distribution.
 27 |     """
 28 |     assert kernel % 2 == 1
 29 | 
 30 |     border = (kernel - 1) // 2
 31 |     batch_size = heatmaps.shape[0]
 32 |     num_joints = heatmaps.shape[1]
 33 |     height = heatmaps.shape[2]
 34 |     width = heatmaps.shape[3]
 35 |     for i in range(batch_size):
 36 |         for j in range(num_joints):
 37 |             origin_max = np.max(heatmaps[i, j])
 38 |             dr = np.zeros((height + 2 * border, width + 2 * border),
 39 |                           dtype=np.float32)
 40 |             dr[border:-border, border:-border] = heatmaps[i, j].copy()
 41 |             dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
 42 |             heatmaps[i, j] = dr[border:-border, border:-border].copy()
 43 |             heatmaps[i, j] *= origin_max / np.max(heatmaps[i, j])
 44 |     return heatmaps
 45 | 
 46 | 
 47 | def _get_max_preds(heatmaps):
 48 |     """Get keypoint predictions from score maps.
 49 | 
 50 |     Note:
 51 |         batch_size: N
 52 |         num_keypoints: K
 53 |         heatmap height: H
 54 |         heatmap width: W
 55 | 
 56 |     Args:
 57 |         heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
 58 | 
 59 |     Returns:
 60 |         tuple: A tuple containing aggregated results.
 61 | 
 62 |         - preds (np.ndarray[N, K, 2]): Predicted keypoint location.
 63 |         - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
 64 |     """
 65 |     assert isinstance(heatmaps,
 66 |                       np.ndarray), ('heatmaps should be numpy.ndarray')
 67 |     assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
 68 | 
 69 |     N, K, _, W = heatmaps.shape
 70 |     heatmaps_reshaped = heatmaps.reshape((N, K, -1))
 71 |     idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
 72 |     maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
 73 | 
 74 |     preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
 75 |     preds[:, :, 0] = preds[:, :, 0] % W
 76 |     preds[:, :, 1] = preds[:, :, 1] // W
 77 | 
 78 |     preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1)
 79 |     return preds, maxvals
 80 | 
 81 | 
 82 | def post_dark_udp(coords, batch_heatmaps, kernel=3):
 83 |     """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
 84 |     Devil is in the Details: Delving into Unbiased Data Processing for Human
 85 |     Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
 86 |     Representation for Human Pose Estimation (CVPR 2020).
 87 | 
 88 |     Note:
 89 |         - batch size: B
 90 |         - num keypoints: K
 91 |         - num persons: N
 92 |         - height of heatmaps: H
 93 |         - width of heatmaps: W
 94 | 
 95 |         B=1 for bottom_up paradigm where all persons share the same heatmap.
 96 |         B=N for top_down paradigm where each person has its own heatmaps.
 97 | 
 98 |     Args:
 99 |         coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
100 |         batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
101 |         kernel (int): Gaussian kernel size (K) for modulation.
102 | 
103 |     Returns:
104 |         np.ndarray([N, K, 2]): Refined coordinates.
105 |     """
106 |     if not isinstance(batch_heatmaps, np.ndarray):
107 |         batch_heatmaps = batch_heatmaps.cpu().numpy()
108 |     B, K, H, W = batch_heatmaps.shape
109 |     N = coords.shape[0]
110 |     assert (B == 1 or B == N)
111 |     for heatmaps in batch_heatmaps:
112 |         for heatmap in heatmaps:
113 |             cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
114 |     np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
115 |     np.log(batch_heatmaps, batch_heatmaps)
116 | 
117 |     batch_heatmaps_pad = np.pad(
118 |         batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)),
119 |         mode='edge').flatten()
120 | 
121 |     index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
122 |     index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
123 |     index = index.astype(int).reshape(-1, 1)
124 |     i_ = batch_heatmaps_pad[index]
125 |     ix1 = batch_heatmaps_pad[index + 1]
126 |     iy1 = batch_heatmaps_pad[index + W + 2]
127 |     ix1y1 = batch_heatmaps_pad[index + W + 3]
128 |     ix1_y1_ = batch_heatmaps_pad[index - W - 3]
129 |     ix1_ = batch_heatmaps_pad[index - 1]
130 |     iy1_ = batch_heatmaps_pad[index - 2 - W]
131 | 
132 |     dx = 0.5 * (ix1 - ix1_)
133 |     dy = 0.5 * (iy1 - iy1_)
134 |     derivative = np.concatenate([dx, dy], axis=1)
135 |     derivative = derivative.reshape(N, K, 2, 1)
136 |     dxx = ix1 - 2 * i_ + ix1_
137 |     dyy = iy1 - 2 * i_ + iy1_
138 |     dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
139 |     hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
140 |     hessian = hessian.reshape(N, K, 2, 2)
141 |     hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
142 |     coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze()
143 |     return coords
144 | 
145 | 
146 | def _taylor(heatmap, coord):
147 |     """Distribution aware coordinate decoding method.
148 | 
149 |     Note:
150 |         - heatmap height: H
151 |         - heatmap width: W
152 | 
153 |     Args:
154 |         heatmap (np.ndarray[H, W]): Heatmap of a particular joint type.
155 |         coord (np.ndarray[2,]): Coordinates of the predicted keypoints.
156 | 
157 |     Returns:
158 |         np.ndarray[2,]: Updated coordinates.
159 |     """
160 |     H, W = heatmap.shape[:2]
161 |     px, py = int(coord[0]), int(coord[1])
162 |     if 1 < px < W - 2 and 1 < py < H - 2:
163 |         dx = 0.5 * (heatmap[py][px + 1] - heatmap[py][px - 1])
164 |         dy = 0.5 * (heatmap[py + 1][px] - heatmap[py - 1][px])
165 |         dxx = 0.25 * (
166 |             heatmap[py][px + 2] - 2 * heatmap[py][px] + heatmap[py][px - 2])
167 |         dxy = 0.25 * (
168 |             heatmap[py + 1][px + 1] - heatmap[py - 1][px + 1] -
169 |             heatmap[py + 1][px - 1] + heatmap[py - 1][px - 1])
170 |         dyy = 0.25 * (
171 |             heatmap[py + 2 * 1][px] - 2 * heatmap[py][px] +
172 |             heatmap[py - 2 * 1][px])
173 |         derivative = np.array([[dx], [dy]])
174 |         hessian = np.array([[dxx, dxy], [dxy, dyy]])
175 |         if dxx * dyy - dxy**2 != 0:
176 |             hessianinv = np.linalg.inv(hessian)
177 |             offset = -hessianinv @ derivative
178 |             offset = np.squeeze(np.array(offset.T), axis=0)
179 |             coord += offset
180 |     return coord
181 | 
182 | 
183 | def transform_preds(coords, center, scale, output_size, use_udp=False):
184 |     """Get final keypoint predictions from heatmaps and apply scaling and
185 |     translation to map them back to the image.
186 | 
187 |     Note:
188 |         num_keypoints: K
189 | 
190 |     Args:
191 |         coords (np.ndarray[K, ndims]):
192 | 
193 |             * If ndims=2, corrds are predicted keypoint location.
194 |             * If ndims=4, corrds are composed of (x, y, scores, tags)
195 |             * If ndims=5, corrds are composed of (x, y, scores, tags,
196 |               flipped_tags)
197 | 
198 |         center (np.ndarray[2, ]): Center of the bounding box (x, y).
199 |         scale (np.ndarray[2, ]): Scale of the bounding box
200 |             wrt [width, height].
201 |         output_size (np.ndarray[2, ] | list(2,)): Size of the
202 |             destination heatmaps.
203 |         use_udp (bool): Use unbiased data processing
204 | 
205 |     Returns:
206 |         np.ndarray: Predicted coordinates in the images.
207 |     """
208 |     assert coords.shape[1] in (2, 4, 5)
209 |     assert len(center) == 2
210 |     assert len(scale) == 2
211 |     assert len(output_size) == 2
212 | 
213 |     # Recover the scale which is normalized by a factor of 200.
214 |     # scale = scale * 200.0
215 | 
216 |     if use_udp:
217 |         scale_x = scale[0] / (output_size[0] - 1.0)
218 |         scale_y = scale[1] / (output_size[1] - 1.0)
219 |     else:
220 |         scale_x = scale[0] / output_size[0]
221 |         scale_y = scale[1] / output_size[1]
222 | 
223 |     target_coords = np.ones_like(coords)
224 |     target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
225 |     target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
226 | 
227 |     return target_coords
228 | 
229 | 
230 | def keypoints_from_heatmaps(heatmaps,
231 |                             center,
232 |                             scale,
233 |                             kernel=11,
234 |                             use_udp=True):
235 |     """Get final keypoint predictions from heatmaps and transform them back to
236 |     the image.
237 | 
238 |     Note:
239 |         - batch size: N
240 |         - num keypoints: K
241 |         - heatmap height: H
242 |         - heatmap width: W
243 | 
244 |     Args:
245 |         heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
246 |         center (np.ndarray[N, 2]): Center of the bounding box (x, y).
247 |         scale (np.ndarray[N, 2]): Scale of the bounding box
248 |             wrt height/width.
249 |         kernel (int): Gaussian kernel size (K) for modulation, which should
250 |             match the heatmap gaussian sigma when training.
251 |             K=17 for sigma=3 and k=11 for sigma=2.
252 |         use_udp (bool): Use unbiased data processing.
253 | 
254 |     Returns:
255 |         tuple: A tuple containing keypoint predictions and scores.
256 | 
257 |         - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
258 |         - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
259 |     """
260 |     # Avoid being affected
261 |     heatmaps = heatmaps.copy()
262 | 
263 |     # start processing
264 |     N, K, H, W = heatmaps.shape
265 |     if use_udp:
266 |         preds, maxvals = _get_max_preds(heatmaps)
267 |         preds = post_dark_udp(preds, heatmaps, kernel=kernel)
268 |     else:
269 |         preds, maxvals = _get_max_preds(heatmaps)
270 | 
271 |         # apply Gaussian distribution modulation.
272 |         heatmaps = np.log(
273 |             np.maximum(_gaussian_blur(heatmaps, kernel), 1e-10))
274 |         for n in range(N):
275 |             for k in range(K):
276 |                 preds[n][k] = _taylor(heatmaps[n][k], preds[n][k])
277 | 
278 |     # Transform back to the image
279 |     for i in range(N):
280 |         preds[i] = transform_preds(
281 |             preds[i], center[i], scale[i], [W, H], use_udp=use_udp)
282 | 
283 |     return preds, maxvals


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2023 Minsik Yoon
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/utils/inference.py:
--------------------------------------------------------------------------------
  1 | from time import time, sleep
  2 | import queue, threading
  3 | import pickle
  4 | 
  5 | import numpy as np
  6 | import cv2
  7 | 
  8 | from utils.visualization import draw_points_and_skeleton, joints_dict
  9 | from utils.util import print_fps
 10 | from utils.vitpose_util import keypoints_from_heatmaps
 11 | from utils.yolov6_util import letterbox, non_max_suppression, preprocess_with_bboxes, xyxy2xywh, plot_box_and_label 
 12 | 
 13 | 
 14 | def inference(original_imgs, yolov6_sess, vitpose_sess, cfg, smooth_net=None):
 15 |     """
 16 |     Args:
 17 |         original_imgs: (numpy.ndarray), (B, H, W, C), RGB color format
 18 |         yolov6_sess: (onnxruntime.InferenceSession)
 19 |         vitpose_sess: (onnxruntime.InferenceSession)
 20 |     Returns:
 21 |          infered_imgs: (numpy.ndarray), (B, H, W, C), BGR color format
 22 |     """
 23 | 
 24 |     
 25 |     if cfg.no_background:
 26 |         backgrounds = np.zeros_like(original_imgs)
 27 |     else:
 28 |         backgrounds = original_imgs
 29 | 
 30 |     yolov6_img_size = yolov6_sess.get_inputs()[0].shape[-2:]
 31 |     vitpose_img_size = vitpose_sess.get_inputs()[0].shape[-2:]
 32 |     
 33 |     
 34 |     # Preprocess images
 35 |     processed_imgs = []
 36 |     for img in original_imgs:
 37 |         img = letterbox(img, yolov6_img_size, auto=False)[0]
 38 |         img = img.astype('float32') / 255
 39 |         img = img.transpose(2, 0, 1)
 40 |         processed_imgs.append(img)
 41 |     
 42 |     processed_imgs = np.stack(processed_imgs, axis=0)
 43 |     
 44 |     
 45 |     # Predict bboxes
 46 |     preds = []
 47 |     input_name = yolov6_sess.get_inputs()[0].name
 48 |     for img_batch in np.array_split(processed_imgs, (len(processed_imgs)-1) // cfg.yolo_batch_size + 1):
 49 |         preds.append(yolov6_sess.run(None, {input_name: img_batch})[0])
 50 | 
 51 |     preds = np.concatenate(preds)
 52 |     
 53 |     
 54 |     # Postprocess preds
 55 |     preds = preds[..., :6] # take only human class
 56 |     bbox_groups = non_max_suppression(preds, cfg.conf_thres, cfg.iou_thres)
 57 |     
 58 |     
 59 |     # Preprocess images for ViTPose input
 60 |     processed_imgs = []
 61 |     xyxy_groups = []
 62 |     conf_groups = []
 63 |     sections = []
 64 |     detection_check_list = []
 65 |     for idx, (original_img, bboxes) in enumerate(zip(original_imgs, bbox_groups)):
 66 |         img_list, xyxy_list, conf_list = preprocess_with_bboxes(original_img, bboxes, yolov6_img_size, vitpose_img_size, cfg)
 67 |     
 68 |         if len(img_list) != 0:
 69 |             processed_imgs.append(np.stack(img_list))
 70 |             xyxy_groups.append(np.stack(xyxy_list))
 71 |             conf_groups.append(conf_list)
 72 |             detection_check_list.append(True)
 73 |         else:
 74 |             detection_check_list.append(False)
 75 |         
 76 |         sections.append(len(img_list))
 77 | 
 78 |     if sum(sections) == 0: # nothing detected
 79 |         return backgrounds[..., ::-1].copy(), ([], [])
 80 |     
 81 |     processed_imgs = np.concatenate(processed_imgs)
 82 |     processed_imgs = processed_imgs.transpose(0, 3, 1, 2).astype('float32')
 83 |     sections = np.cumsum(sections)
 84 |     
 85 |     
 86 |     # Predict keypoints
 87 |     heatmaps = []
 88 |     num_batch = (len(processed_imgs)-1) // cfg.pose_batch_size + 1
 89 |     input_name = vitpose_sess.get_inputs()[0].name
 90 |     for img_batch in np.array_split(processed_imgs, num_batch):
 91 |         heatmaps.append(vitpose_sess.run(None, {input_name: img_batch})[0])
 92 | 
 93 |     heatmaps = np.concatenate(heatmaps)
 94 |     
 95 |     # Postprocess heatmaps
 96 |     xywh_groups = xyxy2xywh(np.concatenate(xyxy_groups))
 97 |     center_xy = xywh_groups[:, [0,1]]
 98 |     scale_hw = xywh_groups[:, [2, 3]]
 99 | 
100 |     keypoints, prob = keypoints_from_heatmaps(heatmaps=heatmaps, center=center_xy, scale=scale_hw, use_udp=True)
101 |     keypoints = np.concatenate([keypoints[:, :, ::-1], prob], axis=2)
102 |     keypoint_groups = np.split(keypoints, sections)
103 |     
104 |     # Visualization
105 |     pid = 1 # dummy
106 |     infered_imgs = backgrounds[..., ::-1].copy()
107 | 
108 |     iteration = zip(keypoint_groups, xyxy_groups, conf_groups, backgrounds[..., ::-1])
109 |     for idx, (keypoints, xyxy_list, conf_list, bg_img) in enumerate(iteration):
110 |         if len(keypoints) == 0:
111 |             continue
112 | 
113 |         img = bg_img.copy()
114 | 
115 |         if not cfg.no_bbox:
116 |             for xyxy, conf in zip(xyxy_list, conf_list):
117 |                 lw = int(np.ceil((xyxy[2]+xyxy[3]-xyxy[0]-xyxy[1]) * 5 / 3000))
118 |                 plot_box_and_label(img, lw, xyxy, 'person '+ f'{conf*100:0.0f}%', color=(40,150,30))
119 | 
120 |         if not cfg.no_skeleton:
121 |             for points, xyxy in zip(keypoints, xyxy_list):
122 |                 xywh = xyxy2xywh(xyxy) if cfg.dynamic_drawing else None
123 |                 img = draw_points_and_skeleton(img, points, joints_dict()['coco']['skeleton'], person_index=pid,
124 |                                             points_color_palette='gist_rainbow', skeleton_color_palette='jet',
125 |                                             points_palette_samples=10, confidence_threshold=cfg.key_conf_thres, xywh=xywh)
126 |                 
127 |         infered_imgs[idx] = img
128 | 
129 |     if cfg.result_scale:
130 |         new_imgs = []
131 |         for img in infered_imgs:
132 |             size = (int(img.shape[1] * cfg.result_scale), int(img.shape[0] * cfg.result_scale))
133 |             img = cv2.resize(img, size, interpolation = cv2.INTER_LINEAR)
134 |             new_imgs.append(img)
135 |         infered_imgs = np.stack(new_imgs)
136 | 
137 |     return infered_imgs, (bbox_groups, keypoint_groups)
138 | 
139 | 
140 | 
141 | 
142 | def inference_image(img_path, yolov6_sess, vitpose_sess, cfg):
143 |     img_origin = cv2.imread(img_path)
144 |     img_origin = img_origin[..., ::-1] # BGR to RGB
145 |     img_origin = np.expand_dims(img_origin, axis=0)
146 |     img, pred = inference(img_origin, yolov6_sess, vitpose_sess, cfg)
147 | 
148 |     print('-'*10 + "\nPress 'Q' key on OpenCV window if you want to close")
149 |     cv2.imshow("OpenCV", img[0])
150 | 
151 |     if cfg.save:
152 |         save_name = img_path.replace(".jpg", "_result.jpg")
153 |         cv2.imwrite(save_name, img[0])
154 |     if cfg.save_prediction:
155 |         preds = {'bbox':[], 'pose':[]}
156 |         preds['bbox'].extend(pred[0])
157 |         preds['pose'].extend(pred[1])
158 |         save_name = img_path.replace(".jpg", "_prediction.pkl")
159 |         with open(save_name, 'wb') as f:
160 |             pickle.dump(preds, f)
161 | 
162 |     cv2.waitKey(0)
163 | 
164 | 
165 | 
166 | 
167 | def inference_video(vid_path, yolov6_sess, vitpose_sess, cfg, smooth_net=None):
168 |     video = cv2.VideoCapture(vid_path)
169 |     frames = []
170 |     preds = {'bbox':[], 'pose':[]}
171 | 
172 |     if cfg.save:
173 |         out_name = '.'.join(vid_path.split('.')[:-1]) + '_result.mp4'
174 |         out_fourcc = cv2.VideoWriter_fourcc(*'mp4v')
175 |         if cfg.set_fps is not None:
176 |             out_fps = cfg.set_fps
177 |         else:
178 |             out_fps = video.get(cv2.CAP_PROP_FPS)
179 |         out_size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))
180 |         out = cv2.VideoWriter(out_name, out_fourcc, out_fps, out_size)
181 | 
182 |     print('-'*10 + "\nPress 'Q' key on OpenCV window if you want to close")
183 |     tic = time()
184 |     while(True):
185 |         ret, frame = video.read()
186 | 
187 |         if ret:
188 |             frames.append(frame)
189 |             if len(frames) < cfg.yolo_batch_size:
190 |                 continue
191 | 
192 |             frames = np.stack(frames)
193 |             frames = frames[..., ::-1] # BGR to RGB
194 |             results, pred = inference(frames, yolov6_sess, vitpose_sess, cfg)
195 | 
196 |             toc = time()
197 |             fps = 1/(toc - tic)
198 |             tic = time()
199 | 
200 |             print_fps(fps*cfg.yolo_batch_size)
201 |             
202 |             cv2.imshow('OpenCV', results[-1])
203 | 
204 |             if cfg.save:
205 |                 for res in results:
206 |                     out.write(res)
207 |             if cfg.save_prediction:
208 |                 preds['bbox'].extend(pred[0])
209 |                 preds['pose'].extend(pred[1])
210 | 
211 |             frames = []
212 | 
213 |             if cv2.waitKey(1) & 0xFF == ord('q'):
214 |                 break
215 | 
216 |         else:
217 |             break
218 |     
219 |     if cfg.save_prediction:
220 |         save_name = '.'.join(vid_path.split('.')[:-1]) + '_prediction.pkl'
221 |         with open(save_name, 'wb') as f:
222 |             pickle.dump(preds, f)
223 | 
224 |     video.release()
225 |     if cfg.save: out.release()
226 |     cv2.destroyAllWindows()
227 | 
228 | 
229 | 
230 | 
231 | def inference_webcam(webcam, yolov6_sess, vitpose_sess, cfg, smooth_net=None):
232 |     event = threading.Event()
233 | 
234 |     # bufferless VideoCapture
235 |     cap = AsyncVideoCapture(webcam, event)
236 |     preds = {'bbox':[], 'pose':[]}
237 | 
238 |     if cfg.save:
239 |         frame_queue = queue.Queue(1)
240 | 
241 |         out_name = 'webcam_result.mp4'
242 |         out_fourcc = cv2.VideoWriter_fourcc(*'mp4v')
243 |         out_fps = cfg.set_fps
244 |         out_size = (int(cap.cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
245 | 
246 |         out = AsyncVideoWriter(out_name, out_fourcc, out_fps, out_size, cap, frame_queue, event)
247 | 
248 |     print('-'*10 + "\nPress 'Q' key on OpenCV window if you want to close")
249 |     tic = time()
250 |     while not cap.is_dead:
251 |         frame = cap.read()
252 | 
253 |         frame = frame[..., ::-1] # BGR to RGB
254 |         frame = np.expand_dims(frame, axis=0)
255 |         frame, pred = inference(frame, yolov6_sess, vitpose_sess, cfg)
256 | 
257 |         toc = time()
258 |         fps = 1/(toc - tic)
259 |         tic = time()
260 |         
261 |         print_fps(fps)
262 | 
263 |         cv2.imshow("OpenCV", frame[0])
264 | 
265 |         if cfg.save:
266 |             try:
267 |                 frame_queue.put_nowait(frame[0])
268 |             except queue.Full:
269 |                 pass
270 | 
271 |         if cfg.save_prediction:
272 |                 preds['bbox'].extend(pred[0])
273 |                 preds['pose'].extend(pred[1])
274 | 
275 |         if cv2.waitKey(1) & 0xFF == ord('q'):
276 |             cap.event.set()
277 |             break
278 | 
279 |     if cfg.save_prediction:
280 |         save_name = 'webcam_prediction.pkl'
281 |         with open(save_name, 'wb') as f:
282 |             pickle.dump(preds, f)
283 |     
284 |     time_out = 10.0
285 |     tic = time()
286 |     while (not cap.is_dead) or (cfg.save and not out.is_dead):
287 |         toc = time()
288 |         if toc-tic > time_out:
289 |             break
290 | 
291 |     cv2.destroyAllWindows()
292 | 
293 | 
294 | 
295 | class AsyncVideoCapture:
296 |     def __init__(self, webcam, event):
297 |         self.cap = cv2.VideoCapture(webcam)
298 |         if self.cap.isOpened():
299 |             self.event = event
300 |             self.q = queue.Queue()
301 |             t = threading.Thread(target=self._reader)
302 |             t.daemon = True
303 |             t.start()
304 |             self.is_dead = False
305 |         else:
306 |             self.cap.release()
307 |             self.is_dead = True
308 | 
309 |     # read frames as soon as they are available, keeping only most recent one
310 |     def _reader(self):
311 |         while True:
312 |             ret, frame = self.cap.read()
313 | 
314 |             if (not ret) or self.event.is_set():
315 |                 break
316 | 
317 |             if not self.q.empty():
318 |                 try:
319 |                     self.q.get_nowait()   # discard previous (unprocessed) frame
320 |                 except queue.Empty:
321 |                     pass
322 | 
323 |             self.q.put(frame)
324 | 
325 |         self.cap.release()
326 |         self.is_dead = True
327 | 
328 |     def read(self):
329 |         return self.q.get()
330 |     
331 | 
332 | class AsyncVideoWriter:
333 |     def __init__(self, out_name, out_fourcc, out_fps, out_size, cap, frame_queue, event):
334 |         self.out = cv2.VideoWriter(out_name, out_fourcc, out_fps, out_size)
335 |         if self.out.isOpened():
336 |             self.cap = cap
337 |             self.event = event
338 |             self.last_frame = np.zeros((out_size[1], out_size[0],3), np.uint8)
339 |             self.frame_queue = frame_queue
340 |             t = threading.Thread(target=self._writer)
341 |             t.daemon = True
342 |             t.start()
343 |             self.period = 1/out_fps
344 |             self.is_dead = False
345 |         else:
346 |             self.out.release()
347 |             self.is_dead = True
348 | 
349 |     def _writer(self):
350 |         diff = 0
351 |         tic = time()
352 |         while True:
353 |             if self.event.is_set():
354 |                 break
355 | 
356 |             try:
357 |                 self.last_frame = self.frame_queue.get_nowait()
358 |             except queue.Empty:
359 |                 pass
360 |             
361 |             self.out.write(self.last_frame)
362 | 
363 |             # match writing speed to the desire fps
364 |             elapsed_time =  time() - tic
365 |             tic = time()
366 |             diff += self.period - elapsed_time
367 |             if diff > 0:
368 |                 sleep(diff)
369 |             
370 |         self.out.release()
371 |         self.is_dead = True


--------------------------------------------------------------------------------