├── requirements.txt ├── utils ├── __init__.py ├── nms.py ├── util.py ├── visualization.py ├── yolov6_util.py ├── vitpose_util.py └── inference.py ├── run.py ├── configs ├── base_config.py └── custom_config.py ├── .gitignore ├── README.md └── LICENSE /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib == 3.6.2 2 | numpy == 1.24.2 3 | onnxruntime-gpu == 1.13.1 4 | opencv-python == 4.7.0.68 5 | yacs == 0.1.8 -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .util import * 2 | from .inference import * 3 | from .visualization import * 4 | from .vitpose_util import * 5 | from .yolov6_util import * 6 | from .nms import * -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import onnxruntime as ort 4 | 5 | from utils.inference import inference_image, inference_video, inference_webcam 6 | from utils import get_config 7 | 8 | 9 | def main(cfg): 10 | YOLOV6_PATH = cfg.yolov6_path 11 | VITPOSE_PATH = cfg.vitpose_path 12 | IMG_PATH = cfg.image_path 13 | VID_PATH = cfg.video_path 14 | WEBCAM = cfg.webcam 15 | 16 | assert (IMG_PATH or VID_PATH or (WEBCAM is not None)), "Argument -img or -vid or -wc should be provided" 17 | 18 | if cfg.cpu: 19 | EP_list = ['CPUExecutionProvider'] 20 | else: 21 | EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] 22 | 23 | yolov6_sess = ort.InferenceSession(YOLOV6_PATH, providers=EP_list) 24 | vitpose_sess = ort.InferenceSession(VITPOSE_PATH, providers=EP_list) 25 | # TODO : implement smooth_net feature 26 | # if cfg.smooth_net: 27 | # smooth_net = ort.InferenceSession('smoothnet-32.onnx', providers=EP_list) 28 | 29 | os.system("") # make terminal to be able to use ANSI escape 30 | 31 | # Inference image 32 | if IMG_PATH: 33 | inference_image(IMG_PATH, yolov6_sess, vitpose_sess, cfg) 34 | 35 | #Inference video from file 36 | if VID_PATH: 37 | inference_video(VID_PATH, yolov6_sess, vitpose_sess, cfg) 38 | 39 | # Inference video from webcam 40 | if WEBCAM is not None: 41 | inference_webcam(WEBCAM, yolov6_sess, vitpose_sess, cfg) 42 | 43 | 44 | if __name__ == "__main__": 45 | cfg = get_config() 46 | 47 | main(cfg) -------------------------------------------------------------------------------- /utils/nms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def nms(boxes, probs=None, overlapThresh=0.3): 4 | # if there are no boxes, return an empty list 5 | if len(boxes) == 0: 6 | return [] 7 | 8 | # if the bounding boxes are integers, convert them to floats -- this 9 | # is important since we'll be doing a bunch of divisions 10 | if boxes.dtype.kind == "i": 11 | boxes = boxes.astype("float") 12 | 13 | # initialize the list of picked indexes 14 | pick = [] 15 | 16 | # grab the coordinates of the bounding boxes 17 | x1 = boxes[:, 0] 18 | y1 = boxes[:, 1] 19 | x2 = boxes[:, 2] 20 | y2 = boxes[:, 3] 21 | 22 | # compute the area of the bounding boxes and grab the indexes to sort 23 | # (in the case that no probabilities are provided, simply sort on the 24 | # bottom-left y-coordinate) 25 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 26 | idxs = y2 27 | 28 | # if probabilities are provided, sort on them instead 29 | if probs is not None: 30 | idxs = probs 31 | 32 | # sort the indexes 33 | idxs = np.argsort(idxs) 34 | 35 | # keep looping while some indexes still remain in the indexes list 36 | while len(idxs) > 0: 37 | # grab the last index in the indexes list and add the index value 38 | # to the list of picked indexes 39 | last = len(idxs) - 1 40 | i = idxs[last] 41 | pick.append(i) 42 | 43 | # find the largest (x, y) coordinates for the start of the bounding 44 | # box and the smallest (x, y) coordinates for the end of the bounding 45 | # box 46 | xx1 = np.maximum(x1[i], x1[idxs[:last]]) 47 | yy1 = np.maximum(y1[i], y1[idxs[:last]]) 48 | xx2 = np.minimum(x2[i], x2[idxs[:last]]) 49 | yy2 = np.minimum(y2[i], y2[idxs[:last]]) 50 | 51 | # compute the width and height of the bounding box 52 | w = np.maximum(0, xx2 - xx1 + 1) 53 | h = np.maximum(0, yy2 - yy1 + 1) 54 | 55 | # compute the ratio of overlap 56 | overlap = (w * h) / area[idxs[:last]] 57 | 58 | # delete all indexes from the index list that have overlap greater 59 | # than the provided overlap threshold 60 | idxs = np.delete(idxs, np.concatenate(([last], 61 | np.where(overlap > overlapThresh)[0]))) 62 | 63 | # return pick indices 64 | return np.array(pick) -------------------------------------------------------------------------------- /configs/base_config.py: -------------------------------------------------------------------------------- 1 | from yacs.config import CfgNode as CN 2 | 3 | 4 | """ 5 | yolov6_path (str): path to YOLOv6 onnx file 6 | vitpose_path (str): path to ViTPose onnx file 7 | 8 | image_path (str): path to image file 9 | video_path (str): path to video file 10 | webcam (str or int): webcam URL or ID, set None for not using 11 | 12 | no_background (bool): draw a black screen instead of the original image, if True 13 | no_bbox (bool): skip drawing bboxes, if True 14 | no_skeleton (bool): skip drawing skeletons, if True 15 | dynamic_drawing (bool): keypoint radius and skeleton width change dynamic with bbox size, if True 16 | smooth_net (bool): reduce jitter in keypoints predicted by using SmoothNet. not implemented yet 17 | result_scale (float): set a coefficient to scale a size of result, set None for not processing 18 | 19 | save (bool): save the result, if True 20 | save_prediction (bool): save the predictions(bbox, pose), if True. 21 | Numpy is needed to read the save file 22 | set_fps (int): set a fps of result to be saved, 23 | set None to use original fps for video( or 60fps for webcam) 24 | 25 | conf_thres (float): set a bbox confidence threshold for non-maximum suppression 26 | iou_thres (float): set a bbox iou threshold for non-maximum suppression 27 | max_detection (int): set the maximum amount of bbox 28 | key_conf_thres (float): set a keypoint confidence threshold 29 | no_pad (bool): do not use additional padding. if True 30 | cpu (bool): use CPU to inference, if True 31 | pose batch size (int): set pose batch size 32 | yolo batch size (int): set yolo batch size, it works only in video 33 | """ 34 | 35 | 36 | _C = CN() 37 | 38 | _C.yolov6_path = 'yolov6m.onnx' 39 | _C.vitpose_path = 'vitpose-b-multi-coco.onnx' 40 | 41 | _C.image_path = '' 42 | _C.video_path = '' 43 | _C.webcam = None 44 | 45 | _C.no_background = False 46 | _C.no_bbox = False 47 | _C.no_skeleton = False 48 | _C.dynamic_drawing = False 49 | _C.smooth_net = False 50 | _C.result_scale = None 51 | 52 | _C.save = False 53 | _C.save_prediction = False 54 | _C.set_fps = None 55 | 56 | _C.conf_thres = 0.25 57 | _C.iou_thres = 0.45 58 | _C.max_detection = 100 59 | _C.key_conf_thres = 0.4 60 | _C.no_pad = False 61 | _C.cpu = False 62 | _C.pose_batch_size = 1 63 | _C.yolo_batch_size = 1 64 | 65 | 66 | cfg = _C -------------------------------------------------------------------------------- /configs/custom_config.py: -------------------------------------------------------------------------------- 1 | from yacs.config import CfgNode as CN 2 | 3 | 4 | """ 5 | yolov6_path (str): path to YOLOv6 onnx file 6 | vitpose_path (str): path to ViTPose onnx file 7 | 8 | image_path (str): path to image file 9 | video_path (str): path to video file 10 | webcam (str or int): webcam URL or ID, set None for not using 11 | 12 | no_background (bool): draw a black screen instead of the original image, if True 13 | no_bbox (bool): skip drawing bboxes, if True 14 | no_skeleton (bool): skip drawing skeletons, if True 15 | dynamic_drawing (bool): keypoint radius and skeleton width change dynamic with bbox size, if True 16 | smooth_net (bool): reduce jitter in keypoints predicted by using SmoothNet. not implemented yet 17 | result_scale (float): set a coefficient to scale a size of result, set None for not processing 18 | 19 | save (bool): save the result, if True 20 | save_prediction (bool): save the predictions(bbox, pose), if True. 21 | Numpy is needed to read the save file 22 | set_fps (int): set a fps of result to be saved, 23 | set None to use original fps for video( or 60fps for webcam) 24 | 25 | conf_thres (float): set a bbox confidence threshold for non-maximum suppression 26 | iou_thres (float): set a bbox iou threshold for non-maximum suppression 27 | max_detection (int): set the maximum amount of bbox 28 | key_conf_thres (float): set a keypoint confidence threshold 29 | no_pad (bool): do not use additional padding. if True 30 | cpu (bool): use CPU to inference, if True 31 | pose batch size (int): set pose batch size 32 | yolo batch size (int): set yolo batch size, it works only in video 33 | """ 34 | 35 | 36 | _C = CN() 37 | 38 | _C.yolov6_path = 'yolov6m.onnx' 39 | _C.vitpose_path = 'vitpose-b-multi-coco.onnx' 40 | 41 | _C.image_path = '' 42 | _C.video_path = '' 43 | _C.webcam = None 44 | 45 | _C.no_background = False 46 | _C.no_bbox = True 47 | _C.no_skeleton = False 48 | _C.dynamic_drawing = True 49 | _C.smooth_net = False 50 | _C.result_scale = None 51 | 52 | _C.save = True 53 | _C.save_prediction = False 54 | _C.set_fps = None 55 | 56 | _C.conf_thres = 0.25 57 | _C.iou_thres = 0.45 58 | _C.max_detection = 100 59 | _C.key_conf_thres = 0.15 60 | _C.no_pad = False 61 | _C.cpu = False 62 | _C.pose_batch_size = 1 63 | _C.yolo_batch_size = 1 64 | 65 | 66 | cfg = _C -------------------------------------------------------------------------------- /utils/util.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | from yacs.config import CfgNode as CN 5 | 6 | 7 | def get_config(): 8 | parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS) 9 | parser.add_argument('--yolov6_path', '-yolo', help='yolov6 path') 10 | parser.add_argument('--vitpose_path', '-pose', help='vitpose path') 11 | 12 | parser.add_argument('--image-path', '-img', help='image path') 13 | parser.add_argument('--video-path', '-vid', help='videos path') 14 | parser.add_argument('--webcam', '-wc', help='webcam id or webcam URL') 15 | 16 | parser.add_argument('--no-background', '-nobg', action='store_true', help="draw only skeletons or bboxes, background will be black") 17 | parser.add_argument('--no-bbox', '-nobx', action='store_true', help="dont't draw bboxes") 18 | parser.add_argument('--no-skeleton', '-nosk', action='store_true', help="don't draw skeletons") 19 | parser.add_argument('--dynamic-drawing', '-dd', action='store_true', help='turn on dynamic drawing') 20 | parser.add_argument('--smooth-net', '-sn', action='store_true', help='use smooth-net for jitter filtering') 21 | parser.add_argument('--result-scale', '-rs', type=float, help='set scale to result') 22 | 23 | parser.add_argument('--save', '-s', action='store_true', help='save drawing result') 24 | parser.add_argument('--save-prediction', '-sp', action='store_true', help='save prediction') 25 | parser.add_argument('--set-fps', '-fps', type=int, help='set fps for result video') 26 | 27 | parser.add_argument('--conf-thres', '-conf',type=float, help='set conf thres for nms') 28 | parser.add_argument('--iou-thres', '-iou', type=float, help='set iou thres for nms') 29 | parser.add_argument('--max-detection', '-max',type=int, help='set max detection for nms') 30 | parser.add_argument('--key-conf-thres', '-kconf',type=float, help='set keypoint conf thres') 31 | parser.add_argument('--no-pad', action='store_true', help="don't use additional padding") 32 | parser.add_argument('--cpu', '-cpu', action='store_true', help="use cpu instead of gpu") 33 | parser.add_argument('--pose-batch-size', '-pbs',type=int, help='set pose batch size') 34 | parser.add_argument('--yolo-batch-size', '-ybs',type=int, help='set yolo batch size') 35 | 36 | parser.add_argument('--config', '-cfg', default=None, help='config path') 37 | 38 | args = parser.parse_args() 39 | args = vars(args) 40 | 41 | if args['config'] is None: 42 | cfg = cfg = CN._load_cfg_py_source('configs/base_config.py') 43 | else: 44 | cfg = CN._load_cfg_py_source(args.config) 45 | 46 | for key, value in args.items(): 47 | cfg[key] = value 48 | 49 | return cfg 50 | 51 | 52 | def print_fps(fps): 53 | bar = int(fps*2) // 5 54 | sub_bar = " ▎▍▋▊"[int(fps*2)%5] # ▉▊▋▍▎ 55 | sys.stdout.write("\033[K") 56 | print(f'- fps:{fps:06.1f} : ' + "▉"*bar + sub_bar, end='\r') -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | ### Python Patch ### 167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 168 | poetry.toml 169 | 170 | # ruff 171 | .ruff_cache/ 172 | 173 | ### VisualStudioCode ### 174 | .vscode/* 175 | !.vscode/settings.json 176 | !.vscode/tasks.json 177 | !.vscode/launch.json 178 | !.vscode/extensions.json 179 | !.vscode/*.code-snippets 180 | 181 | # Local History for Visual Studio Code 182 | .history/ 183 | 184 | # Built Visual Studio Code Extensions 185 | *.vsix 186 | 187 | ### VisualStudioCode Patch ### 188 | # Ignore all local history of files 189 | .history 190 | .ionide 191 | 192 | # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python 193 | 194 | 195 | # **/__pycache__ 196 | **/*.onnx 197 | **/*.mp4 198 | ignore/ 199 | examples/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ViTPose-ONNX 2 | Easy inference for [ViTPose](https://github.com/ViTAE-Transformer/ViTPose) using ONNX 3 |

4 | 5 |

6 | 7 | ## Requirements 8 | ``` 9 | pip install -r requirements.txt 10 | ``` 11 | As you can see in 'requirements.txt', it requires only 5 libraries below 12 | - matplotlib 13 | - numpy 14 | - onnxruntime-gpu 15 | - opencv-python 16 | - yacs 17 | 18 | ## Usage 19 | ### Install 20 | ``` 21 | git clone https://github.com/Pukei-Pukei/ViTPose-ONNX.git 22 | cd ViTPose-ONNX 23 | pip install -r requirements.txt 24 | ``` 25 | ### Run 26 | Download [vitpose-b-multi-coco.onnx](https://drive.google.com/drive/folders/1v7tStPJqV4x9vgEW9l_mnwbEuw87exiq?usp=share_link) and [yolov6m.onnx](https://drive.google.com/file/d/1lZ251Y_oG0yNwgFW067HWKsSQAbiLdln/view?usp=share_link), then put them in ViTPose-ONNX folder 27 | Run the commands below to start inference 28 | ``` 29 | python run.py -img 30 | ``` 31 | ``` 32 | python run.py -vid 33 | ``` 34 | ``` 35 | python run.py -wc 36 | ``` 37 | ``` 38 | python run.py -cfg -vid 39 | ``` 40 | ### Example 41 | ``` 42 | python run.py -cfg configs/custom_config.py -vid dance.mp4 -s 43 | ``` 44 | '-s' for save option 45 | 46 | ## Options 47 | 48 | --yolov6-path, -yolo PATH : Path to YOLOv6 onnx file 49 | --vitpose-path, -pose PATH : Path to ViTPose onnx file 50 | 51 | --image-path, -img PATH : Image path 52 | --video-path, -vid PATH : Videos path 53 | --webcam, -wc PATH : Webcam id or webcam URL 54 | 55 | --no-background, -nobg : Background will be black screen 56 | --no-bbox, -nobx : Don't draw bboxes 57 | --no-skeleton, -nosk : Don't draw skeletons 58 | --dynamic-drawing, -dd : Turn on dynamic drawing, keypoint 59 | radius and skeleton width change 60 | dynamically with bbox size 61 | --result-scale, -rs SIZE : Set a coefficient to scale a size 62 | of result, set None for not 63 | processing 64 | 65 | --save, -s : Save drawing result 66 | --save-prediction, -sp : Save the predictions(bbox, pose), 67 | Numpy is needed to read the save 68 | file 69 | 70 | --conf-thres, -conf THRES : Set confidence threshold for 71 | non-maximum suppression 72 | --iou-thres, -iou THRES : Set IoU threshold for 73 | non-maximum suppression 74 | --max-detection, -max MAX : Set max detection for non-maximum 75 | suppression 76 | --key-conf-thres, -kconf THRES : Set keypoint confidence threshold 77 | --no-pad : Don't use additional padding 78 | 79 | --cpu, -cpu : Use cpu instead of gpu 80 | --pose-batch-size, -pbs SIZE : Set pose batch size 81 | --yolo-batch-size, -ybs SIZE : Set yolo batch size, 82 | it works only in video 83 | 84 | --config, -cfg : Config path. use config for easy 85 | usage of options. default config 86 | path is 'configs/base_config.py' 87 | 88 | 89 | 90 | ## Download ONNX file 91 | 92 | |Model |ONNX |Original Weight for PyTorch| 93 | |:------:|:---------:|:-------------:| 94 | |[ViTPose-B](https://github.com/ViTAE-Transformer/ViTPose#results-from-this-repo-on-ms-coco-val-set-single-task-training)|[GoogleDrive](https://drive.google.com/drive/folders/1v7tStPJqV4x9vgEW9l_mnwbEuw87exiq?usp=share_link)|[Onedrive](https://1drv.ms/u/s!AimBgYV7JjTlgSrlMB093JzJtqq-?e=Jr5S3R)| 95 | |[YOLOv6-M](https://github.com/meituan/YOLOv6#benchmark)|[GoogleDrive](https://drive.google.com/file/d/1lZ251Y_oG0yNwgFW067HWKsSQAbiLdln/view?usp=share_link)|[Download](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6m.pt)| 96 | 97 | If you want other versions, refer to [Tutorial](https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html) and get your own ONNX 98 | 99 | 100 | ## Acknowledgements 101 | 102 | - [ViTPose](https://github.com/ViTAE-Transformer/ViTPose) 103 | 104 | - [YOLOv6](https://github.com/meituan/YOLOv6) 105 | 106 | - [simple-HRNet](https://github.com/stefanopini/simple-HRNet) 107 | 108 | - [(Faster) Non-Maximum Suppression in Python](https://pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/) 109 | 110 | -------------------------------------------------------------------------------- /utils/visualization.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | 6 | __all__ = ["joints_dict", "draw_points_and_skeleton"] 7 | 8 | 9 | def joints_dict(): 10 | joints = { 11 | "coco": { 12 | "keypoints": { 13 | 0: "nose", 14 | 1: "left_eye", 15 | 2: "right_eye", 16 | 3: "left_ear", 17 | 4: "right_ear", 18 | 5: "left_shoulder", 19 | 6: "right_shoulder", 20 | 7: "left_elbow", 21 | 8: "right_elbow", 22 | 9: "left_wrist", 23 | 10: "right_wrist", 24 | 11: "left_hip", 25 | 12: "right_hip", 26 | 13: "left_knee", 27 | 14: "right_knee", 28 | 15: "left_ankle", 29 | 16: "right_ankle" 30 | }, 31 | "skeleton": [ 32 | # # [16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12], [7, 13], [6, 7], [6, 8], 33 | # # [7, 9], [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7] 34 | # [15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], 35 | # [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6] 36 | [15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], 37 | [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], # [3, 5], [4, 6] 38 | [0, 5], [0, 6] 39 | ] 40 | }, 41 | "mpii": { 42 | "keypoints": { 43 | 0: "right_ankle", 44 | 1: "right_knee", 45 | 2: "right_hip", 46 | 3: "left_hip", 47 | 4: "left_knee", 48 | 5: "left_ankle", 49 | 6: "pelvis", 50 | 7: "thorax", 51 | 8: "upper_neck", 52 | 9: "head top", 53 | 10: "right_wrist", 54 | 11: "right_elbow", 55 | 12: "right_shoulder", 56 | 13: "left_shoulder", 57 | 14: "left_elbow", 58 | 15: "left_wrist" 59 | }, 60 | "skeleton": [ 61 | # [5, 4], [4, 3], [0, 1], [1, 2], [3, 2], [13, 3], [12, 2], [13, 12], [13, 14], 62 | # [12, 11], [14, 15], [11, 10], # [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7] 63 | [5, 4], [4, 3], [0, 1], [1, 2], [3, 2], [3, 6], [2, 6], [6, 7], [7, 8], [8, 9], 64 | [13, 7], [12, 7], [13, 14], [12, 11], [14, 15], [11, 10], 65 | ] 66 | }, 67 | } 68 | return joints 69 | 70 | 71 | def draw_points(image, points, color_palette='tab20', palette_samples=16, confidence_threshold=0.5, xywh=None): 72 | """ 73 | Draws `points` on `image`. 74 | 75 | Args: 76 | image: image in opencv format 77 | points: list of points to be drawn. 78 | Shape: (nof_points, 3) 79 | Format: each point should contain (y, x, confidence) 80 | color_palette: name of a matplotlib color palette 81 | Default: 'tab20' 82 | palette_samples: number of different colors sampled from the `color_palette` 83 | Default: 16 84 | confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1] 85 | Default: 0.5 86 | 87 | Returns: 88 | A new image with overlaid points 89 | 90 | """ 91 | try: 92 | colors = np.round( 93 | np.array(plt.get_cmap(color_palette).colors) * 255 94 | ).astype(np.uint8)[:, ::-1].tolist() 95 | except AttributeError: # if palette has not pre-defined colors 96 | colors = np.round( 97 | np.array(plt.get_cmap(color_palette)(np.linspace(0, 1, palette_samples))) * 255 98 | ).astype(np.uint8)[:, -2::-1].tolist() 99 | 100 | if xywh is not None: 101 | circle_size = 7 102 | circle_size = int(np.clip(np.sqrt(sum(xywh[2:]) / 3000)*circle_size, 1, circle_size*2)) 103 | else: 104 | circle_size = max(1, min(image.shape[:2]) // 160) 105 | 106 | 107 | for i, pt in enumerate(points): 108 | if pt[2] > confidence_threshold: 109 | image = cv2.circle(image, (int(pt[1]), int(pt[0])), circle_size, tuple(colors[i % len(colors)]), -1) 110 | 111 | return image 112 | 113 | 114 | def draw_skeleton(image, points, skeleton, color_palette='Set2', palette_samples=8, person_index=0, 115 | confidence_threshold=0.5, xywh=None): 116 | """ 117 | Draws a `skeleton` on `image`. 118 | 119 | Args: 120 | image: image in opencv format 121 | points: list of points to be drawn. 122 | Shape: (nof_points, 3) 123 | Format: each point should contain (y, x, confidence) 124 | skeleton: list of joints to be drawn 125 | Shape: (nof_joints, 2) 126 | Format: each joint should contain (point_a, point_b) where `point_a` and `point_b` are an index in `points` 127 | color_palette: name of a matplotlib color palette 128 | Default: 'Set2' 129 | palette_samples: number of different colors sampled from the `color_palette` 130 | Default: 8 131 | person_index: index of the person in `image` 132 | Default: 0 133 | confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1] 134 | Default: 0.5 135 | 136 | Returns: 137 | A new image with overlaid joints 138 | 139 | """ 140 | try: 141 | colors = np.round( 142 | np.array(plt.get_cmap(color_palette).colors) * 255 143 | ).astype(np.uint8)[:, ::-1].tolist() 144 | except AttributeError: # if palette has not pre-defined colors 145 | colors = np.round( 146 | np.array(plt.get_cmap(color_palette)(np.linspace(0, 1, palette_samples))) * 255 147 | ).astype(np.uint8)[:, -2::-1].tolist() 148 | 149 | if xywh is not None: 150 | lw = 2 151 | lw = int(np.clip((sum(xywh[2:]) / 300)**(1/6)*lw, 1, lw*2)) 152 | else: 153 | lw = 2 154 | 155 | for i, joint in enumerate(skeleton): 156 | pt1, pt2 = points[joint] 157 | if pt1[2] > confidence_threshold and pt2[2] > confidence_threshold: 158 | image = cv2.line( 159 | image, (int(pt1[1]), int(pt1[0])), (int(pt2[1]), int(pt2[0])), 160 | tuple(colors[person_index % len(colors)]), lw 161 | ) 162 | 163 | return image 164 | 165 | 166 | def draw_points_and_skeleton(image, points, skeleton, points_color_palette='tab20', points_palette_samples=16, 167 | skeleton_color_palette='Set2', skeleton_palette_samples=8, person_index=0, 168 | confidence_threshold=0.5, xywh=None): 169 | """ 170 | Draws `points` and `skeleton` on `image`. 171 | 172 | Args: 173 | image: image in opencv format 174 | points: list of points to be drawn. 175 | Shape: (nof_points, 3) 176 | Format: each point should contain (y, x, confidence) 177 | skeleton: list of joints to be drawn 178 | Shape: (nof_joints, 2) 179 | Format: each joint should contain (point_a, point_b) where `point_a` and `point_b` are an index in `points` 180 | points_color_palette: name of a matplotlib color palette 181 | Default: 'tab20' 182 | points_palette_samples: number of different colors sampled from the `color_palette` 183 | Default: 16 184 | skeleton_color_palette: name of a matplotlib color palette 185 | Default: 'Set2' 186 | skeleton_palette_samples: number of different colors sampled from the `color_palette` 187 | Default: 8 188 | person_index: index of the person in `image` 189 | Default: 0 190 | confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1] 191 | Default: 0.5 192 | 193 | Returns: 194 | A new image with overlaid joints 195 | 196 | """ 197 | image = draw_skeleton(image, points, skeleton, color_palette=skeleton_color_palette, 198 | palette_samples=skeleton_palette_samples, person_index=person_index, 199 | confidence_threshold=confidence_threshold, xywh=xywh) 200 | image = draw_points(image, points, color_palette=points_color_palette, palette_samples=points_palette_samples, 201 | confidence_threshold=confidence_threshold, xywh=xywh) 202 | return image -------------------------------------------------------------------------------- /utils/yolov6_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | from utils.nms import nms 5 | 6 | 7 | def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32, return_int=False): 8 | '''Resize and pad image while meeting stride-multiple constraints.''' 9 | shape = im.shape[:2] # current shape [height, width] 10 | if isinstance(new_shape, int): 11 | new_shape = (new_shape, new_shape) 12 | elif isinstance(new_shape, list) and len(new_shape) == 1: 13 | new_shape = (new_shape[0], new_shape[0]) 14 | 15 | # Scale ratio (new / old) 16 | r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) 17 | if not scaleup: # only scale down, do not scale up (for better val mAP) 18 | r = min(r, 1.0) 19 | 20 | # Compute padding 21 | new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) 22 | dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding 23 | 24 | if auto: # minimum rectangle 25 | dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding 26 | 27 | dw /= 2 # divide padding into 2 sides 28 | dh /= 2 29 | 30 | if shape[::-1] != new_unpad: # resize 31 | im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR) 32 | top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) 33 | left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) 34 | im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border 35 | if not return_int: 36 | return im, r, (dw, dh) 37 | else: 38 | return im, r, (left, top) 39 | 40 | 41 | def plot_box_and_label(image, lw, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255), font=cv2.FONT_HERSHEY_COMPLEX): 42 | # Add one xyxy box to image with label 43 | p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3])) 44 | cv2.rectangle(image, p1, p2, color, thickness=lw, lineType=cv2.LINE_AA) 45 | if label: 46 | lw = 1 47 | tf = max(lw - 1, 1) # font thickness 48 | w, h = cv2.getTextSize(label, 0, fontScale=lw / 3, thickness=tf)[0] # text width, height 49 | outside = p1[1] - h - 3 >= 0 # label fits outside box 50 | p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3 51 | cv2.rectangle(image, p1, p2, color, -1, cv2.LINE_AA) # filled 52 | cv2.putText(image, label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2), font, lw / 3, txt_color, 53 | thickness=tf, lineType=cv2.LINE_AA) 54 | 55 | 56 | def xywh2xyxy(x): 57 | '''Convert boxes with shape [n, 4] from [x, y, w, h] to [x1, y1, x2, y2] where x1y1 is top-left, x2y2=bottom-right.''' 58 | y = np.copy(x) 59 | y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x 60 | y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y 61 | y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x 62 | y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y 63 | return y 64 | 65 | 66 | def xyxy2xywh(x): 67 | '''Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right.''' 68 | y = np.copy(x) 69 | y[..., 0] = (x[..., 0] + x[..., 2]) / 2 # x center 70 | y[..., 1] = (x[..., 1] + x[..., 3]) / 2 # y center 71 | y[..., 2] = x[..., 2] - x[..., 0] # width 72 | y[..., 3] = x[..., 3] - x[..., 1] # height 73 | return y 74 | 75 | 76 | def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45): 77 | """Runs Non-Maximum Suppression (NMS) on inference results. 78 | This code is borrowed from: https://github.com/ultralytics/yolov5/blob/47233e1698b89fc437a4fb9463c815e9171be955/utils/general.py#L775 79 | Args: 80 | prediction: (numpy.ndarray), with shape [N, 5 + num_classes], N is the number of bboxes. 81 | conf_thres: (float) confidence threshold. 82 | iou_thres: (float) iou threshold. 83 | Returns: 84 | output: (numpy.ndarray), list of detections, each item is a tensor with shape (num_boxes, 6), 6 is for [xyxy, conf, cls]. 85 | """ 86 | 87 | pred_candidates = np.logical_and(prediction[..., 4] > conf_thres, np.amax(prediction[..., 5:], axis=-1)[0] > conf_thres) # candidates 88 | # Check the parameters. 89 | assert 0 <= conf_thres <= 1, f'conf_thresh must be in 0.0 to 1.0, however {conf_thres} is provided.' 90 | assert 0 <= iou_thres <= 1, f'iou_thres must be in 0.0 to 1.0, however {iou_thres} is provided.' 91 | 92 | # Function settings. 93 | max_wh = 4096 # maximum box width and height 94 | max_nms = 30000 # maximum number of boxes put into torchvision.ops.nms() 95 | 96 | output = [np.zeros((0, 6))] * prediction.shape[0] 97 | for img_idx, x in enumerate(prediction): # image index, image inference 98 | x = x[pred_candidates[img_idx]] # confidence 99 | 100 | # If no box remains, skip the next process. 101 | if not x.shape[0]: 102 | continue 103 | 104 | # confidence multiply the objectness 105 | x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf 106 | 107 | # (center x, center y, width, height) to (x1, y1, x2, y2) 108 | box = xywh2xyxy(x[:, :4]) 109 | 110 | class_idx = x[:, 5:].argmax(1, keepdims=True) 111 | conf = np.take_along_axis(x[:, 5:], class_idx, axis=1) 112 | x = np.concatenate((box, conf, class_idx.astype('float32')), 1)[conf.flatten() > conf_thres] 113 | 114 | # Check shape 115 | num_box = x.shape[0] # number of boxes 116 | if not num_box: # no boxes kept. 117 | continue 118 | elif num_box > max_nms: # excess max boxes' number. 119 | # sort by confidence 120 | x = x[np.flip(x[:, 4].argsort(), -1)[:max_nms]] 121 | 122 | # Batched NMS 123 | boxes, scores = x[:, :4], x[:, 4] # boxes (offset by class), scores 124 | keep_box_idx = nms(boxes, scores, iou_thres) 125 | 126 | output[img_idx] = x[keep_box_idx] 127 | 128 | return output 129 | 130 | 131 | def rescale(ori_shape, boxes, target_shape): 132 | '''Rescale the output to the original image shape''' 133 | ratio = min(ori_shape[0] / target_shape[0], ori_shape[1] / target_shape[1]) 134 | padding = (ori_shape[1] - target_shape[1] * ratio) / 2, (ori_shape[0] - target_shape[0] * ratio) / 2 135 | 136 | boxes[:, [0, 2]] -= padding[0] 137 | boxes[:, [1, 3]] -= padding[1] 138 | boxes[:, :4] /= ratio 139 | 140 | boxes[:, 0] = boxes[:, 0].clip(0, target_shape[1]) # x1 141 | boxes[:, 1] = boxes[:, 1].clip(0, target_shape[0]) # y1 142 | boxes[:, 2] = boxes[:, 2].clip(0, target_shape[1]) # x2 143 | boxes[:, 3] = boxes[:, 3].clip(0, target_shape[0]) # y2 144 | 145 | return boxes 146 | 147 | 148 | def preprocess_with_bboxes(original_img, bboxes, detection_img_size, pose_img_size, cfg): 149 | """ 150 | Args: 151 | original_img: (numpy.ndarray) (H, W, C) 152 | bboxes: (numpy.ndarray), normalized bboxes with shape [N, 5 + num_classes], N is the number of bboxes. 153 | detection_img_size: (tuple), (H, W) 154 | pose_img_size: (tuple), (H, W) 155 | Returns: 156 | img_list: (list of numpy.ndarray) 157 | xyxy_list: (list of numpy.ndarray) 158 | conf_list: (list of numpy.ndarray) 159 | """ 160 | 161 | if len(bboxes): 162 | bboxes[:, :4] = rescale(detection_img_size, bboxes[:, :4], original_img.shape) 163 | 164 | img_list = [] 165 | xyxy_list = [] 166 | conf_list = [] 167 | 168 | # bboxes = np.flip(bboxes, axis=0) 169 | for i, (*xyxy, conf, cls) in enumerate(bboxes): 170 | if i >= cfg.max_detection: 171 | break 172 | 173 | # pad to offset the wrong effect in PatchEmbed in vit.py 174 | # in my opinion, a little bit smaller image is better than a little bit truncated image 175 | # padding=4 for base conf 176 | if not cfg.no_pad: 177 | padding = 4 178 | xyxy[2] += padding * (xyxy[2]-xyxy[0]) / pose_img_size[1] 179 | xyxy[3] += padding * (xyxy[3]-xyxy[1]) / pose_img_size[0] 180 | xyxy[2] = np.clip(xyxy[2], xyxy[0], original_img.shape[1]) 181 | xyxy[3] = np.clip(xyxy[3], xyxy[1], original_img.shape[0]) 182 | 183 | # crop image 184 | l, t, r, b = map(int, np.round(xyxy)) 185 | img = original_img[t:b, l:r, :] 186 | 187 | # resize image 188 | img = cv2.resize(img, pose_img_size[::-1], interpolation = cv2.INTER_LINEAR) 189 | 190 | # normalization 191 | img = img / 255.0 192 | mean_std = np.array([[0.485, 0.456, 0.406], [0.229, 0.224, 0.225]]) 193 | img = (img - mean_std[0]) / mean_std[1] 194 | 195 | # convert to torch tensor format 196 | # img = img.transpose(2, 0, 1).astype('float32') # HWC to CHW 197 | 198 | img_list.append(img) 199 | xyxy_list.append(xyxy) 200 | conf_list.append(conf) 201 | 202 | return img_list, xyxy_list, conf_list -------------------------------------------------------------------------------- /utils/vitpose_util.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | 5 | def _gaussian_blur(heatmaps, kernel=11): 6 | """Modulate heatmap distribution with Gaussian. 7 | sigma = 0.3*((kernel_size-1)*0.5-1)+0.8 8 | sigma~=3 if k=17 9 | sigma=2 if k=11; 10 | sigma~=1.5 if k=7; 11 | sigma~=1 if k=3; 12 | 13 | Note: 14 | - batch_size: N 15 | - num_keypoints: K 16 | - heatmap height: H 17 | - heatmap width: W 18 | 19 | Args: 20 | heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps. 21 | kernel (int): Gaussian kernel size (K) for modulation, which should 22 | match the heatmap gaussian sigma when training. 23 | K=17 for sigma=3 and k=11 for sigma=2. 24 | 25 | Returns: 26 | np.ndarray ([N, K, H, W]): Modulated heatmap distribution. 27 | """ 28 | assert kernel % 2 == 1 29 | 30 | border = (kernel - 1) // 2 31 | batch_size = heatmaps.shape[0] 32 | num_joints = heatmaps.shape[1] 33 | height = heatmaps.shape[2] 34 | width = heatmaps.shape[3] 35 | for i in range(batch_size): 36 | for j in range(num_joints): 37 | origin_max = np.max(heatmaps[i, j]) 38 | dr = np.zeros((height + 2 * border, width + 2 * border), 39 | dtype=np.float32) 40 | dr[border:-border, border:-border] = heatmaps[i, j].copy() 41 | dr = cv2.GaussianBlur(dr, (kernel, kernel), 0) 42 | heatmaps[i, j] = dr[border:-border, border:-border].copy() 43 | heatmaps[i, j] *= origin_max / np.max(heatmaps[i, j]) 44 | return heatmaps 45 | 46 | 47 | def _get_max_preds(heatmaps): 48 | """Get keypoint predictions from score maps. 49 | 50 | Note: 51 | batch_size: N 52 | num_keypoints: K 53 | heatmap height: H 54 | heatmap width: W 55 | 56 | Args: 57 | heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps. 58 | 59 | Returns: 60 | tuple: A tuple containing aggregated results. 61 | 62 | - preds (np.ndarray[N, K, 2]): Predicted keypoint location. 63 | - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints. 64 | """ 65 | assert isinstance(heatmaps, 66 | np.ndarray), ('heatmaps should be numpy.ndarray') 67 | assert heatmaps.ndim == 4, 'batch_images should be 4-ndim' 68 | 69 | N, K, _, W = heatmaps.shape 70 | heatmaps_reshaped = heatmaps.reshape((N, K, -1)) 71 | idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1)) 72 | maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1)) 73 | 74 | preds = np.tile(idx, (1, 1, 2)).astype(np.float32) 75 | preds[:, :, 0] = preds[:, :, 0] % W 76 | preds[:, :, 1] = preds[:, :, 1] // W 77 | 78 | preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1) 79 | return preds, maxvals 80 | 81 | 82 | def post_dark_udp(coords, batch_heatmaps, kernel=3): 83 | """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The 84 | Devil is in the Details: Delving into Unbiased Data Processing for Human 85 | Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate 86 | Representation for Human Pose Estimation (CVPR 2020). 87 | 88 | Note: 89 | - batch size: B 90 | - num keypoints: K 91 | - num persons: N 92 | - height of heatmaps: H 93 | - width of heatmaps: W 94 | 95 | B=1 for bottom_up paradigm where all persons share the same heatmap. 96 | B=N for top_down paradigm where each person has its own heatmaps. 97 | 98 | Args: 99 | coords (np.ndarray[N, K, 2]): Initial coordinates of human pose. 100 | batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps 101 | kernel (int): Gaussian kernel size (K) for modulation. 102 | 103 | Returns: 104 | np.ndarray([N, K, 2]): Refined coordinates. 105 | """ 106 | if not isinstance(batch_heatmaps, np.ndarray): 107 | batch_heatmaps = batch_heatmaps.cpu().numpy() 108 | B, K, H, W = batch_heatmaps.shape 109 | N = coords.shape[0] 110 | assert (B == 1 or B == N) 111 | for heatmaps in batch_heatmaps: 112 | for heatmap in heatmaps: 113 | cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap) 114 | np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps) 115 | np.log(batch_heatmaps, batch_heatmaps) 116 | 117 | batch_heatmaps_pad = np.pad( 118 | batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)), 119 | mode='edge').flatten() 120 | 121 | index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2) 122 | index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K) 123 | index = index.astype(int).reshape(-1, 1) 124 | i_ = batch_heatmaps_pad[index] 125 | ix1 = batch_heatmaps_pad[index + 1] 126 | iy1 = batch_heatmaps_pad[index + W + 2] 127 | ix1y1 = batch_heatmaps_pad[index + W + 3] 128 | ix1_y1_ = batch_heatmaps_pad[index - W - 3] 129 | ix1_ = batch_heatmaps_pad[index - 1] 130 | iy1_ = batch_heatmaps_pad[index - 2 - W] 131 | 132 | dx = 0.5 * (ix1 - ix1_) 133 | dy = 0.5 * (iy1 - iy1_) 134 | derivative = np.concatenate([dx, dy], axis=1) 135 | derivative = derivative.reshape(N, K, 2, 1) 136 | dxx = ix1 - 2 * i_ + ix1_ 137 | dyy = iy1 - 2 * i_ + iy1_ 138 | dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_) 139 | hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1) 140 | hessian = hessian.reshape(N, K, 2, 2) 141 | hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2)) 142 | coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze() 143 | return coords 144 | 145 | 146 | def _taylor(heatmap, coord): 147 | """Distribution aware coordinate decoding method. 148 | 149 | Note: 150 | - heatmap height: H 151 | - heatmap width: W 152 | 153 | Args: 154 | heatmap (np.ndarray[H, W]): Heatmap of a particular joint type. 155 | coord (np.ndarray[2,]): Coordinates of the predicted keypoints. 156 | 157 | Returns: 158 | np.ndarray[2,]: Updated coordinates. 159 | """ 160 | H, W = heatmap.shape[:2] 161 | px, py = int(coord[0]), int(coord[1]) 162 | if 1 < px < W - 2 and 1 < py < H - 2: 163 | dx = 0.5 * (heatmap[py][px + 1] - heatmap[py][px - 1]) 164 | dy = 0.5 * (heatmap[py + 1][px] - heatmap[py - 1][px]) 165 | dxx = 0.25 * ( 166 | heatmap[py][px + 2] - 2 * heatmap[py][px] + heatmap[py][px - 2]) 167 | dxy = 0.25 * ( 168 | heatmap[py + 1][px + 1] - heatmap[py - 1][px + 1] - 169 | heatmap[py + 1][px - 1] + heatmap[py - 1][px - 1]) 170 | dyy = 0.25 * ( 171 | heatmap[py + 2 * 1][px] - 2 * heatmap[py][px] + 172 | heatmap[py - 2 * 1][px]) 173 | derivative = np.array([[dx], [dy]]) 174 | hessian = np.array([[dxx, dxy], [dxy, dyy]]) 175 | if dxx * dyy - dxy**2 != 0: 176 | hessianinv = np.linalg.inv(hessian) 177 | offset = -hessianinv @ derivative 178 | offset = np.squeeze(np.array(offset.T), axis=0) 179 | coord += offset 180 | return coord 181 | 182 | 183 | def transform_preds(coords, center, scale, output_size, use_udp=False): 184 | """Get final keypoint predictions from heatmaps and apply scaling and 185 | translation to map them back to the image. 186 | 187 | Note: 188 | num_keypoints: K 189 | 190 | Args: 191 | coords (np.ndarray[K, ndims]): 192 | 193 | * If ndims=2, corrds are predicted keypoint location. 194 | * If ndims=4, corrds are composed of (x, y, scores, tags) 195 | * If ndims=5, corrds are composed of (x, y, scores, tags, 196 | flipped_tags) 197 | 198 | center (np.ndarray[2, ]): Center of the bounding box (x, y). 199 | scale (np.ndarray[2, ]): Scale of the bounding box 200 | wrt [width, height]. 201 | output_size (np.ndarray[2, ] | list(2,)): Size of the 202 | destination heatmaps. 203 | use_udp (bool): Use unbiased data processing 204 | 205 | Returns: 206 | np.ndarray: Predicted coordinates in the images. 207 | """ 208 | assert coords.shape[1] in (2, 4, 5) 209 | assert len(center) == 2 210 | assert len(scale) == 2 211 | assert len(output_size) == 2 212 | 213 | # Recover the scale which is normalized by a factor of 200. 214 | # scale = scale * 200.0 215 | 216 | if use_udp: 217 | scale_x = scale[0] / (output_size[0] - 1.0) 218 | scale_y = scale[1] / (output_size[1] - 1.0) 219 | else: 220 | scale_x = scale[0] / output_size[0] 221 | scale_y = scale[1] / output_size[1] 222 | 223 | target_coords = np.ones_like(coords) 224 | target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5 225 | target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5 226 | 227 | return target_coords 228 | 229 | 230 | def keypoints_from_heatmaps(heatmaps, 231 | center, 232 | scale, 233 | kernel=11, 234 | use_udp=True): 235 | """Get final keypoint predictions from heatmaps and transform them back to 236 | the image. 237 | 238 | Note: 239 | - batch size: N 240 | - num keypoints: K 241 | - heatmap height: H 242 | - heatmap width: W 243 | 244 | Args: 245 | heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps. 246 | center (np.ndarray[N, 2]): Center of the bounding box (x, y). 247 | scale (np.ndarray[N, 2]): Scale of the bounding box 248 | wrt height/width. 249 | kernel (int): Gaussian kernel size (K) for modulation, which should 250 | match the heatmap gaussian sigma when training. 251 | K=17 for sigma=3 and k=11 for sigma=2. 252 | use_udp (bool): Use unbiased data processing. 253 | 254 | Returns: 255 | tuple: A tuple containing keypoint predictions and scores. 256 | 257 | - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images. 258 | - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints. 259 | """ 260 | # Avoid being affected 261 | heatmaps = heatmaps.copy() 262 | 263 | # start processing 264 | N, K, H, W = heatmaps.shape 265 | if use_udp: 266 | preds, maxvals = _get_max_preds(heatmaps) 267 | preds = post_dark_udp(preds, heatmaps, kernel=kernel) 268 | else: 269 | preds, maxvals = _get_max_preds(heatmaps) 270 | 271 | # apply Gaussian distribution modulation. 272 | heatmaps = np.log( 273 | np.maximum(_gaussian_blur(heatmaps, kernel), 1e-10)) 274 | for n in range(N): 275 | for k in range(K): 276 | preds[n][k] = _taylor(heatmaps[n][k], preds[n][k]) 277 | 278 | # Transform back to the image 279 | for i in range(N): 280 | preds[i] = transform_preds( 281 | preds[i], center[i], scale[i], [W, H], use_udp=use_udp) 282 | 283 | return preds, maxvals -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2023 Minsik Yoon 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /utils/inference.py: -------------------------------------------------------------------------------- 1 | from time import time, sleep 2 | import queue, threading 3 | import pickle 4 | 5 | import numpy as np 6 | import cv2 7 | 8 | from utils.visualization import draw_points_and_skeleton, joints_dict 9 | from utils.util import print_fps 10 | from utils.vitpose_util import keypoints_from_heatmaps 11 | from utils.yolov6_util import letterbox, non_max_suppression, preprocess_with_bboxes, xyxy2xywh, plot_box_and_label 12 | 13 | 14 | def inference(original_imgs, yolov6_sess, vitpose_sess, cfg, smooth_net=None): 15 | """ 16 | Args: 17 | original_imgs: (numpy.ndarray), (B, H, W, C), RGB color format 18 | yolov6_sess: (onnxruntime.InferenceSession) 19 | vitpose_sess: (onnxruntime.InferenceSession) 20 | Returns: 21 | infered_imgs: (numpy.ndarray), (B, H, W, C), BGR color format 22 | """ 23 | 24 | 25 | if cfg.no_background: 26 | backgrounds = np.zeros_like(original_imgs) 27 | else: 28 | backgrounds = original_imgs 29 | 30 | yolov6_img_size = yolov6_sess.get_inputs()[0].shape[-2:] 31 | vitpose_img_size = vitpose_sess.get_inputs()[0].shape[-2:] 32 | 33 | 34 | # Preprocess images 35 | processed_imgs = [] 36 | for img in original_imgs: 37 | img = letterbox(img, yolov6_img_size, auto=False)[0] 38 | img = img.astype('float32') / 255 39 | img = img.transpose(2, 0, 1) 40 | processed_imgs.append(img) 41 | 42 | processed_imgs = np.stack(processed_imgs, axis=0) 43 | 44 | 45 | # Predict bboxes 46 | preds = [] 47 | input_name = yolov6_sess.get_inputs()[0].name 48 | for img_batch in np.array_split(processed_imgs, (len(processed_imgs)-1) // cfg.yolo_batch_size + 1): 49 | preds.append(yolov6_sess.run(None, {input_name: img_batch})[0]) 50 | 51 | preds = np.concatenate(preds) 52 | 53 | 54 | # Postprocess preds 55 | preds = preds[..., :6] # take only human class 56 | bbox_groups = non_max_suppression(preds, cfg.conf_thres, cfg.iou_thres) 57 | 58 | 59 | # Preprocess images for ViTPose input 60 | processed_imgs = [] 61 | xyxy_groups = [] 62 | conf_groups = [] 63 | sections = [] 64 | detection_check_list = [] 65 | for idx, (original_img, bboxes) in enumerate(zip(original_imgs, bbox_groups)): 66 | img_list, xyxy_list, conf_list = preprocess_with_bboxes(original_img, bboxes, yolov6_img_size, vitpose_img_size, cfg) 67 | 68 | if len(img_list) != 0: 69 | processed_imgs.append(np.stack(img_list)) 70 | xyxy_groups.append(np.stack(xyxy_list)) 71 | conf_groups.append(conf_list) 72 | detection_check_list.append(True) 73 | else: 74 | detection_check_list.append(False) 75 | 76 | sections.append(len(img_list)) 77 | 78 | if sum(sections) == 0: # nothing detected 79 | return backgrounds[..., ::-1].copy(), ([], []) 80 | 81 | processed_imgs = np.concatenate(processed_imgs) 82 | processed_imgs = processed_imgs.transpose(0, 3, 1, 2).astype('float32') 83 | sections = np.cumsum(sections) 84 | 85 | 86 | # Predict keypoints 87 | heatmaps = [] 88 | num_batch = (len(processed_imgs)-1) // cfg.pose_batch_size + 1 89 | input_name = vitpose_sess.get_inputs()[0].name 90 | for img_batch in np.array_split(processed_imgs, num_batch): 91 | heatmaps.append(vitpose_sess.run(None, {input_name: img_batch})[0]) 92 | 93 | heatmaps = np.concatenate(heatmaps) 94 | 95 | # Postprocess heatmaps 96 | xywh_groups = xyxy2xywh(np.concatenate(xyxy_groups)) 97 | center_xy = xywh_groups[:, [0,1]] 98 | scale_hw = xywh_groups[:, [2, 3]] 99 | 100 | keypoints, prob = keypoints_from_heatmaps(heatmaps=heatmaps, center=center_xy, scale=scale_hw, use_udp=True) 101 | keypoints = np.concatenate([keypoints[:, :, ::-1], prob], axis=2) 102 | keypoint_groups = np.split(keypoints, sections) 103 | 104 | # Visualization 105 | pid = 1 # dummy 106 | infered_imgs = backgrounds[..., ::-1].copy() 107 | 108 | iteration = zip(keypoint_groups, xyxy_groups, conf_groups, backgrounds[..., ::-1]) 109 | for idx, (keypoints, xyxy_list, conf_list, bg_img) in enumerate(iteration): 110 | if len(keypoints) == 0: 111 | continue 112 | 113 | img = bg_img.copy() 114 | 115 | if not cfg.no_bbox: 116 | for xyxy, conf in zip(xyxy_list, conf_list): 117 | lw = int(np.ceil((xyxy[2]+xyxy[3]-xyxy[0]-xyxy[1]) * 5 / 3000)) 118 | plot_box_and_label(img, lw, xyxy, 'person '+ f'{conf*100:0.0f}%', color=(40,150,30)) 119 | 120 | if not cfg.no_skeleton: 121 | for points, xyxy in zip(keypoints, xyxy_list): 122 | xywh = xyxy2xywh(xyxy) if cfg.dynamic_drawing else None 123 | img = draw_points_and_skeleton(img, points, joints_dict()['coco']['skeleton'], person_index=pid, 124 | points_color_palette='gist_rainbow', skeleton_color_palette='jet', 125 | points_palette_samples=10, confidence_threshold=cfg.key_conf_thres, xywh=xywh) 126 | 127 | infered_imgs[idx] = img 128 | 129 | if cfg.result_scale: 130 | new_imgs = [] 131 | for img in infered_imgs: 132 | size = (int(img.shape[1] * cfg.result_scale), int(img.shape[0] * cfg.result_scale)) 133 | img = cv2.resize(img, size, interpolation = cv2.INTER_LINEAR) 134 | new_imgs.append(img) 135 | infered_imgs = np.stack(new_imgs) 136 | 137 | return infered_imgs, (bbox_groups, keypoint_groups) 138 | 139 | 140 | 141 | 142 | def inference_image(img_path, yolov6_sess, vitpose_sess, cfg): 143 | img_origin = cv2.imread(img_path) 144 | img_origin = img_origin[..., ::-1] # BGR to RGB 145 | img_origin = np.expand_dims(img_origin, axis=0) 146 | img, pred = inference(img_origin, yolov6_sess, vitpose_sess, cfg) 147 | 148 | print('-'*10 + "\nPress 'Q' key on OpenCV window if you want to close") 149 | cv2.imshow("OpenCV", img[0]) 150 | 151 | if cfg.save: 152 | save_name = img_path.replace(".jpg", "_result.jpg") 153 | cv2.imwrite(save_name, img[0]) 154 | if cfg.save_prediction: 155 | preds = {'bbox':[], 'pose':[]} 156 | preds['bbox'].extend(pred[0]) 157 | preds['pose'].extend(pred[1]) 158 | save_name = img_path.replace(".jpg", "_prediction.pkl") 159 | with open(save_name, 'wb') as f: 160 | pickle.dump(preds, f) 161 | 162 | cv2.waitKey(0) 163 | 164 | 165 | 166 | 167 | def inference_video(vid_path, yolov6_sess, vitpose_sess, cfg, smooth_net=None): 168 | video = cv2.VideoCapture(vid_path) 169 | frames = [] 170 | preds = {'bbox':[], 'pose':[]} 171 | 172 | if cfg.save: 173 | out_name = '.'.join(vid_path.split('.')[:-1]) + '_result.mp4' 174 | out_fourcc = cv2.VideoWriter_fourcc(*'mp4v') 175 | if cfg.set_fps is not None: 176 | out_fps = cfg.set_fps 177 | else: 178 | out_fps = video.get(cv2.CAP_PROP_FPS) 179 | out_size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))) 180 | out = cv2.VideoWriter(out_name, out_fourcc, out_fps, out_size) 181 | 182 | print('-'*10 + "\nPress 'Q' key on OpenCV window if you want to close") 183 | tic = time() 184 | while(True): 185 | ret, frame = video.read() 186 | 187 | if ret: 188 | frames.append(frame) 189 | if len(frames) < cfg.yolo_batch_size: 190 | continue 191 | 192 | frames = np.stack(frames) 193 | frames = frames[..., ::-1] # BGR to RGB 194 | results, pred = inference(frames, yolov6_sess, vitpose_sess, cfg) 195 | 196 | toc = time() 197 | fps = 1/(toc - tic) 198 | tic = time() 199 | 200 | print_fps(fps*cfg.yolo_batch_size) 201 | 202 | cv2.imshow('OpenCV', results[-1]) 203 | 204 | if cfg.save: 205 | for res in results: 206 | out.write(res) 207 | if cfg.save_prediction: 208 | preds['bbox'].extend(pred[0]) 209 | preds['pose'].extend(pred[1]) 210 | 211 | frames = [] 212 | 213 | if cv2.waitKey(1) & 0xFF == ord('q'): 214 | break 215 | 216 | else: 217 | break 218 | 219 | if cfg.save_prediction: 220 | save_name = '.'.join(vid_path.split('.')[:-1]) + '_prediction.pkl' 221 | with open(save_name, 'wb') as f: 222 | pickle.dump(preds, f) 223 | 224 | video.release() 225 | if cfg.save: out.release() 226 | cv2.destroyAllWindows() 227 | 228 | 229 | 230 | 231 | def inference_webcam(webcam, yolov6_sess, vitpose_sess, cfg, smooth_net=None): 232 | event = threading.Event() 233 | 234 | # bufferless VideoCapture 235 | cap = AsyncVideoCapture(webcam, event) 236 | preds = {'bbox':[], 'pose':[]} 237 | 238 | if cfg.save: 239 | frame_queue = queue.Queue(1) 240 | 241 | out_name = 'webcam_result.mp4' 242 | out_fourcc = cv2.VideoWriter_fourcc(*'mp4v') 243 | out_fps = cfg.set_fps 244 | out_size = (int(cap.cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))) 245 | 246 | out = AsyncVideoWriter(out_name, out_fourcc, out_fps, out_size, cap, frame_queue, event) 247 | 248 | print('-'*10 + "\nPress 'Q' key on OpenCV window if you want to close") 249 | tic = time() 250 | while not cap.is_dead: 251 | frame = cap.read() 252 | 253 | frame = frame[..., ::-1] # BGR to RGB 254 | frame = np.expand_dims(frame, axis=0) 255 | frame, pred = inference(frame, yolov6_sess, vitpose_sess, cfg) 256 | 257 | toc = time() 258 | fps = 1/(toc - tic) 259 | tic = time() 260 | 261 | print_fps(fps) 262 | 263 | cv2.imshow("OpenCV", frame[0]) 264 | 265 | if cfg.save: 266 | try: 267 | frame_queue.put_nowait(frame[0]) 268 | except queue.Full: 269 | pass 270 | 271 | if cfg.save_prediction: 272 | preds['bbox'].extend(pred[0]) 273 | preds['pose'].extend(pred[1]) 274 | 275 | if cv2.waitKey(1) & 0xFF == ord('q'): 276 | cap.event.set() 277 | break 278 | 279 | if cfg.save_prediction: 280 | save_name = 'webcam_prediction.pkl' 281 | with open(save_name, 'wb') as f: 282 | pickle.dump(preds, f) 283 | 284 | time_out = 10.0 285 | tic = time() 286 | while (not cap.is_dead) or (cfg.save and not out.is_dead): 287 | toc = time() 288 | if toc-tic > time_out: 289 | break 290 | 291 | cv2.destroyAllWindows() 292 | 293 | 294 | 295 | class AsyncVideoCapture: 296 | def __init__(self, webcam, event): 297 | self.cap = cv2.VideoCapture(webcam) 298 | if self.cap.isOpened(): 299 | self.event = event 300 | self.q = queue.Queue() 301 | t = threading.Thread(target=self._reader) 302 | t.daemon = True 303 | t.start() 304 | self.is_dead = False 305 | else: 306 | self.cap.release() 307 | self.is_dead = True 308 | 309 | # read frames as soon as they are available, keeping only most recent one 310 | def _reader(self): 311 | while True: 312 | ret, frame = self.cap.read() 313 | 314 | if (not ret) or self.event.is_set(): 315 | break 316 | 317 | if not self.q.empty(): 318 | try: 319 | self.q.get_nowait() # discard previous (unprocessed) frame 320 | except queue.Empty: 321 | pass 322 | 323 | self.q.put(frame) 324 | 325 | self.cap.release() 326 | self.is_dead = True 327 | 328 | def read(self): 329 | return self.q.get() 330 | 331 | 332 | class AsyncVideoWriter: 333 | def __init__(self, out_name, out_fourcc, out_fps, out_size, cap, frame_queue, event): 334 | self.out = cv2.VideoWriter(out_name, out_fourcc, out_fps, out_size) 335 | if self.out.isOpened(): 336 | self.cap = cap 337 | self.event = event 338 | self.last_frame = np.zeros((out_size[1], out_size[0],3), np.uint8) 339 | self.frame_queue = frame_queue 340 | t = threading.Thread(target=self._writer) 341 | t.daemon = True 342 | t.start() 343 | self.period = 1/out_fps 344 | self.is_dead = False 345 | else: 346 | self.out.release() 347 | self.is_dead = True 348 | 349 | def _writer(self): 350 | diff = 0 351 | tic = time() 352 | while True: 353 | if self.event.is_set(): 354 | break 355 | 356 | try: 357 | self.last_frame = self.frame_queue.get_nowait() 358 | except queue.Empty: 359 | pass 360 | 361 | self.out.write(self.last_frame) 362 | 363 | # match writing speed to the desire fps 364 | elapsed_time = time() - tic 365 | tic = time() 366 | diff += self.period - elapsed_time 367 | if diff > 0: 368 | sleep(diff) 369 | 370 | self.out.release() 371 | self.is_dead = True --------------------------------------------------------------------------------