├── .gitignore ├── LICENSE ├── README.md ├── config.yaml ├── configs ├── ViTPose_base_coco_256x192.py ├── ViTPose_base_simple_coco_256x192.py ├── ViTPose_huge_coco_256x192.py ├── ViTPose_huge_simple_coco_256x192.py ├── ViTPose_large_coco_256x192.py └── ViTPose_large_simple_coco_256x192.py ├── datasets ├── COCO.py └── HumanPoseEstimation.py ├── examples ├── .DS_Store ├── img1.jpg └── img1_result.jpg ├── inference.py ├── models ├── __init__.py ├── backbone │ └── vit.py ├── head │ ├── topdown_heatmap_base_head.py │ └── topdown_heatmap_simple_head.py ├── losses │ ├── __init__.py │ ├── classfication_loss.py │ ├── heatmap_loss.py │ ├── mesh_loss.py │ ├── mse_loss.py │ ├── multi_loss_factory.py │ └── regression_loss.py ├── model.py └── optimizer.py ├── requirements.txt ├── to_onnx.ipynb ├── train.py └── utils ├── __init__.py ├── dist_util.py ├── logging.py ├── nms ├── __init__.py ├── cpu_nms.c ├── cpu_nms.cpython-37m-x86_64-linux-gnu.so ├── cpu_nms.cpython-39-x86_64-linux-gnu.so ├── cpu_nms.pyx ├── gpu_nms.cpp ├── gpu_nms.cpython-37m-x86_64-linux-gnu.so ├── gpu_nms.cpython-39-x86_64-linux-gnu.so ├── gpu_nms.cu ├── gpu_nms.hpp ├── gpu_nms.pyx ├── nms.py ├── nms_kernel.cu ├── nms_ori.py └── setup_linux.py ├── post_processing ├── __init__.py ├── group.py ├── nms.py ├── one_euro_filter.py └── post_transforms.py ├── top_down_eval.py ├── train_valid_fn.py ├── transform.py ├── util.py └── visualization.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.pth 2 | **/*.pt 3 | **/__pycache__ 4 | **/coco/ 5 | *.onnx 6 | .DS_Store 7 | runs 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ViTPose (simple version w/o mmcv) 2 | An unofficial implementation of `ViTPose` [Y. Xu et al., 2022]
3 | ![result_image](./examples/img1_result.jpg "Result Image") 4 | 5 | ## Usage 6 | ### | **Inference** 7 | ``` 8 | python inference.py --image-path './examples/img1.jpg' 9 | ``` 10 | 11 | ### | **Training** 12 | ``` 13 | python train.py --config-path config.yaml --model-name 'b' 14 | ``` 15 | - `model_name` must be in (`b`, `l`, `h`) 16 | 17 | 18 | ## Note 19 | 1. Download the trained model (.pth) 20 | - [ViTPose-B-Multi-COCO.pth](https://1drv.ms/u/s!AimBgYV7JjTlgSrlMB093JzJtqq-?e=Jr5S3R) 21 | - [ViTPose-L-Multi-COCO.pth](https://1drv.ms/u/s!AimBgYV7JjTlgTBm3dCVmBUbHYT6?e=fHUrTq) 22 | - [ViTPose-H-Multi-COCO.pth](https://1drv.ms/u/s!AimBgYV7JjTlgS5rLeRAJiWobCdh?e=41GsDd) 23 | 2. Set the config. according to the trained model 24 | - [ViTPose-B-COCO-256x192](/Users/jaehyun/workspace/ViTPose_pytorch/configs/ViTPose_base_coco_256x192.py) 25 | - [ViTPose-L-COCO-256x192](/Users/jaehyun/workspace/ViTPose_pytorch/configs/ViTPose_large_coco_256x192.py) 26 | - [ViTPose-H-COCO-256x192](/Users/jaehyun/workspace/ViTPose_pytorch/configs/ViTPose_huge_coco_256x192.py) 27 | 28 | --- 29 | ## Reference 30 | All codes were written with reference to [the official ViTPose repo.](https://github.com/ViTAE-Transformer/ViTPose) -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # Train config --------------------------------------- 2 | log_level: logging.INFO 3 | seed: 0 4 | deterministic: True # whether not to evaluate the checkpoint during training 5 | cudnn_benchmark: True # Use cudnn 6 | resume_from: "/home/jaehyun/workspace/PoseEstimation/ViTPose_pytorch/vitpose-b-multi-coco.pth" # CKPT path 7 | gpu_ids: [0] 8 | launcher: 'none' # When distributed training ['none', 'pytorch', 'slurm', 'mpi'] 9 | use_amp: True 10 | validate: True 11 | 12 | autoscale_lr: True # automatically scale lr with the number of gpus 13 | 14 | dist_params: 15 | ... 16 | -------------------------------------------------------------------------------- /configs/ViTPose_base_coco_256x192.py: -------------------------------------------------------------------------------- 1 | # _base_ = [ 2 | # '../../../../_base_/default_runtime.py', 3 | # '../../../../_base_/datasets/coco.py' 4 | # ] 5 | evaluation = dict(interval=10, metric='mAP', save_best='AP') 6 | 7 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1, 8 | constructor='LayerDecayOptimizerConstructor', 9 | paramwise_cfg=dict( 10 | num_layers=12, 11 | layer_decay_rate=0.75, 12 | custom_keys={ 13 | 'bias': dict(decay_multi=0.), 14 | 'pos_embed': dict(decay_mult=0.), 15 | 'relative_position_bias_table': dict(decay_mult=0.), 16 | 'norm': dict(decay_mult=0.) 17 | } 18 | ) 19 | ) 20 | 21 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) 22 | 23 | # learning policy 24 | lr_config = dict( 25 | policy='step', 26 | warmup='linear', 27 | warmup_iters=500, 28 | warmup_ratio=0.001, 29 | step=[170, 200]) 30 | 31 | total_epochs = 210 32 | target_type = 'GaussianHeatmap' 33 | channel_cfg = dict( 34 | num_output_channels=17, 35 | dataset_joints=17, 36 | dataset_channel=[ 37 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 38 | ], 39 | inference_channel=[ 40 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 41 | ]) 42 | 43 | # model settings 44 | model = dict( 45 | type='TopDown', 46 | pretrained=None, 47 | backbone=dict( 48 | type='ViT', 49 | img_size=(256, 192), 50 | patch_size=16, 51 | embed_dim=768, 52 | depth=12, 53 | num_heads=12, 54 | ratio=1, 55 | use_checkpoint=False, 56 | mlp_ratio=4, 57 | qkv_bias=True, 58 | drop_path_rate=0.3, 59 | ), 60 | keypoint_head=dict( 61 | type='TopdownHeatmapSimpleHead', 62 | in_channels=768, 63 | num_deconv_layers=2, 64 | num_deconv_filters=(256, 256), 65 | num_deconv_kernels=(4, 4), 66 | extra=dict(final_conv_kernel=1, ), 67 | out_channels=channel_cfg['num_output_channels'], 68 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), 69 | train_cfg=dict(), 70 | test_cfg=dict( 71 | flip_test=True, 72 | post_process='default', 73 | shift_heatmap=False, 74 | target_type=target_type, 75 | modulate_kernel=11, 76 | use_udp=True)) 77 | 78 | data_cfg = dict( 79 | image_size=[192, 256], 80 | heatmap_size=[48, 64], 81 | num_output_channels=channel_cfg['num_output_channels'], 82 | num_joints=channel_cfg['dataset_joints'], 83 | dataset_channel=channel_cfg['dataset_channel'], 84 | inference_channel=channel_cfg['inference_channel'], 85 | soft_nms=False, 86 | nms_thr=1.0, 87 | oks_thr=0.9, 88 | vis_thr=0.2, 89 | use_gt_bbox=False, 90 | det_bbox_thr=0.0, 91 | bbox_file='data/coco/person_detection_results/' 92 | 'COCO_val2017_detections_AP_H_56_person.json', 93 | ) 94 | 95 | train_pipeline = [ 96 | dict(type='LoadImageFromFile'), 97 | dict(type='TopDownRandomFlip', flip_prob=0.5), 98 | dict( 99 | type='TopDownHalfBodyTransform', 100 | num_joints_half_body=8, 101 | prob_half_body=0.3), 102 | dict( 103 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), 104 | dict(type='TopDownAffine', use_udp=True), 105 | dict(type='ToTensor'), 106 | dict( 107 | type='NormalizeTensor', 108 | mean=[0.485, 0.456, 0.406], 109 | std=[0.229, 0.224, 0.225]), 110 | dict( 111 | type='TopDownGenerateTarget', 112 | sigma=2, 113 | encoding='UDP', 114 | target_type=target_type), 115 | dict( 116 | type='Collect', 117 | keys=['img', 'target', 'target_weight'], 118 | meta_keys=[ 119 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 120 | 'rotation', 'bbox_score', 'flip_pairs' 121 | ]), 122 | ] 123 | 124 | val_pipeline = [ 125 | dict(type='LoadImageFromFile'), 126 | dict(type='TopDownAffine', use_udp=True), 127 | dict(type='ToTensor'), 128 | dict( 129 | type='NormalizeTensor', 130 | mean=[0.485, 0.456, 0.406], 131 | std=[0.229, 0.224, 0.225]), 132 | dict( 133 | type='Collect', 134 | keys=['img'], 135 | meta_keys=[ 136 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score', 137 | 'flip_pairs' 138 | ]), 139 | ] 140 | 141 | test_pipeline = val_pipeline 142 | 143 | data_root = 'datasets/coco' 144 | data = dict( 145 | samples_per_gpu=32, 146 | workers_per_gpu=4, 147 | val_dataloader=dict(samples_per_gpu=32), 148 | test_dataloader=dict(samples_per_gpu=32), 149 | train=dict( 150 | type='TopDownCocoDataset', 151 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', 152 | img_prefix=f'{data_root}/train2017/', 153 | data_cfg=data_cfg, 154 | pipeline=train_pipeline), 155 | val=dict( 156 | type='TopDownCocoDataset', 157 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 158 | img_prefix=f'{data_root}/val2017/', 159 | data_cfg=data_cfg, 160 | pipeline=val_pipeline), 161 | test=dict( 162 | type='TopDownCocoDataset', 163 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 164 | img_prefix=f'{data_root}/val2017/', 165 | data_cfg=data_cfg, 166 | pipeline=test_pipeline) 167 | ) 168 | 169 | -------------------------------------------------------------------------------- /configs/ViTPose_base_simple_coco_256x192.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../../../../_base_/default_runtime.py', 3 | '../../../../_base_/datasets/coco.py' 4 | ] 5 | 6 | evaluation = dict(interval=10, metric='mAP', save_best='AP') 7 | 8 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1, 9 | constructor='LayerDecayOptimizerConstructor', 10 | paramwise_cfg=dict( 11 | num_layers=12, 12 | layer_decay_rate=0.75, 13 | custom_keys={ 14 | 'bias': dict(decay_multi=0.), 15 | 'pos_embed': dict(decay_mult=0.), 16 | 'relative_position_bias_table': dict(decay_mult=0.), 17 | 'norm': dict(decay_mult=0.) 18 | } 19 | ) 20 | ) 21 | 22 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) 23 | 24 | # learning policy 25 | lr_config = dict( 26 | policy='step', 27 | warmup='linear', 28 | warmup_iters=500, 29 | warmup_ratio=0.001, 30 | step=[170, 200]) 31 | total_epochs = 210 32 | target_type = 'GaussianHeatmap' 33 | channel_cfg = dict( 34 | num_output_channels=17, 35 | dataset_joints=17, 36 | dataset_channel=[ 37 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 38 | ], 39 | inference_channel=[ 40 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 41 | ]) 42 | 43 | # model settings 44 | model = dict( 45 | type='TopDown', 46 | pretrained=None, 47 | backbone=dict( 48 | type='ViT', 49 | img_size=(256, 192), 50 | patch_size=16, 51 | embed_dim=768, 52 | depth=12, 53 | num_heads=12, 54 | ratio=1, 55 | use_checkpoint=False, 56 | mlp_ratio=4, 57 | qkv_bias=True, 58 | drop_path_rate=0.3, 59 | ), 60 | keypoint_head=dict( 61 | type='TopdownHeatmapSimpleHead', 62 | in_channels=768, 63 | num_deconv_layers=0, 64 | num_deconv_filters=[], 65 | num_deconv_kernels=[], 66 | upsample=4, 67 | extra=dict(final_conv_kernel=3, ), 68 | out_channels=channel_cfg['num_output_channels'], 69 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), 70 | train_cfg=dict(), 71 | test_cfg=dict( 72 | flip_test=True, 73 | post_process='default', 74 | shift_heatmap=False, 75 | target_type=target_type, 76 | modulate_kernel=11, 77 | use_udp=True)) 78 | 79 | data_cfg = dict( 80 | image_size=[192, 256], 81 | heatmap_size=[48, 64], 82 | num_output_channels=channel_cfg['num_output_channels'], 83 | num_joints=channel_cfg['dataset_joints'], 84 | dataset_channel=channel_cfg['dataset_channel'], 85 | inference_channel=channel_cfg['inference_channel'], 86 | soft_nms=False, 87 | nms_thr=1.0, 88 | oks_thr=0.9, 89 | vis_thr=0.2, 90 | use_gt_bbox=False, 91 | det_bbox_thr=0.0, 92 | bbox_file='data/coco/person_detection_results/' 93 | 'COCO_val2017_detections_AP_H_56_person.json', 94 | ) 95 | 96 | train_pipeline = [ 97 | dict(type='LoadImageFromFile'), 98 | dict(type='TopDownRandomFlip', flip_prob=0.5), 99 | dict( 100 | type='TopDownHalfBodyTransform', 101 | num_joints_half_body=8, 102 | prob_half_body=0.3), 103 | dict( 104 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), 105 | dict(type='TopDownAffine', use_udp=True), 106 | dict(type='ToTensor'), 107 | dict( 108 | type='NormalizeTensor', 109 | mean=[0.485, 0.456, 0.406], 110 | std=[0.229, 0.224, 0.225]), 111 | dict( 112 | type='TopDownGenerateTarget', 113 | sigma=2, 114 | encoding='UDP', 115 | target_type=target_type), 116 | dict( 117 | type='Collect', 118 | keys=['img', 'target', 'target_weight'], 119 | meta_keys=[ 120 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 121 | 'rotation', 'bbox_score', 'flip_pairs' 122 | ]), 123 | ] 124 | 125 | val_pipeline = [ 126 | dict(type='LoadImageFromFile'), 127 | dict(type='TopDownAffine', use_udp=True), 128 | dict(type='ToTensor'), 129 | dict( 130 | type='NormalizeTensor', 131 | mean=[0.485, 0.456, 0.406], 132 | std=[0.229, 0.224, 0.225]), 133 | dict( 134 | type='Collect', 135 | keys=['img'], 136 | meta_keys=[ 137 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score', 138 | 'flip_pairs' 139 | ]), 140 | ] 141 | 142 | test_pipeline = val_pipeline 143 | 144 | data_root = 'datasets/coco' 145 | data = dict( 146 | samples_per_gpu=64, 147 | workers_per_gpu=4, 148 | val_dataloader=dict(samples_per_gpu=32), 149 | test_dataloader=dict(samples_per_gpu=32), 150 | train=dict( 151 | type='TopDownCocoDataset', 152 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', 153 | img_prefix=f'{data_root}/train2017/', 154 | data_cfg=data_cfg, 155 | pipeline=train_pipeline, 156 | dataset_info={{_base_.dataset_info}}), 157 | val=dict( 158 | type='TopDownCocoDataset', 159 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 160 | img_prefix=f'{data_root}/val2017/', 161 | data_cfg=data_cfg, 162 | pipeline=val_pipeline, 163 | dataset_info={{_base_.dataset_info}}), 164 | test=dict( 165 | type='TopDownCocoDataset', 166 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 167 | img_prefix=f'{data_root}/val2017/', 168 | data_cfg=data_cfg, 169 | pipeline=test_pipeline, 170 | dataset_info={{_base_.dataset_info}}), 171 | ) 172 | 173 | -------------------------------------------------------------------------------- /configs/ViTPose_huge_coco_256x192.py: -------------------------------------------------------------------------------- 1 | # _base_ = [ 2 | # '../../../../_base_/default_runtime.py', 3 | # '../../../../_base_/datasets/coco.py' 4 | # ] 5 | evaluation = dict(interval=10, metric='mAP', save_best='AP') 6 | 7 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1, 8 | constructor='LayerDecayOptimizerConstructor', 9 | paramwise_cfg=dict( 10 | num_layers=32, 11 | layer_decay_rate=0.85, 12 | custom_keys={ 13 | 'bias': dict(decay_multi=0.), 14 | 'pos_embed': dict(decay_mult=0.), 15 | 'relative_position_bias_table': dict(decay_mult=0.), 16 | 'norm': dict(decay_mult=0.) 17 | } 18 | ) 19 | ) 20 | 21 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) 22 | 23 | # learning policy 24 | lr_config = dict( 25 | policy='step', 26 | warmup='linear', 27 | warmup_iters=500, 28 | warmup_ratio=0.001, 29 | step=[170, 200]) 30 | total_epochs = 210 31 | target_type = 'GaussianHeatmap' 32 | channel_cfg = dict( 33 | num_output_channels=17, 34 | dataset_joints=17, 35 | dataset_channel=[ 36 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 37 | ], 38 | inference_channel=[ 39 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 40 | ]) 41 | 42 | # model settings 43 | model = dict( 44 | type='TopDown', 45 | pretrained=None, 46 | backbone=dict( 47 | type='ViT', 48 | img_size=(256, 192), 49 | patch_size=16, 50 | embed_dim=1280, 51 | depth=32, 52 | num_heads=16, 53 | ratio=1, 54 | use_checkpoint=False, 55 | mlp_ratio=4, 56 | qkv_bias=True, 57 | drop_path_rate=0.55, 58 | ), 59 | keypoint_head=dict( 60 | type='TopdownHeatmapSimpleHead', 61 | in_channels=1280, 62 | num_deconv_layers=2, 63 | num_deconv_filters=(256, 256), 64 | num_deconv_kernels=(4, 4), 65 | extra=dict(final_conv_kernel=1, ), 66 | out_channels=channel_cfg['num_output_channels'], 67 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), 68 | train_cfg=dict(), 69 | test_cfg=dict( 70 | flip_test=True, 71 | post_process='default', 72 | shift_heatmap=False, 73 | target_type=target_type, 74 | modulate_kernel=11, 75 | use_udp=True)) 76 | 77 | data_cfg = dict( 78 | image_size=[192, 256], 79 | heatmap_size=[48, 64], 80 | num_output_channels=channel_cfg['num_output_channels'], 81 | num_joints=channel_cfg['dataset_joints'], 82 | dataset_channel=channel_cfg['dataset_channel'], 83 | inference_channel=channel_cfg['inference_channel'], 84 | soft_nms=False, 85 | nms_thr=1.0, 86 | oks_thr=0.9, 87 | vis_thr=0.2, 88 | use_gt_bbox=False, 89 | det_bbox_thr=0.0, 90 | bbox_file='data/coco/person_detection_results/' 91 | 'COCO_val2017_detections_AP_H_56_person.json', 92 | ) 93 | 94 | train_pipeline = [ 95 | dict(type='LoadImageFromFile'), 96 | dict(type='TopDownRandomFlip', flip_prob=0.5), 97 | dict( 98 | type='TopDownHalfBodyTransform', 99 | num_joints_half_body=8, 100 | prob_half_body=0.3), 101 | dict( 102 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), 103 | dict(type='TopDownAffine', use_udp=True), 104 | dict(type='ToTensor'), 105 | dict( 106 | type='NormalizeTensor', 107 | mean=[0.485, 0.456, 0.406], 108 | std=[0.229, 0.224, 0.225]), 109 | dict( 110 | type='TopDownGenerateTarget', 111 | sigma=2, 112 | encoding='UDP', 113 | target_type=target_type), 114 | dict( 115 | type='Collect', 116 | keys=['img', 'target', 'target_weight'], 117 | meta_keys=[ 118 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 119 | 'rotation', 'bbox_score', 'flip_pairs' 120 | ]), 121 | ] 122 | 123 | val_pipeline = [ 124 | dict(type='LoadImageFromFile'), 125 | dict(type='TopDownAffine', use_udp=True), 126 | dict(type='ToTensor'), 127 | dict( 128 | type='NormalizeTensor', 129 | mean=[0.485, 0.456, 0.406], 130 | std=[0.229, 0.224, 0.225]), 131 | dict( 132 | type='Collect', 133 | keys=['img'], 134 | meta_keys=[ 135 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score', 136 | 'flip_pairs' 137 | ]), 138 | ] 139 | 140 | test_pipeline = val_pipeline 141 | 142 | data_root = 'datasets/coco' 143 | data = dict( 144 | samples_per_gpu=64, 145 | workers_per_gpu=4, 146 | val_dataloader=dict(samples_per_gpu=32), 147 | test_dataloader=dict(samples_per_gpu=32), 148 | train=dict( 149 | type='TopDownCocoDataset', 150 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', 151 | img_prefix=f'{data_root}/train2017/', 152 | data_cfg=data_cfg, 153 | pipeline=train_pipeline), 154 | val=dict( 155 | type='TopDownCocoDataset', 156 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 157 | img_prefix=f'{data_root}/val2017/', 158 | data_cfg=data_cfg, 159 | pipeline=val_pipeline), 160 | test=dict( 161 | type='TopDownCocoDataset', 162 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 163 | img_prefix=f'{data_root}/val2017/', 164 | data_cfg=data_cfg, 165 | pipeline=test_pipeline) 166 | ) 167 | 168 | -------------------------------------------------------------------------------- /configs/ViTPose_huge_simple_coco_256x192.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../../../../_base_/default_runtime.py', 3 | '../../../../_base_/datasets/coco.py' 4 | ] 5 | evaluation = dict(interval=10, metric='mAP', save_best='AP') 6 | 7 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1, 8 | constructor='LayerDecayOptimizerConstructor', 9 | paramwise_cfg=dict( 10 | num_layers=32, 11 | layer_decay_rate=0.85, 12 | custom_keys={ 13 | 'bias': dict(decay_multi=0.), 14 | 'pos_embed': dict(decay_mult=0.), 15 | 'relative_position_bias_table': dict(decay_mult=0.), 16 | 'norm': dict(decay_mult=0.) 17 | } 18 | ) 19 | ) 20 | 21 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) 22 | 23 | # learning policy 24 | lr_config = dict( 25 | policy='step', 26 | warmup='linear', 27 | warmup_iters=500, 28 | warmup_ratio=0.001, 29 | step=[170, 200]) 30 | total_epochs = 210 31 | target_type = 'GaussianHeatmap' 32 | channel_cfg = dict( 33 | num_output_channels=17, 34 | dataset_joints=17, 35 | dataset_channel=[ 36 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 37 | ], 38 | inference_channel=[ 39 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 40 | ]) 41 | 42 | # model settings 43 | model = dict( 44 | type='TopDown', 45 | pretrained=None, 46 | backbone=dict( 47 | type='ViT', 48 | img_size=(256, 192), 49 | patch_size=16, 50 | embed_dim=1280, 51 | depth=32, 52 | num_heads=16, 53 | ratio=1, 54 | use_checkpoint=False, 55 | mlp_ratio=4, 56 | qkv_bias=True, 57 | drop_path_rate=0.55, 58 | ), 59 | keypoint_head=dict( 60 | type='TopdownHeatmapSimpleHead', 61 | in_channels=1280, 62 | num_deconv_layers=0, 63 | num_deconv_filters=[], 64 | num_deconv_kernels=[], 65 | upsample=4, 66 | extra=dict(final_conv_kernel=3, ), 67 | out_channels=channel_cfg['num_output_channels'], 68 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), 69 | train_cfg=dict(), 70 | test_cfg=dict( 71 | flip_test=True, 72 | post_process='default', 73 | shift_heatmap=False, 74 | target_type=target_type, 75 | modulate_kernel=11, 76 | use_udp=True)) 77 | 78 | data_cfg = dict( 79 | image_size=[192, 256], 80 | heatmap_size=[48, 64], 81 | num_output_channels=channel_cfg['num_output_channels'], 82 | num_joints=channel_cfg['dataset_joints'], 83 | dataset_channel=channel_cfg['dataset_channel'], 84 | inference_channel=channel_cfg['inference_channel'], 85 | soft_nms=False, 86 | nms_thr=1.0, 87 | oks_thr=0.9, 88 | vis_thr=0.2, 89 | use_gt_bbox=False, 90 | det_bbox_thr=0.0, 91 | bbox_file='data/coco/person_detection_results/' 92 | 'COCO_val2017_detections_AP_H_56_person.json', 93 | ) 94 | 95 | train_pipeline = [ 96 | dict(type='LoadImageFromFile'), 97 | dict(type='TopDownRandomFlip', flip_prob=0.5), 98 | dict( 99 | type='TopDownHalfBodyTransform', 100 | num_joints_half_body=8, 101 | prob_half_body=0.3), 102 | dict( 103 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), 104 | dict(type='TopDownAffine', use_udp=True), 105 | dict(type='ToTensor'), 106 | dict( 107 | type='NormalizeTensor', 108 | mean=[0.485, 0.456, 0.406], 109 | std=[0.229, 0.224, 0.225]), 110 | dict( 111 | type='TopDownGenerateTarget', 112 | sigma=2, 113 | encoding='UDP', 114 | target_type=target_type), 115 | dict( 116 | type='Collect', 117 | keys=['img', 'target', 'target_weight'], 118 | meta_keys=[ 119 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 120 | 'rotation', 'bbox_score', 'flip_pairs' 121 | ]), 122 | ] 123 | 124 | val_pipeline = [ 125 | dict(type='LoadImageFromFile'), 126 | dict(type='TopDownAffine', use_udp=True), 127 | dict(type='ToTensor'), 128 | dict( 129 | type='NormalizeTensor', 130 | mean=[0.485, 0.456, 0.406], 131 | std=[0.229, 0.224, 0.225]), 132 | dict( 133 | type='Collect', 134 | keys=['img'], 135 | meta_keys=[ 136 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score', 137 | 'flip_pairs' 138 | ]), 139 | ] 140 | 141 | test_pipeline = val_pipeline 142 | 143 | data_root = 'datasets/coco' 144 | data = dict( 145 | samples_per_gpu=64, 146 | workers_per_gpu=4, 147 | val_dataloader=dict(samples_per_gpu=32), 148 | test_dataloader=dict(samples_per_gpu=32), 149 | train=dict( 150 | type='TopDownCocoDataset', 151 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', 152 | img_prefix=f'{data_root}/train2017/', 153 | data_cfg=data_cfg, 154 | pipeline=train_pipeline, 155 | dataset_info={{_base_.dataset_info}}), 156 | val=dict( 157 | type='TopDownCocoDataset', 158 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 159 | img_prefix=f'{data_root}/val2017/', 160 | data_cfg=data_cfg, 161 | pipeline=val_pipeline, 162 | dataset_info={{_base_.dataset_info}}), 163 | test=dict( 164 | type='TopDownCocoDataset', 165 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 166 | img_prefix=f'{data_root}/val2017/', 167 | data_cfg=data_cfg, 168 | pipeline=test_pipeline, 169 | dataset_info={{_base_.dataset_info}}), 170 | ) 171 | 172 | -------------------------------------------------------------------------------- /configs/ViTPose_large_coco_256x192.py: -------------------------------------------------------------------------------- 1 | # _base_ = [ 2 | # '../../../../_base_/default_runtime.py', 3 | # '../../../../_base_/datasets/coco.py' 4 | # ] 5 | evaluation = dict(interval=10, metric='mAP', save_best='AP') 6 | 7 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1, 8 | constructor='LayerDecayOptimizerConstructor', 9 | paramwise_cfg=dict( 10 | num_layers=16, 11 | layer_decay_rate=0.8, 12 | custom_keys={ 13 | 'bias': dict(decay_multi=0.), 14 | 'pos_embed': dict(decay_mult=0.), 15 | 'relative_position_bias_table': dict(decay_mult=0.), 16 | 'norm': dict(decay_mult=0.) 17 | } 18 | ) 19 | ) 20 | 21 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) 22 | 23 | # learning policy 24 | lr_config = dict( 25 | policy='step', 26 | warmup='linear', 27 | warmup_iters=500, 28 | warmup_ratio=0.001, 29 | step=[170, 200]) 30 | total_epochs = 210 31 | target_type = 'GaussianHeatmap' 32 | channel_cfg = dict( 33 | num_output_channels=17, 34 | dataset_joints=17, 35 | dataset_channel=[ 36 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 37 | ], 38 | inference_channel=[ 39 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 40 | ]) 41 | 42 | # model settings 43 | model = dict( 44 | type='TopDown', 45 | pretrained=None, 46 | backbone=dict( 47 | type='ViT', 48 | img_size=(256, 192), 49 | patch_size=16, 50 | embed_dim=1024, 51 | depth=24, 52 | num_heads=16, 53 | ratio=1, 54 | use_checkpoint=False, 55 | mlp_ratio=4, 56 | qkv_bias=True, 57 | drop_path_rate=0.5, 58 | ), 59 | keypoint_head=dict( 60 | type='TopdownHeatmapSimpleHead', 61 | in_channels=1024, 62 | num_deconv_layers=2, 63 | num_deconv_filters=(256, 256), 64 | num_deconv_kernels=(4, 4), 65 | extra=dict(final_conv_kernel=1, ), 66 | out_channels=channel_cfg['num_output_channels'], 67 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), 68 | train_cfg=dict(), 69 | test_cfg=dict( 70 | flip_test=True, 71 | post_process='default', 72 | shift_heatmap=False, 73 | target_type=target_type, 74 | modulate_kernel=11, 75 | use_udp=True)) 76 | 77 | data_cfg = dict( 78 | image_size=[192, 256], 79 | heatmap_size=[48, 64], 80 | num_output_channels=channel_cfg['num_output_channels'], 81 | num_joints=channel_cfg['dataset_joints'], 82 | dataset_channel=channel_cfg['dataset_channel'], 83 | inference_channel=channel_cfg['inference_channel'], 84 | soft_nms=False, 85 | nms_thr=1.0, 86 | oks_thr=0.9, 87 | vis_thr=0.2, 88 | use_gt_bbox=False, 89 | det_bbox_thr=0.0, 90 | bbox_file='data/coco/person_detection_results/' 91 | 'COCO_val2017_detections_AP_H_56_person.json', 92 | ) 93 | 94 | train_pipeline = [ 95 | dict(type='LoadImageFromFile'), 96 | dict(type='TopDownRandomFlip', flip_prob=0.5), 97 | dict( 98 | type='TopDownHalfBodyTransform', 99 | num_joints_half_body=8, 100 | prob_half_body=0.3), 101 | dict( 102 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), 103 | dict(type='TopDownAffine', use_udp=True), 104 | dict(type='ToTensor'), 105 | dict( 106 | type='NormalizeTensor', 107 | mean=[0.485, 0.456, 0.406], 108 | std=[0.229, 0.224, 0.225]), 109 | dict( 110 | type='TopDownGenerateTarget', 111 | sigma=2, 112 | encoding='UDP', 113 | target_type=target_type), 114 | dict( 115 | type='Collect', 116 | keys=['img', 'target', 'target_weight'], 117 | meta_keys=[ 118 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 119 | 'rotation', 'bbox_score', 'flip_pairs' 120 | ]), 121 | ] 122 | 123 | val_pipeline = [ 124 | dict(type='LoadImageFromFile'), 125 | dict(type='TopDownAffine', use_udp=True), 126 | dict(type='ToTensor'), 127 | dict( 128 | type='NormalizeTensor', 129 | mean=[0.485, 0.456, 0.406], 130 | std=[0.229, 0.224, 0.225]), 131 | dict( 132 | type='Collect', 133 | keys=['img'], 134 | meta_keys=[ 135 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score', 136 | 'flip_pairs' 137 | ]), 138 | ] 139 | 140 | test_pipeline = val_pipeline 141 | 142 | data_root = 'datasets/coco' 143 | data = dict( 144 | samples_per_gpu=64, 145 | workers_per_gpu=4, 146 | val_dataloader=dict(samples_per_gpu=32), 147 | test_dataloader=dict(samples_per_gpu=32), 148 | train=dict( 149 | type='TopDownCocoDataset', 150 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', 151 | img_prefix=f'{data_root}/train2017/', 152 | data_cfg=data_cfg, 153 | pipeline=train_pipeline), 154 | val=dict( 155 | type='TopDownCocoDataset', 156 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 157 | img_prefix=f'{data_root}/val2017/', 158 | data_cfg=data_cfg, 159 | pipeline=val_pipeline), 160 | test=dict( 161 | type='TopDownCocoDataset', 162 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 163 | img_prefix=f'{data_root}/val2017/', 164 | data_cfg=data_cfg, 165 | pipeline=test_pipeline) 166 | ) 167 | 168 | -------------------------------------------------------------------------------- /configs/ViTPose_large_simple_coco_256x192.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../../../../_base_/default_runtime.py', 3 | '../../../../_base_/datasets/coco.py' 4 | ] 5 | evaluation = dict(interval=10, metric='mAP', save_best='AP') 6 | 7 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1, 8 | constructor='LayerDecayOptimizerConstructor', 9 | paramwise_cfg=dict( 10 | num_layers=24, 11 | layer_decay_rate=0.8, 12 | custom_keys={ 13 | 'bias': dict(decay_multi=0.), 14 | 'pos_embed': dict(decay_mult=0.), 15 | 'relative_position_bias_table': dict(decay_mult=0.), 16 | 'norm': dict(decay_mult=0.) 17 | } 18 | ) 19 | ) 20 | 21 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) 22 | 23 | # learning policy 24 | lr_config = dict( 25 | policy='step', 26 | warmup='linear', 27 | warmup_iters=500, 28 | warmup_ratio=0.001, 29 | step=[170, 200]) 30 | total_epochs = 210 31 | target_type = 'GaussianHeatmap' 32 | channel_cfg = dict( 33 | num_output_channels=17, 34 | dataset_joints=17, 35 | dataset_channel=[ 36 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 37 | ], 38 | inference_channel=[ 39 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 40 | ]) 41 | 42 | # model settings 43 | model = dict( 44 | type='TopDown', 45 | pretrained=None, 46 | backbone=dict( 47 | type='ViT', 48 | img_size=(256, 192), 49 | patch_size=16, 50 | embed_dim=1024, 51 | depth=24, 52 | num_heads=16, 53 | ratio=1, 54 | use_checkpoint=False, 55 | mlp_ratio=4, 56 | qkv_bias=True, 57 | drop_path_rate=0.5, 58 | ), 59 | keypoint_head=dict( 60 | type='TopdownHeatmapSimpleHead', 61 | in_channels=1024, 62 | num_deconv_layers=0, 63 | num_deconv_filters=[], 64 | num_deconv_kernels=[], 65 | upsample=4, 66 | extra=dict(final_conv_kernel=3, ), 67 | out_channels=channel_cfg['num_output_channels'], 68 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), 69 | train_cfg=dict(), 70 | test_cfg=dict( 71 | flip_test=True, 72 | post_process='default', 73 | shift_heatmap=False, 74 | target_type=target_type, 75 | modulate_kernel=11, 76 | use_udp=True)) 77 | 78 | data_cfg = dict( 79 | image_size=[192, 256], 80 | heatmap_size=[48, 64], 81 | num_output_channels=channel_cfg['num_output_channels'], 82 | num_joints=channel_cfg['dataset_joints'], 83 | dataset_channel=channel_cfg['dataset_channel'], 84 | inference_channel=channel_cfg['inference_channel'], 85 | soft_nms=False, 86 | nms_thr=1.0, 87 | oks_thr=0.9, 88 | vis_thr=0.2, 89 | use_gt_bbox=False, 90 | det_bbox_thr=0.0, 91 | bbox_file='data/coco/person_detection_results/' 92 | 'COCO_val2017_detections_AP_H_56_person.json', 93 | ) 94 | 95 | train_pipeline = [ 96 | dict(type='LoadImageFromFile'), 97 | dict(type='TopDownRandomFlip', flip_prob=0.5), 98 | dict( 99 | type='TopDownHalfBodyTransform', 100 | num_joints_half_body=8, 101 | prob_half_body=0.3), 102 | dict( 103 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), 104 | dict(type='TopDownAffine', use_udp=True), 105 | dict(type='ToTensor'), 106 | dict( 107 | type='NormalizeTensor', 108 | mean=[0.485, 0.456, 0.406], 109 | std=[0.229, 0.224, 0.225]), 110 | dict( 111 | type='TopDownGenerateTarget', 112 | sigma=2, 113 | encoding='UDP', 114 | target_type=target_type), 115 | dict( 116 | type='Collect', 117 | keys=['img', 'target', 'target_weight'], 118 | meta_keys=[ 119 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 120 | 'rotation', 'bbox_score', 'flip_pairs' 121 | ]), 122 | ] 123 | 124 | val_pipeline = [ 125 | dict(type='LoadImageFromFile'), 126 | dict(type='TopDownAffine', use_udp=True), 127 | dict(type='ToTensor'), 128 | dict( 129 | type='NormalizeTensor', 130 | mean=[0.485, 0.456, 0.406], 131 | std=[0.229, 0.224, 0.225]), 132 | dict( 133 | type='Collect', 134 | keys=['img'], 135 | meta_keys=[ 136 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score', 137 | 'flip_pairs' 138 | ]), 139 | ] 140 | 141 | test_pipeline = val_pipeline 142 | 143 | data_root = 'datasets/coco' 144 | data = dict( 145 | samples_per_gpu=64, 146 | workers_per_gpu=4, 147 | val_dataloader=dict(samples_per_gpu=32), 148 | test_dataloader=dict(samples_per_gpu=32), 149 | train=dict( 150 | type='TopDownCocoDataset', 151 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', 152 | img_prefix=f'{data_root}/train2017/', 153 | data_cfg=data_cfg, 154 | pipeline=train_pipeline, 155 | dataset_info={{_base_.dataset_info}}), 156 | val=dict( 157 | type='TopDownCocoDataset', 158 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 159 | img_prefix=f'{data_root}/val2017/', 160 | data_cfg=data_cfg, 161 | pipeline=val_pipeline, 162 | dataset_info={{_base_.dataset_info}}), 163 | test=dict( 164 | type='TopDownCocoDataset', 165 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', 166 | img_prefix=f'{data_root}/val2017/', 167 | data_cfg=data_cfg, 168 | pipeline=test_pipeline, 169 | dataset_info={{_base_.dataset_info}}), 170 | ) 171 | 172 | -------------------------------------------------------------------------------- /datasets/HumanPoseEstimation.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | 3 | 4 | class HumanPoseEstimationDataset(Dataset): 5 | """ 6 | HumanPoseEstimationDataset class. 7 | 8 | Generic class for HPE datasets. 9 | """ 10 | def __init__(self): 11 | pass 12 | 13 | def __len__(self): 14 | pass 15 | 16 | def __getitem__(self, item): 17 | pass -------------------------------------------------------------------------------- /examples/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/examples/.DS_Store -------------------------------------------------------------------------------- /examples/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/examples/img1.jpg -------------------------------------------------------------------------------- /examples/img1_result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/examples/img1_result.jpg -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path as osp 3 | 4 | import torch 5 | from torch import Tensor 6 | 7 | from pathlib import Path 8 | import cv2 9 | import numpy as np 10 | 11 | 12 | from time import time 13 | from PIL import Image 14 | from torchvision.transforms import transforms 15 | 16 | from models.model import ViTPose 17 | from utils.visualization import draw_points_and_skeleton, joints_dict 18 | from utils.dist_util import get_dist_info, init_dist 19 | from utils.top_down_eval import keypoints_from_heatmaps 20 | 21 | __all__ = ['inference'] 22 | 23 | 24 | @torch.no_grad() 25 | def inference(img_path: Path, img_size: tuple[int, int], 26 | model_cfg: dict, ckpt_path: Path, device: torch.device, save_result: bool=True) -> np.ndarray: 27 | 28 | # Prepare model 29 | vit_pose = ViTPose(model_cfg) 30 | 31 | 32 | ckpt = torch.load(ckpt_path) 33 | if 'state_dict' in ckpt: 34 | vit_pose.load_state_dict(ckpt['state_dict']) 35 | else: 36 | vit_pose.load_state_dict(ckpt) 37 | vit_pose.to(device) 38 | print(f">>> Model loaded: {ckpt_path}") 39 | 40 | # Prepare input data 41 | img = Image.open(img_path) 42 | org_w, org_h = img.size 43 | print(f">>> Original image size: {org_h} X {org_w} (height X width)") 44 | print(f">>> Resized image size: {img_size[1]} X {img_size[0]} (height X width)") 45 | print(f">>> Scale change: {org_h/img_size[1]}, {org_w/img_size[0]}") 46 | img_tensor = transforms.Compose ( 47 | [transforms.Resize((img_size[1], img_size[0])), 48 | transforms.ToTensor()] 49 | )(img).unsqueeze(0).to(device) 50 | 51 | 52 | # Feed to model 53 | tic = time() 54 | heatmaps = vit_pose(img_tensor).detach().cpu().numpy() # N, 17, h/4, w/4 55 | elapsed_time = time()-tic 56 | print(f">>> Output size: {heatmaps.shape} ---> {elapsed_time:.4f} sec. elapsed [{elapsed_time**-1: .1f} fps]\n") 57 | 58 | # points = heatmap2coords(heatmaps=heatmaps, original_resolution=(org_h, org_w)) 59 | points, prob = keypoints_from_heatmaps(heatmaps=heatmaps, center=np.array([[org_w//2, org_h//2]]), scale=np.array([[org_w, org_h]]), 60 | unbiased=True, use_udp=True) 61 | points = np.concatenate([points[:, :, ::-1], prob], axis=2) 62 | 63 | # Visualization 64 | if save_result: 65 | for pid, point in enumerate(points): 66 | img = np.array(img)[:, :, ::-1] # RGB to BGR for cv2 modules 67 | img = draw_points_and_skeleton(img.copy(), point, joints_dict()['coco']['skeleton'], person_index=pid, 68 | points_color_palette='gist_rainbow', skeleton_color_palette='jet', 69 | points_palette_samples=10, confidence_threshold=0.4) 70 | save_name = img_path.replace(".jpg", "_result.jpg") 71 | cv2.imwrite(save_name, img) 72 | 73 | return points 74 | 75 | 76 | if __name__ == "__main__": 77 | from configs.ViTPose_base_coco_256x192 import model as model_cfg 78 | from configs.ViTPose_base_coco_256x192 import data_cfg 79 | 80 | parser = argparse.ArgumentParser() 81 | parser.add_argument('--image-path', nargs='+', type=str, default='examples/sample.jpg', help='image path(s)') 82 | args = parser.parse_args() 83 | 84 | CUR_DIR = osp.dirname(__file__) 85 | # CKPT_PATH = f"{CUR_DIR}/vitpose-b-multi-coco.pth" 86 | CKPT_PATH = "/home/jaehyun/workspace/PoseEstimation/ViTPose_pytorch/runs/train/002/epoch010.pth" 87 | 88 | img_size = data_cfg['image_size'] 89 | if type(args.image_path) != list: 90 | args.image_path = [args.image_path] 91 | for img_path in args.image_path: 92 | print(img_path) 93 | keypoints = inference(img_path=img_path, img_size=img_size, model_cfg=model_cfg, ckpt_path=CKPT_PATH, 94 | device=torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu'), 95 | save_result=True) -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path as osp 3 | 4 | sys.path.append(osp.dirname(osp.dirname(__file__))) 5 | 6 | from utils.util import load_checkpoint, resize, constant_init, normal_init 7 | from utils.top_down_eval import keypoints_from_heatmaps, pose_pck_accuracy 8 | from utils.post_processing import * -------------------------------------------------------------------------------- /models/head/topdown_heatmap_base_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import ABCMeta, abstractmethod 3 | 4 | import numpy as np 5 | import torch.nn as nn 6 | 7 | from .. import keypoints_from_heatmaps 8 | 9 | 10 | class TopdownHeatmapBaseHead(nn.Module): 11 | """Base class for top-down heatmap heads. 12 | 13 | All top-down heatmap heads should subclass it. 14 | All subclass should overwrite: 15 | 16 | Methods:`get_loss`, supporting to calculate loss. 17 | Methods:`get_accuracy`, supporting to calculate accuracy. 18 | Methods:`forward`, supporting to forward model. 19 | Methods:`inference_model`, supporting to inference model. 20 | """ 21 | 22 | __metaclass__ = ABCMeta 23 | 24 | @abstractmethod 25 | def get_loss(self, **kwargs): 26 | """Gets the loss.""" 27 | 28 | @abstractmethod 29 | def get_accuracy(self, **kwargs): 30 | """Gets the accuracy.""" 31 | 32 | @abstractmethod 33 | def forward(self, **kwargs): 34 | """Forward function.""" 35 | 36 | @abstractmethod 37 | def inference_model(self, **kwargs): 38 | """Inference function.""" 39 | 40 | def decode(self, img_metas, output, **kwargs): 41 | """Decode keypoints from heatmaps. 42 | 43 | Args: 44 | img_metas (list(dict)): Information about data augmentation 45 | By default this includes: 46 | 47 | - "image_file: path to the image file 48 | - "center": center of the bbox 49 | - "scale": scale of the bbox 50 | - "rotation": rotation of the bbox 51 | - "bbox_score": score of bbox 52 | output (np.ndarray[N, K, H, W]): model predicted heatmaps. 53 | """ 54 | batch_size = len(img_metas) 55 | 56 | if 'bbox_id' in img_metas[0]: 57 | bbox_ids = [] 58 | else: 59 | bbox_ids = None 60 | 61 | c = np.zeros((batch_size, 2), dtype=np.float32) 62 | s = np.zeros((batch_size, 2), dtype=np.float32) 63 | image_paths = [] 64 | score = np.ones(batch_size) 65 | for i in range(batch_size): 66 | c[i, :] = img_metas[i]['center'] 67 | s[i, :] = img_metas[i]['scale'] 68 | image_paths.append(img_metas[i]['image_file']) 69 | 70 | if 'bbox_score' in img_metas[i]: 71 | score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1) 72 | if bbox_ids is not None: 73 | bbox_ids.append(img_metas[i]['bbox_id']) 74 | 75 | preds, maxvals = keypoints_from_heatmaps( 76 | output, 77 | c, 78 | s, 79 | unbiased=self.test_cfg.get('unbiased_decoding', False), 80 | post_process=self.test_cfg.get('post_process', 'default'), 81 | kernel=self.test_cfg.get('modulate_kernel', 11), 82 | valid_radius_factor=self.test_cfg.get('valid_radius_factor', 83 | 0.0546875), 84 | use_udp=self.test_cfg.get('use_udp', False), 85 | target_type=self.test_cfg.get('target_type', 'GaussianHeatmap')) 86 | 87 | all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32) 88 | all_boxes = np.zeros((batch_size, 6), dtype=np.float32) 89 | all_preds[:, :, 0:2] = preds[:, :, 0:2] 90 | all_preds[:, :, 2:3] = maxvals 91 | all_boxes[:, 0:2] = c[:, 0:2] 92 | all_boxes[:, 2:4] = s[:, 0:2] 93 | all_boxes[:, 4] = np.prod(s * 200.0, axis=1) 94 | all_boxes[:, 5] = score 95 | 96 | result = {} 97 | 98 | result['preds'] = all_preds 99 | result['boxes'] = all_boxes 100 | result['image_paths'] = image_paths 101 | result['bbox_ids'] = bbox_ids 102 | 103 | return result 104 | 105 | @staticmethod 106 | def _get_deconv_cfg(deconv_kernel): 107 | """Get configurations for deconv layers.""" 108 | if deconv_kernel == 4: 109 | padding = 1 110 | output_padding = 0 111 | elif deconv_kernel == 3: 112 | padding = 1 113 | output_padding = 1 114 | elif deconv_kernel == 2: 115 | padding = 0 116 | output_padding = 0 117 | else: 118 | raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') 119 | 120 | return deconv_kernel, padding, output_padding 121 | -------------------------------------------------------------------------------- /models/head/topdown_heatmap_simple_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.nn as nn 4 | from .. import constant_init, normal_init 5 | 6 | from .. import pose_pck_accuracy, flip_back, resize 7 | import torch.nn.functional as F 8 | from .topdown_heatmap_base_head import TopdownHeatmapBaseHead 9 | 10 | 11 | class TopdownHeatmapSimpleHead(TopdownHeatmapBaseHead): 12 | """Top-down heatmap simple head. paper ref: Bin Xiao et al. ``Simple 13 | Baselines for Human Pose Estimation and Tracking``. 14 | 15 | TopdownHeatmapSimpleHead is consisted of (>=0) number of deconv layers 16 | and a simple conv2d layer. 17 | 18 | Args: 19 | in_channels (int): Number of input channels 20 | out_channels (int): Number of output channels 21 | num_deconv_layers (int): Number of deconv layers. 22 | num_deconv_layers should >= 0. Note that 0 means 23 | no deconv layers. 24 | num_deconv_filters (list|tuple): Number of filters. 25 | If num_deconv_layers > 0, the length of 26 | num_deconv_kernels (list|tuple): Kernel sizes. 27 | in_index (int|Sequence[int]): Input feature index. Default: 0 28 | input_transform (str|None): Transformation type of input features. 29 | Options: 'resize_concat', 'multiple_select', None. 30 | Default: None. 31 | 32 | - 'resize_concat': Multiple feature maps will be resized to the 33 | same size as the first one and then concat together. 34 | Usually used in FCN head of HRNet. 35 | - 'multiple_select': Multiple feature maps will be bundle into 36 | a list and passed into decode head. 37 | - None: Only one select feature map is allowed. 38 | align_corners (bool): align_corners argument of F.interpolate. 39 | Default: False. 40 | loss_keypoint (dict): Config for keypoint loss. Default: None. 41 | """ 42 | 43 | def __init__(self, 44 | in_channels, 45 | out_channels, 46 | num_deconv_layers=3, 47 | num_deconv_filters=(256, 256, 256), 48 | num_deconv_kernels=(4, 4, 4), 49 | extra=None, 50 | in_index=0, 51 | input_transform=None, 52 | align_corners=False, 53 | loss_keypoint=None, 54 | train_cfg=None, 55 | test_cfg=None, 56 | upsample=0,): 57 | super().__init__() 58 | 59 | self.in_channels = in_channels 60 | self.loss = loss_keypoint 61 | self.upsample = upsample 62 | 63 | self.train_cfg = {} if train_cfg is None else train_cfg 64 | self.test_cfg = {} if test_cfg is None else test_cfg 65 | self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap') 66 | 67 | self._init_inputs(in_channels, in_index, input_transform) 68 | self.in_index = in_index 69 | self.align_corners = align_corners 70 | 71 | if extra is not None and not isinstance(extra, dict): 72 | raise TypeError('extra should be dict or None.') 73 | 74 | if num_deconv_layers > 0: 75 | self.deconv_layers = self._make_deconv_layer( 76 | num_deconv_layers, 77 | num_deconv_filters, 78 | num_deconv_kernels, 79 | ) 80 | elif num_deconv_layers == 0: 81 | self.deconv_layers = nn.Identity() 82 | else: 83 | raise ValueError( 84 | f'num_deconv_layers ({num_deconv_layers}) should >= 0.') 85 | 86 | identity_final_layer = False 87 | if extra is not None and 'final_conv_kernel' in extra: 88 | assert extra['final_conv_kernel'] in [0, 1, 3] 89 | if extra['final_conv_kernel'] == 3: 90 | padding = 1 91 | elif extra['final_conv_kernel'] == 1: 92 | padding = 0 93 | else: 94 | # 0 for Identity mapping. 95 | identity_final_layer = True 96 | kernel_size = extra['final_conv_kernel'] 97 | else: 98 | kernel_size = 1 99 | padding = 0 100 | 101 | if identity_final_layer: 102 | self.final_layer = nn.Identity() 103 | else: 104 | conv_channels = num_deconv_filters[ 105 | -1] if num_deconv_layers > 0 else self.in_channels 106 | 107 | layers = [] 108 | if extra is not None: 109 | num_conv_layers = extra.get('num_conv_layers', 0) 110 | num_conv_kernels = extra.get('num_conv_kernels', 111 | [1] * num_conv_layers) 112 | 113 | for i in range(num_conv_layers): 114 | layers.append( 115 | nn.Conv2d(in_channels=conv_channels, 116 | out_channels=conv_channels, 117 | kernel_size=num_conv_kernels[i], 118 | stride=1, 119 | padding=(num_conv_kernels[i] - 1) // 2) 120 | ) 121 | layers.append(nn.BatchNorm2d(conv_channels)) 122 | layers.append(nn.ReLU(inplace=True)) 123 | 124 | layers.append( 125 | nn.Conv2d(in_channels=conv_channels, 126 | out_channels=out_channels, 127 | kernel_size=kernel_size, 128 | stride=1, 129 | padding=padding) 130 | ) 131 | 132 | if len(layers) > 1: 133 | self.final_layer = nn.Sequential(*layers) 134 | else: 135 | self.final_layer = layers[0] 136 | 137 | def get_loss(self, output, target, target_weight): 138 | """Calculate top-down keypoint loss. 139 | 140 | Note: 141 | - batch_size: N 142 | - num_keypoints: K 143 | - heatmaps height: H 144 | - heatmaps weight: W 145 | 146 | Args: 147 | output (torch.Tensor[N,K,H,W]): Output heatmaps. 148 | target (torch.Tensor[N,K,H,W]): Target heatmaps. 149 | target_weight (torch.Tensor[N,K,1]): 150 | Weights across different joint types. 151 | """ 152 | 153 | losses = dict() 154 | 155 | assert not isinstance(self.loss, nn.Sequential) 156 | assert target.dim() == 4 and target_weight.dim() == 3 157 | losses['heatmap_loss'] = self.loss(output, target, target_weight) 158 | 159 | return losses 160 | 161 | def get_accuracy(self, output, target, target_weight): 162 | """Calculate accuracy for top-down keypoint loss. 163 | 164 | Note: 165 | - batch_size: N 166 | - num_keypoints: K 167 | - heatmaps height: H 168 | - heatmaps weight: W 169 | 170 | Args: 171 | output (torch.Tensor[N,K,H,W]): Output heatmaps. 172 | target (torch.Tensor[N,K,H,W]): Target heatmaps. 173 | target_weight (torch.Tensor[N,K,1]): 174 | Weights across different joint types. 175 | """ 176 | 177 | accuracy = dict() 178 | 179 | if self.target_type == 'GaussianHeatmap': 180 | _, avg_acc, _ = pose_pck_accuracy( 181 | output.detach().cpu().numpy(), 182 | target.detach().cpu().numpy(), 183 | target_weight.detach().cpu().numpy().squeeze(-1) > 0) 184 | accuracy['acc_pose'] = float(avg_acc) 185 | 186 | return accuracy 187 | 188 | def forward(self, x): 189 | """Forward function.""" 190 | x = self._transform_inputs(x) 191 | x = self.deconv_layers(x) 192 | x = self.final_layer(x) 193 | return x 194 | 195 | def inference_model(self, x, flip_pairs=None): 196 | """Inference function. 197 | 198 | Returns: 199 | output_heatmap (np.ndarray): Output heatmaps. 200 | 201 | Args: 202 | x (torch.Tensor[N,K,H,W]): Input features. 203 | flip_pairs (None | list[tuple]): 204 | Pairs of keypoints which are mirrored. 205 | """ 206 | output = self.forward(x) 207 | 208 | if flip_pairs is not None: 209 | output_heatmap = flip_back( 210 | output.detach().cpu().numpy(), 211 | flip_pairs, 212 | target_type=self.target_type) 213 | # feature is not aligned, shift flipped heatmap for higher accuracy 214 | if self.test_cfg.get('shift_heatmap', False): 215 | output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1] 216 | else: 217 | output_heatmap = output.detach().cpu().numpy() 218 | return output_heatmap 219 | 220 | def _init_inputs(self, in_channels, in_index, input_transform): 221 | """Check and initialize input transforms. 222 | 223 | The in_channels, in_index and input_transform must match. 224 | Specifically, when input_transform is None, only single feature map 225 | will be selected. So in_channels and in_index must be of type int. 226 | When input_transform is not None, in_channels and in_index must be 227 | list or tuple, with the same length. 228 | 229 | Args: 230 | in_channels (int|Sequence[int]): Input channels. 231 | in_index (int|Sequence[int]): Input feature index. 232 | input_transform (str|None): Transformation type of input features. 233 | Options: 'resize_concat', 'multiple_select', None. 234 | 235 | - 'resize_concat': Multiple feature maps will be resize to the 236 | same size as first one and than concat together. 237 | Usually used in FCN head of HRNet. 238 | - 'multiple_select': Multiple feature maps will be bundle into 239 | a list and passed into decode head. 240 | - None: Only one select feature map is allowed. 241 | """ 242 | 243 | if input_transform is not None: 244 | assert input_transform in ['resize_concat', 'multiple_select'] 245 | self.input_transform = input_transform 246 | self.in_index = in_index 247 | if input_transform is not None: 248 | assert isinstance(in_channels, (list, tuple)) 249 | assert isinstance(in_index, (list, tuple)) 250 | assert len(in_channels) == len(in_index) 251 | if input_transform == 'resize_concat': 252 | self.in_channels = sum(in_channels) 253 | else: 254 | self.in_channels = in_channels 255 | else: 256 | assert isinstance(in_channels, int) 257 | assert isinstance(in_index, int) 258 | self.in_channels = in_channels 259 | 260 | def _transform_inputs(self, inputs): 261 | """Transform inputs for decoder. 262 | 263 | Args: 264 | inputs (list[Tensor] | Tensor): multi-level img features. 265 | 266 | Returns: 267 | Tensor: The transformed inputs 268 | """ 269 | if not isinstance(inputs, list): 270 | if not isinstance(inputs, list): 271 | if self.upsample > 0: 272 | inputs = resize( 273 | input=F.relu(inputs), 274 | scale_factor=self.upsample, 275 | mode='bilinear', 276 | align_corners=self.align_corners 277 | ) 278 | return inputs 279 | 280 | if self.input_transform == 'resize_concat': 281 | inputs = [inputs[i] for i in self.in_index] 282 | upsampled_inputs = [ 283 | resize( 284 | input=x, 285 | size=inputs[0].shape[2:], 286 | mode='bilinear', 287 | align_corners=self.align_corners) for x in inputs 288 | ] 289 | inputs = torch.cat(upsampled_inputs, dim=1) 290 | elif self.input_transform == 'multiple_select': 291 | inputs = [inputs[i] for i in self.in_index] 292 | else: 293 | inputs = inputs[self.in_index] 294 | 295 | return inputs 296 | 297 | def _make_deconv_layer(self, num_layers, num_filters, num_kernels): 298 | """Make deconv layers.""" 299 | if num_layers != len(num_filters): 300 | error_msg = f'num_layers({num_layers}) ' \ 301 | f'!= length of num_filters({len(num_filters)})' 302 | raise ValueError(error_msg) 303 | if num_layers != len(num_kernels): 304 | error_msg = f'num_layers({num_layers}) ' \ 305 | f'!= length of num_kernels({len(num_kernels)})' 306 | raise ValueError(error_msg) 307 | 308 | layers = [] 309 | for i in range(num_layers): 310 | kernel, padding, output_padding = \ 311 | self._get_deconv_cfg(num_kernels[i]) 312 | 313 | planes = num_filters[i] 314 | layers.append( 315 | nn.ConvTranspose2d(in_channels=self.in_channels, 316 | out_channels=planes, 317 | kernel_size=kernel, 318 | stride=2, 319 | padding=padding, 320 | output_padding=output_padding, 321 | bias=False) 322 | ) 323 | layers.append(nn.BatchNorm2d(planes)) 324 | layers.append(nn.ReLU(inplace=True)) 325 | self.in_channels = planes 326 | 327 | return nn.Sequential(*layers) 328 | 329 | def init_weights(self): 330 | """Initialize model weights.""" 331 | for _, m in self.deconv_layers.named_modules(): 332 | if isinstance(m, nn.ConvTranspose2d): 333 | normal_init(m, std=0.001) 334 | elif isinstance(m, nn.BatchNorm2d): 335 | constant_init(m, 1) 336 | for m in self.final_layer.modules(): 337 | if isinstance(m, nn.Conv2d): 338 | normal_init(m, std=0.001, bias=0) 339 | elif isinstance(m, nn.BatchNorm2d): 340 | constant_init(m, 1) 341 | -------------------------------------------------------------------------------- /models/losses/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .classfication_loss import BCELoss 3 | from .heatmap_loss import AdaptiveWingLoss 4 | from .mesh_loss import GANLoss, MeshLoss 5 | from .mse_loss import JointsMSELoss, JointsOHKMMSELoss 6 | from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory 7 | from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss, 8 | SemiSupervisionLoss, SmoothL1Loss, SoftWingLoss, 9 | WingLoss) 10 | 11 | __all__ = [ 12 | 'JointsMSELoss', 'JointsOHKMMSELoss', 'HeatmapLoss', 'AELoss', 13 | 'MultiLossFactory', 'MeshLoss', 'GANLoss', 'SmoothL1Loss', 'WingLoss', 14 | 'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss', 15 | 'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss' 16 | ] 17 | -------------------------------------------------------------------------------- /models/losses/classfication_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | __all__ = ['BCELoss'] 7 | 8 | 9 | class BCELoss(nn.Module): 10 | """Binary Cross Entropy loss.""" 11 | 12 | def __init__(self, use_target_weight=False, loss_weight=1.): 13 | super().__init__() 14 | self.criterion = F.binary_cross_entropy 15 | self.use_target_weight = use_target_weight 16 | self.loss_weight = loss_weight 17 | 18 | def forward(self, output, target, target_weight=None): 19 | """Forward function. 20 | 21 | Note: 22 | - batch_size: N 23 | - num_labels: K 24 | 25 | Args: 26 | output (torch.Tensor[N, K]): Output classification. 27 | target (torch.Tensor[N, K]): Target classification. 28 | target_weight (torch.Tensor[N, K] or torch.Tensor[N]): 29 | Weights across different labels. 30 | """ 31 | 32 | if self.use_target_weight: 33 | assert target_weight is not None 34 | loss = self.criterion(output, target, reduction='none') 35 | if target_weight.dim() == 1: 36 | target_weight = target_weight[:, None] 37 | loss = (loss * target_weight).mean() 38 | else: 39 | loss = self.criterion(output, target) 40 | 41 | return loss * self.loss_weight 42 | -------------------------------------------------------------------------------- /models/losses/heatmap_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class AdaptiveWingLoss(nn.Module): 7 | """Adaptive wing loss. paper ref: 'Adaptive Wing Loss for Robust Face 8 | Alignment via Heatmap Regression' Wang et al. ICCV'2019. 9 | 10 | Args: 11 | alpha (float), omega (float), epsilon (float), theta (float) 12 | are hyper-parameters. 13 | use_target_weight (bool): Option to use weighted MSE loss. 14 | Different joint types may have different target weights. 15 | loss_weight (float): Weight of the loss. Default: 1.0. 16 | """ 17 | 18 | def __init__(self, 19 | alpha=2.1, 20 | omega=14, 21 | epsilon=1, 22 | theta=0.5, 23 | use_target_weight=False, 24 | loss_weight=1.): 25 | super().__init__() 26 | self.alpha = float(alpha) 27 | self.omega = float(omega) 28 | self.epsilon = float(epsilon) 29 | self.theta = float(theta) 30 | self.use_target_weight = use_target_weight 31 | self.loss_weight = loss_weight 32 | 33 | def criterion(self, pred, target): 34 | """Criterion of wingloss. 35 | 36 | Note: 37 | batch_size: N 38 | num_keypoints: K 39 | 40 | Args: 41 | pred (torch.Tensor[NxKxHxW]): Predicted heatmaps. 42 | target (torch.Tensor[NxKxHxW]): Target heatmaps. 43 | """ 44 | H, W = pred.shape[2:4] 45 | delta = (target - pred).abs() 46 | 47 | A = self.omega * ( 48 | 1 / (1 + torch.pow(self.theta / self.epsilon, self.alpha - target)) 49 | ) * (self.alpha - target) * (torch.pow( 50 | self.theta / self.epsilon, 51 | self.alpha - target - 1)) * (1 / self.epsilon) 52 | C = self.theta * A - self.omega * torch.log( 53 | 1 + torch.pow(self.theta / self.epsilon, self.alpha - target)) 54 | 55 | losses = torch.where( 56 | delta < self.theta, 57 | self.omega * 58 | torch.log(1 + 59 | torch.pow(delta / self.epsilon, self.alpha - target)), 60 | A * delta - C) 61 | 62 | return torch.mean(losses) 63 | 64 | def forward(self, output, target, target_weight): 65 | """Forward function. 66 | 67 | Note: 68 | batch_size: N 69 | num_keypoints: K 70 | 71 | Args: 72 | output (torch.Tensor[NxKxHxW]): Output heatmaps. 73 | target (torch.Tensor[NxKxHxW]): Target heatmaps. 74 | target_weight (torch.Tensor[NxKx1]): 75 | Weights across different joint types. 76 | """ 77 | if self.use_target_weight: 78 | loss = self.criterion(output * target_weight.unsqueeze(-1), 79 | target * target_weight.unsqueeze(-1)) 80 | else: 81 | loss = self.criterion(output, target) 82 | 83 | return loss * self.loss_weight 84 | -------------------------------------------------------------------------------- /models/losses/mse_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | __all__ = ['JointsMSELoss', 'JointsOHKMMSELoss',] 7 | 8 | 9 | class JointsMSELoss(nn.Module): 10 | """MSE loss for heatmaps. 11 | 12 | Args: 13 | use_target_weight (bool): Option to use weighted MSE loss. 14 | Different joint types may have different target weights. 15 | loss_weight (float): Weight of the loss. Default: 1.0. 16 | """ 17 | 18 | def __init__(self, use_target_weight=False, loss_weight=1.): 19 | super().__init__() 20 | self.criterion = nn.MSELoss() 21 | self.use_target_weight = use_target_weight 22 | self.loss_weight = loss_weight 23 | 24 | def forward(self, output, target, target_weight): 25 | """Forward function.""" 26 | batch_size = output.size(0) 27 | num_joints = output.size(1) 28 | 29 | heatmaps_pred = output.reshape( 30 | (batch_size, num_joints, -1)).split(1, 1) 31 | heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1) 32 | 33 | loss = 0. 34 | 35 | for idx in range(num_joints): 36 | heatmap_pred = heatmaps_pred[idx].squeeze(1) 37 | heatmap_gt = heatmaps_gt[idx].squeeze(1) 38 | if self.use_target_weight: 39 | loss += self.criterion(heatmap_pred * target_weight[:, idx], 40 | heatmap_gt * target_weight[:, idx]) 41 | else: 42 | loss += self.criterion(heatmap_pred, heatmap_gt) 43 | 44 | return loss / num_joints * self.loss_weight 45 | 46 | 47 | class CombinedTargetMSELoss(nn.Module): 48 | """MSE loss for combined target. 49 | CombinedTarget: The combination of classification target 50 | (response map) and regression target (offset map). 51 | Paper ref: Huang et al. The Devil is in the Details: Delving into 52 | Unbiased Data Processing for Human Pose Estimation (CVPR 2020). 53 | 54 | Args: 55 | use_target_weight (bool): Option to use weighted MSE loss. 56 | Different joint types may have different target weights. 57 | loss_weight (float): Weight of the loss. Default: 1.0. 58 | """ 59 | 60 | def __init__(self, use_target_weight, loss_weight=1.): 61 | super().__init__() 62 | self.criterion = nn.MSELoss(reduction='mean') 63 | self.use_target_weight = use_target_weight 64 | self.loss_weight = loss_weight 65 | 66 | def forward(self, output, target, target_weight): 67 | batch_size = output.size(0) 68 | num_channels = output.size(1) 69 | heatmaps_pred = output.reshape( 70 | (batch_size, num_channels, -1)).split(1, 1) 71 | heatmaps_gt = target.reshape( 72 | (batch_size, num_channels, -1)).split(1, 1) 73 | loss = 0. 74 | num_joints = num_channels // 3 75 | for idx in range(num_joints): 76 | heatmap_pred = heatmaps_pred[idx * 3].squeeze() 77 | heatmap_gt = heatmaps_gt[idx * 3].squeeze() 78 | offset_x_pred = heatmaps_pred[idx * 3 + 1].squeeze() 79 | offset_x_gt = heatmaps_gt[idx * 3 + 1].squeeze() 80 | offset_y_pred = heatmaps_pred[idx * 3 + 2].squeeze() 81 | offset_y_gt = heatmaps_gt[idx * 3 + 2].squeeze() 82 | if self.use_target_weight: 83 | heatmap_pred = heatmap_pred * target_weight[:, idx] 84 | heatmap_gt = heatmap_gt * target_weight[:, idx] 85 | # classification loss 86 | loss += 0.5 * self.criterion(heatmap_pred, heatmap_gt) 87 | # regression loss 88 | loss += 0.5 * self.criterion(heatmap_gt * offset_x_pred, 89 | heatmap_gt * offset_x_gt) 90 | loss += 0.5 * self.criterion(heatmap_gt * offset_y_pred, 91 | heatmap_gt * offset_y_gt) 92 | return loss / num_joints * self.loss_weight 93 | 94 | 95 | class JointsOHKMMSELoss(nn.Module): 96 | """MSE loss with online hard keypoint mining. 97 | 98 | Args: 99 | use_target_weight (bool): Option to use weighted MSE loss. 100 | Different joint types may have different target weights. 101 | topk (int): Only top k joint losses are kept. 102 | loss_weight (float): Weight of the loss. Default: 1.0. 103 | """ 104 | 105 | def __init__(self, use_target_weight=False, topk=8, loss_weight=1.): 106 | super().__init__() 107 | assert topk > 0 108 | self.criterion = nn.MSELoss(reduction='none') 109 | self.use_target_weight = use_target_weight 110 | self.topk = topk 111 | self.loss_weight = loss_weight 112 | 113 | def _ohkm(self, loss): 114 | """Online hard keypoint mining.""" 115 | ohkm_loss = 0. 116 | N = len(loss) 117 | for i in range(N): 118 | sub_loss = loss[i] 119 | _, topk_idx = torch.topk( 120 | sub_loss, k=self.topk, dim=0, sorted=False) 121 | tmp_loss = torch.gather(sub_loss, 0, topk_idx) 122 | ohkm_loss += torch.sum(tmp_loss) / self.topk 123 | ohkm_loss /= N 124 | return ohkm_loss 125 | 126 | def forward(self, output, target, target_weight): 127 | """Forward function.""" 128 | batch_size = output.size(0) 129 | num_joints = output.size(1) 130 | if num_joints < self.topk: 131 | raise ValueError(f'topk ({self.topk}) should not ' 132 | f'larger than num_joints ({num_joints}).') 133 | heatmaps_pred = output.reshape( 134 | (batch_size, num_joints, -1)).split(1, 1) 135 | heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1) 136 | 137 | losses = [] 138 | for idx in range(num_joints): 139 | heatmap_pred = heatmaps_pred[idx].squeeze(1) 140 | heatmap_gt = heatmaps_gt[idx].squeeze(1) 141 | if self.use_target_weight: 142 | losses.append( 143 | self.criterion(heatmap_pred * target_weight[:, idx], 144 | heatmap_gt * target_weight[:, idx])) 145 | else: 146 | losses.append(self.criterion(heatmap_pred, heatmap_gt)) 147 | 148 | losses = [loss.mean(dim=1).unsqueeze(dim=1) for loss in losses] 149 | losses = torch.cat(losses, dim=1) 150 | 151 | return self._ohkm(losses) * self.loss_weight 152 | -------------------------------------------------------------------------------- /models/losses/multi_loss_factory.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Adapted from https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation 3 | # Original licence: Copyright (c) Microsoft, under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | __all__ = ['HeatmapLoss', 'AELoss', 'MultiLossFactory'] 11 | 12 | 13 | def _make_input(t, requires_grad=False, device=torch.device('cpu')): 14 | """Make zero inputs for AE loss. 15 | 16 | Args: 17 | t (torch.Tensor): input 18 | requires_grad (bool): Option to use requires_grad. 19 | device: torch device 20 | 21 | Returns: 22 | torch.Tensor: zero input. 23 | """ 24 | inp = torch.autograd.Variable(t, requires_grad=requires_grad) 25 | inp = inp.sum() 26 | inp = inp.to(device) 27 | return inp 28 | 29 | 30 | class HeatmapLoss(nn.Module): 31 | """Accumulate the heatmap loss for each image in the batch. 32 | 33 | Args: 34 | supervise_empty (bool): Whether to supervise empty channels. 35 | """ 36 | 37 | def __init__(self, supervise_empty=True): 38 | super().__init__() 39 | self.supervise_empty = supervise_empty 40 | 41 | def forward(self, pred, gt, mask): 42 | """Forward function. 43 | 44 | Note: 45 | - batch_size: N 46 | - heatmaps weight: W 47 | - heatmaps height: H 48 | - max_num_people: M 49 | - num_keypoints: K 50 | 51 | Args: 52 | pred (torch.Tensor[N,K,H,W]):heatmap of output. 53 | gt (torch.Tensor[N,K,H,W]): target heatmap. 54 | mask (torch.Tensor[N,H,W]): mask of target. 55 | """ 56 | assert pred.size() == gt.size( 57 | ), f'pred.size() is {pred.size()}, gt.size() is {gt.size()}' 58 | 59 | if not self.supervise_empty: 60 | empty_mask = (gt.sum(dim=[2, 3], keepdim=True) > 0).float() 61 | loss = ((pred - gt)**2) * empty_mask.expand_as( 62 | pred) * mask[:, None, :, :].expand_as(pred) 63 | else: 64 | loss = ((pred - gt)**2) * mask[:, None, :, :].expand_as(pred) 65 | loss = loss.mean(dim=3).mean(dim=2).mean(dim=1) 66 | return loss 67 | 68 | 69 | class AELoss(nn.Module): 70 | """Associative Embedding loss. 71 | 72 | `Associative Embedding: End-to-End Learning for Joint Detection and 73 | Grouping `_. 74 | """ 75 | 76 | def __init__(self, loss_type): 77 | super().__init__() 78 | self.loss_type = loss_type 79 | 80 | def singleTagLoss(self, pred_tag, joints): 81 | """Associative embedding loss for one image. 82 | 83 | Note: 84 | - heatmaps weight: W 85 | - heatmaps height: H 86 | - max_num_people: M 87 | - num_keypoints: K 88 | 89 | Args: 90 | pred_tag (torch.Tensor[KxHxW,1]): tag of output for one image. 91 | joints (torch.Tensor[M,K,2]): joints information for one image. 92 | """ 93 | tags = [] 94 | pull = 0 95 | for joints_per_person in joints: 96 | tmp = [] 97 | for joint in joints_per_person: 98 | if joint[1] > 0: 99 | tmp.append(pred_tag[joint[0]]) 100 | if len(tmp) == 0: 101 | continue 102 | tmp = torch.stack(tmp) 103 | tags.append(torch.mean(tmp, dim=0)) 104 | pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2) 105 | 106 | num_tags = len(tags) 107 | if num_tags == 0: 108 | return ( 109 | _make_input(torch.zeros(1).float(), device=pred_tag.device), 110 | _make_input(torch.zeros(1).float(), device=pred_tag.device)) 111 | elif num_tags == 1: 112 | return (_make_input( 113 | torch.zeros(1).float(), device=pred_tag.device), pull) 114 | 115 | tags = torch.stack(tags) 116 | 117 | size = (num_tags, num_tags) 118 | A = tags.expand(*size) 119 | B = A.permute(1, 0) 120 | 121 | diff = A - B 122 | 123 | if self.loss_type == 'exp': 124 | diff = torch.pow(diff, 2) 125 | push = torch.exp(-diff) 126 | push = torch.sum(push) - num_tags 127 | elif self.loss_type == 'max': 128 | diff = 1 - torch.abs(diff) 129 | push = torch.clamp(diff, min=0).sum() - num_tags 130 | else: 131 | raise ValueError('Unknown ae loss type') 132 | 133 | push_loss = push / ((num_tags - 1) * num_tags) * 0.5 134 | pull_loss = pull / (num_tags) 135 | 136 | return push_loss, pull_loss 137 | 138 | def forward(self, tags, joints): 139 | """Accumulate the tag loss for each image in the batch. 140 | 141 | Note: 142 | - batch_size: N 143 | - heatmaps weight: W 144 | - heatmaps height: H 145 | - max_num_people: M 146 | - num_keypoints: K 147 | 148 | Args: 149 | tags (torch.Tensor[N,KxHxW,1]): tag channels of output. 150 | joints (torch.Tensor[N,M,K,2]): joints information. 151 | """ 152 | pushes, pulls = [], [] 153 | joints = joints.cpu().data.numpy() 154 | batch_size = tags.size(0) 155 | for i in range(batch_size): 156 | push, pull = self.singleTagLoss(tags[i], joints[i]) 157 | pushes.append(push) 158 | pulls.append(pull) 159 | return torch.stack(pushes), torch.stack(pulls) 160 | 161 | 162 | class MultiLossFactory(nn.Module): 163 | """Loss for bottom-up models. 164 | 165 | Args: 166 | num_joints (int): Number of keypoints. 167 | num_stages (int): Number of stages. 168 | ae_loss_type (str): Type of ae loss. 169 | with_ae_loss (list[bool]): Use ae loss or not in multi-heatmap. 170 | push_loss_factor (list[float]): 171 | Parameter of push loss in multi-heatmap. 172 | pull_loss_factor (list[float]): 173 | Parameter of pull loss in multi-heatmap. 174 | with_heatmap_loss (list[bool]): 175 | Use heatmap loss or not in multi-heatmap. 176 | heatmaps_loss_factor (list[float]): 177 | Parameter of heatmap loss in multi-heatmap. 178 | supervise_empty (bool): Whether to supervise empty channels. 179 | """ 180 | 181 | def __init__(self, 182 | num_joints, 183 | num_stages, 184 | ae_loss_type, 185 | with_ae_loss, 186 | push_loss_factor, 187 | pull_loss_factor, 188 | with_heatmaps_loss, 189 | heatmaps_loss_factor, 190 | supervise_empty=True): 191 | super().__init__() 192 | 193 | assert isinstance(with_heatmaps_loss, (list, tuple)), \ 194 | 'with_heatmaps_loss should be a list or tuple' 195 | assert isinstance(heatmaps_loss_factor, (list, tuple)), \ 196 | 'heatmaps_loss_factor should be a list or tuple' 197 | assert isinstance(with_ae_loss, (list, tuple)), \ 198 | 'with_ae_loss should be a list or tuple' 199 | assert isinstance(push_loss_factor, (list, tuple)), \ 200 | 'push_loss_factor should be a list or tuple' 201 | assert isinstance(pull_loss_factor, (list, tuple)), \ 202 | 'pull_loss_factor should be a list or tuple' 203 | 204 | self.num_joints = num_joints 205 | self.num_stages = num_stages 206 | self.ae_loss_type = ae_loss_type 207 | self.with_ae_loss = with_ae_loss 208 | self.push_loss_factor = push_loss_factor 209 | self.pull_loss_factor = pull_loss_factor 210 | self.with_heatmaps_loss = with_heatmaps_loss 211 | self.heatmaps_loss_factor = heatmaps_loss_factor 212 | 213 | self.heatmaps_loss = \ 214 | nn.ModuleList( 215 | [ 216 | HeatmapLoss(supervise_empty) 217 | if with_heatmaps_loss else None 218 | for with_heatmaps_loss in self.with_heatmaps_loss 219 | ] 220 | ) 221 | 222 | self.ae_loss = \ 223 | nn.ModuleList( 224 | [ 225 | AELoss(self.ae_loss_type) if with_ae_loss else None 226 | for with_ae_loss in self.with_ae_loss 227 | ] 228 | ) 229 | 230 | def forward(self, outputs, heatmaps, masks, joints): 231 | """Forward function to calculate losses. 232 | 233 | Note: 234 | - batch_size: N 235 | - heatmaps weight: W 236 | - heatmaps height: H 237 | - max_num_people: M 238 | - num_keypoints: K 239 | - output_channel: C C=2K if use ae loss else K 240 | 241 | Args: 242 | outputs (list(torch.Tensor[N,C,H,W])): outputs of stages. 243 | heatmaps (list(torch.Tensor[N,K,H,W])): target of heatmaps. 244 | masks (list(torch.Tensor[N,H,W])): masks of heatmaps. 245 | joints (list(torch.Tensor[N,M,K,2])): joints of ae loss. 246 | """ 247 | heatmaps_losses = [] 248 | push_losses = [] 249 | pull_losses = [] 250 | for idx in range(len(outputs)): 251 | offset_feat = 0 252 | if self.heatmaps_loss[idx]: 253 | heatmaps_pred = outputs[idx][:, :self.num_joints] 254 | offset_feat = self.num_joints 255 | heatmaps_loss = self.heatmaps_loss[idx](heatmaps_pred, 256 | heatmaps[idx], 257 | masks[idx]) 258 | heatmaps_loss = heatmaps_loss * self.heatmaps_loss_factor[idx] 259 | heatmaps_losses.append(heatmaps_loss) 260 | else: 261 | heatmaps_losses.append(None) 262 | 263 | if self.ae_loss[idx]: 264 | tags_pred = outputs[idx][:, offset_feat:] 265 | batch_size = tags_pred.size()[0] 266 | tags_pred = tags_pred.contiguous().view(batch_size, -1, 1) 267 | 268 | push_loss, pull_loss = self.ae_loss[idx](tags_pred, 269 | joints[idx]) 270 | push_loss = push_loss * self.push_loss_factor[idx] 271 | pull_loss = pull_loss * self.pull_loss_factor[idx] 272 | 273 | push_losses.append(push_loss) 274 | pull_losses.append(pull_loss) 275 | else: 276 | push_losses.append(None) 277 | pull_losses.append(None) 278 | 279 | return heatmaps_losses, push_losses, pull_losses 280 | -------------------------------------------------------------------------------- /models/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | from .backbone.vit import ViT 4 | from .head.topdown_heatmap_simple_head import TopdownHeatmapSimpleHead 5 | 6 | 7 | __all__ = ['ViTPose'] 8 | 9 | 10 | class ViTPose(nn.Module): 11 | def __init__(self, cfg: dict) -> None: 12 | super(ViTPose, self).__init__() 13 | 14 | backbone_cfg = {k: v for k, v in cfg['backbone'].items() if k != 'type'} 15 | head_cfg = {k: v for k, v in cfg['keypoint_head'].items() if k != 'type'} 16 | 17 | self.backbone = ViT(**backbone_cfg) 18 | self.keypoint_head = TopdownHeatmapSimpleHead(**head_cfg) 19 | 20 | def forward_features(self, x): 21 | return self.backbone(x) 22 | 23 | def forward(self, x): 24 | return self.keypoint_head(self.backbone(x)) -------------------------------------------------------------------------------- /models/optimizer.py: -------------------------------------------------------------------------------- 1 | import torch.optim as optim 2 | 3 | class LayerDecayOptimizer: 4 | def __init__(self, optimizer, layerwise_decay_rate): 5 | self.optimizer = optimizer 6 | self.layerwise_decay_rate = layerwise_decay_rate 7 | self.param_groups = optimizer.param_groups 8 | 9 | def step(self, *args, **kwargs): 10 | for i, group in enumerate(self.optimizer.param_groups): 11 | group['lr'] *= self.layerwise_decay_rate[i] 12 | self.optimizer.step(*args, **kwargs) 13 | 14 | def zero_grad(self, *args, **kwargs): 15 | self.optimizer.zero_grad(*args, **kwargs) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ffmpeg==1.4 2 | matplotlib==3.6.2 3 | munkres==1.1.4 4 | numpy==1.23.5 5 | opencv_python==4.6.0.66 6 | Pillow==9.3.0 7 | torch==1.9.0+cu111 8 | torchvision==0.10.0+cu111 9 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import copy 4 | import os 5 | import os.path as osp 6 | import time 7 | import warnings 8 | import click 9 | import yaml 10 | 11 | from glob import glob 12 | 13 | import torch 14 | import torch.distributed as dist 15 | 16 | from utils.util import init_random_seed, set_random_seed 17 | from utils.dist_util import get_dist_info, init_dist 18 | from utils.logging import get_root_logger 19 | 20 | import configs.ViTPose_base_coco_256x192 as b_cfg 21 | import configs.ViTPose_large_coco_256x192 as l_cfg 22 | import configs.ViTPose_huge_coco_256x192 as h_cfg 23 | 24 | from models.model import ViTPose 25 | from datasets.COCO import COCODataset 26 | from utils.train_valid_fn import train_model 27 | 28 | CUR_PATH = osp.dirname(__file__) 29 | 30 | @click.command() 31 | @click.option('--config-path', type=click.Path(exists=True), default='config.yaml', required=True, help='train config file path') 32 | @click.option('--model-name', type=str, default='b', required=True, help='[b: ViT-B, l: ViT-L, h: ViT-H]') 33 | def main(config_path, model_name): 34 | 35 | cfg = {'b':b_cfg, 36 | 'l':l_cfg, 37 | 'h':h_cfg}.get(model_name.lower()) 38 | # Load config.yaml 39 | with open(config_path, 'r') as f: 40 | cfg_yaml = yaml.load(f, Loader=yaml.SafeLoader) 41 | 42 | for k, v in cfg_yaml.items(): 43 | if hasattr(cfg, k): 44 | raise ValueError(f"Already exsist {k} in config") 45 | else: 46 | cfg.__setattr__(k, v) 47 | 48 | # set cudnn_benchmark 49 | if cfg.cudnn_benchmark: 50 | torch.backends.cudnn.benchmark = True 51 | 52 | # Set work directory (session-level) 53 | if not hasattr(cfg, 'work_dir'): 54 | cfg.__setattr__('work_dir', f"{CUR_PATH}/runs/train") 55 | 56 | if not osp.exists(cfg.work_dir): 57 | os.makedirs(cfg.work_dir) 58 | session_list = sorted(glob(f"{cfg.work_dir}/*")) 59 | if len(session_list) == 0: 60 | session = 1 61 | else: 62 | session = int(os.path.basename(session_list[-1])) + 1 63 | session_dir = osp.join(cfg.work_dir, str(session).zfill(3)) 64 | os.makedirs(session_dir) 65 | cfg.__setattr__('work_dir', session_dir) 66 | 67 | 68 | if cfg.autoscale_lr: 69 | # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) 70 | cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 71 | 72 | # init distributed env first, since logger depends on the dist info. 73 | if cfg.launcher == 'none': 74 | distributed = False 75 | if len(cfg.gpu_ids) > 1: 76 | warnings.warn( 77 | f"We treat {cfg['gpu_ids']} as gpu-ids, and reset to " 78 | f"{cfg['gpu_ids'][0:1]} as gpu-ids to avoid potential error in " 79 | "non-distribute training time.") 80 | cfg.gpu_ids = cfg.gpu_ids[0:1] 81 | else: 82 | distributed = True 83 | init_dist(cfg.launcher, **cfg.dist_params) 84 | # re-set gpu_ids with distributed training mode 85 | _, world_size = get_dist_info() 86 | cfg.gpu_ids = range(world_size) 87 | 88 | # init the logger before other steps 89 | timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) 90 | log_file = osp.join(session_dir, f'{timestamp}.log') 91 | logger = get_root_logger(log_file=log_file) 92 | 93 | # init the meta dict to record some important information such as 94 | # environment info and seed, which will be logged 95 | meta = dict() 96 | 97 | # log some basic info 98 | logger.info(f'Distributed training: {distributed}') 99 | 100 | # set random seeds 101 | seed = init_random_seed(cfg.seed) 102 | logger.info(f"Set random seed to {seed}, " 103 | f"deterministic: {cfg.deterministic}") 104 | set_random_seed(seed, deterministic=cfg.deterministic) 105 | meta['seed'] = seed 106 | 107 | # Set model 108 | model = ViTPose(cfg.model) 109 | if cfg.resume_from: 110 | model.load_state_dict(torch.load(cfg.resume_from)['state_dict']) 111 | 112 | # Set dataset 113 | datasets_train = COCODataset( 114 | root_path=cfg.data_root, 115 | data_version="train_custom", 116 | is_train=True, 117 | use_gt_bboxes=True, 118 | image_width=192, 119 | image_height=256, 120 | scale=True, 121 | scale_factor=0.35, 122 | flip_prob=0.5, 123 | rotate_prob=0.5, 124 | rotation_factor=45., 125 | half_body_prob=0.3, 126 | use_different_joints_weight=True, 127 | heatmap_sigma=3, 128 | soft_nms=False 129 | ) 130 | 131 | datasets_valid = COCODataset( 132 | root_path=cfg.data_root, 133 | data_version="valid_custom", 134 | is_train=False, 135 | use_gt_bboxes=True, 136 | image_width=192, 137 | image_height=256, 138 | scale=False, 139 | scale_factor=0.35, 140 | flip_prob=0.5, 141 | rotate_prob=0.5, 142 | rotation_factor=45., 143 | half_body_prob=0.3, 144 | use_different_joints_weight=True, 145 | heatmap_sigma=3, 146 | soft_nms=False 147 | ) 148 | 149 | train_model( 150 | model=model, 151 | datasets_train=datasets_train, 152 | datasets_valid=datasets_valid, 153 | cfg=cfg, 154 | distributed=distributed, 155 | validate=cfg.validate, 156 | timestamp=timestamp, 157 | meta=meta 158 | ) 159 | 160 | 161 | if __name__ == '__main__': 162 | main() 163 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .util import * 2 | from .top_down_eval import * 3 | from .post_processing import * 4 | from .visualization import * 5 | from .dist_util import * 6 | from .logging import * 7 | -------------------------------------------------------------------------------- /utils/dist_util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | import functools 4 | import os 5 | import socket 6 | import subprocess 7 | from collections import OrderedDict 8 | from typing import Callable, List, Optional, Tuple 9 | 10 | import torch 11 | import torch.multiprocessing as mp 12 | from torch import distributed as dist 13 | from torch._utils import (_flatten_dense_tensors, _take_tensors, 14 | _unflatten_dense_tensors) 15 | 16 | 17 | def is_mps_available() -> bool: 18 | """Return True if mps devices exist. 19 | 20 | It's specialized for mac m1 chips and require torch version 1.12 or higher. 21 | """ 22 | try: 23 | import torch 24 | return hasattr(torch.backends, 25 | 'mps') and torch.backends.mps.is_available() 26 | except Exception: 27 | return False 28 | 29 | def _find_free_port() -> str: 30 | # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501 31 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 32 | # Binding to port 0 will cause the OS to find an available port for us 33 | sock.bind(('', 0)) 34 | port = sock.getsockname()[1] 35 | sock.close() 36 | # NOTE: there is still a chance the port could be taken by other processes. 37 | return port 38 | 39 | 40 | def _is_free_port(port: int) -> bool: 41 | ips = socket.gethostbyname_ex(socket.gethostname())[-1] 42 | ips.append('localhost') 43 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: 44 | return all(s.connect_ex((ip, port)) != 0 for ip in ips) 45 | 46 | 47 | def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None: 48 | if mp.get_start_method(allow_none=True) is None: 49 | mp.set_start_method('spawn') 50 | if launcher == 'pytorch': 51 | _init_dist_pytorch(backend, **kwargs) 52 | elif launcher == 'mpi': 53 | _init_dist_mpi(backend, **kwargs) 54 | elif launcher == 'slurm': 55 | _init_dist_slurm(backend, **kwargs) 56 | else: 57 | raise ValueError(f'Invalid launcher type: {launcher}') 58 | 59 | 60 | def _init_dist_pytorch(backend: str, **kwargs) -> None: 61 | # TODO: use local_rank instead of rank % num_gpus 62 | rank = int(os.environ['RANK']) 63 | num_gpus = torch.cuda.device_count() 64 | torch.cuda.set_device(rank % num_gpus) 65 | dist.init_process_group(backend=backend, **kwargs) 66 | 67 | 68 | def _init_dist_mpi(backend: str, **kwargs) -> None: 69 | local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) 70 | torch.cuda.set_device(local_rank) 71 | if 'MASTER_PORT' not in os.environ: 72 | # 29500 is torch.distributed default port 73 | os.environ['MASTER_PORT'] = '29500' 74 | if 'MASTER_ADDR' not in os.environ: 75 | raise KeyError('The environment variable MASTER_ADDR is not set') 76 | os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE'] 77 | os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK'] 78 | dist.init_process_group(backend=backend, **kwargs) 79 | 80 | 81 | def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None: 82 | """Initialize slurm distributed training environment. 83 | 84 | If argument ``port`` is not specified, then the master port will be system 85 | environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system 86 | environment variable, then a default port ``29500`` will be used. 87 | 88 | Args: 89 | backend (str): Backend of torch.distributed. 90 | port (int, optional): Master port. Defaults to None. 91 | """ 92 | proc_id = int(os.environ['SLURM_PROCID']) 93 | ntasks = int(os.environ['SLURM_NTASKS']) 94 | node_list = os.environ['SLURM_NODELIST'] 95 | num_gpus = torch.cuda.device_count() 96 | torch.cuda.set_device(proc_id % num_gpus) 97 | addr = subprocess.getoutput( 98 | f'scontrol show hostname {node_list} | head -n1') 99 | # specify master port 100 | if port is not None: 101 | os.environ['MASTER_PORT'] = str(port) 102 | elif 'MASTER_PORT' in os.environ: 103 | pass # use MASTER_PORT in the environment variable 104 | else: 105 | # if torch.distributed default port(29500) is available 106 | # then use it, else find a free port 107 | if _is_free_port(29500): 108 | os.environ['MASTER_PORT'] = '29500' 109 | else: 110 | os.environ['MASTER_PORT'] = str(_find_free_port()) 111 | # use MASTER_ADDR in the environment variable if it already exists 112 | if 'MASTER_ADDR' not in os.environ: 113 | os.environ['MASTER_ADDR'] = addr 114 | os.environ['WORLD_SIZE'] = str(ntasks) 115 | os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) 116 | os.environ['RANK'] = str(proc_id) 117 | dist.init_process_group(backend=backend) 118 | 119 | 120 | def get_dist_info() -> Tuple[int, int]: 121 | if dist.is_available() and dist.is_initialized(): 122 | rank = dist.get_rank() 123 | world_size = dist.get_world_size() 124 | else: 125 | rank = 0 126 | world_size = 1 127 | return rank, world_size 128 | 129 | 130 | def master_only(func: Callable) -> Callable: 131 | 132 | @functools.wraps(func) 133 | def wrapper(*args, **kwargs): 134 | rank, _ = get_dist_info() 135 | if rank == 0: 136 | return func(*args, **kwargs) 137 | 138 | return wrapper 139 | 140 | 141 | def allreduce_params(params: List[torch.nn.Parameter], 142 | coalesce: bool = True, 143 | bucket_size_mb: int = -1) -> None: 144 | """Allreduce parameters. 145 | 146 | Args: 147 | params (list[torch.nn.Parameter]): List of parameters or buffers 148 | of a model. 149 | coalesce (bool, optional): Whether allreduce parameters as a whole. 150 | Defaults to True. 151 | bucket_size_mb (int, optional): Size of bucket, the unit is MB. 152 | Defaults to -1. 153 | """ 154 | _, world_size = get_dist_info() 155 | if world_size == 1: 156 | return 157 | params = [param.data for param in params] 158 | if coalesce: 159 | _allreduce_coalesced(params, world_size, bucket_size_mb) 160 | else: 161 | for tensor in params: 162 | dist.all_reduce(tensor.div_(world_size)) 163 | 164 | 165 | def allreduce_grads(params: List[torch.nn.Parameter], 166 | coalesce: bool = True, 167 | bucket_size_mb: int = -1) -> None: 168 | """Allreduce gradients. 169 | 170 | Args: 171 | params (list[torch.nn.Parameter]): List of parameters of a model. 172 | coalesce (bool, optional): Whether allreduce parameters as a whole. 173 | Defaults to True. 174 | bucket_size_mb (int, optional): Size of bucket, the unit is MB. 175 | Defaults to -1. 176 | """ 177 | grads = [ 178 | param.grad.data for param in params 179 | if param.requires_grad and param.grad is not None 180 | ] 181 | _, world_size = get_dist_info() 182 | if world_size == 1: 183 | return 184 | if coalesce: 185 | _allreduce_coalesced(grads, world_size, bucket_size_mb) 186 | else: 187 | for tensor in grads: 188 | dist.all_reduce(tensor.div_(world_size)) 189 | 190 | 191 | def _allreduce_coalesced(tensors: torch.Tensor, 192 | world_size: int, 193 | bucket_size_mb: int = -1) -> None: 194 | if bucket_size_mb > 0: 195 | bucket_size_bytes = bucket_size_mb * 1024 * 1024 196 | buckets = _take_tensors(tensors, bucket_size_bytes) 197 | else: 198 | buckets = OrderedDict() 199 | for tensor in tensors: 200 | tp = tensor.type() 201 | if tp not in buckets: 202 | buckets[tp] = [] 203 | buckets[tp].append(tensor) 204 | buckets = buckets.values() 205 | 206 | for bucket in buckets: 207 | flat_tensors = _flatten_dense_tensors(bucket) 208 | dist.all_reduce(flat_tensors) 209 | flat_tensors.div_(world_size) 210 | for tensor, synced in zip( 211 | bucket, _unflatten_dense_tensors(flat_tensors, bucket)): 212 | tensor.copy_(synced) 213 | -------------------------------------------------------------------------------- /utils/logging.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import logging 3 | 4 | import torch.distributed as dist 5 | 6 | logger_initialized: dict = {} 7 | 8 | 9 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): 10 | """Initialize and get a logger by name. 11 | 12 | If the logger has not been initialized, this method will initialize the 13 | logger by adding one or two handlers, otherwise the initialized logger will 14 | be directly returned. During initialization, a StreamHandler will always be 15 | added. If `log_file` is specified and the process rank is 0, a FileHandler 16 | will also be added. 17 | 18 | Args: 19 | name (str): Logger name. 20 | log_file (str | None): The log filename. If specified, a FileHandler 21 | will be added to the logger. 22 | log_level (int): The logger level. Note that only the process of 23 | rank 0 is affected, and other processes will set the level to 24 | "Error" thus be silent most of the time. 25 | file_mode (str): The file mode used in opening log file. 26 | Defaults to 'w'. 27 | 28 | Returns: 29 | logging.Logger: The expected logger. 30 | """ 31 | logger = logging.getLogger(name) 32 | if name in logger_initialized: 33 | return logger 34 | # handle hierarchical names 35 | # e.g., logger "a" is initialized, then logger "a.b" will skip the 36 | # initialization since it is a child of "a". 37 | for logger_name in logger_initialized: 38 | if name.startswith(logger_name): 39 | return logger 40 | 41 | # handle duplicate logs to the console 42 | # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler (NOTSET) 43 | # to the root logger. As logger.propagate is True by default, this root 44 | # level handler causes logging messages from rank>0 processes to 45 | # unexpectedly show up on the console, creating much unwanted clutter. 46 | # To fix this issue, we set the root logger's StreamHandler, if any, to log 47 | # at the ERROR level. 48 | for handler in logger.root.handlers: 49 | if type(handler) is logging.StreamHandler: 50 | handler.setLevel(logging.ERROR) 51 | 52 | stream_handler = logging.StreamHandler() 53 | handlers = [stream_handler] 54 | 55 | if dist.is_available() and dist.is_initialized(): 56 | rank = dist.get_rank() 57 | else: 58 | rank = 0 59 | 60 | # only rank 0 will add a FileHandler 61 | if rank == 0 and log_file is not None: 62 | # Here, the default behaviour of the official logger is 'a'. Thus, we 63 | # provide an interface to change the file mode to the default 64 | # behaviour. 65 | file_handler = logging.FileHandler(log_file, file_mode) 66 | handlers.append(file_handler) 67 | 68 | formatter = logging.Formatter( 69 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 70 | for handler in handlers: 71 | handler.setFormatter(formatter) 72 | handler.setLevel(log_level) 73 | logger.addHandler(handler) 74 | 75 | if rank == 0: 76 | logger.setLevel(log_level) 77 | else: 78 | logger.setLevel(logging.ERROR) 79 | 80 | logger_initialized[name] = True 81 | 82 | return logger 83 | 84 | 85 | def print_log(msg, logger=None, level=logging.INFO): 86 | """Print a log message. 87 | 88 | Args: 89 | msg (str): The message to be logged. 90 | logger (logging.Logger | str | None): The logger to be used. 91 | Some special loggers are: 92 | 93 | - "silent": no message will be printed. 94 | - other str: the logger obtained with `get_root_logger(logger)`. 95 | - None: The `print()` method will be used to print log messages. 96 | level (int): Logging level. Only available when `logger` is a Logger 97 | object or "root". 98 | """ 99 | if logger is None: 100 | print(msg) 101 | elif isinstance(logger, logging.Logger): 102 | logger.log(level, msg) 103 | elif logger == 'silent': 104 | pass 105 | elif isinstance(logger, str): 106 | _logger = get_logger(logger) 107 | _logger.log(level, msg) 108 | else: 109 | raise TypeError( 110 | 'logger should be either a logging.Logger object, str, ' 111 | f'"silent" or None, but got {type(logger)}') 112 | 113 | 114 | def get_root_logger(log_file=None, log_level=logging.INFO): 115 | """Use `get_logger` method in mmcv to get the root logger. 116 | 117 | The logger will be initialized if it has not been initialized. By default a 118 | StreamHandler will be added. If `log_file` is specified, a FileHandler will 119 | also be added. The name of the root logger is the top-level package name, 120 | e.g., "mmpose". 121 | 122 | Args: 123 | log_file (str | None): The log filename. If specified, a FileHandler 124 | will be added to the root logger. 125 | log_level (int): The root logger level. Note that only the process of 126 | rank 0 is affected, while other processes will set the level to 127 | "Error" and be silent most of the time. 128 | 129 | Returns: 130 | logging.Logger: The root logger. 131 | """ 132 | return get_logger(__name__.split('.')[0], log_file, log_level) 133 | 134 | -------------------------------------------------------------------------------- /utils/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/utils/nms/__init__.py -------------------------------------------------------------------------------- /utils/nms/cpu_nms.cpython-37m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/utils/nms/cpu_nms.cpython-37m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /utils/nms/cpu_nms.cpython-39-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/utils/nms/cpu_nms.cpython-39-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /utils/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft 3 | # Licensed under the MIT License. 4 | # Written by Bin Xiao (Bin.Xiao@microsoft.com) 5 | # ------------------------------------------------------------------------------ 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import numpy as np 12 | cimport numpy as np 13 | 14 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 15 | return a if a >= b else b 16 | 17 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 18 | return a if a <= b else b 19 | 20 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 21 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 22 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 23 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 24 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 25 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 26 | 27 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 28 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1].astype('i') 29 | 30 | cdef int ndets = dets.shape[0] 31 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 32 | np.zeros((ndets), dtype=np.int) 33 | 34 | # nominal indices 35 | cdef int _i, _j 36 | # sorted indices 37 | cdef int i, j 38 | # temp variables for box i's (the box currently under consideration) 39 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 40 | # variables for computing overlap with box j (lower scoring box) 41 | cdef np.float32_t xx1, yy1, xx2, yy2 42 | cdef np.float32_t w, h 43 | cdef np.float32_t inter, ovr 44 | 45 | keep = [] 46 | for _i in range(ndets): 47 | i = order[_i] 48 | if suppressed[i] == 1: 49 | continue 50 | keep.append(i) 51 | ix1 = x1[i] 52 | iy1 = y1[i] 53 | ix2 = x2[i] 54 | iy2 = y2[i] 55 | iarea = areas[i] 56 | for _j in range(_i + 1, ndets): 57 | j = order[_j] 58 | if suppressed[j] == 1: 59 | continue 60 | xx1 = max(ix1, x1[j]) 61 | yy1 = max(iy1, y1[j]) 62 | xx2 = min(ix2, x2[j]) 63 | yy2 = min(iy2, y2[j]) 64 | w = max(0.0, xx2 - xx1 + 1) 65 | h = max(0.0, yy2 - yy1 + 1) 66 | inter = w * h 67 | ovr = inter / (iarea + areas[j] - inter) 68 | if ovr >= thresh: 69 | suppressed[j] = 1 70 | 71 | return keep 72 | -------------------------------------------------------------------------------- /utils/nms/gpu_nms.cpython-37m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/utils/nms/gpu_nms.cpython-37m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /utils/nms/gpu_nms.cpython-39-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/utils/nms/gpu_nms.cpython-39-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /utils/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /utils/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft 3 | # Licensed under the MIT License. 4 | # Written by Bin Xiao (Bin.Xiao@microsoft.com) 5 | # ------------------------------------------------------------------------------ 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import numpy as np 12 | cimport numpy as np 13 | 14 | assert sizeof(int) == sizeof(np.int32_t) 15 | 16 | cdef extern from "gpu_nms.hpp": 17 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 18 | 19 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 20 | np.int32_t device_id=0): 21 | cdef int boxes_num = dets.shape[0] 22 | cdef int boxes_dim = dets.shape[1] 23 | cdef int num_out 24 | cdef np.ndarray[np.int32_t, ndim=1] \ 25 | keep = np.zeros(boxes_num, dtype=np.int32) 26 | cdef np.ndarray[np.float32_t, ndim=1] \ 27 | scores = dets[:, 4] 28 | cdef np.ndarray[np.int32_t, ndim=1] \ 29 | order = scores.argsort()[::-1].astype(np.int32) 30 | cdef np.ndarray[np.float32_t, ndim=2] \ 31 | sorted_dets = dets[order, :] 32 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 33 | keep = keep[:num_out] 34 | return list(order[keep]) 35 | -------------------------------------------------------------------------------- /utils/nms/nms.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft 3 | # Licensed under the MIT License. 4 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 5 | # ------------------------------------------------------------------------------ 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import numpy as np 12 | 13 | from .cpu_nms import cpu_nms 14 | from .gpu_nms import gpu_nms 15 | 16 | 17 | def py_nms_wrapper(thresh): 18 | def _nms(dets): 19 | return nms(dets, thresh) 20 | return _nms 21 | 22 | 23 | def cpu_nms_wrapper(thresh): 24 | def _nms(dets): 25 | return cpu_nms(dets, thresh) 26 | return _nms 27 | 28 | 29 | def gpu_nms_wrapper(thresh, device_id): 30 | def _nms(dets): 31 | return gpu_nms(dets, thresh, device_id) 32 | return _nms 33 | 34 | 35 | def nms(dets, thresh): 36 | """ 37 | greedily select boxes with high confidence and overlap with current maximum <= thresh 38 | rule out overlap >= thresh 39 | :param dets: [[x1, y1, x2, y2 score]] 40 | :param thresh: retain overlap < thresh 41 | :return: indexes to keep 42 | """ 43 | if dets.shape[0] == 0: 44 | return [] 45 | 46 | x1 = dets[:, 0] 47 | y1 = dets[:, 1] 48 | x2 = dets[:, 2] 49 | y2 = dets[:, 3] 50 | scores = dets[:, 4] 51 | 52 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 53 | order = scores.argsort()[::-1] 54 | 55 | keep = [] 56 | while order.size > 0: 57 | i = order[0] 58 | keep.append(i) 59 | xx1 = np.maximum(x1[i], x1[order[1:]]) 60 | yy1 = np.maximum(y1[i], y1[order[1:]]) 61 | xx2 = np.minimum(x2[i], x2[order[1:]]) 62 | yy2 = np.minimum(y2[i], y2[order[1:]]) 63 | 64 | w = np.maximum(0.0, xx2 - xx1 + 1) 65 | h = np.maximum(0.0, yy2 - yy1 + 1) 66 | inter = w * h 67 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 68 | 69 | inds = np.where(ovr <= thresh)[0] 70 | order = order[inds + 1] 71 | 72 | return keep 73 | 74 | 75 | def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None): 76 | if not isinstance(sigmas, np.ndarray): 77 | sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89]) / 10.0 78 | vars = (sigmas * 2) ** 2 79 | xg = g[0::3] 80 | yg = g[1::3] 81 | vg = g[2::3] 82 | ious = np.zeros((d.shape[0])) 83 | for n_d in range(0, d.shape[0]): 84 | xd = d[n_d, 0::3] 85 | yd = d[n_d, 1::3] 86 | vd = d[n_d, 2::3] 87 | dx = xd - xg 88 | dy = yd - yg 89 | e = (dx ** 2 + dy ** 2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2 90 | if in_vis_thre is not None: 91 | ind = list(vg > in_vis_thre) and list(vd > in_vis_thre) 92 | e = e[ind] 93 | ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0 94 | return ious 95 | 96 | 97 | def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None): 98 | """ 99 | greedily select boxes with high confidence and overlap with current maximum <= thresh 100 | rule out overlap >= thresh, overlap = oks 101 | :param kpts_db 102 | :param thresh: retain overlap < thresh 103 | :return: indexes to keep 104 | """ 105 | if len(kpts_db) == 0: 106 | return [] 107 | 108 | scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))]) 109 | kpts = np.array([kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))]) 110 | areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))]) 111 | 112 | order = scores.argsort()[::-1] 113 | 114 | keep = [] 115 | while order.size > 0: 116 | i = order[0] 117 | keep.append(i) 118 | 119 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], sigmas, in_vis_thre) 120 | 121 | inds = np.where(oks_ovr <= thresh)[0] 122 | order = order[inds + 1] 123 | 124 | return keep 125 | 126 | 127 | def rescore(overlap, scores, thresh, type='gaussian'): 128 | assert overlap.shape[0] == scores.shape[0] 129 | if type == 'linear': 130 | inds = np.where(overlap >= thresh)[0] 131 | scores[inds] = scores[inds] * (1 - overlap[inds]) 132 | else: 133 | scores = scores * np.exp(- overlap**2 / thresh) 134 | 135 | return scores 136 | 137 | 138 | def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None): 139 | """ 140 | greedily select boxes with high confidence and overlap with current maximum <= thresh 141 | rule out overlap >= thresh, overlap = oks 142 | :param kpts_db 143 | :param thresh: retain overlap < thresh 144 | :return: indexes to keep 145 | """ 146 | if len(kpts_db) == 0: 147 | return [] 148 | 149 | scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))]) 150 | kpts = np.array([kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))]) 151 | areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))]) 152 | 153 | order = scores.argsort()[::-1] 154 | scores = scores[order] 155 | 156 | # max_dets = order.size 157 | max_dets = 20 158 | keep = np.zeros(max_dets, dtype=np.intp) 159 | keep_cnt = 0 160 | while order.size > 0 and keep_cnt < max_dets: 161 | i = order[0] 162 | 163 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], sigmas, in_vis_thre) 164 | 165 | order = order[1:] 166 | scores = rescore(oks_ovr, scores[1:], thresh) 167 | 168 | tmp = scores.argsort()[::-1] 169 | order = order[tmp] 170 | scores = scores[tmp] 171 | 172 | keep[keep_cnt] = i 173 | keep_cnt += 1 174 | 175 | keep = keep[:keep_cnt] 176 | 177 | return keep 178 | # kpts_db = kpts_db[:keep_cnt] 179 | 180 | # return kpts_db 181 | -------------------------------------------------------------------------------- /utils/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Copyright (c) Microsoft 3 | // Licensed under The MIT License 4 | // Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn) 5 | // ------------------------------------------------------------------ 6 | 7 | #include "gpu_nms.hpp" 8 | #include 9 | #include 10 | 11 | #define CUDA_CHECK(condition) \ 12 | /* Code block avoids redefinition of cudaError_t error */ \ 13 | do { \ 14 | cudaError_t error = condition; \ 15 | if (error != cudaSuccess) { \ 16 | std::cout << cudaGetErrorString(error) << std::endl; \ 17 | } \ 18 | } while (0) 19 | 20 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 21 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 22 | 23 | __device__ inline float devIoU(float const * const a, float const * const b) { 24 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 25 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 26 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 27 | float interS = width * height; 28 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 29 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 30 | return interS / (Sa + Sb - interS); 31 | } 32 | 33 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 34 | const float *dev_boxes, unsigned long long *dev_mask) { 35 | const int row_start = blockIdx.y; 36 | const int col_start = blockIdx.x; 37 | 38 | // if (row_start > col_start) return; 39 | 40 | const int row_size = 41 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 42 | const int col_size = 43 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 44 | 45 | __shared__ float block_boxes[threadsPerBlock * 5]; 46 | if (threadIdx.x < col_size) { 47 | block_boxes[threadIdx.x * 5 + 0] = 48 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 49 | block_boxes[threadIdx.x * 5 + 1] = 50 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 51 | block_boxes[threadIdx.x * 5 + 2] = 52 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 53 | block_boxes[threadIdx.x * 5 + 3] = 54 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 55 | block_boxes[threadIdx.x * 5 + 4] = 56 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 57 | } 58 | __syncthreads(); 59 | 60 | if (threadIdx.x < row_size) { 61 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 62 | const float *cur_box = dev_boxes + cur_box_idx * 5; 63 | int i = 0; 64 | unsigned long long t = 0; 65 | int start = 0; 66 | if (row_start == col_start) { 67 | start = threadIdx.x + 1; 68 | } 69 | for (i = start; i < col_size; i++) { 70 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 71 | t |= 1ULL << i; 72 | } 73 | } 74 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 75 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 76 | } 77 | } 78 | 79 | void _set_device(int device_id) { 80 | int current_device; 81 | CUDA_CHECK(cudaGetDevice(¤t_device)); 82 | if (current_device == device_id) { 83 | return; 84 | } 85 | // The call to cudaSetDevice must come before any calls to Get, which 86 | // may perform initialization using the GPU. 87 | CUDA_CHECK(cudaSetDevice(device_id)); 88 | } 89 | 90 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 91 | int boxes_dim, float nms_overlap_thresh, int device_id) { 92 | _set_device(device_id); 93 | 94 | float* boxes_dev = NULL; 95 | unsigned long long* mask_dev = NULL; 96 | 97 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 98 | 99 | CUDA_CHECK(cudaMalloc(&boxes_dev, 100 | boxes_num * boxes_dim * sizeof(float))); 101 | CUDA_CHECK(cudaMemcpy(boxes_dev, 102 | boxes_host, 103 | boxes_num * boxes_dim * sizeof(float), 104 | cudaMemcpyHostToDevice)); 105 | 106 | CUDA_CHECK(cudaMalloc(&mask_dev, 107 | boxes_num * col_blocks * sizeof(unsigned long long))); 108 | 109 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 110 | DIVUP(boxes_num, threadsPerBlock)); 111 | dim3 threads(threadsPerBlock); 112 | nms_kernel<<>>(boxes_num, 113 | nms_overlap_thresh, 114 | boxes_dev, 115 | mask_dev); 116 | 117 | std::vector mask_host(boxes_num * col_blocks); 118 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 119 | mask_dev, 120 | sizeof(unsigned long long) * boxes_num * col_blocks, 121 | cudaMemcpyDeviceToHost)); 122 | 123 | std::vector remv(col_blocks); 124 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 125 | 126 | int num_to_keep = 0; 127 | for (int i = 0; i < boxes_num; i++) { 128 | int nblock = i / threadsPerBlock; 129 | int inblock = i % threadsPerBlock; 130 | 131 | if (!(remv[nblock] & (1ULL << inblock))) { 132 | keep_out[num_to_keep++] = i; 133 | unsigned long long *p = &mask_host[0] + i * col_blocks; 134 | for (int j = nblock; j < col_blocks; j++) { 135 | remv[j] |= p[j]; 136 | } 137 | } 138 | } 139 | *num_out = num_to_keep; 140 | 141 | CUDA_CHECK(cudaFree(boxes_dev)); 142 | CUDA_CHECK(cudaFree(mask_dev)); 143 | } 144 | -------------------------------------------------------------------------------- /utils/nms/nms_ori.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft 3 | # Licensed under the MIT License. 4 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 5 | # ------------------------------------------------------------------------------ 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import numpy as np 12 | 13 | from cpu_nms import cpu_nms 14 | from gpu_nms import gpu_nms 15 | 16 | 17 | def py_nms_wrapper(thresh): 18 | def _nms(dets): 19 | return nms(dets, thresh) 20 | return _nms 21 | 22 | 23 | def cpu_nms_wrapper(thresh): 24 | def _nms(dets): 25 | return cpu_nms(dets, thresh) 26 | return _nms 27 | 28 | 29 | def gpu_nms_wrapper(thresh, device_id): 30 | def _nms(dets): 31 | return gpu_nms(dets, thresh, device_id) 32 | return _nms 33 | 34 | 35 | def nms(dets, thresh): 36 | """ 37 | greedily select boxes with high confidence and overlap with current maximum <= thresh 38 | rule out overlap >= thresh 39 | :param dets: [[x1, y1, x2, y2 score]] 40 | :param thresh: retain overlap < thresh 41 | :return: indexes to keep 42 | """ 43 | if dets.shape[0] == 0: 44 | return [] 45 | 46 | x1 = dets[:, 0] 47 | y1 = dets[:, 1] 48 | x2 = dets[:, 2] 49 | y2 = dets[:, 3] 50 | scores = dets[:, 4] 51 | 52 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 53 | order = scores.argsort()[::-1] 54 | 55 | keep = [] 56 | while order.size > 0: 57 | i = order[0] 58 | keep.append(i) 59 | xx1 = np.maximum(x1[i], x1[order[1:]]) 60 | yy1 = np.maximum(y1[i], y1[order[1:]]) 61 | xx2 = np.minimum(x2[i], x2[order[1:]]) 62 | yy2 = np.minimum(y2[i], y2[order[1:]]) 63 | 64 | w = np.maximum(0.0, xx2 - xx1 + 1) 65 | h = np.maximum(0.0, yy2 - yy1 + 1) 66 | inter = w * h 67 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 68 | 69 | inds = np.where(ovr <= thresh)[0] 70 | order = order[inds + 1] 71 | 72 | return keep 73 | 74 | 75 | def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None): 76 | if not isinstance(sigmas, np.ndarray): 77 | sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89]) / 10.0 78 | vars = (sigmas * 2) ** 2 79 | xg = g[0::3] 80 | yg = g[1::3] 81 | vg = g[2::3] 82 | ious = np.zeros((d.shape[0])) 83 | for n_d in range(0, d.shape[0]): 84 | xd = d[n_d, 0::3] 85 | yd = d[n_d, 1::3] 86 | vd = d[n_d, 2::3] 87 | dx = xd - xg 88 | dy = yd - yg 89 | e = (dx ** 2 + dy ** 2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2 90 | if in_vis_thre is not None: 91 | ind = list(vg > in_vis_thre) and list(vd > in_vis_thre) 92 | e = e[ind] 93 | ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0 94 | return ious 95 | 96 | 97 | def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None): 98 | """ 99 | greedily select boxes with high confidence and overlap with current maximum <= thresh 100 | rule out overlap >= thresh, overlap = oks 101 | :param kpts_db 102 | :param thresh: retain overlap < thresh 103 | :return: indexes to keep 104 | """ 105 | if len(kpts_db) == 0: 106 | return [] 107 | 108 | scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))]) 109 | kpts = np.array([kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))]) 110 | areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))]) 111 | 112 | order = scores.argsort()[::-1] 113 | 114 | keep = [] 115 | while order.size > 0: 116 | i = order[0] 117 | keep.append(i) 118 | 119 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], sigmas, in_vis_thre) 120 | 121 | inds = np.where(oks_ovr <= thresh)[0] 122 | order = order[inds + 1] 123 | 124 | return keep 125 | 126 | 127 | def rescore(overlap, scores, thresh, type='gaussian'): 128 | assert overlap.shape[0] == scores.shape[0] 129 | if type == 'linear': 130 | inds = np.where(overlap >= thresh)[0] 131 | scores[inds] = scores[inds] * (1 - overlap[inds]) 132 | else: 133 | scores = scores * np.exp(- overlap**2 / thresh) 134 | 135 | return scores 136 | 137 | 138 | def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None): 139 | """ 140 | greedily select boxes with high confidence and overlap with current maximum <= thresh 141 | rule out overlap >= thresh, overlap = oks 142 | :param kpts_db 143 | :param thresh: retain overlap < thresh 144 | :return: indexes to keep 145 | """ 146 | if len(kpts_db) == 0: 147 | return [] 148 | 149 | scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))]) 150 | kpts = np.array([kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))]) 151 | areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))]) 152 | 153 | order = scores.argsort()[::-1] 154 | scores = scores[order] 155 | 156 | # max_dets = order.size 157 | max_dets = 20 158 | keep = np.zeros(max_dets, dtype=np.intp) 159 | keep_cnt = 0 160 | while order.size > 0 and keep_cnt < max_dets: 161 | i = order[0] 162 | 163 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], sigmas, in_vis_thre) 164 | 165 | order = order[1:] 166 | scores = rescore(oks_ovr, scores[1:], thresh) 167 | 168 | tmp = scores.argsort()[::-1] 169 | order = order[tmp] 170 | scores = scores[tmp] 171 | 172 | keep[keep_cnt] = i 173 | keep_cnt += 1 174 | 175 | keep = keep[:keep_cnt] 176 | 177 | return keep 178 | # kpts_db = kpts_db[:keep_cnt] 179 | 180 | # return kpts_db 181 | -------------------------------------------------------------------------------- /utils/nms/setup_linux.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Pose.gluon 3 | # Copyright (c) 2018-present Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | from setuptools import setup 11 | from distutils.extension import Extension 12 | from Cython.Distutils import build_ext 13 | import numpy as np 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # Adapted fom 19 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 20 | for dir in path.split(os.pathsep): 21 | binpath = pjoin(dir, name) 22 | if os.path.exists(binpath): 23 | return os.path.abspath(binpath) 24 | return None 25 | 26 | 27 | def locate_cuda(): 28 | """Locate the CUDA environment on the system 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | Starts by looking for the CUDAHOME env variable. If not found, everything 32 | is based on finding 'nvcc' in the PATH. 33 | """ 34 | 35 | # first check if the CUDAHOME env variable is in use 36 | if 'CUDAHOME' in os.environ: 37 | home = os.environ['CUDAHOME'] 38 | nvcc = pjoin(home, 'bin', 'nvcc') 39 | else: 40 | # otherwise, search the PATH for NVCC 41 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 42 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 43 | if nvcc is None: 44 | raise EnvironmentError('The nvcc binary could not be ' 45 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 46 | home = os.path.dirname(os.path.dirname(nvcc)) 47 | 48 | cudaconfig = {'home':home, 'nvcc':nvcc, 49 | 'include': pjoin(home, 'include'), 50 | 'lib64': pjoin(home, 'lib64')} 51 | for k, v in cudaconfig.items(): 52 | if not os.path.exists(v): 53 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 54 | 55 | return cudaconfig 56 | CUDA = locate_cuda() 57 | 58 | 59 | # Obtain the numpy include directory. This logic works across numpy versions. 60 | try: 61 | numpy_include = np.get_include() 62 | except AttributeError: 63 | numpy_include = np.get_numpy_include() 64 | 65 | 66 | def customize_compiler_for_nvcc(self): 67 | """inject deep into distutils to customize how the dispatch 68 | to gcc/nvcc works. 69 | If you subclass UnixCCompiler, it's not trivial to get your subclass 70 | injected in, and still have the right customizations (i.e. 71 | distutils.sysconfig.customize_compiler) run on it. So instead of going 72 | the OO route, I have this. Note, it's kindof like a wierd functional 73 | subclassing going on.""" 74 | 75 | # tell the compiler it can processes .cu 76 | self.src_extensions.append('.cu') 77 | 78 | # save references to the default compiler_so and _comple methods 79 | default_compiler_so = self.compiler_so 80 | super = self._compile 81 | 82 | # now redefine the _compile method. This gets executed for each 83 | # object but distutils doesn't have the ability to change compilers 84 | # based on source extension: we add it. 85 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 86 | if os.path.splitext(src)[1] == '.cu': 87 | # use the cuda for .cu files 88 | self.set_executable('compiler_so', CUDA['nvcc']) 89 | # use only a subset of the extra_postargs, which are 1-1 translated 90 | # from the extra_compile_args in the Extension class 91 | postargs = extra_postargs['nvcc'] 92 | else: 93 | postargs = extra_postargs['gcc'] 94 | 95 | super(obj, src, ext, cc_args, postargs, pp_opts) 96 | # reset the default compiler_so, which we might have changed for cuda 97 | self.compiler_so = default_compiler_so 98 | 99 | # inject our redefined _compile method into the class 100 | self._compile = _compile 101 | 102 | 103 | # run the customize_compiler 104 | class custom_build_ext(build_ext): 105 | def build_extensions(self): 106 | customize_compiler_for_nvcc(self.compiler) 107 | build_ext.build_extensions(self) 108 | 109 | 110 | ext_modules = [ 111 | Extension( 112 | "cpu_nms", 113 | ["cpu_nms.pyx"], 114 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 115 | include_dirs = [numpy_include] 116 | ), 117 | Extension('gpu_nms', 118 | ['nms_kernel.cu', 'gpu_nms.pyx'], 119 | library_dirs=[CUDA['lib64']], 120 | libraries=['cudart'], 121 | language='c++', 122 | runtime_library_dirs=[CUDA['lib64']], 123 | # this syntax is specific to this build system 124 | # we're only going to use certain compiler args with nvcc and not with 125 | # gcc the implementation of this trick is in customize_compiler() below 126 | extra_compile_args={'gcc': ["-Wno-unused-function"], 127 | 'nvcc': ['-arch=sm_35', 128 | '--ptxas-options=-v', 129 | '-c', 130 | '--compiler-options', 131 | "'-fPIC'"]}, 132 | include_dirs = [numpy_include, CUDA['include']] 133 | ), 134 | ] 135 | 136 | setup( 137 | name='nms', 138 | ext_modules=ext_modules, 139 | # inject our custom trigger 140 | cmdclass={'build_ext': custom_build_ext}, 141 | ) 142 | -------------------------------------------------------------------------------- /utils/post_processing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .nms import oks_iou, oks_nms, soft_oks_nms 3 | from .one_euro_filter import OneEuroFilter 4 | from .post_transforms import (affine_transform, flip_back, fliplr_joints, 5 | fliplr_regression, get_affine_transform, 6 | get_warp_matrix, rotate_point, transform_preds, 7 | warp_affine_joints) 8 | 9 | __all__ = [ 10 | 'oks_nms', 'soft_oks_nms', 'affine_transform', 'rotate_point', 'flip_back', 11 | 'fliplr_joints', 'fliplr_regression', 'transform_preds', 12 | 'get_affine_transform', 'get_warp_matrix', 'warp_affine_joints', 13 | 'OneEuroFilter', 'oks_iou' 14 | ] 15 | -------------------------------------------------------------------------------- /utils/post_processing/group.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Adapted from https://github.com/princeton-vl/pose-ae-train/ 3 | # Original licence: Copyright (c) 2017, umich-vl, under BSD 3-Clause License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | import numpy as np 7 | import torch 8 | from munkres import Munkres 9 | 10 | from ..top_down_eval import post_dark_udp 11 | 12 | 13 | def _py_max_match(scores): 14 | """Apply munkres algorithm to get the best match. 15 | 16 | Args: 17 | scores(np.ndarray): cost matrix. 18 | 19 | Returns: 20 | np.ndarray: best match. 21 | """ 22 | m = Munkres() 23 | tmp = m.compute(scores) 24 | tmp = np.array(tmp).astype(int) 25 | return tmp 26 | 27 | 28 | def _match_by_tag(inp, params): 29 | """Match joints by tags. Use Munkres algorithm to calculate the best match 30 | for keypoints grouping. 31 | 32 | Note: 33 | number of keypoints: K 34 | max number of people in an image: M (M=30 by default) 35 | dim of tags: L 36 | If use flip testing, L=2; else L=1. 37 | 38 | Args: 39 | inp(tuple): 40 | tag_k (np.ndarray[KxMxL]): tag corresponding to the 41 | top k values of feature map per keypoint. 42 | loc_k (np.ndarray[KxMx2]): top k locations of the 43 | feature maps for keypoint. 44 | val_k (np.ndarray[KxM]): top k value of the 45 | feature maps per keypoint. 46 | params(Params): class Params(). 47 | 48 | Returns: 49 | np.ndarray: result of pose groups. 50 | """ 51 | assert isinstance(params, _Params), 'params should be class _Params()' 52 | 53 | tag_k, loc_k, val_k = inp 54 | 55 | default_ = np.zeros((params.num_joints, 3 + tag_k.shape[2]), 56 | dtype=np.float32) 57 | 58 | joint_dict = {} 59 | tag_dict = {} 60 | for i in range(params.num_joints): 61 | idx = params.joint_order[i] 62 | 63 | tags = tag_k[idx] 64 | joints = np.concatenate((loc_k[idx], val_k[idx, :, None], tags), 1) 65 | mask = joints[:, 2] > params.detection_threshold 66 | tags = tags[mask] 67 | joints = joints[mask] 68 | 69 | if joints.shape[0] == 0: 70 | continue 71 | 72 | if i == 0 or len(joint_dict) == 0: 73 | for tag, joint in zip(tags, joints): 74 | key = tag[0] 75 | joint_dict.setdefault(key, np.copy(default_))[idx] = joint 76 | tag_dict[key] = [tag] 77 | else: 78 | grouped_keys = list(joint_dict.keys())[:params.max_num_people] 79 | grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys] 80 | 81 | if (params.ignore_too_much 82 | and len(grouped_keys) == params.max_num_people): 83 | continue 84 | 85 | diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :] 86 | diff_normed = np.linalg.norm(diff, ord=2, axis=2) 87 | diff_saved = np.copy(diff_normed) 88 | 89 | if params.use_detection_val: 90 | diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3] 91 | 92 | num_added = diff.shape[0] 93 | num_grouped = diff.shape[1] 94 | 95 | if num_added > num_grouped: 96 | diff_normed = np.concatenate( 97 | (diff_normed, 98 | np.zeros((num_added, num_added - num_grouped), 99 | dtype=np.float32) + 1e10), 100 | axis=1) 101 | 102 | pairs = _py_max_match(diff_normed) 103 | for row, col in pairs: 104 | if (row < num_added and col < num_grouped 105 | and diff_saved[row][col] < params.tag_threshold): 106 | key = grouped_keys[col] 107 | joint_dict[key][idx] = joints[row] 108 | tag_dict[key].append(tags[row]) 109 | else: 110 | key = tags[row][0] 111 | joint_dict.setdefault(key, np.copy(default_))[idx] = \ 112 | joints[row] 113 | tag_dict[key] = [tags[row]] 114 | 115 | results = np.array([joint_dict[i] for i in joint_dict]).astype(np.float32) 116 | return results 117 | 118 | 119 | class _Params: 120 | """A class of parameter. 121 | 122 | Args: 123 | cfg(Config): config. 124 | """ 125 | 126 | def __init__(self, cfg): 127 | self.num_joints = cfg['num_joints'] 128 | self.max_num_people = cfg['max_num_people'] 129 | 130 | self.detection_threshold = cfg['detection_threshold'] 131 | self.tag_threshold = cfg['tag_threshold'] 132 | self.use_detection_val = cfg['use_detection_val'] 133 | self.ignore_too_much = cfg['ignore_too_much'] 134 | 135 | if self.num_joints == 17: 136 | self.joint_order = [ 137 | i - 1 for i in 138 | [1, 2, 3, 4, 5, 6, 7, 12, 13, 8, 9, 10, 11, 14, 15, 16, 17] 139 | ] 140 | else: 141 | self.joint_order = list(np.arange(self.num_joints)) 142 | 143 | 144 | class HeatmapParser: 145 | """The heatmap parser for post processing.""" 146 | 147 | def __init__(self, cfg): 148 | self.params = _Params(cfg) 149 | self.tag_per_joint = cfg['tag_per_joint'] 150 | self.pool = torch.nn.MaxPool2d(cfg['nms_kernel'], 1, 151 | cfg['nms_padding']) 152 | self.use_udp = cfg.get('use_udp', False) 153 | self.score_per_joint = cfg.get('score_per_joint', False) 154 | 155 | def nms(self, heatmaps): 156 | """Non-Maximum Suppression for heatmaps. 157 | 158 | Args: 159 | heatmap(torch.Tensor): Heatmaps before nms. 160 | 161 | Returns: 162 | torch.Tensor: Heatmaps after nms. 163 | """ 164 | 165 | maxm = self.pool(heatmaps) 166 | maxm = torch.eq(maxm, heatmaps).float() 167 | heatmaps = heatmaps * maxm 168 | 169 | return heatmaps 170 | 171 | def match(self, tag_k, loc_k, val_k): 172 | """Group keypoints to human poses in a batch. 173 | 174 | Args: 175 | tag_k (np.ndarray[NxKxMxL]): tag corresponding to the 176 | top k values of feature map per keypoint. 177 | loc_k (np.ndarray[NxKxMx2]): top k locations of the 178 | feature maps for keypoint. 179 | val_k (np.ndarray[NxKxM]): top k value of the 180 | feature maps per keypoint. 181 | 182 | Returns: 183 | list 184 | """ 185 | 186 | def _match(x): 187 | return _match_by_tag(x, self.params) 188 | 189 | return list(map(_match, zip(tag_k, loc_k, val_k))) 190 | 191 | def top_k(self, heatmaps, tags): 192 | """Find top_k values in an image. 193 | 194 | Note: 195 | batch size: N 196 | number of keypoints: K 197 | heatmap height: H 198 | heatmap width: W 199 | max number of people: M 200 | dim of tags: L 201 | If use flip testing, L=2; else L=1. 202 | 203 | Args: 204 | heatmaps (torch.Tensor[NxKxHxW]) 205 | tags (torch.Tensor[NxKxHxWxL]) 206 | 207 | Returns: 208 | dict: A dict containing top_k values. 209 | 210 | - tag_k (np.ndarray[NxKxMxL]): 211 | tag corresponding to the top k values of 212 | feature map per keypoint. 213 | - loc_k (np.ndarray[NxKxMx2]): 214 | top k location of feature map per keypoint. 215 | - val_k (np.ndarray[NxKxM]): 216 | top k value of feature map per keypoint. 217 | """ 218 | heatmaps = self.nms(heatmaps) 219 | N, K, H, W = heatmaps.size() 220 | heatmaps = heatmaps.view(N, K, -1) 221 | val_k, ind = heatmaps.topk(self.params.max_num_people, dim=2) 222 | 223 | tags = tags.view(tags.size(0), tags.size(1), W * H, -1) 224 | if not self.tag_per_joint: 225 | tags = tags.expand(-1, self.params.num_joints, -1, -1) 226 | 227 | tag_k = torch.stack( 228 | [torch.gather(tags[..., i], 2, ind) for i in range(tags.size(3))], 229 | dim=3) 230 | 231 | x = ind % W 232 | y = ind // W 233 | 234 | ind_k = torch.stack((x, y), dim=3) 235 | 236 | results = { 237 | 'tag_k': tag_k.cpu().numpy(), 238 | 'loc_k': ind_k.cpu().numpy(), 239 | 'val_k': val_k.cpu().numpy() 240 | } 241 | 242 | return results 243 | 244 | @staticmethod 245 | def adjust(results, heatmaps): 246 | """Adjust the coordinates for better accuracy. 247 | 248 | Note: 249 | batch size: N 250 | number of keypoints: K 251 | heatmap height: H 252 | heatmap width: W 253 | 254 | Args: 255 | results (list(np.ndarray)): Keypoint predictions. 256 | heatmaps (torch.Tensor[NxKxHxW]): Heatmaps. 257 | """ 258 | _, _, H, W = heatmaps.shape 259 | for batch_id, people in enumerate(results): 260 | for people_id, people_i in enumerate(people): 261 | for joint_id, joint in enumerate(people_i): 262 | if joint[2] > 0: 263 | x, y = joint[0:2] 264 | xx, yy = int(x), int(y) 265 | tmp = heatmaps[batch_id][joint_id] 266 | if tmp[min(H - 1, yy + 1), xx] > tmp[max(0, yy - 1), 267 | xx]: 268 | y += 0.25 269 | else: 270 | y -= 0.25 271 | 272 | if tmp[yy, min(W - 1, xx + 1)] > tmp[yy, 273 | max(0, xx - 1)]: 274 | x += 0.25 275 | else: 276 | x -= 0.25 277 | results[batch_id][people_id, joint_id, 278 | 0:2] = (x + 0.5, y + 0.5) 279 | return results 280 | 281 | @staticmethod 282 | def refine(heatmap, tag, keypoints, use_udp=False): 283 | """Given initial keypoint predictions, we identify missing joints. 284 | 285 | Note: 286 | number of keypoints: K 287 | heatmap height: H 288 | heatmap width: W 289 | dim of tags: L 290 | If use flip testing, L=2; else L=1. 291 | 292 | Args: 293 | heatmap: np.ndarray(K, H, W). 294 | tag: np.ndarray(K, H, W) | np.ndarray(K, H, W, L) 295 | keypoints: np.ndarray of size (K, 3 + L) 296 | last dim is (x, y, score, tag). 297 | use_udp: bool-unbiased data processing 298 | 299 | Returns: 300 | np.ndarray: The refined keypoints. 301 | """ 302 | 303 | K, H, W = heatmap.shape 304 | if len(tag.shape) == 3: 305 | tag = tag[..., None] 306 | 307 | tags = [] 308 | for i in range(K): 309 | if keypoints[i, 2] > 0: 310 | # save tag value of detected keypoint 311 | x, y = keypoints[i][:2].astype(int) 312 | x = np.clip(x, 0, W - 1) 313 | y = np.clip(y, 0, H - 1) 314 | tags.append(tag[i, y, x]) 315 | 316 | # mean tag of current detected people 317 | prev_tag = np.mean(tags, axis=0) 318 | results = [] 319 | 320 | for _heatmap, _tag in zip(heatmap, tag): 321 | # distance of all tag values with mean tag of 322 | # current detected people 323 | distance_tag = (((_tag - 324 | prev_tag[None, None, :])**2).sum(axis=2)**0.5) 325 | norm_heatmap = _heatmap - np.round(distance_tag) 326 | 327 | # find maximum position 328 | y, x = np.unravel_index(np.argmax(norm_heatmap), _heatmap.shape) 329 | xx = x.copy() 330 | yy = y.copy() 331 | # detection score at maximum position 332 | val = _heatmap[y, x] 333 | if not use_udp: 334 | # offset by 0.5 335 | x += 0.5 336 | y += 0.5 337 | 338 | # add a quarter offset 339 | if _heatmap[yy, min(W - 1, xx + 1)] > _heatmap[yy, max(0, xx - 1)]: 340 | x += 0.25 341 | else: 342 | x -= 0.25 343 | 344 | if _heatmap[min(H - 1, yy + 1), xx] > _heatmap[max(0, yy - 1), xx]: 345 | y += 0.25 346 | else: 347 | y -= 0.25 348 | 349 | results.append((x, y, val)) 350 | results = np.array(results) 351 | 352 | if results is not None: 353 | for i in range(K): 354 | # add keypoint if it is not detected 355 | if results[i, 2] > 0 and keypoints[i, 2] == 0: 356 | keypoints[i, :3] = results[i, :3] 357 | 358 | return keypoints 359 | 360 | def parse(self, heatmaps, tags, adjust=True, refine=True): 361 | """Group keypoints into poses given heatmap and tag. 362 | 363 | Note: 364 | batch size: N 365 | number of keypoints: K 366 | heatmap height: H 367 | heatmap width: W 368 | dim of tags: L 369 | If use flip testing, L=2; else L=1. 370 | 371 | Args: 372 | heatmaps (torch.Tensor[NxKxHxW]): model output heatmaps. 373 | tags (torch.Tensor[NxKxHxWxL]): model output tagmaps. 374 | 375 | Returns: 376 | tuple: A tuple containing keypoint grouping results. 377 | 378 | - results (list(np.ndarray)): Pose results. 379 | - scores (list/list(np.ndarray)): Score of people. 380 | """ 381 | results = self.match(**self.top_k(heatmaps, tags)) 382 | 383 | if adjust: 384 | if self.use_udp: 385 | for i in range(len(results)): 386 | if results[i].shape[0] > 0: 387 | results[i][..., :2] = post_dark_udp( 388 | results[i][..., :2].copy(), heatmaps[i:i + 1, :]) 389 | else: 390 | results = self.adjust(results, heatmaps) 391 | 392 | if self.score_per_joint: 393 | scores = [i[:, 2] for i in results[0]] 394 | else: 395 | scores = [i[:, 2].mean() for i in results[0]] 396 | 397 | if refine: 398 | results = results[0] 399 | # for every detected person 400 | for i in range(len(results)): 401 | heatmap_numpy = heatmaps[0].cpu().numpy() 402 | tag_numpy = tags[0].cpu().numpy() 403 | if not self.tag_per_joint: 404 | tag_numpy = np.tile(tag_numpy, 405 | (self.params.num_joints, 1, 1, 1)) 406 | results[i] = self.refine( 407 | heatmap_numpy, tag_numpy, results[i], use_udp=self.use_udp) 408 | results = [results] 409 | 410 | return results, scores 411 | -------------------------------------------------------------------------------- /utils/post_processing/nms.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch 3 | # Original licence: Copyright (c) Microsoft, under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | import numpy as np 7 | 8 | 9 | def nms(dets, thr): 10 | """Greedily select boxes with high confidence and overlap <= thr. 11 | 12 | Args: 13 | dets: [[x1, y1, x2, y2, score]]. 14 | thr: Retain overlap < thr. 15 | 16 | Returns: 17 | list: Indexes to keep. 18 | """ 19 | if len(dets) == 0: 20 | return [] 21 | 22 | x1 = dets[:, 0] 23 | y1 = dets[:, 1] 24 | x2 = dets[:, 2] 25 | y2 = dets[:, 3] 26 | scores = dets[:, 4] 27 | 28 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 29 | order = scores.argsort()[::-1] 30 | 31 | keep = [] 32 | while len(order) > 0: 33 | i = order[0] 34 | keep.append(i) 35 | xx1 = np.maximum(x1[i], x1[order[1:]]) 36 | yy1 = np.maximum(y1[i], y1[order[1:]]) 37 | xx2 = np.minimum(x2[i], x2[order[1:]]) 38 | yy2 = np.minimum(y2[i], y2[order[1:]]) 39 | 40 | w = np.maximum(0.0, xx2 - xx1 + 1) 41 | h = np.maximum(0.0, yy2 - yy1 + 1) 42 | inter = w * h 43 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 44 | 45 | inds = np.where(ovr <= thr)[0] 46 | order = order[inds + 1] 47 | 48 | return keep 49 | 50 | 51 | def oks_iou(g, d, a_g, a_d, sigmas=None, vis_thr=None): 52 | """Calculate oks ious. 53 | 54 | Args: 55 | g: Ground truth keypoints. 56 | d: Detected keypoints. 57 | a_g: Area of the ground truth object. 58 | a_d: Area of the detected object. 59 | sigmas: standard deviation of keypoint labelling. 60 | vis_thr: threshold of the keypoint visibility. 61 | 62 | Returns: 63 | list: The oks ious. 64 | """ 65 | if sigmas is None: 66 | sigmas = np.array([ 67 | .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, 68 | .87, .87, .89, .89 69 | ]) / 10.0 70 | vars = (sigmas * 2)**2 71 | xg = g[0::3] 72 | yg = g[1::3] 73 | vg = g[2::3] 74 | ious = np.zeros(len(d), dtype=np.float32) 75 | for n_d in range(0, len(d)): 76 | xd = d[n_d, 0::3] 77 | yd = d[n_d, 1::3] 78 | vd = d[n_d, 2::3] 79 | dx = xd - xg 80 | dy = yd - yg 81 | e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2 82 | if vis_thr is not None: 83 | ind = list(vg > vis_thr) and list(vd > vis_thr) 84 | e = e[ind] 85 | ious[n_d] = np.sum(np.exp(-e)) / len(e) if len(e) != 0 else 0.0 86 | return ious 87 | 88 | 89 | def oks_nms(kpts_db, thr, sigmas=None, vis_thr=None, score_per_joint=False): 90 | """OKS NMS implementations. 91 | 92 | Args: 93 | kpts_db: keypoints. 94 | thr: Retain overlap < thr. 95 | sigmas: standard deviation of keypoint labelling. 96 | vis_thr: threshold of the keypoint visibility. 97 | score_per_joint: the input scores (in kpts_db) are per joint scores 98 | 99 | Returns: 100 | np.ndarray: indexes to keep. 101 | """ 102 | if len(kpts_db) == 0: 103 | return [] 104 | 105 | if score_per_joint: 106 | scores = np.array([k['score'].mean() for k in kpts_db]) 107 | else: 108 | scores = np.array([k['score'] for k in kpts_db]) 109 | 110 | kpts = np.array([k['keypoints'].flatten() for k in kpts_db]) 111 | areas = np.array([k['area'] for k in kpts_db]) 112 | 113 | order = scores.argsort()[::-1] 114 | 115 | keep = [] 116 | while len(order) > 0: 117 | i = order[0] 118 | keep.append(i) 119 | 120 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], 121 | sigmas, vis_thr) 122 | 123 | inds = np.where(oks_ovr <= thr)[0] 124 | order = order[inds + 1] 125 | 126 | keep = np.array(keep) 127 | 128 | return keep 129 | 130 | 131 | def _rescore(overlap, scores, thr, type='gaussian'): 132 | """Rescoring mechanism gaussian or linear. 133 | 134 | Args: 135 | overlap: calculated ious 136 | scores: target scores. 137 | thr: retain oks overlap < thr. 138 | type: 'gaussian' or 'linear' 139 | 140 | Returns: 141 | np.ndarray: indexes to keep 142 | """ 143 | assert len(overlap) == len(scores) 144 | assert type in ['gaussian', 'linear'] 145 | 146 | if type == 'linear': 147 | inds = np.where(overlap >= thr)[0] 148 | scores[inds] = scores[inds] * (1 - overlap[inds]) 149 | else: 150 | scores = scores * np.exp(-overlap**2 / thr) 151 | 152 | return scores 153 | 154 | 155 | def soft_oks_nms(kpts_db, 156 | thr, 157 | max_dets=20, 158 | sigmas=None, 159 | vis_thr=None, 160 | score_per_joint=False): 161 | """Soft OKS NMS implementations. 162 | 163 | Args: 164 | kpts_db 165 | thr: retain oks overlap < thr. 166 | max_dets: max number of detections to keep. 167 | sigmas: Keypoint labelling uncertainty. 168 | score_per_joint: the input scores (in kpts_db) are per joint scores 169 | 170 | Returns: 171 | np.ndarray: indexes to keep. 172 | """ 173 | if len(kpts_db) == 0: 174 | return [] 175 | 176 | if score_per_joint: 177 | scores = np.array([k['score'].mean() for k in kpts_db]) 178 | else: 179 | scores = np.array([k['score'] for k in kpts_db]) 180 | 181 | kpts = np.array([k['keypoints'].flatten() for k in kpts_db]) 182 | areas = np.array([k['area'] for k in kpts_db]) 183 | 184 | order = scores.argsort()[::-1] 185 | scores = scores[order] 186 | 187 | keep = np.zeros(max_dets, dtype=np.intp) 188 | keep_cnt = 0 189 | while len(order) > 0 and keep_cnt < max_dets: 190 | i = order[0] 191 | 192 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], 193 | sigmas, vis_thr) 194 | 195 | order = order[1:] 196 | scores = _rescore(oks_ovr, scores[1:], thr) 197 | 198 | tmp = scores.argsort()[::-1] 199 | order = order[tmp] 200 | scores = scores[tmp] 201 | 202 | keep[keep_cnt] = i 203 | keep_cnt += 1 204 | 205 | keep = keep[:keep_cnt] 206 | 207 | return keep 208 | -------------------------------------------------------------------------------- /utils/post_processing/one_euro_filter.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Adapted from https://github.com/HoBeom/OneEuroFilter-Numpy 3 | # Original licence: Copyright (c) HoBeom Jeon, under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | from time import time 6 | 7 | import numpy as np 8 | 9 | 10 | def smoothing_factor(t_e, cutoff): 11 | r = 2 * np.pi * cutoff * t_e 12 | return r / (r + 1) 13 | 14 | 15 | def exponential_smoothing(a, x, x_prev): 16 | return a * x + (1 - a) * x_prev 17 | 18 | 19 | class OneEuroFilter: 20 | 21 | def __init__(self, 22 | x0, 23 | dx0=0.0, 24 | min_cutoff=1.7, 25 | beta=0.3, 26 | d_cutoff=30.0, 27 | fps=None): 28 | """One Euro Filter for keypoints smoothing. 29 | 30 | Args: 31 | x0 (np.ndarray[K, 2]): Initialize keypoints value 32 | dx0 (float): 0.0 33 | min_cutoff (float): parameter for one euro filter 34 | beta (float): parameter for one euro filter 35 | d_cutoff (float): Input data FPS 36 | fps (float): Video FPS for video inference 37 | """ 38 | 39 | # The parameters. 40 | self.data_shape = x0.shape 41 | self.min_cutoff = np.full(x0.shape, min_cutoff) 42 | self.beta = np.full(x0.shape, beta) 43 | self.d_cutoff = np.full(x0.shape, d_cutoff) 44 | # Previous values. 45 | self.x_prev = x0.astype(np.float32) 46 | self.dx_prev = np.full(x0.shape, dx0) 47 | self.mask_prev = np.ma.masked_where(x0 <= 0, x0) 48 | self.realtime = True 49 | if fps is None: 50 | # Using in realtime inference 51 | self.t_e = None 52 | self.skip_frame_factor = d_cutoff 53 | else: 54 | # fps using video inference 55 | self.realtime = False 56 | self.d_cutoff = np.full(x0.shape, float(fps)) 57 | self.t_prev = time() 58 | 59 | def __call__(self, x, t_e=1.0): 60 | """Compute the filtered signal. 61 | 62 | Hyper-parameters (cutoff, beta) are from `VNect 63 | `__ . 64 | 65 | Realtime Camera fps (d_cutoff) default 30.0 66 | 67 | Args: 68 | x (np.ndarray[K, 2]): keypoints results in frame 69 | t_e (Optional): video skip frame count for posetrack 70 | evaluation 71 | """ 72 | assert x.shape == self.data_shape 73 | 74 | t = 0 75 | if self.realtime: 76 | t = time() 77 | t_e = (t - self.t_prev) * self.skip_frame_factor 78 | t_e = np.full(x.shape, t_e) 79 | 80 | # missing keypoints mask 81 | mask = np.ma.masked_where(x <= 0, x) 82 | 83 | # The filtered derivative of the signal. 84 | a_d = smoothing_factor(t_e, self.d_cutoff) 85 | dx = (x - self.x_prev) / t_e 86 | dx_hat = exponential_smoothing(a_d, dx, self.dx_prev) 87 | 88 | # The filtered signal. 89 | cutoff = self.min_cutoff + self.beta * np.abs(dx_hat) 90 | a = smoothing_factor(t_e, cutoff) 91 | x_hat = exponential_smoothing(a, x, self.x_prev) 92 | 93 | # missing keypoints remove 94 | np.copyto(x_hat, -10, where=mask.mask) 95 | 96 | # Memorize the previous values. 97 | self.x_prev = x_hat 98 | self.dx_prev = dx_hat 99 | self.t_prev = t 100 | self.mask_prev = mask 101 | 102 | return x_hat 103 | -------------------------------------------------------------------------------- /utils/post_processing/post_transforms.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch 3 | # Original licence: Copyright (c) Microsoft, under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | import math 7 | 8 | import cv2 9 | import numpy as np 10 | import torch 11 | 12 | 13 | def fliplr_joints(joints_3d, joints_3d_visible, img_width, flip_pairs): 14 | """Flip human joints horizontally. 15 | 16 | Note: 17 | - num_keypoints: K 18 | 19 | Args: 20 | joints_3d (np.ndarray([K, 3])): Coordinates of keypoints. 21 | joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints. 22 | img_width (int): Image width. 23 | flip_pairs (list[tuple]): Pairs of keypoints which are mirrored 24 | (for example, left ear and right ear). 25 | 26 | Returns: 27 | tuple: Flipped human joints. 28 | 29 | - joints_3d_flipped (np.ndarray([K, 3])): Flipped joints. 30 | - joints_3d_visible_flipped (np.ndarray([K, 1])): Joint visibility. 31 | """ 32 | 33 | assert len(joints_3d) == len(joints_3d_visible) 34 | assert img_width > 0 35 | 36 | joints_3d_flipped = joints_3d.copy() 37 | joints_3d_visible_flipped = joints_3d_visible.copy() 38 | 39 | # Swap left-right parts 40 | for left, right in flip_pairs: 41 | joints_3d_flipped[left, :] = joints_3d[right, :] 42 | joints_3d_flipped[right, :] = joints_3d[left, :] 43 | 44 | joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :] 45 | joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :] 46 | 47 | # Flip horizontally 48 | joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0] 49 | joints_3d_flipped = joints_3d_flipped * joints_3d_visible_flipped 50 | 51 | return joints_3d_flipped, joints_3d_visible_flipped 52 | 53 | 54 | def fliplr_regression(regression, 55 | flip_pairs, 56 | center_mode='static', 57 | center_x=0.5, 58 | center_index=0): 59 | """Flip human joints horizontally. 60 | 61 | Note: 62 | - batch_size: N 63 | - num_keypoint: K 64 | 65 | Args: 66 | regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K 67 | is the joint number and C is the dimension. Example shapes are: 68 | 69 | - [N, K, C]: a batch of keypoints where N is the batch size. 70 | - [N, T, K, C]: a batch of pose sequences, where T is the frame 71 | number. 72 | flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored 73 | (for example, left ear -- right ear). 74 | center_mode (str): The mode to set the center location on the x-axis 75 | to flip around. Options are: 76 | 77 | - static: use a static x value (see center_x also) 78 | - root: use a root joint (see center_index also) 79 | center_x (float): Set the x-axis location of the flip center. Only used 80 | when center_mode=static. 81 | center_index (int): Set the index of the root joint, whose x location 82 | will be used as the flip center. Only used when center_mode=root. 83 | 84 | Returns: 85 | np.ndarray([..., K, C]): Flipped joints. 86 | """ 87 | assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}' 88 | 89 | allowed_center_mode = {'static', 'root'} 90 | assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \ 91 | f'{center_mode}, allowed choices are {allowed_center_mode}' 92 | 93 | if center_mode == 'static': 94 | x_c = center_x 95 | elif center_mode == 'root': 96 | assert regression.shape[-2] > center_index 97 | x_c = regression[..., center_index:center_index + 1, 0] 98 | 99 | regression_flipped = regression.copy() 100 | # Swap left-right parts 101 | for left, right in flip_pairs: 102 | regression_flipped[..., left, :] = regression[..., right, :] 103 | regression_flipped[..., right, :] = regression[..., left, :] 104 | 105 | # Flip horizontally 106 | regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0] 107 | return regression_flipped 108 | 109 | 110 | def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'): 111 | """Flip the flipped heatmaps back to the original form. 112 | 113 | Note: 114 | - batch_size: N 115 | - num_keypoints: K 116 | - heatmap height: H 117 | - heatmap width: W 118 | 119 | Args: 120 | output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained 121 | from the flipped images. 122 | flip_pairs (list[tuple()): Pairs of keypoints which are mirrored 123 | (for example, left ear -- right ear). 124 | target_type (str): GaussianHeatmap or CombinedTarget 125 | 126 | Returns: 127 | np.ndarray: heatmaps that flipped back to the original image 128 | """ 129 | assert output_flipped.ndim == 4, \ 130 | 'output_flipped should be [batch_size, num_keypoints, height, width]' 131 | shape_ori = output_flipped.shape 132 | channels = 1 133 | if target_type.lower() == 'CombinedTarget'.lower(): 134 | channels = 3 135 | output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...] 136 | output_flipped = output_flipped.reshape(shape_ori[0], -1, channels, 137 | shape_ori[2], shape_ori[3]) 138 | output_flipped_back = output_flipped.copy() 139 | 140 | # Swap left-right parts 141 | for left, right in flip_pairs: 142 | output_flipped_back[:, left, ...] = output_flipped[:, right, ...] 143 | output_flipped_back[:, right, ...] = output_flipped[:, left, ...] 144 | output_flipped_back = output_flipped_back.reshape(shape_ori) 145 | # Flip horizontally 146 | output_flipped_back = output_flipped_back[..., ::-1] 147 | return output_flipped_back 148 | 149 | 150 | def transform_preds(coords, center, scale, output_size, use_udp=False): 151 | """Get final keypoint predictions from heatmaps and apply scaling and 152 | translation to map them back to the image. 153 | 154 | Note: 155 | num_keypoints: K 156 | 157 | Args: 158 | coords (np.ndarray[K, ndims]): 159 | 160 | * If ndims=2, corrds are predicted keypoint location. 161 | * If ndims=4, corrds are composed of (x, y, scores, tags) 162 | * If ndims=5, corrds are composed of (x, y, scores, tags, 163 | flipped_tags) 164 | 165 | center (np.ndarray[2, ]): Center of the bounding box (x, y). 166 | scale (np.ndarray[2, ]): Scale of the bounding box 167 | wrt [width, height]. 168 | output_size (np.ndarray[2, ] | list(2,)): Size of the 169 | destination heatmaps. 170 | use_udp (bool): Use unbiased data processing 171 | 172 | Returns: 173 | np.ndarray: Predicted coordinates in the images. 174 | """ 175 | assert coords.shape[1] in (2, 4, 5) 176 | assert len(center) == 2 177 | assert len(scale) == 2 178 | assert len(output_size) == 2 179 | 180 | # Recover the scale which is normalized by a factor of 200. 181 | # scale = scale * 200.0 182 | 183 | if use_udp: 184 | scale_x = scale[0] / (output_size[0] - 1.0) 185 | scale_y = scale[1] / (output_size[1] - 1.0) 186 | else: 187 | scale_x = scale[0] / output_size[0] 188 | scale_y = scale[1] / output_size[1] 189 | 190 | target_coords = np.ones_like(coords) 191 | target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5 192 | target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5 193 | 194 | return target_coords 195 | 196 | 197 | def get_affine_transform(center, 198 | scale, 199 | rot, 200 | output_size, 201 | shift=(0., 0.), 202 | inv=False): 203 | """Get the affine transform matrix, given the center/scale/rot/output_size. 204 | 205 | Args: 206 | center (np.ndarray[2, ]): Center of the bounding box (x, y). 207 | scale (np.ndarray[2, ]): Scale of the bounding box 208 | wrt [width, height]. 209 | rot (float): Rotation angle (degree). 210 | output_size (np.ndarray[2, ] | list(2,)): Size of the 211 | destination heatmaps. 212 | shift (0-100%): Shift translation ratio wrt the width/height. 213 | Default (0., 0.). 214 | inv (bool): Option to inverse the affine transform direction. 215 | (inv=False: src->dst or inv=True: dst->src) 216 | 217 | Returns: 218 | np.ndarray: The transform matrix. 219 | """ 220 | assert len(center) == 2 221 | assert len(scale) == 2 222 | assert len(output_size) == 2 223 | assert len(shift) == 2 224 | 225 | # pixel_std is 200. 226 | scale_tmp = scale * 200.0 227 | 228 | shift = np.array(shift) 229 | src_w = scale_tmp[0] 230 | dst_w = output_size[0] 231 | dst_h = output_size[1] 232 | 233 | rot_rad = np.pi * rot / 180 234 | src_dir = rotate_point([0., src_w * -0.5], rot_rad) 235 | dst_dir = np.array([0., dst_w * -0.5]) 236 | 237 | src = np.zeros((3, 2), dtype=np.float32) 238 | src[0, :] = center + scale_tmp * shift 239 | src[1, :] = center + src_dir + scale_tmp * shift 240 | src[2, :] = _get_3rd_point(src[0, :], src[1, :]) 241 | 242 | dst = np.zeros((3, 2), dtype=np.float32) 243 | dst[0, :] = [dst_w * 0.5, dst_h * 0.5] 244 | dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir 245 | dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :]) 246 | 247 | if inv: 248 | trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) 249 | else: 250 | trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) 251 | 252 | return trans 253 | 254 | 255 | def affine_transform(pt, trans_mat): 256 | """Apply an affine transformation to the points. 257 | 258 | Args: 259 | pt (np.ndarray): a 2 dimensional point to be transformed 260 | trans_mat (np.ndarray): 2x3 matrix of an affine transform 261 | 262 | Returns: 263 | np.ndarray: Transformed points. 264 | """ 265 | assert len(pt) == 2 266 | new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.]) 267 | 268 | return new_pt 269 | 270 | 271 | def _get_3rd_point(a, b): 272 | """To calculate the affine matrix, three pairs of points are required. This 273 | function is used to get the 3rd point, given 2D points a & b. 274 | 275 | The 3rd point is defined by rotating vector `a - b` by 90 degrees 276 | anticlockwise, using b as the rotation center. 277 | 278 | Args: 279 | a (np.ndarray): point(x,y) 280 | b (np.ndarray): point(x,y) 281 | 282 | Returns: 283 | np.ndarray: The 3rd point. 284 | """ 285 | assert len(a) == 2 286 | assert len(b) == 2 287 | direction = a - b 288 | third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32) 289 | 290 | return third_pt 291 | 292 | 293 | def rotate_point(pt, angle_rad): 294 | """Rotate a point by an angle. 295 | 296 | Args: 297 | pt (list[float]): 2 dimensional point to be rotated 298 | angle_rad (float): rotation angle by radian 299 | 300 | Returns: 301 | list[float]: Rotated point. 302 | """ 303 | assert len(pt) == 2 304 | sn, cs = np.sin(angle_rad), np.cos(angle_rad) 305 | new_x = pt[0] * cs - pt[1] * sn 306 | new_y = pt[0] * sn + pt[1] * cs 307 | rotated_pt = [new_x, new_y] 308 | 309 | return rotated_pt 310 | 311 | 312 | def get_warp_matrix(theta, size_input, size_dst, size_target): 313 | """Calculate the transformation matrix under the constraint of unbiased. 314 | Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased 315 | Data Processing for Human Pose Estimation (CVPR 2020). 316 | 317 | Args: 318 | theta (float): Rotation angle in degrees. 319 | size_input (np.ndarray): Size of input image [w, h]. 320 | size_dst (np.ndarray): Size of output image [w, h]. 321 | size_target (np.ndarray): Size of ROI in input plane [w, h]. 322 | 323 | Returns: 324 | np.ndarray: A matrix for transformation. 325 | """ 326 | theta = np.deg2rad(theta) 327 | matrix = np.zeros((2, 3), dtype=np.float32) 328 | scale_x = size_dst[0] / size_target[0] 329 | scale_y = size_dst[1] / size_target[1] 330 | matrix[0, 0] = math.cos(theta) * scale_x 331 | matrix[0, 1] = -math.sin(theta) * scale_x 332 | matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) + 333 | 0.5 * size_input[1] * math.sin(theta) + 334 | 0.5 * size_target[0]) 335 | matrix[1, 0] = math.sin(theta) * scale_y 336 | matrix[1, 1] = math.cos(theta) * scale_y 337 | matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) - 338 | 0.5 * size_input[1] * math.cos(theta) + 339 | 0.5 * size_target[1]) 340 | return matrix 341 | 342 | 343 | def warp_affine_joints(joints, mat): 344 | """Apply affine transformation defined by the transform matrix on the 345 | joints. 346 | 347 | Args: 348 | joints (np.ndarray[..., 2]): Origin coordinate of joints. 349 | mat (np.ndarray[3, 2]): The affine matrix. 350 | 351 | Returns: 352 | np.ndarray[..., 2]: Result coordinate of joints. 353 | """ 354 | joints = np.array(joints) 355 | shape = joints.shape 356 | joints = joints.reshape(-1, 2) 357 | return np.dot( 358 | np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1), 359 | mat.T).reshape(shape) 360 | 361 | 362 | def affine_transform_torch(pts, t): 363 | npts = pts.shape[0] 364 | pts_homo = torch.cat([pts, torch.ones(npts, 1, device=pts.device)], dim=1) 365 | out = torch.mm(t, torch.t(pts_homo)) 366 | return torch.t(out[:2, :]) 367 | -------------------------------------------------------------------------------- /utils/train_valid_fn.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from models.losses import JointsMSELoss 7 | from models.optimizer import LayerDecayOptimizer 8 | 9 | from torch.nn.parallel import DataParallel, DistributedDataParallel 10 | from torch.nn.utils import clip_grad_norm_ 11 | from torch.optim import AdamW 12 | from torch.optim.lr_scheduler import LambdaLR, MultiStepLR 13 | from torch.utils.data import DataLoader, Dataset 14 | from torch.utils.data.distributed import DistributedSampler 15 | from torch.cuda.amp import autocast, GradScaler 16 | from tqdm import tqdm 17 | from time import time 18 | 19 | from utils.dist_util import get_dist_info, init_dist 20 | from utils.logging import get_root_logger 21 | 22 | @torch.no_grad() 23 | def valid_model(model: nn.Module, dataloaders: DataLoader, criterion: nn.Module, cfg: dict) -> None: 24 | total_loss = 0 25 | total_metric = 0 26 | model.eval() 27 | for dataloader in dataloaders: 28 | for batch_idx, batch in enumerate(dataloader): 29 | images, targets, target_weights, __ = batch 30 | images = images.to('cuda') 31 | targets = targets.to('cuda') 32 | target_weights = target_weights.to('cuda') 33 | 34 | outputs = model(images) 35 | loss = criterion(outputs, targets, target_weights) 36 | total_loss += loss.item() 37 | 38 | avg_loss = total_loss/(len(dataloader)*len(dataloaders)) 39 | return avg_loss 40 | 41 | def train_model(model: nn.Module, datasets_train: Dataset, datasets_valid: Dataset, cfg: dict, distributed: bool, validate: bool, timestamp: str, meta: dict) -> None: 42 | logger = get_root_logger() 43 | 44 | # Prepare data loaders 45 | datasets_train = datasets_train if isinstance(datasets_train, (list, tuple)) else [datasets_train] 46 | datasets_valid = datasets_valid if isinstance(datasets_valid, (list, tuple)) else [datasets_valid] 47 | 48 | if distributed: 49 | samplers_train = [DistributedSampler(ds, num_replicas=len(cfg.gpu_ids), rank=torch.cuda.current_device(), shuffle=True, drop_last=False) for ds in datasets_train] 50 | samplers_valid = [DistributedSampler(ds, num_replicas=len(cfg.gpu_ids), rank=torch.cuda.current_device(), shuffle=False, drop_last=False) for ds in datasets_valid] 51 | else: 52 | samplers_train = [None for ds in datasets_train] 53 | samplers_valid = [None for ds in datasets_valid] 54 | 55 | dataloaders_train = [DataLoader(ds, batch_size=cfg.data['samples_per_gpu'], shuffle=True, sampler=sampler, num_workers=cfg.data['workers_per_gpu'], pin_memory=False) for ds, sampler in zip(datasets_train, samplers_train)] 56 | dataloaders_valid = [DataLoader(ds, batch_size=cfg.data['samples_per_gpu'], shuffle=False, sampler=sampler, num_workers=cfg.data['workers_per_gpu'], pin_memory=False) for ds, sampler in zip(datasets_valid, samplers_valid)] 57 | 58 | # put model on gpus 59 | if distributed: 60 | find_unused_parameters = cfg.get('find_unused_parameters', False) 61 | # Sets the `find_unused_parameters` parameter in 62 | # torch.nn.parallel.DistributedDataParallel 63 | 64 | model = DistributedDataParallel( 65 | module=model, 66 | device_ids=[torch.cuda.current_device()], 67 | broadcast_buffers=False, 68 | find_unused_parameters=find_unused_parameters) 69 | else: 70 | model = DataParallel(model, device_ids=cfg.gpu_ids) 71 | 72 | # Loss function 73 | criterion = JointsMSELoss(use_target_weight=cfg.model['keypoint_head']['loss_keypoint']['use_target_weight']) 74 | 75 | # Optimizer 76 | optimizer = AdamW(model.parameters(), lr=cfg.optimizer['lr'], betas=cfg.optimizer['betas'], weight_decay=cfg.optimizer['weight_decay']) 77 | 78 | # Layer-wise learning rate decay 79 | lr_mult = [cfg.optimizer['paramwise_cfg']['layer_decay_rate']] * cfg.optimizer['paramwise_cfg']['num_layers'] 80 | layerwise_optimizer = LayerDecayOptimizer(optimizer, lr_mult) 81 | 82 | 83 | # Learning rate scheduler (MultiStepLR) 84 | milestones = cfg.lr_config['step'] 85 | gamma = 0.1 86 | scheduler = MultiStepLR(optimizer, milestones, gamma) 87 | 88 | # Warm-up scheduler 89 | num_warmup_steps = cfg.lr_config['warmup_iters'] # Number of warm-up steps 90 | warmup_factor = cfg.lr_config['warmup_ratio'] # Initial learning rate = warmup_factor * learning_rate 91 | warmup_scheduler = LambdaLR( 92 | optimizer, 93 | lr_lambda=lambda step: warmup_factor + (1.0 - warmup_factor) * step / num_warmup_steps 94 | ) 95 | 96 | # AMP setting 97 | if cfg.use_amp: 98 | logger.info("Using Automatic Mixed Precision (AMP) training...") 99 | # Create a GradScaler object for FP16 training 100 | scaler = GradScaler() 101 | 102 | # Logging config 103 | total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 104 | logger.info(f'''\n 105 | #========= [Train Configs] =========# 106 | # - Num GPUs: {len(cfg.gpu_ids)} 107 | # - Batch size (per gpu): {cfg.data['samples_per_gpu']} 108 | # - LR: {cfg.optimizer['lr']: .6f} 109 | # - Num params: {total_params:,d} 110 | # - AMP: {cfg.use_amp} 111 | #===================================# 112 | ''') 113 | 114 | global_step = 0 115 | for dataloader in dataloaders_train: 116 | for epoch in range(cfg.total_epochs): 117 | model.train() 118 | train_pbar = tqdm(dataloader) 119 | total_loss = 0 120 | tic = time() 121 | for batch_idx, batch in enumerate(train_pbar): 122 | layerwise_optimizer.zero_grad() 123 | 124 | images, targets, target_weights, __ = batch 125 | images = images.to('cuda') 126 | targets = targets.to('cuda') 127 | target_weights = target_weights.to('cuda') 128 | 129 | if cfg.use_amp: 130 | with autocast(): 131 | outputs = model(images) 132 | loss = criterion(outputs, targets, target_weights) 133 | scaler.scale(loss).backward() 134 | clip_grad_norm_(model.parameters(), **cfg.optimizer_config['grad_clip']) 135 | scaler.step(layerwise_optimizer) 136 | scaler.update() 137 | else: 138 | outputs = model(images) 139 | loss = criterion(outputs, targets, target_weights) 140 | loss.backward() 141 | clip_grad_norm_(model.parameters(), **cfg.optimizer_config['grad_clip']) 142 | layerwise_optimizer.step() 143 | 144 | if global_step < num_warmup_steps: 145 | warmup_scheduler.step() 146 | global_step += 1 147 | 148 | total_loss += loss.item() 149 | train_pbar.set_description(f"🏋️> Epoch [{str(epoch).zfill(3)}/{str(cfg.total_epochs).zfill(3)}] | Loss {loss.item():.4f} | LR {optimizer.param_groups[0]['lr']:.6f} | Step") 150 | scheduler.step() 151 | 152 | avg_loss_train = total_loss/len(dataloader) 153 | logger.info(f"[Summary-train] Epoch [{str(epoch).zfill(3)}/{str(cfg.total_epochs).zfill(3)}] | Average Loss (train) {avg_loss_train:.4f} --- {time()-tic:.5f} sec. elapsed") 154 | ckpt_name = f"epoch{str(epoch).zfill(3)}.pth" 155 | ckpt_path = osp.join(cfg.work_dir, ckpt_name) 156 | torch.save(model.module.state_dict(), ckpt_path) 157 | 158 | # validation 159 | if validate: 160 | tic2 = time() 161 | avg_loss_valid = valid_model(model, dataloaders_valid, criterion, cfg) 162 | logger.info(f"[Summary-valid] Epoch [{str(epoch).zfill(3)}/{str(cfg.total_epochs).zfill(3)}] | Average Loss (valid) {avg_loss_valid:.4f} --- {time()-tic2:.5f} sec. elapsed") 163 | -------------------------------------------------------------------------------- /utils/transform.py: -------------------------------------------------------------------------------- 1 | import math 2 | import cv2 3 | import munkres 4 | import numpy as np 5 | import torch 6 | 7 | 8 | # solution proposed in https://github.com/pytorch/pytorch/issues/229#issuecomment-299424875 9 | def flip_tensor(tensor, dim=0): 10 | """ 11 | flip the tensor on the dimension dim 12 | """ 13 | inv_idx = torch.arange(tensor.shape[dim] - 1, -1, -1).to(tensor.device) 14 | return tensor.index_select(dim, inv_idx) 15 | 16 | 17 | # 18 | # derived from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch 19 | def flip_back(output_flipped, matched_parts): 20 | assert len(output_flipped.shape) == 4, 'output_flipped has to be [batch_size, num_joints, height, width]' 21 | 22 | output_flipped = flip_tensor(output_flipped, dim=-1) 23 | 24 | for pair in matched_parts: 25 | tmp = output_flipped[:, pair[0]].clone() 26 | output_flipped[:, pair[0]] = output_flipped[:, pair[1]] 27 | output_flipped[:, pair[1]] = tmp 28 | 29 | return output_flipped 30 | 31 | 32 | def fliplr_joints(joints, joints_vis, width, matched_parts): 33 | # Flip horizontal 34 | joints[:, 0] = width - joints[:, 0] - 1 35 | 36 | # Change left-right parts 37 | for pair in matched_parts: 38 | joints[pair[0], :], joints[pair[1], :] = \ 39 | joints[pair[1], :], joints[pair[0], :].copy() 40 | joints_vis[pair[0], :], joints_vis[pair[1], :] = \ 41 | joints_vis[pair[1], :], joints_vis[pair[0], :].copy() 42 | 43 | return joints * joints_vis, joints_vis 44 | 45 | 46 | def get_affine_transform(center, scale, pixel_std, rot, output_size, shift=np.array([0, 0], dtype=np.float32), inv=0): 47 | if not isinstance(scale, np.ndarray) and not isinstance(scale, list): 48 | print(scale) 49 | scale = np.array([scale, scale]) 50 | 51 | scale_tmp = scale * 1.0 * pixel_std # It was scale_tmp = scale * 200.0 52 | src_w = scale_tmp[0] 53 | dst_w = output_size[0] 54 | dst_h = output_size[1] 55 | 56 | rot_rad = np.pi * rot / 180 57 | src_dir = get_dir([0, src_w * -0.5], rot_rad) 58 | dst_dir = np.array([0, dst_w * -0.5], np.float32) 59 | 60 | src = np.zeros((3, 2), dtype=np.float32) 61 | dst = np.zeros((3, 2), dtype=np.float32) 62 | src[0, :] = center + scale_tmp * shift 63 | src[1, :] = center + src_dir + scale_tmp * shift 64 | dst[0, :] = [dst_w * 0.5, dst_h * 0.5] 65 | dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir 66 | 67 | src[2:, :] = get_3rd_point(src[0, :], src[1, :]) 68 | dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) 69 | 70 | if inv: 71 | trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) 72 | else: 73 | trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) 74 | 75 | return trans 76 | 77 | 78 | def affine_transform(pt, t): 79 | new_pt = np.array([pt[0], pt[1], 1.]).T 80 | new_pt = np.dot(t, new_pt) 81 | return new_pt[:2] 82 | 83 | 84 | def get_3rd_point(a, b): 85 | direct = a - b 86 | return b + np.array([-direct[1], direct[0]], dtype=np.float32) 87 | 88 | 89 | def get_dir(src_point, rot_rad): 90 | sn, cs = np.sin(rot_rad), np.cos(rot_rad) 91 | 92 | src_result = [0, 0] 93 | src_result[0] = src_point[0] * cs - src_point[1] * sn 94 | src_result[1] = src_point[0] * sn + src_point[1] * cs 95 | 96 | return src_result -------------------------------------------------------------------------------- /utils/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | import random 4 | import numpy as np 5 | 6 | from collections import OrderedDict 7 | import os.path as osp 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | from torch import distributed as dist 13 | from torch.nn.parallel import DataParallel, DistributedDataParallel 14 | 15 | from .dist_util import get_dist_info 16 | 17 | MODULE_WRAPPERS = [DataParallel, DistributedDataParallel] 18 | 19 | 20 | def init_random_seed(seed=None, device='cuda'): 21 | """Initialize random seed. 22 | 23 | If the seed is not set, the seed will be automatically randomized, 24 | and then broadcast to all processes to prevent some potential bugs. 25 | 26 | Args: 27 | seed (int, Optional): The seed. Default to None. 28 | device (str): The device where the seed will be put on. 29 | Default to 'cuda'. 30 | 31 | Returns: 32 | int: Seed to be used. 33 | """ 34 | if seed is not None: 35 | return seed 36 | 37 | # Make sure all ranks share the same random seed to prevent 38 | # some potential bugs. Please refer to 39 | # https://github.com/open-mmlab/mmdetection/issues/6339 40 | rank, world_size = get_dist_info() 41 | seed = np.random.randint(2**31) 42 | if world_size == 1: 43 | return seed 44 | 45 | if rank == 0: 46 | random_num = torch.tensor(seed, dtype=torch.int32, device=device) 47 | else: 48 | random_num = torch.tensor(0, dtype=torch.int32, device=device) 49 | dist.broadcast(random_num, src=0) 50 | return random_num.item() 51 | 52 | 53 | def set_random_seed(seed: int, 54 | deterministic: bool = False, 55 | use_rank_shift: bool = False) -> None: 56 | """Set random seed. 57 | 58 | Args: 59 | seed (int): Seed to be used. 60 | deterministic (bool): Whether to set the deterministic option for 61 | CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` 62 | to True and `torch.backends.cudnn.benchmark` to False. 63 | Default: False. 64 | rank_shift (bool): Whether to add rank number to the random seed to 65 | have different random seed in different threads. Default: False. 66 | """ 67 | if use_rank_shift: 68 | rank, _ = get_dist_info() 69 | seed += rank 70 | random.seed(seed) 71 | np.random.seed(seed) 72 | torch.manual_seed(seed) 73 | torch.cuda.manual_seed(seed) 74 | torch.cuda.manual_seed_all(seed) 75 | os.environ['PYTHONHASHSEED'] = str(seed) 76 | if deterministic: 77 | torch.backends.cudnn.deterministic = True 78 | torch.backends.cudnn.benchmark = False 79 | 80 | def is_module_wrapper(module: nn.Module) -> bool: 81 | """ Check if module wrrapper exists recursively """ 82 | def is_module_in_wrapper(module, module_wrapper): 83 | module_wrappers = tuple(module_wrapper.module_dict.values()) 84 | if isinstance(module, module_wrappers): 85 | return True 86 | for child in module_wrapper.children.values(): 87 | if is_module_in_wrapper(module, child): 88 | return True 89 | return is_module_in_wrapper(module, MODULE_WRAPPERS) 90 | 91 | 92 | def load_state_dict(module, state_dict, strict=False, logger=None): 93 | """Load state_dict to a module. 94 | 95 | This method is modified from :meth:`torch.nn.Module.load_state_dict`. 96 | Default value for ``strict`` is set to ``False`` and the message for 97 | param mismatch will be shown even if strict is False. 98 | 99 | Args: 100 | module (Module): Module that receives the state_dict. 101 | state_dict (OrderedDict): Weights. 102 | strict (bool): whether to strictly enforce that the keys 103 | in :attr:`state_dict` match the keys returned by this module's 104 | :meth:`~torch.nn.Module.state_dict` function. Default: ``False``. 105 | logger (:obj:`logging.Logger`, optional): Logger to log the error 106 | message. If not specified, print function will be used. 107 | """ 108 | unexpected_keys = [] 109 | all_missing_keys = [] 110 | err_msg = [] 111 | 112 | metadata = getattr(state_dict, '_metadata', None) 113 | state_dict = state_dict.copy() 114 | if metadata is not None: 115 | state_dict._metadata = metadata 116 | 117 | # use _load_from_state_dict to enable checkpoint version control 118 | def load(module, prefix=''): 119 | # recursively check parallel module in case that the model has a 120 | # complicated structure, e.g., nn.Module(nn.Module(DDP)) 121 | if is_module_wrapper(module): 122 | module = module.module 123 | local_metadata = {} if metadata is None else metadata.get( 124 | prefix[:-1], {}) 125 | module._load_from_state_dict(state_dict, prefix, local_metadata, True, 126 | all_missing_keys, unexpected_keys, 127 | err_msg) 128 | for name, child in module._modules.items(): 129 | if child is not None: 130 | load(child, prefix + name + '.') 131 | 132 | load(module) 133 | load = None # break load->load reference cycle 134 | 135 | # ignore "num_batches_tracked" of BN layers 136 | missing_keys = [ 137 | key for key in all_missing_keys if 'num_batches_tracked' not in key 138 | ] 139 | 140 | if unexpected_keys: 141 | err_msg.append('unexpected key in source ' 142 | f'state_dict: {", ".join(unexpected_keys)}\n') 143 | if missing_keys: 144 | err_msg.append( 145 | f'missing keys in source state_dict: {", ".join(missing_keys)}\n') 146 | 147 | rank, _ = get_dist_info() 148 | if len(err_msg) > 0 and rank == 0: 149 | err_msg.insert( 150 | 0, 'The model and loaded state dict do not match exactly\n') 151 | err_msg = '\n'.join(err_msg) 152 | if strict: 153 | raise RuntimeError(err_msg) 154 | elif logger is not None: 155 | logger.warning(err_msg) 156 | else: 157 | print(err_msg) 158 | 159 | 160 | def load_checkpoint(model, 161 | filename, 162 | map_location='cpu', 163 | strict=False, 164 | logger=None): 165 | """Load checkpoint from a file or URI. 166 | 167 | Args: 168 | model (Module): Module to load checkpoint. 169 | filename (str): Accept local filepath, URL, ``torchvision://xxx``, 170 | ``open-mmlab://xxx``. 171 | map_location (str): Same as :func:`torch.load`. 172 | strict (bool): Whether to allow different params for the model and 173 | checkpoint. 174 | logger (:mod:`logging.Logger` or None): The logger for error message. 175 | 176 | Returns: 177 | dict or OrderedDict: The loaded checkpoint. 178 | """ 179 | checkpoint = torch.load(filename, map_location=map_location) 180 | # OrderedDict is a subclass of dict 181 | if not isinstance(checkpoint, dict): 182 | raise RuntimeError( 183 | f'No state_dict found in checkpoint file {filename}') 184 | # get state_dict from checkpoint 185 | if 'state_dict' in checkpoint: 186 | state_dict_tmp = checkpoint['state_dict'] 187 | else: 188 | state_dict_tmp = checkpoint 189 | 190 | state_dict = OrderedDict() 191 | # strip prefix of state_dict 192 | for k, v in state_dict_tmp.items(): 193 | if k.startswith('module.backbone.'): 194 | state_dict[k[16:]] = v 195 | elif k.startswith('module.'): 196 | state_dict[k[7:]] = v 197 | elif k.startswith('backbone.'): 198 | state_dict[k[9:]] = v 199 | else: 200 | state_dict[k] = v 201 | # load state_dict 202 | load_state_dict(model, state_dict, strict, logger) 203 | return checkpoint 204 | 205 | 206 | def resize(input, 207 | size=None, 208 | scale_factor=None, 209 | mode='nearest', 210 | align_corners=None, 211 | warning=True): 212 | if warning: 213 | if size is not None and align_corners: 214 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 215 | output_h, output_w = tuple(int(x) for x in size) 216 | if output_h > input_h or output_w > output_h: 217 | if ((output_h > 1 and output_w > 1 and input_h > 1 218 | and input_w > 1) and (output_h - 1) % (input_h - 1) 219 | and (output_w - 1) % (input_w - 1)): 220 | warnings.warn( 221 | f'When align_corners={align_corners}, ' 222 | 'the output would more aligned if ' 223 | f'input size {(input_h, input_w)} is `x+1` and ' 224 | f'out size {(output_h, output_w)} is `nx+1`') 225 | if isinstance(size, torch.Size): 226 | size = tuple(int(x) for x in size) 227 | 228 | def constant_init(module: nn.Module, val: float, bias: float = 0) -> None: 229 | if hasattr(module, 'weight') and module.weight is not None: 230 | nn.init.constant_(module.weight, val) 231 | if hasattr(module, 'bias') and module.bias is not None: 232 | nn.init.constant_(module.bias, bias) 233 | 234 | def normal_init(module: nn.Module, 235 | mean: float = 0, 236 | std: float = 1, 237 | bias: float = 0) -> None: 238 | if hasattr(module, 'weight') and module.weight is not None: 239 | nn.init.normal_(module.weight, mean, std) 240 | if hasattr(module, 'bias') and module.bias is not None: 241 | nn.init.constant_(module.bias, bias) -------------------------------------------------------------------------------- /utils/visualization.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import torch 5 | import torchvision 6 | import ffmpeg 7 | 8 | 9 | __all__ = ["joints_dict", "draw_points_and_skeleton"] 10 | 11 | 12 | def joints_dict(): 13 | joints = { 14 | "coco": { 15 | "keypoints": { 16 | 0: "nose", 17 | 1: "left_eye", 18 | 2: "right_eye", 19 | 3: "left_ear", 20 | 4: "right_ear", 21 | 5: "left_shoulder", 22 | 6: "right_shoulder", 23 | 7: "left_elbow", 24 | 8: "right_elbow", 25 | 9: "left_wrist", 26 | 10: "right_wrist", 27 | 11: "left_hip", 28 | 12: "right_hip", 29 | 13: "left_knee", 30 | 14: "right_knee", 31 | 15: "left_ankle", 32 | 16: "right_ankle" 33 | }, 34 | "skeleton": [ 35 | # # [16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12], [7, 13], [6, 7], [6, 8], 36 | # # [7, 9], [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7] 37 | # [15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], 38 | # [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6] 39 | [15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], 40 | [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], # [3, 5], [4, 6] 41 | [0, 5], [0, 6] 42 | ] 43 | }, 44 | "mpii": { 45 | "keypoints": { 46 | 0: "right_ankle", 47 | 1: "right_knee", 48 | 2: "right_hip", 49 | 3: "left_hip", 50 | 4: "left_knee", 51 | 5: "left_ankle", 52 | 6: "pelvis", 53 | 7: "thorax", 54 | 8: "upper_neck", 55 | 9: "head top", 56 | 10: "right_wrist", 57 | 11: "right_elbow", 58 | 12: "right_shoulder", 59 | 13: "left_shoulder", 60 | 14: "left_elbow", 61 | 15: "left_wrist" 62 | }, 63 | "skeleton": [ 64 | # [5, 4], [4, 3], [0, 1], [1, 2], [3, 2], [13, 3], [12, 2], [13, 12], [13, 14], 65 | # [12, 11], [14, 15], [11, 10], # [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7] 66 | [5, 4], [4, 3], [0, 1], [1, 2], [3, 2], [3, 6], [2, 6], [6, 7], [7, 8], [8, 9], 67 | [13, 7], [12, 7], [13, 14], [12, 11], [14, 15], [11, 10], 68 | ] 69 | }, 70 | } 71 | return joints 72 | 73 | 74 | def draw_points(image, points, color_palette='tab20', palette_samples=16, confidence_threshold=0.5): 75 | """ 76 | Draws `points` on `image`. 77 | 78 | Args: 79 | image: image in opencv format 80 | points: list of points to be drawn. 81 | Shape: (nof_points, 3) 82 | Format: each point should contain (y, x, confidence) 83 | color_palette: name of a matplotlib color palette 84 | Default: 'tab20' 85 | palette_samples: number of different colors sampled from the `color_palette` 86 | Default: 16 87 | confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1] 88 | Default: 0.5 89 | 90 | Returns: 91 | A new image with overlaid points 92 | 93 | """ 94 | try: 95 | colors = np.round( 96 | np.array(plt.get_cmap(color_palette).colors) * 255 97 | ).astype(np.uint8)[:, ::-1].tolist() 98 | except AttributeError: # if palette has not pre-defined colors 99 | colors = np.round( 100 | np.array(plt.get_cmap(color_palette)(np.linspace(0, 1, palette_samples))) * 255 101 | ).astype(np.uint8)[:, -2::-1].tolist() 102 | 103 | circle_size = max(1, min(image.shape[:2]) // 150) # ToDo Shape it taking into account the size of the detection 104 | # circle_size = max(2, int(np.sqrt(np.max(np.max(points, axis=0) - np.min(points, axis=0)) // 16))) 105 | 106 | for i, pt in enumerate(points): 107 | if pt[2] > confidence_threshold: 108 | image = cv2.circle(image, (int(pt[1]), int(pt[0])), circle_size, tuple(colors[i % len(colors)]), -1) 109 | 110 | return image 111 | 112 | 113 | def draw_skeleton(image, points, skeleton, color_palette='Set2', palette_samples=8, person_index=0, 114 | confidence_threshold=0.5): 115 | """ 116 | Draws a `skeleton` on `image`. 117 | 118 | Args: 119 | image: image in opencv format 120 | points: list of points to be drawn. 121 | Shape: (nof_points, 3) 122 | Format: each point should contain (y, x, confidence) 123 | skeleton: list of joints to be drawn 124 | Shape: (nof_joints, 2) 125 | Format: each joint should contain (point_a, point_b) where `point_a` and `point_b` are an index in `points` 126 | color_palette: name of a matplotlib color palette 127 | Default: 'Set2' 128 | palette_samples: number of different colors sampled from the `color_palette` 129 | Default: 8 130 | person_index: index of the person in `image` 131 | Default: 0 132 | confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1] 133 | Default: 0.5 134 | 135 | Returns: 136 | A new image with overlaid joints 137 | 138 | """ 139 | try: 140 | colors = np.round( 141 | np.array(plt.get_cmap(color_palette).colors) * 255 142 | ).astype(np.uint8)[:, ::-1].tolist() 143 | except AttributeError: # if palette has not pre-defined colors 144 | colors = np.round( 145 | np.array(plt.get_cmap(color_palette)(np.linspace(0, 1, palette_samples))) * 255 146 | ).astype(np.uint8)[:, -2::-1].tolist() 147 | 148 | for i, joint in enumerate(skeleton): 149 | pt1, pt2 = points[joint] 150 | if pt1[2] > confidence_threshold and pt2[2] > confidence_threshold: 151 | image = cv2.line( 152 | image, (int(pt1[1]), int(pt1[0])), (int(pt2[1]), int(pt2[0])), 153 | tuple(colors[person_index % len(colors)]), 2 154 | ) 155 | 156 | return image 157 | 158 | 159 | def draw_points_and_skeleton(image, points, skeleton, points_color_palette='tab20', points_palette_samples=16, 160 | skeleton_color_palette='Set2', skeleton_palette_samples=8, person_index=0, 161 | confidence_threshold=0.5): 162 | """ 163 | Draws `points` and `skeleton` on `image`. 164 | 165 | Args: 166 | image: image in opencv format 167 | points: list of points to be drawn. 168 | Shape: (nof_points, 3) 169 | Format: each point should contain (y, x, confidence) 170 | skeleton: list of joints to be drawn 171 | Shape: (nof_joints, 2) 172 | Format: each joint should contain (point_a, point_b) where `point_a` and `point_b` are an index in `points` 173 | points_color_palette: name of a matplotlib color palette 174 | Default: 'tab20' 175 | points_palette_samples: number of different colors sampled from the `color_palette` 176 | Default: 16 177 | skeleton_color_palette: name of a matplotlib color palette 178 | Default: 'Set2' 179 | skeleton_palette_samples: number of different colors sampled from the `color_palette` 180 | Default: 8 181 | person_index: index of the person in `image` 182 | Default: 0 183 | confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1] 184 | Default: 0.5 185 | 186 | Returns: 187 | A new image with overlaid joints 188 | 189 | """ 190 | image = draw_skeleton(image, points, skeleton, color_palette=skeleton_color_palette, 191 | palette_samples=skeleton_palette_samples, person_index=person_index, 192 | confidence_threshold=confidence_threshold) 193 | image = draw_points(image, points, color_palette=points_color_palette, palette_samples=points_palette_samples, 194 | confidence_threshold=confidence_threshold) 195 | return image 196 | 197 | 198 | def save_images(images, target, joint_target, output, joint_output, joint_visibility, summary_writer=None, step=0, 199 | prefix=''): 200 | """ 201 | Creates a grid of images with gt joints and a grid with predicted joints. 202 | This is a basic function for debugging purposes only. 203 | 204 | If summary_writer is not None, the grid will be written in that SummaryWriter with name "{prefix}_images" and 205 | "{prefix}_predictions". 206 | 207 | Args: 208 | images (torch.Tensor): a tensor of images with shape (batch x channels x height x width). 209 | target (torch.Tensor): a tensor of gt heatmaps with shape (batch x channels x height x width). 210 | joint_target (torch.Tensor): a tensor of gt joints with shape (batch x joints x 2). 211 | output (torch.Tensor): a tensor of predicted heatmaps with shape (batch x channels x height x width). 212 | joint_output (torch.Tensor): a tensor of predicted joints with shape (batch x joints x 2). 213 | joint_visibility (torch.Tensor): a tensor of joint visibility with shape (batch x joints). 214 | summary_writer (tb.SummaryWriter): a SummaryWriter where write the grids. 215 | Default: None 216 | step (int): summary_writer step. 217 | Default: 0 218 | prefix (str): summary_writer name prefix. 219 | Default: "" 220 | 221 | Returns: 222 | A pair of images which are built from torchvision.utils.make_grid 223 | """ 224 | # Input images with gt 225 | images_ok = images.detach().clone() 226 | images_ok[:, 0].mul_(0.229).add_(0.485) 227 | images_ok[:, 1].mul_(0.224).add_(0.456) 228 | images_ok[:, 2].mul_(0.225).add_(0.406) 229 | for i in range(images.shape[0]): 230 | joints = joint_target[i] * 4. 231 | joints_vis = joint_visibility[i] 232 | 233 | for joint, joint_vis in zip(joints, joints_vis): 234 | if joint_vis[0]: 235 | a = int(joint[1].item()) 236 | b = int(joint[0].item()) 237 | # images_ok[i][:, a-1:a+1, b-1:b+1] = torch.tensor([1, 0, 0]) 238 | images_ok[i][0, a - 1:a + 1, b - 1:b + 1] = 1 239 | images_ok[i][1:, a - 1:a + 1, b - 1:b + 1] = 0 240 | grid_gt = torchvision.utils.make_grid(images_ok, nrow=int(images_ok.shape[0] ** 0.5), padding=2, normalize=False) 241 | if summary_writer is not None: 242 | summary_writer.add_image(prefix + 'images', grid_gt, global_step=step) 243 | 244 | # Input images with prediction 245 | images_ok = images.detach().clone() 246 | images_ok[:, 0].mul_(0.229).add_(0.485) 247 | images_ok[:, 1].mul_(0.224).add_(0.456) 248 | images_ok[:, 2].mul_(0.225).add_(0.406) 249 | for i in range(images.shape[0]): 250 | joints = joint_output[i] * 4. 251 | joints_vis = joint_visibility[i] 252 | 253 | for joint, joint_vis in zip(joints, joints_vis): 254 | if joint_vis[0]: 255 | a = int(joint[1].item()) 256 | b = int(joint[0].item()) 257 | # images_ok[i][:, a-1:a+1, b-1:b+1] = torch.tensor([1, 0, 0]) 258 | images_ok[i][0, a - 1:a + 1, b - 1:b + 1] = 1 259 | images_ok[i][1:, a - 1:a + 1, b - 1:b + 1] = 0 260 | grid_pred = torchvision.utils.make_grid(images_ok, nrow=int(images_ok.shape[0] ** 0.5), padding=2, normalize=False) 261 | if summary_writer is not None: 262 | summary_writer.add_image(prefix + 'predictions', grid_pred, global_step=step) 263 | 264 | # Heatmaps 265 | # ToDo 266 | # for h in range(0,17): 267 | # heatmap = torchvision.utils.make_grid(output[h].detach(), nrow=int(np.sqrt(output.shape[0])), 268 | # padding=2, normalize=True, range=(0, 1)) 269 | # summary_writer.add_image('train_heatmap_%d' % h, heatmap, global_step=step + epoch*len_dl_train) 270 | 271 | return grid_gt, grid_pred 272 | 273 | 274 | def check_video_rotation(filename): 275 | # thanks to 276 | # https://stackoverflow.com/questions/53097092/frame-from-video-is-upside-down-after-extracting/55747773#55747773 277 | 278 | # this returns meta-data of the video file in form of a dictionary 279 | meta_dict = ffmpeg.probe(filename) 280 | 281 | # from the dictionary, meta_dict['streams'][0]['tags']['rotate'] is the key 282 | # we are looking for 283 | rotation_code = None 284 | try: 285 | if int(meta_dict['streams'][0]['tags']['rotate']) == 90: 286 | rotation_code = cv2.ROTATE_90_CLOCKWISE 287 | elif int(meta_dict['streams'][0]['tags']['rotate']) == 180: 288 | rotation_code = cv2.ROTATE_180 289 | elif int(meta_dict['streams'][0]['tags']['rotate']) == 270: 290 | rotation_code = cv2.ROTATE_90_COUNTERCLOCKWISE 291 | else: 292 | raise ValueError 293 | except KeyError: 294 | pass 295 | 296 | return rotation_code 297 | --------------------------------------------------------------------------------